#! /usr/bin/gawk -f
# Last edited on 1998-12-25 06:51:57 by stolfi

# Splits a multipage EVT-format file into one file per text unit,
# Assumes each unit is preceded by a line "## <PAGE> ... " or 
# "## <PAGE.UNIT> ...", where PAGE is a page f-number and UNIT is a 
# capital letter.
#
# Also writes to stdout a list of the files, in the order written.
# Attempst to work with any locator style (Stolfi's <fNNN.UU.LL> or 
# EVMT's <fNN.ULL>).
# 

BEGIN {
  abort = -1;

  usage = "split-evt-into-units -v outdir=DIR -v style=STYLE < FILE > PAGELIST";

  if (outdir == "") { arg_error("must define \"outdir\""); }
  if (style == "") {  arg_error("must define \"style\""); }
  if ((style != "EVMT") && (style != "STOLFI")) { arg_error("invalid \"style\""); }

  file = "";
  junk = ( outdir "/junk" );
  unit = "";
}

function pline(lin)
{ 
  # Writes the line "" to the current "file".
  if (file == "")
    { format_error("line with no unit");
      print lin > junk;
    }
  else
    { print lin >> file; }
}

// { if (abort >= 0) { exit abort; } }

/^##/ { 
  if ( style == "EVMT") 
    { mtc = match($0, /[<]f[0-9]+[rv]?[0-9]?(|[.][A-Z])[>]/); }
  else if ( style == "STOLFI")
    { mtc = match($0, /[<]f[0-9]+[rv]?[0-9]?(|[.][A-Za-z][A-Za-z0-9]?)[>]/); }
  else
    { program_error("bad style"); }
    
  if (mtc == 0)
    { format_error("bad format of unit header");
      print $0 > junk;
    }
  else
    { if (file != "") { close(file); }
      unit = substr($0, RSTART+1, RLENGTH-2);
      file = (outdir "/" unit);
      printf "%s\n", file;
      pline($0);
    }
  next;
}

/^#/ { 
  pline($0);
  next;
}

/^ *$/{ next; }

// {
  un = "";
  if ( style == "EVMT") 
    { mtc = match($0, /^[<]f[0-9]+[rv]?[0-9]?[.][A-Z]/);
      if (mtc != 0) 
        { un = substr($0, RSTART+1, RLENGTH-1);
          mtc = match(substr($0,RLENGTH+1), /^[0-9]+[;][A-Z][>]/);
        }
    }
  else if ( style == "STOLFI")
    { mtc = match($0, /^[<]f[0-9]+[rv]?[0-9]?[.][A-Za-z][A-Za-z0-9]?[.]/); 
       if (mtc != 0)
        { un = substr($0, RSTART+1, RLENGTH-2);
          mtc = match(substr($0,RLENGTH+1), /^[0-9]+[a-z]?[;][A-Z][>]/);
        }
    }
  else
    { program_error("bad style"); }

  if (mtc == 0)
    { format_error("bad format");
      print $0 > junk;
    }
  else if (un != unit) 
    { format_error("wrong unit");
      print $0 > junk;
    }
  else 
    { pline($0); }
  next;
}

// { 
  format_error("bad line format");
  print $0 > junk;
  next;
}

END{
  if (file != "") { close(file); }
  close(junk);
}

function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg >> "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg >> "/dev/stderr";
  printf "usage: %s\n", usage >> "/dev/stderr";
  abort = 1; exit abort;
}

function program_error(msg)
{
  printf "file %s, line %d: prog error %s\n", FILENAME, FNR, msg >> "/dev/stderr";
  abort = 1; exit abort;
}