#! /usr/bin/gawk -f

# Last edited on 1998-12-27 11:02:26 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "cat INFILE \\\n" \
    "  | attach-reading-order \\\n" \
    "      -v table=TBLFILE \\\n" \
    "      [ -v trcodes=CHARS ] \\\n" \
    "  > OUTFILE\n" \
  );

# Reads an EVMT file from stdin, writes to stdout all text
# lines (and the attached #-comments, if any) with the
# reading order, transcriber code, and input serial number
# in columns 1-8.
#
# The transcriber code is mapped to an integer in 01..26
# in the order given by the "trcodes" parameter.
#
# An #-comment is considered attached to a line if it is
# separated by at most one blank line just preceding the command.
#
# Each line of TBLFILE should have two words LOC ORDER specifying
# the ORDER assigned to each locator LOC.  The locator should
# not include the "<>" delimiters nor the transcriber codes.
#
# Whenever an input location code is not found in the table, 
# a message is printed.

  if (table == "") arg_error("must specify \"-v table=FILE\"\n");
  if (trcodes == "") { trcodes = "HCDFGTJILMKQRNPZABEOSWXYVU"; }
  
  split("", dic);
  nMap=0;
  while((getline lin < table) > 0)
    { if (! match(lin, /^[#]/))
        { nfld = split(lin, fld);
          if (nfld != 2) arg_error(("bad table entry = \"" lin "\""));
          if (fld[1] in dic) arg_error(("repeated key = \"" lin "\""));
          dic[fld[1]] = fld[2];
          nMap++;
        }
    }
  close (table);
  if (nMap == 0) { arg_error(("file \"" table "\" empty or missing")); }
  # printf "loaded %6d map pairs\n", nMap > "/dev/stderr"
}

// { if (abort >= 0) { exit abort; } }

/^##/ { 
  # Neutralized page/unit header
  flush(); nv = 0;
  next;
}

/^<[^[<>;]*> *(|{[^{}]*} *)$/ { 
  # Unprotected page/unit header
  flush(); nv = 0;
  next;
}

/^ *$/{ 
  # blank line
  next;
}

/^# *$/{ 
  # blank comment
  next;
}

/^#/ {
  # comment
  lin[nv] = $0; trc[nv] = 00;
  nv++;
  next;
}

/^[<]/ {
  # text line
  if (match($0, /^<f[0-9A-Za-z.]+[;][A-Z]>/))
    { loc = substr($0, RSTART+1, RLENGTH-4);
      tr = substr($0, RLENGTH-1, 1);
      if (loc != cur_loc) { flush(); }
      lin[nv] = $0; trc[nv] = index(trcodes, tr);
      nv++;
      cur_loc = loc;
    }
  else
    { format_error("bad locator"); }
  next;
}

//{ format_error("unrecognized format"); }

END {
  if (abort >= 0) { exit abort; }
  flush();
}

function flush(   i,k)
{ 
  if (cur_loc != "")
    { if (cur_loc in dic)
        { ord = dic[cur_loc]; }
      else
        { format_error("locator not in table"); ord = 0; }
      # Locate last data line:
      for(k=nv;((k>0) && (substr(lin[k-1],1,1) == "#"));k--) {  }
      # Print data lines and preceding comments:
      printf "%06d %02d %03d %s\n", ord, 00, 00, "#";
      for (i=0; i<k; i++)
        { printf "%06d %02d %03d %s\n", ord, trc[i], i+1, lin[i]; }
      # Bubble up the skipped comments:
      for(i=0;k<nv;i++) { lin[i] = lin[k]; trc[i] = trc[k]; k++; }
      nv = i;
    }
  cur_loc = "";
}

function format_error(msg)
{ 
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1;
}