#! /usr/bin/gawk -f
# Last edited on 1998-07-14 23:08:31 by stolfi

BEGIN {
  abort = -1;
  usage = "replace-signif-digraph DIG SYM < INFILE > OUTFILE";
  
  # Reads a file produced by extract-signif-chars.
  # Replaces every occurrence of the digraph DIG as 
  # consecutive significant chars (class 3) by the single character SYM.
  # Ignores decoration (class 0) but is frustrated by word
  # and paragraph breaks (class 1 and 2).
  
  if (ARGC != 3) { error(("bad args - usage: " usage)); }
  dig = ARGV[1];
  if (length(dig) != 2) { error(("bad digraph - usage: " usage)); }
  diga = substr(dig,1,1); digb = substr(dig,2,1);
  repl = ARGV[2];
  if (length(repl) != 1) { error(("bad replacement - usage: " usage)); }
  ARGC = 1;
  
  # If "siga" is not empty, it is the previous signif char in the current word.
  # In that case "deco" is the decoration after "siga".
  # If "siga" is empty, "deco" must be empty too.
  
  siga = "";
  deco = "";
  replacements = 0;
}


/^[0]/{
  # decoration - concatenate with current decoration
  if (abort >= 0) { exit abort; }
  if (siga == "")
    { print; }
  else
    { deco = (deco substr($0,2)); }
  next;
}

/^[12]/ {
  # break - flush buffers and restart.
  if (abort >= 0) { exit abort; }
  flush_buffers();
  print; 
  next;
}

/^[3]/ {
  # significant character - try to combine with previous one, if any
  if (abort >= 0) { exit abort; }
  if (length($0) != 2) { error(("line " NR ": wrong length in class \"3\" record")); }
  sigb = substr($0,2,1);
  if (siga == "") 
    { # save it for now
      siga = sigb; next;
    }
  else
    { if ((siga == diga) && (sigb == digb))
        { # replace by new symbol
          printf "3%s\n", repl;
          replacements++;
          siga = "";
          if (deco != "") { warning(("line " NR ": decoration squeezed over")); }
        }
      else
        { flush_buffers(); siga = sigb; }
    }
  next;        
}

function flush_buffers()
{
  # writes out the saved character and decoration, if any:
  if (siga != "") 
    { printf "3%s\n", siga; siga = "";
      if (deco != "") { printf "0%s\n", deco; deco = ""; }
    }
}

/./ {
  # What-what-what?
  if (abort >= 0) { exit abort; }
  error(("line " NR ": invalid character class")); 
}

END {
  if (abort >= 0) { exit abort; }
  flush_buffers();
  printf "  replaced %d instances\n", replacements > "/dev/stderr";
}

function error(msg)
{ 
  printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 1;
}

function warning(msg)
{
  printf "  (warning) %s\n", msg >> "/dev/stderr";
}