#! /usr/bin/gawk -f
# Last edited on 1998-07-14 23:41:43 by stolfi

# Extracts tuples from the output of "extract-signif-chars"

BEGIN{
  usage = ( \
      "compute-cond-tuple-info \\\n" \
      "  -v order=ORDER \\\n" \
      "  [ -v filler=CHAR ] \\\n" \
      "  [ -v lowercase=BOOL ] \\\n" \
      "  < SIGFILE > TUPFILE" \
    );

  # The file SIGFILE must have been created by "extract-signif-chars"
  # This script writes to standard output the n-tuples of consecutive
  # significant characters read from SIGFILE, where n=ORDER.
  # The "decoration" records in SIGFILE are ignored.
  #
  # If "lowercase: is true the significant characters are converted to lower case.
  #
  # The word breaks in SIGFILE are replaced by a single instance of the "filler"
  # character (which mustbe printable and non-blank). The paragraph breaks
  # in SIGFILE are replaced by ORDER-1 consecutive fillers.
  
  abort = -1;
  check_options();
  init_tup();
}
    
/^[0]/{
  if (abort >= 0) { exit(abort); }
  next;
}

/^[1]/{
  if (abort >= 0) { exit(abort); }
  push_char(filler);
  next;
}

/^[2]/{
  if (abort >= 0) { exit(abort); }
  for (i=1;i<order;i++) { push_char(filler); }
  next;
}

/^[3]/{
  c = substr($0,2,1); m = map[c];
  if (m == filler) { error(("\"filler\" character found on input")); }
  push_char(m);
  next;
}

END{
  if (abort >= 0) { exit(abort); }
  for (i=1;i<order;i++) 
    { if (substr(tup,i,1) != filler) { error(("internal error 1")); } }
}

function init_tup()
{
  tup = "";
  wait = order-1;
}

function push_char(m)
{ 
  tup = (tup m);
  if (wait == 0) 
    { print tup; tup = substr(tup, 2); }
  else
    { wait--; }
}

function check_options(   i,c,mk,ucs,lcs,uc,lc)
{
  # Analyzes/defaults the option variables, namely
  #
  #   "order" "filler" "lowercase"
  #
  # Defines the global variable "map" that maps characters to lowercase 
  # if so desired.
  
  if (order == "") { error("should define \"order\""); } 
  if ((order < 1) || (order > 20)) { error("funny \"order\""); } 
    
  if (filler == "") { filler = "_"; }
  if (length(filler) != 1)
    { error(("the \"filler\" should be a single char")); }

  # --- lowercase mapping ----------------------------------------------
  split("", map);
  for (i=0;i<256;i++) { c = sprintf("%c", i); map[c] = c; }
  
  if (lowercase == "") { lowercase = 0; }
  if (lowercase > 0) 
    { ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦШЩЪЫЬЭЮ";
      lcs = "abcdefghijklmnopqrstuvwxyzабвгдежзийклмнопрстуфхцшщъыьэю";
      for (i=1;i<=length(ucs);i++)
        { uc = substr(ucs,i,1); lc = substr(lcs,i,1);
          map[uc] = lc;
        }
    }

}

function error(msg)
{ 
  printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 1;
}