#! /usr/bin/gawk -f
# Last edited on 2025-09-24 16:52:14 by stolfi

# ** MUST BE SAVED IN ISO-LATIN-1 **

BEGIN { 
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -f FUNCS.gawk \\\n" \
    "  -v smp=SMP \\\n" \
    "  -v sec=SEC \\\n" \
    "  [ -v table=TABLE.tbl ] \\\n" \
    "  [ -v maxAlpha=NUM ] \\\n" \
    "  < INFILE > OUTFILE " \
  );
  
  # Converts a Langbank token file ("main.wds") into a "raw.alw" or "raw.tlw"
  # language sample file for VMS comparative analysis studies. 
  #
  # Each input record must have the format "{type} {token}", where
  # {type} is one of [#@$aspbn] and defines the token type, as follows:
  #  
  #   "#" = {token} is a #-comment (may include blanks).
  #   "$" = start of the section whose full ID is {token}, e.g. "{GEN}{c3}{v14}".
  #   "@" = start of line number {token} in the original text.
  #   "a" = {token} is an alpha token. 
  #   "s" = {token} is a symbol-like token (numeral, math symbol, etc.).
  #   "p" = {token} is a punctuation-like token.
  #   "b" = {token} is a blank-like token (should not occur). 
  #   "n" = {token} is a null token (should not occur).
  # 
  # Output records will have the format "{type} {loc} {token}" where
  # {token} is a text token, {type} is "a" or "s", and {loc} is the
  # token's location in the original book. The location {loc} will
  # consist of a full book section ID {curSec}, concatenated with the
  # a book line number {curLin} surrounded in "{" and "}".
  # 
  # This script first recomputes the type of each input record of type
  # "a", "p", or "s" by calling a procedure from the user-specified
  # library "FUNCS.gawk":
  #
  #   smp_reclassify_token(smp, sec, curSec, curLin, type, token)
  #     
  #     where {token} is the {body} of the input record; {type} is its
  #     single-character type tag, as above; {smp} and {sec} are the
  #     desired sample and sub-sample tags, specified by the user; and
  #     {curSec} and {curLin} are the full section ID and line number
  #     containing this occurrence of {token} in the original book. The
  #     procedure must return a new type for {token}, according to the
  #     table above; or "x", meaning that the record is in an unwanted
  #     subsection of the selected section that behaves like a symbol
  #     (foreign phrase, table, poem, etc.).
  # 
  # The values of {curSec} and {curLin} are obtained from the last "$"
  # and "@" records, respectively, before the current record. Note
  # that the format of {sec} may be very different from that of
  # {curSec}; e.g. {sec="gen.1"} could mean "take the whole text from
  # Genesis", i.e. {curSec ~ /{GEN}{c[0-9]+}{v[0-9]+}/}.
  # 
  # After calling {smp_reclassify_token}, the script discards 
  # the record if its new type is "#", "$", "@", "n", or "b".
  # 
  # The remaining "a", "p", "s", or "x" record has its {token} adjusted to
  # be suitable for statistical analysis. This may include change of
  # encoding, de-capitalization, compound splitting, supression of
  # some letters or tokens, etc.
  # 
  # The script then applies the function 
  # 
  #   smp_fix_token(smp, sec, type, token)
  #     
  #     where {smp} and {sec} are the desired sample and section,
  #     {type} is the new type ("a", "p", "s", or "x"), and {token} is
  #     the input token. The procedure should return a cleaned
  #     copy of {token}, e.g. without capitalization or undesired
  #     markings. It may split {token} by inserting blanks and/or
  #     newlines.
  # 
  # A token that gets remapped to "*DELETE*", "*delete*", or all blanks
  # will be discarded. The result of {fix_token} is split at blanks or
  # newlines, and each field is processed as if it were a separate 
  # input record, with the same {type} and {LOC}.
  # 
  # The individual "a" records are then tested with
  # {smp_is_good_token(smp, sec, token)}, and the {type} is changed to
  # "s" if the result is FALSE. The "a", "p", and "s" records then 
  # written to the output.
  # 
  # As for "x"-records, runs of three or more are squeezed, leaving only
  # the first and last records of the run. These have their token enclosed
  # in braces and marked with "*", e.g. "finis" becomes "*{finis}",
  # and their type gets replaced by "s".
  #
  # The resulting file is written to 
  # 
  # The file "" is then truncated after {maxAlpha} "a"-type tokens. 
  # The default is to process the whole input file.
  # 
  # INITIALIZATION
  # 
  # If the {table} argument is specified, it must name a file that
  # contain pairs of words "{OLD} {NEW}". This script will read that
  # file and create an array {wmap} with {wmap[OLD] = NEW}. This table
  # may then be used by {smp_reclassify_token} and/or {smp_fix_token}
  # 
  #  The "FUNCS.gawk" library must also define a function
  #
  #   smp_define_patterns(smp, sec) 
  #     
  # that will be called by this script, after loading the {wmap} table
  # (if any) but before processing the first input record. This
  # procedure could, for instance, precompile any complicated patterns
  # to be used by {smp_reclassify_token}.
  
  if (smp == "")      { arg_error("must define \"smp\""); }
  if (sec == "")      { arg_error("must define \"sec\""); }
  
  if (maxAlpha == "") { maxAlpha = -1; }
  if (maxRead == "")  { maxRead = -1; }
  
  debug = 0;
  # debug = 1; maxRead = 200;
  
  s = "???"; n = "???";
  
  curSec = "";
  curLin = "";

  nExRun = 0;      # Number of consecutive "x"-type records generated so far
  # Data of last "x"-type output record (not written), if {nExRun >= 2}:
  lastExSec = "";  # Location.
  lastExLin = "";  # Original line number.
  lastExToken = ""; # Token. 

  nRead = 0;      # Total records read
  nWritten = 0;   # total records written to output.
  nAlpha = 0;       # total "a"-type records written.
  nSymbol = 0;    # total "s"-type records written.
  nPunct = 0;     # total "p"-type records written.
  nIntrude = 0;   # total "x"-type records written (as "s"-type).

  split("", wmap); 
  if (table != "") 
    { # Read word-remapping table, if present.
      load_remapping_table(table);
    }
  if (field == "") { field = 0; }

  smp_define_patterns(smp, sec);
}

(abort >= 0) { exit abort; }

/^ *$/ { next; }

# Stop if enough:
((maxRead >= 0) && (nRead >= maxRead)) {
  exit 0;
}

($1 ~ /^.$/) { 
  
  nRead++;
  
  # Get type tag {type} and body {token}:
  type = $1;
  if (type == "#")
    { token = substr($0, 3); }
  else
    { if (NF != 2) { data_error("bad input format"); }
      token = $2; 
    }

  # Dispose accordingly:
  if (type == "$") 
    { curSec = token; }
  else if (type == "@") 
    { curLin = token; }
  else if (type ~ /^[#nb]/)
    {  }
  else if (type ~ /^[aspx]/)
    { if (debug) { printf "@@ %s:[%s]\n", type, token > "/dev/stderr"; }
      # Reclassify token and check whether it is inside the desired section:
      otype = type;
      type = smp_reclassify_token(smp, sec, curSec, curLin, type, token);
      if (debug) { printf "   -r-> %s:[%s]\n", type, token > "/dev/stderr"; }
      
      if (type ~ /[apsx]/) 
        { 
          # Apply sample-specific adjustments: 
          otoken = token; 
          token = smp_fix_token(smp, sec, type, token);
          if (debug) { printf "   -f-> %s:[%s] -> %s:%s\n", type,otoken, type,token > "/dev/stderr"; }
          if ((token == "*DELETE*") || (token == "*delete*")) { token = ""; }

          # Split into separate tokens at blanks:
          nwds = split(token, wds, /[ \012]+/);
          # Write each token separately
          for(i = 1; i <= nwds; i++)
            { wdi = wds[i]; tpi = type;
              if (wdi != "") 
                { if ( tpi == "a" )
                    { # Re re-reclassify each piece:
                      tpi = ( smp_is_good_token(smp, sec, tpi, wdi) ? type : "s" );
                    }
                  else
                    { tpi = type; }
                  if (debug) { printf "   -g-> %s:[%s]\n", tpi, wdi > "/dev/stderr"; }
                  output_token(tpi, curSec, curLin, wdi);
                }
            } 
          next;
        }
      else if (type !~ /[#nb]/)
        { data_error(("invalid new type tag \"" type "\"")); }
    }
  else
    { data_error(("unknown input type tag \"" type "\"")); }
  next;
}

// { data_error(("invalid input type tag \"" $1 "\"")); }

END {
  if (abort >= 0) { exit abort; }
  flush_x_record();
  printf "      %d records read, %d written (%d alpha, %d symbol, %d punct, %d intrusions)\n", \
    nRead, nWritten, nAlpha, nSymbol, nPunct, nIntrude > "/dev/stderr"; 
}

function output_token(type,aSec,aLin,token,  gud)
{
  # Outputs token {token} of type {type}, squeezing long runs of "x" tokens.
  # Updates {nAlpha}, {nSymbol}, {nPunct}, {nIntrude}, {nWritten}. 
  # Manages {nExRun}, {lastExSec}, {lastExLin}, {lastExToken}.
  # Exits the script (with 0) after writing {maxAlpha} "a"-records.

  if (type == "x") 
    { 
      # Squeeze runs of "x" records, mark them as bad (type "s"):
      token = ("*{" token "}");
      type = "s";
      if (nExRun == 0)
        { # First in a run of "x"-records, print it:
          print fmt_token(type,aSec,aLin,token); 
          nWritten++; nIntrude++;
          nExRun = 1;
        }
      else 
        { # Non-first in a run of "x"-records, save it for now:
          lastExSec = aSec; lastExLin = aLin; lastExToken = token;
          nExRun++;
        }
    }
  else
    { # Do we have any "x"-record waiting to be printed?
      flush_x_record();
      
      # Print it:
      print fmt_token(type,aSec,aLin,token); 
      nWritten++;
      if (type == "a") 
        { nAlpha++; 
          # Have we written enough stuff:
          if ((maxAlpha >= 0) && (nAlpha >= maxAlpha)) { exit 0; }
        }
      else if (type == "s")
        { nSymbol++; }
      else if (type == "p") 
        { nPunct++; }
    }
}

function flush_x_record()
{
  # If there is any "x"-record suspended, print it:
  if (nExRun >= 2) 
    { if (nExRun >= 3) { lastExToken = ( ".." lastExToken ); }
      # Flush last "x"-record:
      print fmt_token("s",lastExSec,lastExLin,lastExToken);
      nWritten++; nIntrude++;
    }
  nExRun = 0;
}

function fmt_token(type,aSec, aLin, token)
{
  # Formats a token {token} of type {type} for output, 
  # including {aSec} and {aLin}: 
  return sprintf("%s %s{%s} %s", type, aSec, aLin, token);
}

function load_remapping_table(file,    nMap,lin,fld,nfld)
{
  # Reads a word mapping table from "file", containing pairs 
  # of the form ORGINAL NEW. 
  # Stores the table in "wmap[ORIGINAL] = NEW".
  
  nMap=0;
  split("", wmap)
  ERRNO = "";
  while((getline lin < file) > 0) { 
    gsub(/^[ ]*/, "", lin);
    if (! match(lin, /^([#]|$)/))
      { gsub(/[ ]*[#].*$/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\""));
        if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\""));
        wmap[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "") { arg_error((file ": ERRNO = " ERRNO)); }
  close (file);
  printf "    loaded %6d map pairs\n", nMap > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(file, msg)
{
  printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}