#! /bin/gawk -f
# Last edited on 2004-02-26 18:15:01 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "cat INFILE.txt \\\n" \
    "  | roman-code-words -f roman-code-fns.gawk \\\n" \
    "      [ -v table=TBLNAME ] \\\n" \
    "      [ -v honorCase=BOOL ] \\\n" \
    "      [ -v maxNumWords=NUMWDS [ -v maxCode=MAXCODE ] ] \\\n" \
    "  > OUTFILE.rcd" \
  );

  # Each line of the input must contain zero or more words separated
  # by spaces. Maps each plain-word in the input to a numeric code,
  # then encodes the latter with a Roman-like number system, and
  # writes the resulting code-word.
  # 
  # The file TBLNAME, which defaults to empty, should contain 
  # a list of PLAINWORD CODEWORD pairs, one per line; it is used to map
  # plain-words to code-words directly. In any case, each input word
  # that is not found in the table get assigned the next available
  # code number, and this assignment is entered in the table..
  # 
  # By default, the valid numeric codes are sequential, starting from 1.
  # However, if "maxNumWords" is specified, the valid numeric codes
  # are spaced further apart so that NUMWDS of them will span the
  # range [1..MAXNUMCODE] of valid numeric codes.  The value
  # of MAXNUMCODE defaults to the maximum allowed by the encoding,
  # but may be set explicitly by giving the "maxCode" parameter
  # (an *encoded* number).
  # 
  # If {honorCase} is set (default), the script will attempt to copy
  # the capitalization of the input words into the code. I.e. if the
  # input word is "Mars", and "foo" is the next available code word,
  # then "foo" is assigned to "mars" and "Foo" to "Mars". This rule
  # also applies to pairs pre-defined in the initial table. Otherwise
  # "Mars" and "mars" are considered distinct words and get distinct
  # codes.
  #
  # In any case, the completed encoding table will be written at the end
  # of the output text, as a bunch of "#"-comments, bracketed by
  # the heading "# BEGIN DICTIONARY" and "# END DICTIONARY"
  
  if (honorCase == "") { honorCase = 1; }
  
  renc_init();
  printf "maximum code = %s (%d)\n", \
    renc_max_code, renc_max_num > "/dev/stderr";

  if (maxNumWords == "") 
    { code_increment = 1; }
  else
    { printf "assuming max %d distinct words\n", maxNumWords > "/dev/stderr";
      if (maxCode == "")
        { maxNumCode = renc_max_num; }
      else
        { maxNumCode = renc_decode(maxCode); 
          if (maxNumCode > renc_max_num) 
            { arg_error(("given \"maxCode\" is too big")); }
        }
      printf "adjusting increment for max code = %s (%d)\n", maxCode, maxNumCode > "/dev/stderr";
      if (maxNumWords > maxNumCode) 
        { arg_error(("given \"maxNumWords\" is too big")); }
      code_increment = maxNumCode/maxNumWords;
    }
  printf "code increment = %d\n", code_increment > "/dev/stderr";

  split("", dic);
  num_distinct_words = 0;
  max_assigned_num_code = 0;
  max_assigned_code = "";
  
  if (table != "") { load_table(table); }
}

(abort >= 0) { exit abort; }

/^ *([#@]|$)/ { print; next; }

// {
  for (i = 1; i <= NF; i++) 
    { wd = $(i);
      $(i) = encode_word(wd);
    }
  print;
  next;
}

# The following is OK for texts with Latin alphabet: 
# 
# // {
#   lin = $0; res = ""; wd = "";
#   m = length(lin);
#   for (i = 1; i <= m; i++) 
#     { c = substr(lin,i,1);
#       if (is_alpha(c))
#         { w = (w c); }
#       else
#         { if (w != "") { res = (res encode_word(w)); w = ""; }
#           res = (res c); 
#         }
#    }
#   if (w != "") { res = (res encode_word(w)); w = ""; }
#   print res;
# }
# 
# function is_alpha(c)
# {
#   c = tolower(c);
#   return \
#     ((c >= "a") && (c <= "z")) || \
#     ((c >= "ß") && (c <= "ö")) || \
#     ((c >= "ø") && (c <= "ÿ"));
# }

END {
  printf "%d distinct words seen\n", num_distinct_words > "/dev/stderr";
  printf "maximum code used = %s (%d)\n", max_assigned_code, max_assigned_num_code > "/dev/stderr";
  printf "# BEGIN DICTIONARY\n";
  for (w in dic)
    { printf "# %s %s\n", w, dic[w]; }
  printf "# END DICTIONARY\n";
}

function encode_word(word,  w,num,code,pfx,sfx)
{
  # Empty is not a word:
  if (word == "") { return word; }
  # Lookup word in table:
  w = (honorCase ? tolower(word) : word);
  if (w in dic)
    { code = dic[w]; }
  else
    { num_distinct_words++;
      num = int(max_assigned_num_code + code_increment);
      if (num <= max_assigned_num_code) { prog_error(("num code bug")); }
      code = renc_encode(num);
      max_assigned_num_code = num;
      max_assigned_code = code;
      dic[w] = code;
    }
  if (honorCase && (word != w))
    { # The original {word} was not in lowercase. 
      # Try to preserve its capitalization in in the code, if possible.
      # Note that some codes have non-alpha prefixes.
      if (word ~ /[A-Z].*[A-Z]/) 
        { # Assume all-caps:
          code = toupper(code);
        }
      else 
        { # Assume initial-caps:
          if (match(code, /[a-z]/))
            { pfx = substr(code,1,RSTART-1); 
              sfx = substr(code,RSTART);
              code = (pfx toupper(substr(sfx,1,1)) substr(sfx,2));
            }
        }
      # We assign the modified code to {word} too, for 
      # the benefit of dumber programs who may want to use our table
      dic[word] = code;
    }
  return code;
  # If all else fails:
  return word;
}

function load_table(fname,    ntbl,nbadtbl,nlin,lin,fld,nfld,word,code,num)
{
  # Loads the mapping table "dic" from file "fname".
  # Also updates "max_assigned_num_code".
  max_assigned_num_code = 0;
  ntbl = 0;
  nbadtbl = 0;
  nlin = 0;
  while((getline lin < fname) > 0) { 
    nlin++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { gsub(/[#].*$/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld != 2) tbl_error(fname, nlin, ("bad table entry = \"" lin "\""));
        word = fld[1]; code = fld[2];
        if (honorCase) { word = tolower(word); code = tolower(code); }
        if (word in dic) tbl_error(fname, nlin, ("repeated plainword = \"" lin "\""));
        dic[word] = code;
        num = renc_decode(code,-1);
        if (num == -1) 
          { nbadtbl++; }
        else if (num > max_assigned_num_code) 
          { max_assigned_num_code = num; }
        ntbl++;
      }
  }
  if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf "loaded %6d pairs from %s\n", ntbl, fname > "/dev/stderr"
  if (nbadtbl > 0) 
    { printf "!!! warning - file %s contains %d invalid word codes\n", \
        fname, nbadtbl >> "/dev/stderr";
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function tbl_error(f,n,msg)
{ 
  printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}