#! /usr/bin/gawk -f
# Last edited on 2025-05-04 16:56:09 by stolfi

BEGIN {
  abort = -1;
  usage = ( "extract_and_count_prefixes.gawk \\\n" \
    "  [ -v spaceMarker=STRING ] \\\n" \
    "  [ -v showBadWords=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # Assumes the input records have fields
  # 
  #   COUNT LEXEME 
  # 
  # where LEXEME is plain EVA, with capitalized ligatures and elements
  # marked off by {}; and COUNT is its token count. For each input record
  # the program appends to each end of LEXEME a `spaceMarker' element 
  # in braces, and then writes out several records of the form
  # 
  #   COUNT PREFIX
  # 
  # where the PREFIX consists of the first K elements of 
  # the padded LEXEME, including the padding elements,
  # for all K from 1 to the maximum number.
  
  if (spaceMarker == "") { spaceMarker = "_"; }
  spaceMarkerWBraces = ( "{" spaceMarker "}" );
  if (showBadWords == "") { showBadWords = 0; }
  
  # indexed with the capitalized and bracketed lexeme prefix:
  split("", prefixCt);

  nlexemeGud = 0;  nlexemeShort = 0;
  ntokenGud = 0; ntokenShort = 0;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 2) { data_error("bad line format"); }
  ct = $1; worig = $2;
  if (worig !~ /^[{}a-zA-Z?]+$/) { data_error("bad lexeme"); }
  w = worig;
 
  # split lexeme into elements:
  gsub(/[}][{]/, "} {", w);
  ne = split(w, welem, " ");
  
  # Append and prepend a space to the lexeme:
  welem[0] = spaceMarkerWBraces;
  welem[ne+1] = spaceMarkerWBraces;
    
  # Element consistency check:
  lexemeBad = 0;
  for (i = 1; i <= ne; i++) 
    { e = welem[i];
      if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); }
    }
    
  if (ne < order - 1) 
    { nlexemeShort++; ntokenShort += ct; next; }
  else
    { nlexemeGud++; ntokenGud += ct; }
  
  extract_prefixes(welem, ne, ct, worig);
  
  next;
}

function extract_prefixes(welem, ne, ct, worig,   i,ei,p)
{
  # tallies `ct' occurrences of the prefixes `welem[0..k]'
  # for all `k' starting from 1.
  # `worig' is the original lexeme (for debugging).
  p = welem[0];
  prefixCt[p] += ct;
  for (k = 1; k <= ne+1; k++)
    { e = welem[k];
      p = ( p e );
      if (! (p in prefixCt)) { prefixCt[p] = 0; }
      prefixCt[p] += ct;
    }
}

END {
  if (abort >= 0) { exit abort; }
  if (nlexemeShort > 0)
    { printf "%d short lexemes (%d short tokens) found:\n",
        nlexemeShort, ntokenShort > "/dev/stderr";
    }

  for (p in prefixCt)
    { ct = prefixCt[p];
      printf "%7d %s\n", ct, p;
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function table_error(msg)
{ 
  printf "error in elem table: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}