#! /usr/bin/gawk -f
# Last edited on 2004-01-20 03:21:19 by stolfi

BEGIN {
  usage = ( ARGV[0] "\\\n" \
    "  -v maxpre=NUM -v minmid=NUM -v maxsuf=NUM \\\n" \
    "  < INFILE.wfr > OUTFILE.pms" \
  );
  # Input is a word frequency file, with fields {COUNT WORD},
  # where {WORD} is factored into elems, each delimited by braces.
  #  
  # For each input record, writes zero or more records with fields
  # {COUNT MID PRE SUF} where {PRE·MID·SUF = WORD}. 
  # 
  # Only generates combinations where the {MID} field has at least
  # {minmid} elems; {PRE} has between 1 and {maxpre} elems, and
  # similarly for {SUF}.
  
  abort = -1;
  
  if (bias == "") { bias = 1; }
  if (maxpre == "") { maxpre = 2; }
  if (minmid == "") { minmid = 4; }
  if (maxsuf == "") { maxsuf = 2; }
}

(abort >= 0) { exit abort; }

/^ *[0-9]/ {
  if (NF != 2) { data_error(("bad NF = " NF)); }
  ct = $1;
  wd = $2;
  if (! match(wd, /^[{].*[}]$/)) 
    { data_error(("unfactored word = «" wd "»")); }
  wd = substr(wd, 2, length(wd)-2);
  n = split(wd, fld, /[}][{]/);
  if (n < minmid + 2) { next; }
  for (i = 1; i <= maxpre; i++)
    { for (j = 1; j <= maxsuf; j++)
        { m = n - i - j;
          if (m >= minmid) 
            { printf "%7d ", ct
              output_part(i, m);
              printf " ";
              output_part(0, i);
              printf " ";
              output_part(i+m, j);
              printf "\n";
            }
        }
    }
  next;
}
       
function output_part(skp,num, k)
{
  for (k = 1; k <= num; k++) { printf "{%s}", fld[skp+k]; }
}

function data_error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}