#! /bin/gawk -f
# Last edited on 2004-01-24 01:40:05 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "extract-slot -v tkwd=TKWD  -f SLOTFNS.gawk \\\n" \
    "  < INFILE.wfr > OUTFILE.cts" \
  );
  
  # Reads a word frequency list with lines of the form  COUNT FREQ WORD
  # 
  # Extracts from each WORD a certain non-empty string SLOT, defined by the 
  # function "slot_extract" from the package SLOTFNS.gawk .
  # 
  # Writes out a file with lines   TCOUNT SLOT PSLOT LENGTH   where 
  # 
  #   SLOT is each of the SLOT strings extracted from the input, 
  #     without repetitions;
  #
  #   TCOUNT is the sum of all COUNTs of WORDs with that SLOT string.
  # 
  #   PSLOT is the SLOT string factored into "{}"-bracketed "elements",
  #     by calling the function "slot_factor" from the same package;
  # 
  #   LENGTH is the number of "{}"-bracketed elements in PSLOT.
  
  split("",t);
}

/^ *([#]|$)/ {next;}

/./ {
  c = $1; w = $3;
  s = tolower(slot_extract(w));
  t[s] += c;
  next;
}

END {
  printf "t[empty] = %s\n", t[""] > "/dev/stderr";
  for (s in t)
    { ct = t[s];
      fs = slot_factor(s);
      if (s == "") { s = "_"; }
      if (fs == "") { fs = "_"; }
      ts = fs; gsub(/[^{}]/, "", ts);
      printf "%7d %s %d\n", ct, fs, length(ts)/2;
    }
}