#! /usr/bin/gawk -f
# Last edited on 2025-05-04 17:15:53 by stolfi

BEGIN{
  abort = -1;
  usage = ( \
    "compute_elem_count_distrib.gawk \\\n" \
    "    [ -v sampleSep=SEP ] \\\n" \
    "  < INFILE > OUTFILE " \
  );

  # Input records must be 
  # 
  #   COUNT LEXEME LENGTH
  # 
  # where LEXEME is a nonempty string, and LENGTH is
  # some integer.  Outputs, for each distinct LENGTH, a line
  # 
  #   LENGTH COUNT FREQ XLEXEME
  # 
  # where COUNT is the total count for that LENGTH, FREQ is the
  # relative frequency, and XLEXEME is (by default) a sample LEXEME of
  # that length.
  # 
  # Optionally, if "sampleSep" is set to a non-empty string SEP, the
  # output XLEXEME is the concatenation of all LEXEMEs of the same
  # LENGTH, separated by the string SEP.
  # 

  hi = -1;
  split("", mct);
  split("", wsample);
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ {
  if (NF != 3) { data_error(("wrong number of fields")); }
  ct = $1; w = $2; m = $3;
  if (! (m in mct)) 
    { wsample[m] = w; }
  else if (sampleSep != "")
    { wsample[m] = ( wsample[m] sampleSep w ); }
  mct[m] += ct;
  totct += ct;
  hi = (m > hi ? m : hi);
}

END {
  printf "# len  count   freq %s\n", (sampleSep == "" ? "example" : "strings");
  printf "# --- ------ ------ ------------------\n";
  for(m = 0; m <= hi+1; m++)
    { printf "  %3d %6d %6.4f %s\n", m, mct[m], mct[m]/totct, wsample[m]; }
}

function data_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}