#! /usr/bin/gawk -f
# Last edited on 2001-01-05 23:29:30 by stolfi

BEGIN {
  abort = -1;
  usage = ( "compute-row-col-freqs \\\n" \
    "  [ -v outputTotals=BOOL ] \\\n" \
    "  < INFILE.pct > OUTFILE.pfr" \
  );

  # Reads a file with entries of the form COUNT ITEM1:ITEM2 pairs, 
  # as produced by "uniq -c"
  # Outputs a similar file with lines of the form
  #   COUNT FREQROW FREQCOL FREQTOT ITEM1:ITEM2
  # where FREQROW is the fraction of COUNT relative to the total of 
  # all COUNTs with same ITEM1, FREQCOL is the same relative to 
  # ITEM2, and FREQTOT is the frequency relative to all items.
  # 
  # If outputTotals is 1, also outputs extra pairs of the form GLYPH1:+,
  # +:GLYPH2, +:+ with the row, column, and table totals, respectively.

  if (outputTotals == "") { outputTotals = 0; }
  split("", totrow);
  split("", totcol);
  tottbl = 0;
  split("", count);
  split("", pair);
  n = 0;
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ {
  count[n] = "#"; pair[n] = $0;
  n++;
  next;  
}

// {
  if (NF != 2) { fatal_error(("line " NF ": bad input format = «" $0 "»")); }
  ct = $1; pr = $2;
  count[n] = ct; pair[n] = pr;
  n++;
  nitems = split(pr, item, ":");
  if (nitems != 2) { fatal_error(("line " NF ": bad pair format = «" pr "»")); }
  totrow[item[1]] += ct;
  totcol[item[2]] += ct;
  tottbl += ct;
  next;  
}

END {
  if (abort >= 0) { exit abort; }
  dentbl = ( tottbl != 0 ? tottbl : 1 )
  for (i=0; i<n; i++)
    { pr = pair[i]; ct = count[i];
      if (ct == "#")
        { print pr; }
      else
        { nitems = split(pr, item, ":");
          if (nitems != 2) { fatal_error(("line " NF ": bad pair format = «" pr "»")); }
          denrow = ( totrow[item[1]] == 0 ? 1 : totrow[item[1]] );
          dencol = ( totcol[item[2]] == 0 ? 1 : totcol[item[2]] );
          printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr;
        }
    }
  if (outputTotals)
    { for (it in totrow) 
        { pr = (it ":+"); ct = totrow[it];
          denrow = ( totrow[it] == 0 ? 1 : totrow[it] );
          dencol = dentbl;
          printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr;
        }
      for (it in totcol) 
        { pr = ("+:" it); ct = totcol[it];
          denrow = dentbl;
          dencol = ( totcol[it] == 0 ? 1 : totcol[it] );
          printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr;
        }
      pr = "+:+"; ct = tottbl;
      denrow = dentbl;
      dencol = dentbl;
      printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr;
    }
}

function fatal_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1; exit abort;
}