#! /usr/bin/gawk -f 
# Last edited on 1999-07-28 01:41:44 by stolfi

BEGIN{
  abort = -1;
  usage = "compute-freq-diffs [-normalize] FILEA FILEB";
  #
  # where FILEA and FILEB are names of files, and each line in them
  # has the format COUNT COORD LABEL, where COUNT is an integer, COORD
  # is a real number, and LABEL is any word. (The LABELs must be
  # sorted and must match in both files.)  The COORD field of the ith
  # line of FILEA is interpreted as the ith coordinate A[i] of a
  # vector in some high-dimensional space; and ditto for FILEB.
  #
  # After reading both files, prints to stdout another list of
  # COORD LABEL pairs, where the COORD on line i is (A[i] - B[i])**2
  #
  # If "-normalize" is specified, the numbers are scaled
  # so that the largest has absolute value 1.0000
  
  while ((ARGC > 1) && (substr(ARGV[1],1,1) == "-"))
    { if (ARGV[1] == "-normalize") 
        { normalize = 1; shiftarg(); }
      else
        { error(("bad option " ARGV[1] " - usage: " usage)); }
    }
  if (ARGC != 3) { error(("ARGC = " ARGC " - usage: " usage)); }
  apt = ARGV[1]; if (apt == "") { error(("usage: " usage)); }
  bpt = ARGV[2]; if (bpt == "") { error(("usage: " usage)); }
 
  N = 0;
  while ((getline < apt) > 0) 
    { N++;
      if (NF != 3) { error((apt ", line " N ": bad format")); }
      w = $3;
      d[w] = $2;
    }
  if (ERRNO != "0") { error((apt ": " ERRNO)); }
  close(apt);
  N = 0;
  while ((getline < bpt) > 0)
    { N++;
      if (NF != 3) { error((bpt ", line " N ": bad format")); }
      w = $3;
      d[w] = d[w] - $2;
    }
  if (ERRNO != "0") { error((bpt ": " ERRNO)); }
  close(bpt);
  if (normalize) 
    { maxd = 0;
      for (w in d)
        { da = d[w]; da = ( da < 0 ? -da : da );
          if(da > maxd) { maxd = da;}
        }
    }
  else
    { maxd = 1.0; }
  for (w in d)
    { printf "%+7.5f %s\n",  d[w]/maxd, w; }
}

function error(msg)
{
   printf "%s\n", msg > "/dev/stderr"; 
   abort=1; exit(1);
}

function shiftarg(   i)
{
  for(i=1;i<ARGC-1;i++) { ARGV[i] = ARGV[i+1]; }
  ARGC--;
}