#! /usr/bin/gawk -f
# Last edited on 2004-01-20 03:56:52 by stolfi

BEGIN {
  usage = ( ARGV[0] "\\\n" \
    "  -v bias=NUM \\\n" \
    "  -v minskew=NUM -v minhits=NUM \\\n" \
    "  < INFILE.wfr > OUTFILE.pms" \
  );
  # Input is a midfix-prefix-suffix frequency file, with fields {COUNT MID PRE SUF}.
  #  
  # Gathers all record with same {MID}, then for each pair of 
  # {PRE}s {A,B} and each pair of {SUF}s {X,Y}, evaluates the 
  # skewness of the quadruple. 
  # {COUNT MID PRE SUF} where {PRE·MID·SUF = WORD}. 
  # 
  # Only generates combinations where the {MID} field has at least
  # {minmid} elems; {PRE} has between 1 and {maxpre} elems, and
  # similarly for {SUF}.
  
  abort = -1;
  
  if (bias == "") { bias = 1; }
  if (minskew == "") { minskew = 0.10; }
  if (minhits == "") { minhits = 3; }
  omid = "";
  L2 = log(2.0);
  clear_tables();
}

(abort >= 0) { exit abort; }
    
/^[ ]*[0-9]/ {
  if (NF != 4) { data_error(("bad NF = " NF)); }
  ct = $1; mid = $2; pre = $3; suf = $4;
  if (mid != omid)
    { if (omid != "") { process_midfix(); clear_tables(); }
      omid = mid; 
    }
  if (! (pre in ct_pre)) { ct_pre[pre] = 0; pre_s[npre] = pre; npre++; }
  if (! (suf in ct_suf)) { ct_suf[suf] = 0; suf_s[nsuf] = suf; nsuf++; }
  ct_pre[pre] += ct;
  ct_suf[suf] += ct;
  ct_pair[pre,suf] = ct;
  ct_mid += ct;
  next;
}

END {
  if (abort >= 0) { exit abort; }
  if (omid != "") { process_midfix(); }
}

function clear_tables()
{
  npre = 0; split("", pre_s); split("", ct_pre);
  nsuf = 0; split("", suf_s); split("", ct_suf);
  split("", ct_pair);
  ct_mid = 0;
}

function process_midfix(   i,j,A,B,X,Y,nAX,nAY,nBX,nBY,nA,nB,nX,nY,n,d,s)
{
  if ((npre < 2) || (nsuf < 2) || (ct_mid < minhits)) { return; }
  printf "%7d %s\n", ct_mid, omid > "/dev/stderr";
  for (ia = 0; ia < npre; ia++)
    { A = pre_s[ia];
      if (ct_pre[A] >= minhits)
        { for (ib = ia+1; ib < npre; ib++)
            { B = pre_s[ib];
              if (ct_pre[B] >= minhits)
                { for (jx = 0; jx < nsuf; jx++) 
                    { X = suf_s[jx];
                      if (ct_suf[X] >= minhits)
                        { nAX = ct_pair[A,X];
                          nBX = ct_pair[B,X];
                          nX = nAX + nBX;
                          for (jy = 0; jy < nsuf; jy++) 
                            { Y = suf_s[jy];
                              if (ct_suf[Y] >= minhits)
                                { nAY = ct_pair[A,Y];
                                  nBY = ct_pair[B,Y];
                                  nY = nAY + nBY;
                                  nA = nAX + nAY;
                                  nB = nBX + nBY;
                                  if ((nA >= minhits) && (nB >= minhits) && (nX >= minhits) && (nY >= minhits))
                                    { n = nAX+nAY+nBX+nBY;
                                      d = (log(nAX+bias) - log(nAY+bias) - log(nBX+bias) + log(nBY+bias))/L2;
                                      s = d*d;
                                      if (s > minskew)
                                        { printf "%7.2f %d %d %d %d %d %s  %s %s  %s %s\n", \
                                            s, n, nAX, nAY, nBX, nBY, omid, A, B, X, Y; 
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

function data_error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}