#! /n/gnu/bin/gawk -f
# Last edited on 1998-12-20 12:46:33 by stolfi

BEGIN {
  abort = -1;
  usage = "digest-tuples [ -v weights=WTFILE ] < TUPLECTS > TUPLEMAP";
  
  # Reads a file whose lines have the form COUNT TUPLE,
  # where COUNT is an integer and TUPLE is a string of 
  # 26 characters representing the readings of one VMS character
  # position by 26 potential transcribers ("A" thry "Z").
  # In this list "%" denotes "no information" and "*"
  # denotes "unreadable".
  #
  # Writes a file of the form COUNT TUPLE CON MAJ where COUNT and
  # TUPLE are as in the input, CON is the consensus reading for that
  # tuple, and MAJ the majority reading.
  #
  # Let's say that a reading is "valid" if it is not "%. If all valid
  # readings are equal to the same letter C (including "!"), the
  # consensus reading is C; otherwise, if all valid readings are
  # either "!", ".", or ",", the consensus reading is ","; otherwise
  # it is "*".
  #
  # Independently of the above, if the valid readings that are equal
  # to some character C have more than half of the total weight of
  # valid readings, then the majority reading is C; otherwise if the
  # readings "!", ".", and "," together have more than half of the
  # total valid weight, the majority reading is ","; otherwise it is
  # "*".
  
  alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

  # Read weight table "wt[c]":
  if (weights != "") 
    {
      while (getline lin < weights)
        { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) 
            { n = split(lin, fld);
              if (n != 2) { table_error("bad number of fields"); }
              if (fld[1] !~ /^[A-Z]$/) { table_error("bad letter"); }
              if (fld[2] !~ /^[0-9]+$/) { table_error("bad weight"); }
              i = index(alpha, fld[1]);
              if (i == 0) { print lin; print fld[1]; program_error("letter conv"); }
              wt[i] = fld[2];
            }
        }
      close(weights);
    }
  else
    { for (i=1;i<=26;i++) { wt[i] = 1; }
    }
  
}

/./ {
  if (NF != 2) format_error("wrong num of fields");
  ct = $1;
  tp = $2;
  if (length(tp) != 26) { format_error("bad tuple length"); }
  
  # Compute consensus:
  con = "%";
  for(i=1; i<=26; i++)
    { c = substr(tp,i,1);
      if ((con == c) || (c == "%"))
        { }
      else if (con == "%")
        { con = c; }
      else if ( \
        ( (con == ",") || (con == ".") || (con == "!") ) &&
        ( (c   == ",") || (c   == ".") || (c   == "!") ) \
      )
        { con = ","; }
      else
        { con = "*"; i = 27; }
    }
  
  # Compute majority:
  split("", vote);
  wtot = 0;
  for(i=1; i<=26; i++)
    { c = substr(tp,i,1);
      if (c == "%")
        { }
      else 
        { w = wt[i];
          vote[c] += w;
          wtot += w;
        }
    }
  if (wtot == 0)
    { maj = "%"; }
  else
    { maj = "*";
      for (c in vote)
        { if (2*vote[c] > wtot) { maj = c; break; } }
      if ((maj == "*") && (2*(vote["."] + vote["!"] + vote[","]) > wtot))
        { maj = ","; }
    }
    
  printf "%7d %s %s %s\n", ct, tp, con, maj;
  next;
}

// { next; }

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function table_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}

function program_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  abort = 1; exit abort;
}