#! /usr/bin/gawk -f
# Last edited on 1998-12-30 06:22:04 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
      "compute-majority-table \\\n" \
      "  [ -v alternates=ALTS ] \\\n" \
      "  [ -v weights=WTFILE ] \\\n" \
      "  < TUPLECTS > TUPLEMAP" \
    );
  
  # Reads a file whose lines have the form COUNT TUPLE, where COUNT is
  # an integer and TUPLE is a string of 26 EVA characters representing the
  # readings of one VMS character position by 26 potential
  # transcribers ("A" thru "Z"). In this list "%" denotes "no
  # information" and "*" denotes "unreadable".
  #
  # Writes a file of the form COUNT TUPLE MAJR TOTWT where COUNT and
  # TUPLE are as in the input, MAJR the majority reading, and TOTWT is
  # the total number of votes cast on that letter.
  #
  # A reading equal to some character "C" denotes a certain number of
  # votes for "C", the number depending on "C" and on the
  # transcriber's weight (see below). A reading of "%" counts as no
  # vote, and readings of "*" get an infinitesimal fraction of a vote.  
  #
  # If there are no votes on some character position, the majority
  # reading is "%". If the votes for some character C are more than
  # half of the total votes cast, then the majority reading is C.
  # Otherwise, if the readings "!", ".", and "," together have more
  # than half of the total votes, the majority reading is ",". In all
  # other cases the majority reading is "*".
  #
  # The transcrber weights can be specified in the WTFILE table, which
  # has entries of the form CODE WEIGHT where CODE is a letter "A"
  # through "Z" and WEIGHT is the (non-negative, integer) weight to be
  # given to readings by the transcriber with that CODE. The default
  # is weight 1 for everybody.
  #
  # Certain pairs of transcriber codes, like "F" and "G", are
  # actually two alternative reading by the same transcriber
  # (originally denoted by "[...|...]" in Landini's file). If
  # non-"%" readings are present for both codes, their weights
  # are divided by two.
  # 
  # These pairs of transcriber codes are specified by the string ALTS,
  # a comma-separated list of letter pairs "XY,XY,...,XY".
  
  alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

  # Compute the alternate transcription pairing map: "alter[i] = j"
  # and "alter[j] = i" if transcriber number "i" is an alternate
  # reading for transcriber "j", where "i" and "j" range in 1..26.
  npairs = split(alternates, trpair, ",");
  split("", seen);
  for (k=1; k<=npairs; k++)
    { xy = trpair[k];
      if (! match(xy, /^[A-Z][A-Z]$/)) 
        { arg_error(("bad alternative pair \"" xy "\"")); }
      x = index(alpha, substr(xy, 1,1)); 
      y = index(alpha, substr(xy, 2,1));
      if((x == 0) || (x in alter))
        {arg_error(("bad/duplicated primary/alternate \"" p "\"")); }
      if((y == 0) || (y in alter))
        {arg_error(("bad/duplicated primary/alternate \"" a "\"")); }
      alter[x] = y; alter[y] = x;
      # printf "alter[%s] = %s\n", x, y > "/dev/stderr";
      # printf "alter[%s] = %s\n", y, x > "/dev/stderr";
    }
  
  # Read weight table "wt[i]" where "i" is 1..26:
  if (weights != "") 
    {
      nentries = 0;
      while (getline lin < weights)
        { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) 
            { n = split(lin, fld);
              if (n != 2) { table_error("bad number of fields"); }
              if (fld[1] !~ /^[A-Z]$/) { table_error("bad letter"); }
              if (fld[2] !~ /^[0-9]+$/) { table_error("bad weight"); }
              i = index(alpha, fld[1]);
              if (i == 0) { print lin; print fld[1]; program_error("letter conv"); }
              wt[i] = fld[2];
              nentries++;
            }
        }
      close(weights);
      if (nentries == 0) { arg_error("no entries in weight table"); }
    }
  else
    { for (i=1;i<=26;i++) { wt[i] = 1; } }
}

/./ {
  if (NF != 2) format_error("wrong num of fields");
  ct = $1;
  tp = $2;
  nr = split(tp, rd, "");
  if ((nr != 26) || (length(tp) != 26)) { format_error("bad tuple length"); }
  
  # Compute majority:
  split("", vote);
  wtot = 0;
  for(i=1; i<=26; i++)
    { c = rd[i];
      if (c == "%")
        { }
      else 
        { w = wt[i];
          if ((i in alter) && (rd[alter[i]] != "%")) { w /= 2.0; }
          if (c == "*") { w /= 1000.0; }
          vote[c] += w;
          wtot += w;
        }
    }
    
  if (wtot == 0)
    { maj = "%"; }
  else
    { maj = "*";
      for (c in vote)
        { if (2*vote[c] > wtot) { maj = c; break; } }
      if ((maj == "*") && (2*(0 + vote["."] + vote["!"] + vote[","]) > wtot))
        { maj = ","; }
    }
    
  printf "%7d %s %s %6.2f\n", ct, tp, maj, wtot + 0.00001;
  next;
}

// { next; }

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function table_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}

function program_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  abort = 1; exit abort;
}