#! /usr/bin/gawk -f
# Last edited on 1999-01-02 11:38:45 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
      "compute-transcriber correlations \\\n" \
      "  [ -v alternates=ALTS ] \\\n" \
      "  < TUPLECTS " \
    );

  # Reads a file whose lines have the form COUNT TUPLE, where COUNT is
  # an integer and TUPLE is a string of 26 EVA characters representing the
  # readings of one VMS character position by 26 potential
  # transcribers ("A" thru "Z"). In this list "%" denotes "no
  # information" and "*" denotes "unreadable".
  #
  # Prints tables that show how many times transcriber "i" agreed and
  # disagreed with transcriber "j", in counts and percentages. The
  # disagreements are futher broken down into substantive and
  # blank-related, the latter being cases where the disagreement is
  # over "!", ".", or ",".
  #
  # Readings of "%", "*", "-", or "=" are not counted.
  #
  # Certain pairs of transcriber codes, like "F" and "G", are
  # actually two alternative reading by the same transcriber
  # (originally denoted by "[...|...]" in Landini's file). If
  # non-"%" readings are present for both codes, each counts
  # as half a reading.  I.e. if transcriber A reads "XXXX" while
  # B reads "X[X|Y]ZZ", then A and B agree on 1.5 a characters 
  # and disagree on 2.5.
  
  alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

  # Compute the alternate transcription pairing map: "alter[i] = j"
  # and "alter[j] = i" if transcriber number "i" is an alternate
  # reading for transcriber "j", where "i" and "j" range in 1..26.
  npairs = split(alternates, trpair, ",");
  split("", seen);
  for (k=1; k<=npairs; k++)
    { xy = trpair[k];
      if (! match(xy, /^[A-Z][A-Z]$/)) 
        { arg_error(("bad alternative pair \"" xy "\"")); }
      x = index(alpha, substr(xy, 1,1)); 
      y = index(alpha, substr(xy, 2,1));
      if((x == 0) || (x in alter))
        {arg_error(("bad/duplicated primary/alternate \"" p "\"")); }
      if((y == 0) || (y in alter))
        {arg_error(("bad/duplicated primary/alternate \"" a "\"")); }
      alter[x] = y; alter[y] = x;
      # printf "alter[%s] = %s\n", x, y > "/dev/stderr";
      # printf "alter[%s] = %s\n", y, x > "/dev/stderr";
    }
    
  split("", trseen);
  split("", tt);

}

// { if (abort >= 0) { exit abort;} }

/./{
  if (NF != 2) format_error("wrong num of fields");
  ct = $1;
  tp = $2;
  nr = split(tp, rd, "");
  if ((nr != 26) || (length(tp) != 26)) { format_error("bad tuple length"); }

  for (i=1; i<=26; i++)
    { ci = rd[i];
      if (ci !~ /^[-=%*]$/)
        { tt[i] ++; 
          wi = ((i in alter) && (rd[alter[i]] !~ /^[-=%*]$/) ? 0.5 : 1);
          trseen[i] = 1;
          for (j=1; j<i; j++)
            { cj = rd[j];
              if (cj !~ /^[-=%*]$/)
                { wj = ((j in alter) && (rd[alter[j]] !~ /^[-=%*]$/) ? 0.5 : 1); 
                  # tally agreement/disagreement
                  if (ci == cj)
                    { agr[i,j] += ct*wi*wj; }
                  else if ((ci ~ /[!,.]/) && (cj ~ /[!,.]/)) 
                    { bdi[i,j] += ct*wi*wj; }
                  else
                    { dis[i,j] += ct*wi*wj; }
                  tot[i,j] += ct*wi*wj;
                }
            }
        }
    }
}

END {
  if (abort >= 0) { exit abort;}
  
  print_simple_counts("readings per transcriber", tt);
  
  print_counts("overlapping readings", tot);
  
  print_counts("agreement", agr);
  print_ratios("agreement", agr, tot);
  
  print_counts("wordspace disagreement", bdi);
  print_ratios("wordspace disagreement", bdi, tot);
  
  print_counts("substantive disagreement", dis);
  print_ratios("substantive disagreement", dis, tot);
}

function print_simple_counts(title,tbl,   i,j,v,f)
{
  # prints single count table "tbl[i]"
  printf "\n%s (counts)\n", title;
  for (i=1; i<=26; i++)
    { if (i in trseen)
        { printf "%s |", substr(alpha,i,1);
          v = tbl[i];
          f = (v == 0 ? "." : sprintf("%.1f", v));
          printf " %8s", f; 
          printf "\n";
        }
    }
}

function print_counts(title,tbl,   i,j,v,f)
{
  # prints count table "tbl[i,j]"
  printf "\n%s (counts)\n", title;
  for (i=1; i<=26; i++)
    { if (i in trseen)
        { printf "%s |", substr(alpha,i,1);
          for (j=1; j<i; j++)
            { if (j in trseen) 
                { v = tbl[i,j];
                  f = (v == 0 ? "." : sprintf("%.1f", v));
                  printf " %8s", f; 
                }
            }
          printf "\n";
        }
    }
  printf "  +";
  for (j=1; j<26; j++)
    { if (j in trseen) { printf " %8.8s", "-------------"; } }
  printf "\n";
  printf "   ";
  for (j=1; j<26; j++)
    { if (j in trseen) { printf " %8s", substr(alpha,j,1); } }
  printf "\n";
}

function print_ratios(title,tbl,tot,   i,j,v,t,f)
{
  # prints ratios "tbl[i,j]/tot[i,j]" as parts per 999
  printf "\nrelative %s (parts per 999)\n", title;
  for (i=1; i<=26; i++)
    { if (i in trseen)
        { printf "%s |", substr(alpha,i,1);
          for (j=1; j<i; j++)
            { if (j in trseen) 
                { v = tbl[i,j]; t = tot[i,j];
                  f = (t == 0 ? "." : sprintf("%d", int(999*v/t+0.5)));
                  printf " %3s", f; 
                }
            }
          printf "\n";
        }
    }
  printf "  +";
  for (j=1; j<26; j++)
    { if (j in trseen) { printf " %3.3s", "------------"; } }
  printf "\n";
  printf "   ";
  for (j=1; j<26; j++)
    { if (j in trseen) { printf " %3s", substr(alpha,j,1); } }
  printf "\n";
}

function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}