#! /n/gnu/bin/gawk -f # Last edited on 1998-12-20 12:46:33 by stolfi BEGIN { abort = -1; usage = "digest-tuples [ -v weights=WTFILE ] < TUPLECTS > TUPLEMAP"; # Reads a file whose lines have the form COUNT TUPLE, # where COUNT is an integer and TUPLE is a string of # 26 characters representing the readings of one VMS character # position by 26 potential transcribers ("A" thry "Z"). # In this list "%" denotes "no information" and "*" # denotes "unreadable". # # Writes a file of the form COUNT TUPLE CON MAJ where COUNT and # TUPLE are as in the input, CON is the consensus reading for that # tuple, and MAJ the majority reading. # # Let's say that a reading is "valid" if it is not "%. If all valid # readings are equal to the same letter C (including "!"), the # consensus reading is C; otherwise, if all valid readings are # either "!", ".", or ",", the consensus reading is ","; otherwise # it is "*". # # Independently of the above, if the valid readings that are equal # to some character C have more than half of the total weight of # valid readings, then the majority reading is C; otherwise if the # readings "!", ".", and "," together have more than half of the # total valid weight, the majority reading is ","; otherwise it is # "*". alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; # Read weight table "wt[c]": if (weights != "") { while (getline lin < weights) { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) { n = split(lin, fld); if (n != 2) { table_error("bad number of fields"); } if (fld[1] !~ /^[A-Z]$/) { table_error("bad letter"); } if (fld[2] !~ /^[0-9]+$/) { table_error("bad weight"); } i = index(alpha, fld[1]); if (i == 0) { print lin; print fld[1]; program_error("letter conv"); } wt[i] = fld[2]; } } close(weights); } else { for (i=1;i<=26;i++) { wt[i] = 1; } } } /./ { if (NF != 2) format_error("wrong num of fields"); ct = $1; tp = $2; if (length(tp) != 26) { format_error("bad tuple length"); } # Compute consensus: con = "%"; for(i=1; i<=26; i++) { c = substr(tp,i,1); if ((con == c) || (c == "%")) { } else if (con == "%") { con = c; } else if ( \ ( (con == ",") || (con == ".") || (con == "!") ) && ( (c == ",") || (c == ".") || (c == "!") ) \ ) { con = ","; } else { con = "*"; i = 27; } } # Compute majority: split("", vote); wtot = 0; for(i=1; i<=26; i++) { c = substr(tp,i,1); if (c == "%") { } else { w = wt[i]; vote[c] += w; wtot += w; } } if (wtot == 0) { maj = "%"; } else { maj = "*"; for (c in vote) { if (2*vote[c] > wtot) { maj = c; break; } } if ((maj == "*") && (2*(vote["."] + vote["!"] + vote[","]) > wtot)) { maj = ","; } } printf "%7d %s %s %s\n", ct, tp, con, maj; next; } // { next; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function table_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function program_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }