#! /usr/bin/gawk -f # Last edited on 2004-10-12 19:36:46 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " < NAME.gpr > NAME.dic" \ ); # Reads a list of glossing pairs "{TOKEN} {GLOSS}". Outputs a file # with one line for each distinct {TOKEN}; the format is "{TOKEN} # {GLOSSES}", where {GLOSSES} is the concatenation of all the # distinct {GLOSS} fields associated with {TOKEN}, with respective # occurrence counts. # Lines that have {TOKEN == "×"} are assumed to be unpaired # glosses; the {GLOSS} is then replaced with "×" too. # Lines starting with "#" are ignored. Lines starting with "@" # are assumed to contain a JOD-locator in field 1. split("", nt); # {nt[w]} is number of distinct glosses for word {w}. split("", tr); # {tr[w,0..nt[w]-1]} are those glosses split("", ct); # {ct[w,0..nt[w]-1]} are the correponding token counts. nglosses = 0; # Total glossing pairs read. nwords = 0; # Number of distinct left-hand words. loc = "{}"; # Last JOD-locator seen. } (abort >= 0) { exit(abort); } /^[\#] *{(jod|sent)[:][0-9]+[:][0-9]+[.][0-9]+}/ { # Locator line loc = $2; next; } /^ *([\#]|$)/ { # Comment/blank line next; } /./ { if (NF != 2) { data_error("bad NF"); } # Get lefthand token {o} and gloss {t}: o = $1; t = $2; # Collapse unpaired glosses: if (o == "×") { t = "×"; } # If first occurrence of {o}, initialize {ntr[o]}. if (! (o in ntr)) { ntr[o] = 0; nwords++; } # Find index {k} of this gloss, or set {k=ntr[o]}: for (k = 0; k < ntr[o]; k++) { if (tr[o,k] == t) { break; } } # If new gloss, save it and initialize {ct[o,k]}: if (k >= ntr[o]) { ntr[o]++; tr[o,k] = t; ct[o,k] = 0; } # Count gloss instance: ct[o,k]++; nglosses++; next; } END { if (abort >= 0) { exit(abort); } # Print stats printf "%7d glossing pairs lines read\n", nglosses > "/dev/stderr"; printf "%7d distinct left-hand words\n", nwords > "/dev/stderr"; # Print words and their glosses: for (o in ntr) { printf "%-35s ", o; for (k = 0; k < ntr[o]; k++) { # Get gloss and count: t = tr[o,k]; c = ct[o,k]; printf "%s%s", (k > 0 ? "/" : ""), t; if (c > 1) { printf "(%d)", c; } } printf "\n"; } fflush(OpEnFile); } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr"; abort = 1; exit 1; }