#! /usr/bin/gawk -f # Last edited on 1998-07-14 23:41:54 by stolfi # Extracts tuples from the output of "extract-signif-chars" BEGIN{ usage = ( \ "compute-cond-tuple-info \\\n" \ " -v order=ORDER \\\n" \ " [ -v filler=CHAR ] \\\n" \ " [ -v lowercase=BOOL ] \\\n" \ " < SIGFILE > TUPFILE" \ ); # The file SIGFILE must have been created by "extract-signif-chars" # This script writes to standard output the n-tuples of consecutive # significant characters read from SIGFILE, where n=ORDER. # The "decoration" records in SIGFILE are ignored. # # If "lowercase: is true the significant characters are converted to lower case. # # The word breaks in SIGFILE are replaced by a single instance of the "filler" # character (which mustbe printable and non-blank). The paragraph breaks # in SIGFILE are replaced by ORDER-1 consecutive fillers. abort = -1; check_options(); init_tup(); } /^[0]/{ if (abort >= 0) { exit(abort); } next; } /^[1]/{ if (abort >= 0) { exit(abort); } push_char(filler); next; } /^[2]/{ if (abort >= 0) { exit(abort); } for (i=1;i<order;i++) { push_char(filler); } next; } /^[3]/{ c = substr($0,2,1); m = map[c]; if (m == filler) { error(("\"filler\" character found on input")); } push_char(m); next; } END{ if (abort >= 0) { exit(abort); } for (i=1;i<order;i++) { if (substr(tup,i,1) != filler) { error(("internal error 1")); } } } function init_tup() { tup = ""; wait = order-1; } function push_char(m) { tup = (tup m); if (wait == 0) { print tup; tup = substr(tup, 2); } else { wait--; } } function check_options( i,c,mk,ucs,lcs,uc,lc) { # Analyzes/defaults the option variables, namely # # "order" "filler" "lowercase" # # Defines the global variable "map" that maps characters to lowercase # if so desired. if (order == "") { error("should define \"order\""); } if ((order < 1) || (order > 20)) { error("funny \"order\""); } if (filler == "") { filler = "_"; } if (length(filler) != 1) { error(("the \"filler\" should be a single char")); } # --- lowercase mapping ---------------------------------------------- split("", map); for (i=0;i<256;i++) { c = sprintf("%c", i); map[c] = c; } if (lowercase == "") { lowercase = 0; } if (lowercase > 0) { ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕЖЗИЙКЛМНОПРСТУФХЦШЩЪЫЬЭЮ"; lcs = "abcdefghijklmnopqrstuvwxyzабвгдежзийклмнопрстуфхцшщъыьэю"; for (i=1;i<=length(ucs);i++) { uc = substr(ucs,i,1); lc = substr(lcs,i,1); map[uc] = lc; } } } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }