#! /usr/bin/gawk -f # Last edited on 1999-01-06 06:43:08 by stolfi # Usage: # # cat RAWMAP \ # | format-word-location-map \ # [-v nblocks=NBLOCKS] \ # [-v maxlen=MAXLEN] \ # [-v ctwd=CTWD] \ # [-v html={0|1}] \ # [-v title=TITLE] \ # [-v blockHeadings=TITLE] \ # [-v totOnly={0|1}] \ # [-v showProps={0|1}] \ # [-v showPattern={0|1}] \ # [-v showLineNumber={0|1}] \ # [-v showAbsCounts={0|1}] \ # [-v showRelCounts={0|1}] \ # [-v showAvgPos={0|1}] \ # > OUTFILE" # # The script reads a raw word location map, as produced by # make-word-location-map, consisitng of records of the form # # TOTCT XXX...XXX PATTN STRING TAG PNUM LOC OBS LANG # # and formats it nicely, adding the following fields: # # AVP and DVP the mean and variance # of the block number for this STRING; # # YYY...YYY the per-block occurrence counts, # divided by the line's TOTCT. # # The following options request specific fields to be printed: # # showAvgPos=1 prints the fields AVG and DVP # # showAbsCounts=1 prints the absolute counst per block XXX...XXX # # showRelCounts=1 prints the relative counts per block YYY...YYY # # showPattern=1 prints the PATT field in addition to the STRING. # # showLineNumber=1 prints a sequential line number (starting from 0). # # showProps=1 prints the fields TAG PNUM LOC OBS LANG # # If TAG = "=" the line is interpreted as a pattern total, # and formatted specially. # # If "totOnly" is 1 then only lines with TAG = "=" are printed. # # Blank lines in the input produce blank lines in the output. # # Each per-block count is printed with CTWD bytes. If CTWD > 1 then # the maximum value printed MAXCT is 10^(CTWD-1)-1, with at least one # leading blank; else MAXCT is 9. The percentages are scaled from [0% # _ 100%] to [0 _ MAXCT] and rounded. function html_beg_line(tg) { if (tg == "-") { printf "<font color=#000000>"; } else if (tg == "+") { printf "<font color=#993300>"; } else if (tg == "=") { printf "<font color=#003366>"; } else if (tg == "h") { printf "<font color=#9900ff>"; } else { printf "<font color=#ff3300>"; } } function html_end_line(tg) { printf "</font>"; } function avp(c, i, s, n) { # Computes the average string position from histogram "c" s = 0.0 n = 0 for (i in c) { s += (i-0.5)*c[i]; n += c[i] } return s/n } function dev(c, a, i, d, bias, slop, ss, n) { # Computes the estimated standard deviation of the string position # from the histogram "c" and average position "a" # The biasterm tries to fix the deviation so that # rare strings do not come out looking localized. ss = 0.0 n = 0 for (i in c) { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i] } slop = (nblocks-1.0)/n bias = (1.0 + slop*slop)/12.0 return sqrt(ss/n + bias) } function print_line(pa, st, tg, tt, ct, pn, lc, ob, lg, av,dv,bn) { # Prints a line for pattern "pa", string "st", tag "tg", total count "tt", # per-block counts "ct[0..nblocks-1]", p-number "pn", location "lc", # obs "ob", language "lg". # Also computes the average position and deviation. # Also increments the line counter. if (html) html_beg_line(tg); if (showLineNumber) printf "%5d ", line_count; if (showPattern) printf "%-*s ", maxlen, pa; printf "%-*s ", maxlen, st; if (showAvgPos) { av = avp(c); dv = dev(c, av); printf "%5.1f %5.1f ", av, dv; } printf "%5d ", tt; if (showAbsCounts) { printf " "; for (bn=0; bn<nblocks; bn++) { if (ct[bn] == 0) printf "%*s", ctwd, "." else if (ct[bn] >= maxct) printf "%*d", ctwd, maxct else printf "%*d", ctwd, ct[bn] } printf " "; } if (showRelCounts) { printf " "; for (bn=0; bn<nblocks; bn++) { rct = int(((ct[bn]+1)*maxct)/(tt+nblocks) + 0.5) if (ct[bn] == 0) printf "%*s", ctwd, "." else if (rct >= maxct) printf "%*d", ctwd, maxct else printf "%*d", ctwd, rct } printf " "; } if (showProps) { printf " %1s", (lg == "-" ? "?" : lg); printf " %-4s", (pn == "-" ? "" : ("p" pn)); printf " %-15s", (lc == "-" ? "" : lc); printf " %s", (ob == "-" ? "" : ob); } if (html) html_end_line(tg); printf "\n"; line_count++; } function print_headings_major( av,dv,bn) { # Prints the major column headings. # Must match print_line. if (html) html_beg_line("h"); if (showLineNumber) printf "%5s ", " "; if (showPattern) printf "%-*s ", maxlen, " "; printf "%-*s ", maxlen, " "; if (showAvgPos) { printf "%5s %5s ", " ", " "; } printf "%5s ", " "; if (showAbsCounts) { printf " "; printf "%-*s", nblocks*ctwd, "abs counts"; printf " "; } if (showRelCounts) { printf " "; printf "%-*s", nblocks*ctwd, "rel counts"; printf " "; } if (showProps) { printf " %1s", " "; printf " %-4s", " "; printf " %-15s", " "; printf " %s", " "; } if (html) html_end_line("h"); printf "\n" } function print_headings_minor( av,dv,bn) { # Prints the minor column headings. # Must match print_line. if (html) html_beg_line("h"); if (showLineNumber) printf "%5s ", "line"; if (showPattern) printf "%-*s ", maxlen, "pattern"; printf "%-*s ", maxlen, "word(s)"; if (showAvgPos) { printf "%5s %5s ", "av.bl", "dv.bl"; } printf "%5s ", "totct"; if (showAbsCounts) { printf " "; for (bn=0; bn<nblocks; bn++) { printf "%*s", ctwd, sprintf("%02d", bn); } printf " "; } if (showRelCounts) { printf " "; for (bn=0; bn<nblocks; bn++) { printf "%*s", ctwd, sprintf("%02d", bn); } printf " "; } if (showProps) { printf " %1s", "L"; printf " %-4s", "page"; printf " %-15s", "location"; printf " %s", "notes"; } if (html) html_end_line("h"); printf "\n" } function print_dashes( i,bn) { # Prints dashes for all fields. # Must match print_line. if (html) html_beg_line("h"); if (showLineNumber) printf "----- "; if (showPattern) { for (i=0;i<maxlen;i++) printf "-"; printf " "; } for (i=0;i<maxlen;i++) printf "-"; printf " "; if (showAvgPos) { printf "----- ----- "; } printf "----- "; if (showAbsCounts) { printf " "; for (bn=0; bn<nblocks; bn++) { if (ctwd == 1) { printf "-"; } else { printf " "; for (i=0;i<ctwd-1;i++) printf "-"; } } printf " "; } if (showRelCounts) { printf " " for (bn=0; bn<nblocks; bn++) { if (ctwd == 1) { printf "-"; } else { printf " "; for (i=0;i<ctwd-1;i++) printf "-"; } } printf " "; } if (showProps) { printf " -"; printf " ----"; printf " ---------------"; printf " ------" } if (html) html_end_line("h"); printf "\n" } function compute_block_column(bn, skip) { # Returns the screen column of the first char in the count for block bnum. # Must match print_line. # Assumes absolute and relative counts are printed in the same format. skip = 0; if (showLineNumber) skip += 6; # line number if (showPattern) skip += maxlen + 1; # pattern skip += maxlen + 1; # string if (showAvgPos) skip += 12; # average block index and spread skip += 6; # total count skip += bn*ctwd; # previous blocks return 1 + skip; } function html_block_headings(file, lin, col, i) { # Prints the page numbers (verticalized) # over the per-block counts. if (showAbsCounts || showRelCounts) { col = compute_block_column(0); while ((getline lin < file) > 0) { html_beg_line("h"); for (i=1;i<col;i++) printf " "; if (showAbsCounts) { printf " "; printf "%s", lin; printf " "; } if (showRelCounts) { printf " "; printf "%s", lin; printf " "; } html_end_line("h"); printf "\n"; } if (ERRNO != "0") { error((file ": " ERRNO)); } } } function html_head(title) { printf "<html>\n"; printf "<head><title>Voynich Manuscript - %s</title></head>\n", title; printf "<body bgcolor=#ccffcc>\n"; printf "<h1>Voynich Manuscript</h1>\n"; printf "<h2>%s</h2>\n", title; printf "<font size=1>\n"; printf "<pre>\n"; print_headings_major(); if (blockHeadings != "") { print_dashes(); html_block_headings(blockHeadings); } print_headings_minor(); print_dashes(); } function html_tail() { printf "</pre>\n"; printf "</font>\n"; printf "</body>\n"; printf "</html>\n"; } BEGIN { abort = 0; if (maxlen == 0) maxlen=16; if (nblocks == 0) error("must specify \"-v nblocks\""); if (title == "") title = "Word occurrence map"; if (ctwd == 0) ctwd = 1 if (ctwd == 1) { maxct = 9 } else { maxct=1; for (i=1;i<ctwd;i++) { maxct = maxct*10 } maxct-- } if (html) html_head(title); split("", blct); prev_patt = ""; } /^ *$/ { print; next; } /./ { if(abort) exit 1; if (NF != 8 + nblocks) error("wrong number of fields"); ttct = $1; for (i=0; i<nblocks; i++) blct[i] = $(i+2); patt = $(nblocks+2); strn = $(nblocks+3); tagg = $(nblocks+4); pnum = $(nblocks+5); locn = $(nblocks+6); obss = $(nblocks+7); lang = $(nblocks+8); if (totOnly && (tagg != "=")) next; if ((patt != prev_patt) && (prev_patt != "") && (! totOnly)) printf "\n"; print_line(patt, strn, tagg, ttct, blct, pnum, locn, obss, lang); prev_patt = patt; } END { if (html) html_tail(); }