#! /bin/gawk -f # Last edited on 2002-02-12 00:31:41 by stolfi # Extracts the text words from an EVT file. In other words, removes # the location codes, #-comments, chapter headers, and replaces BLANK # and all other punctuation by blanks. BEGIN { abort = -1; usage = ( "evt-to-wds \\\n" \ " [ -v showBreaks=BOOL ] \\\n" \ " [ -v showParags=BOOL ] \\\n" \ " [ -v showPuncts=BOOL ] \\\n" \ " [ -v showLocation=BOOL ] \\\n" \ " [ -v smashSymbols=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # The input is in EVT format, generalized as follows. # At the beginning of the file (before the first # non-comment line) there may be one or more # comments containing charset declarations of the form # NAME = "..." where NAME is one of the following: # # NULL characters to be deleted. # BLANK characters used for interword spaces. # ALPHA characters that may occur in words. # SYMBOL non-alpha symbols like $ or digits in English. # PUNCT puntuation symbols. # BREAK character(s) indicating a significant gap or line break. # PARAG character(s) indicating a paragraph break. # # These sets should be disjoint and should not # include any of the characters " <>#{}" which have # special meaning in the EVT format. Multiple # declarations of the same NAME are concatenated. # The sets NULL, PUNCT and SYMBOL may be empty (default); the # rest, if not specified, default to the # Voynichese EVA values, namely # # NULL = "!" # BLANK = ".," # ALPHA = "a...zA...Z" # SYMBOL = "?%*" # PUNCT = "" # BREAK = "-" # PARAG = "=" # # Each non-comment line should contain a line locator, zero or more # spaces (SP), and a text string containing no SPs. The original # spaces should have been replaced by BLANK characters. A paragraph # break should be indicated by a PARA character, either at the end of # a line or as a line by itself. The line locator shoudl match # the pattern /^<[a-zA-Z0-9.;]*>$/ # # The input lines are initially stripped of locators, comments, # ASCII blanks, and NULL characters. Then: # # * Each BLANK character is replaced by a space (ASCII SP). # # * One space is inserted after each line. # # * The lines are concatenated together. # # * One PARAG character is inserted at the end of the last line. # # * If "showPuncts" is true, a BLANK character is inserted on either # side of each PUNCT character, else every PUNCT character is # replaced by a space; # # * If "showParags" is true, any maximal string of space, BREAK, or PARAG # characters containing at least one PARAG is replaced by a BREAK-PARAG-space # sequence; otherwise every PARAG character is replaced by a BREAK. # # * If "showBreaks" is true, any maximal string of space or BREAK characters # containing at least one BREAK is replaced by a space-BREAK-space # sequence; otherwise every BREAK character is replaced by space. # # * If "smashSymbols" is true, every SYMBOL character is replaced by "?". # # * Any space, BREAK, or PARAG characters at the beginning of the # input stream are ignored. # # * Every maximal sequence of one or more spaces is replaced by a single space. # # Finally, each maximal nonempty string delimited by spaces is considered # to be one token and is written out to a separate line. # # If "showLocation" is false, each output line contains only the # token. If "showLocation" is true, each output line has the format # LOC TYPE TOKEN where LOC is the EVT-style line locator where the # TOKEN occured, and TYPE is # # 3 if the TOKEN contained any SYMBOL characters, otherwise # 2 if it contained any ALPHA characters, otherwise # 1 if it contained any PUNCT, BREAK, or PARAG characters, otherwise # 0. # # Line-final delimiters (BREAKs and PARAGs) will be reported as belonging to the next # line, if there is one. if (showBreaks == "") { showBreaks = 0; } if (showParags == "") { showParags = 0; } if (showPuncts == "") { showPuncts = 0; } if (showLocation == "") { showLocation = 0; } if (smashSymbols == "") { smashSymbols = 0; } # Charsets specified in input file: NULL = ""; default_NULL = 1; BLANK = ""; default_BLANK = 1; ALPHA = ""; default_ALPHA = 1; SYMBOL = ""; default_SYMBOL = 1; PUNCT = ""; default_PUNCT = 1; BREAK = ""; default_BREAK = 1; PARAG = ""; default_PARAG = 1; ndata = 0; # Number of input data lines (excluding comments). nwdout = 0; # Number of output words } (abort >= 0) { exit abort; } # Process charset definitions, if any: /^[#] *[A-Z]+ *= *".*" *$/ { if (ndata > 0) { data_error("late charset defintion"); } } /^[#] *NULL *= *".*" *$/ { NULL = (NULL get_val($0)); default_NULL = 0; } /^[#] *BLANK *= *".*" *$/ { BLANK = (BLANK get_val($0)); default_BLANK = 0; } /^[#] *ALPHA *= *"[^"]*" *$/ { ALPHA = (ALPHA get_val($0)); default_ALPHA = 0; } /^[#] *SYMBOL *= *"[^"]*" *$/ { SYMBOL = (SYMBOL get_val($0)); default_SYMBOL = 0; } /^[#] *PUNCT *= *".*" *$/ { PUNCT = (PUNCT get_val($0)); default_PUNCT = 0; } /^[#] *BREAK *= *".*" *$/ { BREAK = (BREAK get_val($0)); default_BREAK = 0; } /^[#] *PARAG *= *".*" *$/ { PARAG = (PARAG get_val($0)); default_PARAG = 0; } function get_val(def) { gsub(/^[#][ ]*[A-Z]+ *= *"/, "", def); gsub(/" *$/, "", def); return(quote_special(def)); } function quote_special(chars) { gsub(/[\\]/, "\\\\", chars); gsub(/[-]/, "\\-", chars); gsub(/[\]]/, "\\]", chars); gsub(/[\^]/, "\\^", chars); return chars; } /^[#]/ { next; } # Discard blank lines: /^ *$/ { next; } # Process contents lines (possibly empty): /^[<]/ { if (ndata == 0) { make_patterns(); leftover = parag_char; } ndata++; if (! match($0, /^<[a-zA-Z0-9.;]*>/)) { data_error("bad locator"); } loc = substr($0, 2, RLENGTH-2); lin = substr($0, RLENGTH+1); # Remove {}-comments (twice for nested "{}"): gsub(/[{][^{}]*[}]/, "", lin); gsub(/[{][^{}]*[}]/, "", lin); # Remove null characters: if (NULL != "") { gsub(null_pat, "", lin); } # Apply transformations: process_line(); # Output words (without final newline): output_words(loc, lin); next; } END { lin = space_char; process_line(); if (lin != "") { prog_error(("invalid leftover \"" lin "\"")); } output_words(loc, leftover); # printf "%7d data lines read\n", ndata > "/dev/stderr"; # printf "%7d words written\n", nwdout > "/dev/stderr"; } function output_words(loc,lin, wd,nwd,j,wjOrg,wjSmash,wtype) { nwd = split(lin, wd); for (j = 1; j <= nwd; j++) { nwdout++; wjOrg = wd[j]; # Map symbol chars to "?" if so requested wjSmash = wjOrg; if ((SYMBOL != "") && (smashSymbols)) { gsub(symbol_pat, "?", wjSmash); } if (match(wjSmash, invalid_out_char_pat)) { data_error(("invalid output character = \"" substr(wjSmash,RSTART,RLENGTH) "\"")); } # Output word in desired format if (showLocation) { # Determine word type: if ((SYMBOL != "") && (wjOrg ~ symbol_pat)) { wtype = 3; } else if ((ALPHA != "") && (wjOrg ~ alpha_pat)) { wtype = 2; } else if ((PUNCT != "") && (wjOrg ~ punct_pat)) { wtype = 1; } else if ((BREAK != "") && (wjOrg ~ break_pat)) { wtype = 1; } else if ((PARAG != "") && (wjOrg ~ parag_pat)) { wtype = 1; } else { wtype = 0; printf "! wtype = 0 loc = %s word = «%s»\n", loc, wjOrg > "/dev/stderr"; } printf "%s %d %s\n", loc, wtype, wjSmash; } else { print wjSmash; } } } function process_line() { # Takes "leftover" and "lin" (without NULL characters and comments), # applies the transformations defined above, then splits any # trailing spaces, BREAK, and PARAG into "lefttover". gsub(blank_pat, " ", lin); lin = ( leftover lin " " ); leftover = ""; if (PUNCT != "") { if (showPuncts) { lin = gensub(punct_pat, " \\0 ", "g", lin); } else { gsub(punct_pat, " ", lin); } } if (showParags) { gsub(parag_pat, break_parag_space_chars, lin); } else { gsub(parag_pat, break_char, lin); } if (showBreaks) { gsub(break_pat, space_break_space_chars, lin); } else { gsub(break_pat, " ", lin); } if (nwdout == 0) { gsub(leading_delim_pat, "", lin); } gsub(/[ ][ ]+/, " ", lin); # Save trailing delimiters in "leftover": if (match(lin, trailing_delim_pat)) { leftover = substr(lin,RSTART,RLENGTH); lin = substr(lin,1,RSTART-1); } # validity check: if (lin != "") { # if ((nwdout == 0) == (match(lin, /^[ ]/) != 0)) # { data_error(("inconsistent leading space \"" lin "\"")); } if (match(lin, /[ ]$/)) { data_error(("invalid trailing space \"" lin "\"")); } if (match(lin, /[ ][ ]/)) { data_error(("invalid double space \"" lin "\"")); } } } function make_patterns() { # Creates the patterns nonword_pat and null_pat if (default_NULL) { NULL = "!"; } if (default_BLANK) { BLANK = ".,"; } if (default_ALPHA) { ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; } if (default_SYMBOL) { SYMBOL = "?*%"; } if (default_PUNCT) { PUNCT = ""; } if (default_BREAK) { BREAK = "-"; } if (default_PARAG) { PARAG = "="; } if (NULL != "") { null_pat = ( "[" NULL "]" ); # printf "null_pat = /%s/\n", null_pat > "/dev/stderr"; } if (BLANK == "") { arg_error("BLANK charset cannot be empty"); } blank_pat = ( "[" BLANK "]" ); # printf "blank_pat = /%s/\n", blank_pat > "/dev/stderr"; if (SYMBOL != "") { symbol_pat = ( "[" SYMBOL "]" ); # printf "symbol_pat = /%s/\n", symbol_pat > "/dev/stderr"; } if (ALPHA == "") { arg_error("ALPHA must be non-empty"); } alpha_pat = ( "[" ALPHA "]" ); # printf "alpha_pat = /%s/\n", alpha_pat > "/dev/stderr"; if (PUNCT != "") { punct_pat = ( "[" PUNCT "]" ); # printf "punct_pat = /%s/\n", punct_pat > "/dev/stderr"; } if (PARAG == "") { arg_error("PARAG must be non-empty"); } parag_pat = ( "[ " BREAK "]*[" PARAG "][ " BREAK PARAG "]*" ); # printf "parag_pat = /%s/\n", parag_pat > "/dev/stderr"; parag_char = substr(PARAG,length(PARAG),1); if (BREAK == "") { arg_error("BREAK must be non-empty"); } break_pat = ( "[ ]*[" BREAK "][ " BREAK "]*" ); # printf "break_pat = /%s/\n", break_pat > "/dev/stderr"; break_char = substr(BREAK,length(BREAK),1); break_parag_space_chars = ( break_char parag_char " " ); space_break_space_chars = ( " " break_char " " ); leading_delim_pat = ( "^[ " BREAK PARAG "]+" ); trailing_delim_pat = ( "[ " BREAK PARAG "]+$" ); # Non-blank valid output characters: invalid_out_char_pat = ( "[^ " \ ALPHA \ (smashSymbols ? "?" : SYMBOL) \ (showPuncts ? PUNCT : "") \ (showParags ? quote_special(parag_char) : "") \ (showBreaks ? quote_special(break_char) : "") \ "]" \ ); # printf "invalid_out_char_pat = /%s/\n", invalid_out_char_pat > "/dev/stderr"; } // { data_error("neither text nor comment"); } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort=1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }