#! /usr/bin/gawk # Last edited on 2025-07-04 16:22:59 by stolfi # Common functions for scripts of Notes/076. # To be included in other gawk scripts. function extract_words(raw_lin) { # Removes the locator from {raw_lin}, all leading and trailing # punctuation chars, and replaces each internal string of one or more # punctuation chars with ' '. lin = raw_lin; # Remove locator and leading blanks: gsub(/^[<][0-9A-Za-z.]+[>][ ]*/, "", lin); if (match(lin, /^[<]/)){ data_error("malformed locator"); } # Remove inline comments: gsub(/[<][!][^<>]*[>]/, "", lin); # Remove special IVTFF markers: gsub(/[<][-~$%][>]/, "", lin); if (match(lin, /[<>]/)){ data_error("malformed comment or '<...>' marker"); } if (match(lin, /[@*%$()]/)){ data_error("invalid char in text"); } # Map to lowercase: lin = tolower(lin); # Replace strings of word delimiters by space: gsub(/[-<>=«».,' ()]+/, " ", lin); # Remove leading, trailing, and superfluous spaces: gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); return lin; }