#! /usr/bin/gawk # Last edited on 2026-01-12 17:00:33 by stolfi # Common functions for scripts of Notes/076. # To be included in other gawk scripts. function extract_words(raw_lin) { # Removes the locus ID such as "" from {raw_lin}, all leading and trailing # punctuation chars, and replaces each internal string of one or more # punctuation chars with ' '. lin = raw_lin; # Remove locus ID and leading blanks: gsub(/^[<][0-9A-Za-z.]+[>][ ]*/, "", lin); if (match(lin, /^[<]/)){ data_error("malformed locus ID"); } # Remove inline comments: gsub(/[<][!][^<>]*[>]/, "", lin); # Remove special IVTFF markers: gsub(/[<][-~$%][>]/, "", lin); if (match(lin, /[<>]/)){ data_error("malformed comment or '<...>' marker"); } if (match(lin, /[@*%$()]/)){ data_error("invalid char in text"); } # Map to lowercase: lin = tolower(lin); # Replace strings of word delimiters by space: gsub(/[-<>=«».,' ()]+/, " ", lin); # Remove leading, trailing, and superfluous spaces: gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); return lin; }