#! /usr/bin/gawk # Last edited on 2025-06-24 04:22:57 by stolfi # Common functions for scripts of Notes/077. # To be included in other gawk scripts. function extract_words(raw_lin) { # Removes the locator from {raw_lin}, all leading and trailing # punctuation chars, and replaces each internal string of one or more # punctuation chars with ' '. lin = raw_lin; gsub(/^[<>0-9A-Za-z.]+[ ]*/, "", lin); lin = tolower(lin); gsub(/[=.,' ()]+/, " ", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); return lin; }