#! /usr/bin/gawk -f # Last edited on 2025-09-12 08:31:53 by stolfi # Reads a file "maind.wds" from the "projects/langbank" database. # outputs the words in selected parts of each paragraph, # converted to lowercase, without punctuation. # Assumes that the input file is in langbank's "main.wds" format, # with sectioning lines starting wit "$": # # "$ {B}{h{N}}{tt}" Start of title (herb name) of section {N} (1,2,...). # "$ {B}{h{N}}{p{M}}" Start of parag {M} {1,2,...) of section {N}. # # For each section, the script will collect one logical parag # consisting of the words in the title and parag "{p1}" of the section. # Parags other than "{p1}" are ignored. # # The user must define (with "-v") the parameters {sample_pos} and # {sample_size}. The {sample_pos} may be 0, 1, or 2, meaning # respectively the start, the middle, and the end of each parag. The # {sample_size} tells how many words to take from each part. In any # case, in a parag with {W} words (including the title), at most {W/3} # words are used. BEGIN { abort = -1 sample_size = check_num_arg("sample_size", sample_size, 1, 200) sample_pos = check_num_arg("sample_pos", sample_pos, 0, 2) npara = 0 # Number of parags processed. nwd_rd = 0 # Number of input lines read. nwd_pr = 0 # Number of words processed. nwd_ot = 0 # Number of words written. cur_sec = "" # Code of current section, "{B}{h{N}}" has_title = 0; # Set to 1 if a section has non-empty title. has_body = 0; # Set to 1 if a section has non-empty body. state = -1 # Parsing state: -1 before title, 0 = parsing title, 1 = parsing body, 2 = skipping. split("", word) # Words of current parag, indexed from 0. nw = 0; # Number of words in current parag. } (abort >= 0) { exit(abort); } /^[ ]*([#@]|$)/ { # Comment, line number, or blank line: next; } /^[$] *[{]B[}] *$/ { # Start of book, ignore: if (state != -1) { data_error("unexpected book start"); } next; } /^[$] *[{]B[}][{]h[0-9]+[}] *$/ { # New section: finish_parag() start_parag($2) next } /^[$] *[{]B[}][{]h[0-9]+[}][{]tt[}] *$/ { # Start of section title: start_title($2) next } /^[$] *[{]B[}][{]h[0-9]+[}][{]p1[}] *$/ { # Start of first parag: start_body($2) next } /^[$] *[{]B[}][{]h[0-9]+[}][{]p[0-9]+[}] *$/ { # Start of some other parag, ignore: state = 2 next } /^[$]/ { # Unexpected or invalid section: data_error("unexpected section") } /^a[ ]/ { # Word entry: nwd_rd++; if (state == 2) { next; } if (state == -1) { data_error("missing title or parag header"); } append_word($2) next } /^[ps][ ]/ { # Punctuation or symbol - ignore next } // { # Unrecognized word: data_error("invalid line format") } END { if (abort >= 0) { exit(abort); } finish_parag() printf "%5d words in input file\n", nwd_rd > "/dev/stderr" printf "%5d non-empty parags processed\n", npara > "/dev/stderr" printf "%5d words considered", nwd_pr > "/dev/stderr" printf " (average %5.2f per parag)\n", nwd_pr/npara > "/dev/stderr" printf "%5d words written", nwd_ot > "/dev/stderr" printf " (average %5.2f per parag)\n", nwd_pr/npara > "/dev/stderr" } function start_parag(sec) { # Started a new level 2 section with code {sec}. # printf "# %s\n", sec > "/dev/stderr" split("", word); nw = 0; has_title = 0; has_body = 0; cur_sec = sec; state = -1 } function finish_parag() { if (cur_sec == "") { return; } if ((state != 1) && (state != 2)) { prog_error("bad state") } if (has_title == 0) { data_error("missing title"); } if (has_body == 0) { printf "!! empty body, ignored\n" > "/dev/stderr" return; } if (nw == 0) { prog_error("bug {nw}") } no = sample_size if (no >int(nw/3)) { no = int(nw/3); } if (no < 2) { printf "!! section %s too short\n", cur_sec > "/dev/stderr"; return } npara++ nwd_pr += nw nwd_ot += no ko = (sample_pos == 0 ? 0 : (sample_pos == 2 ? nw-no : int((nw-no)/2))) for (io = 0; io < no; io++) { print word[ko+io]; } state = -1 } function start_title(sec) { # New section "{B}{h{N}}{tt}": if (substr(sec, 1, length(cur_sec)) != cur_sec) { data_error("sec mismatch"); } if (substr(sec, length(cur_sec)+1) != "{tt}") { prog_error("bug {tt}"); } if (state != -1) { data_error("sec sequence error") } if (nw != 0) { data_error("dup title?") } state = 0 } function start_body(sec) { # New section "{B}{h{N}}{p1}": if (substr(sec, 1, length(cur_sec)) != cur_sec) { data_error("sec mismatch"); } if (substr(sec, length(cur_sec)+1) != "{p1}") { prog_error("bug {p1}"); } if (state != 0) { data_error("sec sequence error") } if (nw == 0) { data_error("empty title") } state = 1 } function append_word(w) { if ((state != 0) && (state != 1)) { prog_error("bad state (word)"); } if (state == 0) { has_title = 1 } if (state == 1) { has_body = 1 } w = tolower(w) word[nw] = w nw++ }