#! /usr/bin/gawk -f
# Last edited on 2025-09-12 08:31:53 by stolfi

# Reads a file "maind.wds" from the "projects/langbank" database.
# outputs the words in selected parts of each paragraph, 
# converted to lowercase, without punctuation.

# Assumes that the input file is in langbank's "main.wds" format,
# with sectioning lines starting wit "$":
#
#   "$ {B}{h{N}}{tt}" Start of title (herb name) of section {N} (1,2,...).
#   "$ {B}{h{N}}{p{M}}" Start of parag {M} {1,2,...) of section {N}.
# 
# For each section, the script will collect one logical parag
# consisting of the words in the title and parag "{p1}" of the section.
# Parags other than "{p1}" are ignored.
# 
# The user must define (with "-v") the parameters {sample_pos} and
# {sample_size}. The {sample_pos} may be 0, 1, or 2, meaning
# respectively the start, the middle, and the end of each parag. The
# {sample_size} tells how many words to take from each part. In any
# case, in a parag with {W} words (including the title), at most {W/3}
# words are used.

BEGIN {
  abort = -1
  sample_size = check_num_arg("sample_size", sample_size, 1, 200)
  sample_pos = check_num_arg("sample_pos", sample_pos, 0, 2)
  npara = 0 # Number of parags processed.
  nwd_rd = 0 # Number of input lines read.
  nwd_pr = 0 # Number of words processed.
  nwd_ot = 0 # Number of words written.
  cur_sec = ""  # Code of current section, "{B}{h{N}}"
  has_title = 0; # Set to 1 if a section has non-empty title.
  has_body = 0; # Set to 1 if a section has non-empty body.
  state = -1 # Parsing state: -1 before title, 0 = parsing title, 1 = parsing body, 2 = skipping.
  split("", word)   # Words of current parag, indexed from 0.
  nw = 0;          # Number of words in current parag.
}

(abort >= 0) { exit(abort); }

/^[ ]*([#@]|$)/ { 
  # Comment, line number, or blank line:
  next;
}

/^[$] *[{]B[}] *$/ { 
  # Start of book, ignore:
  if (state != -1) { data_error("unexpected book start"); }
  next;
}

/^[$] *[{]B[}][{]h[0-9]+[}] *$/ { 
  # New section:
  finish_parag()
  start_parag($2)
  next
}

/^[$] *[{]B[}][{]h[0-9]+[}][{]tt[}] *$/ { 
  # Start of section title:
  start_title($2)
  next
}

/^[$] *[{]B[}][{]h[0-9]+[}][{]p1[}] *$/ {
  # Start of first parag:
  start_body($2)
  next
}

/^[$] *[{]B[}][{]h[0-9]+[}][{]p[0-9]+[}] *$/ {
  # Start of some other parag, ignore:
  state = 2
  next
}

/^[$]/ { 
  # Unexpected or invalid section:
  data_error("unexpected section")
}

/^a[ ]/ { 
  # Word entry:
  nwd_rd++; 
  if (state == 2) { next; }
  if (state == -1) { data_error("missing title or parag header"); }
  append_word($2)
  next
}

/^[ps][ ]/ { 
  # Punctuation or symbol - ignore
  next
}
  
// { 
  # Unrecognized word:
  data_error("invalid line format")
}

END {
  if (abort >= 0) { exit(abort); }
  finish_parag()
  printf "%5d words in input file\n", nwd_rd > "/dev/stderr"
  printf "%5d non-empty parags processed\n", npara > "/dev/stderr"
  printf "%5d words considered", nwd_pr > "/dev/stderr"
  printf " (average %5.2f per parag)\n", nwd_pr/npara > "/dev/stderr"
  printf "%5d words written", nwd_ot > "/dev/stderr"
  printf " (average %5.2f  per parag)\n", nwd_pr/npara > "/dev/stderr"
}

function start_parag(sec) {
  # Started a new level 2 section with code {sec}.
  # printf "# %s\n", sec > "/dev/stderr"  
  split("", word); 
  nw = 0;
  has_title = 0; 
  has_body = 0; 
  cur_sec = sec;
  state = -1
}

function finish_parag() {
  if (cur_sec == "") { return; }
  if ((state != 1) && (state != 2)) { prog_error("bad state") }
  if (has_title == 0) { data_error("missing title"); }
  if (has_body == 0) {
    printf "!! empty body, ignored\n" > "/dev/stderr" 
    return;
  }
  if (nw == 0) { prog_error("bug {nw}") }
  no = sample_size
  if (no >int(nw/3)) { no = int(nw/3); }
  if (no < 2) {
    printf "!! section %s too short\n", cur_sec > "/dev/stderr";
    return
  }
  npara++
  nwd_pr += nw
  nwd_ot += no
  ko = (sample_pos == 0 ? 0 : (sample_pos == 2 ? nw-no : int((nw-no)/2)))
  for (io = 0; io < no; io++) { print word[ko+io]; }
  state = -1
}

function start_title(sec) {
  # New section "{B}{h{N}}{tt}":
  if (substr(sec, 1, length(cur_sec)) != cur_sec) { data_error("sec mismatch"); }
  if (substr(sec, length(cur_sec)+1) != "{tt}") { prog_error("bug {tt}"); }
  if (state != -1) { data_error("sec sequence error") }
  if (nw != 0) { data_error("dup title?") }
  state = 0
}

function start_body(sec) {
  # New section "{B}{h{N}}{p1}":
  if (substr(sec, 1, length(cur_sec)) != cur_sec) { data_error("sec mismatch"); }
  if (substr(sec, length(cur_sec)+1) != "{p1}") { prog_error("bug {p1}"); }
  if (state != 0) { data_error("sec sequence error") }
  if (nw == 0) { data_error("empty title") }
  state = 1
}

function append_word(w) {
  if ((state != 0) && (state != 1)) { prog_error("bad state (word)"); }
  if (state == 0) { has_title = 1 }
  if (state == 1) { has_body = 1 }
  w = tolower(w)
  word[nw] = w
  nw++
}