#! /bin/bash -eu
# Last edited on 2025-11-06 11:11:37 by stolfi

# Reads a transcription file from "stdin" in a modified IVTFF format.
# Extract the running text words, on per line, and labels them 
# with the section and subsection.  Writes the result to stdout.

# Each output line has format
#
#   "{SUBSEC} {FNUM} {LSEQ} {IW} {JW} {WORD}" 
#  
# where {SUBSEC} is the section and subsection (like "hea.2" or
# "unk.4"), {IW} is the word index from start of line, {JW} is index
# from end of line, {WORD} is a single word or "=" for parag break or
# "-" for line break.
#
# There is a lone line break "=" at the end of the file.

cat \
  | sed -e 's:fRos:f85v2:g' \
  | extract_parag_words.py \
  | map_field.gawk \
      -v inField=1 \
      -v outField=1 \
      -v table=fnum-to-subsec.tbl \
      -v defSubst='???'