#! /bin/bash -ue
# Last edited on 2025-05-04 22:45:52 by stolfi

# Creates an EVT file specific for this ${lang} and ${book}.
# The ${lang} is presumed to be "voyn".  The ${book} 
# is actually a subtext of the VMS:
#
#

lang="$1"; shift
book="$1"; shift

if [[ "/${lang}" == "/" ]]; then echo "** lang not specified" 1>&2; exit 1; fi
if [[ "/${book}" == "/" ]]; then echo "** book not specified" 1>&2; exit 1; fi

if [[ "${lang}" != "voyn" ]]; then echo "** cannot handle language '${lang}'" 1>&2 ; exit 1; fi

# Top folders for output data and output TeX tables of this note:
gen_book_dir="gen/${lang}/${book}"; mkdir -p ${gen_book_dir} 
tex_book_dir="tex/${lang}/${book}"; mkdir -p ${tex_book_dir}

echo "defining the transcriber code and text unit types to include in ${lang}/${book} ...\n" 1>&2
# Unit types for prose datasets:
prs_utypes="parags,starred-parags,circular-lines,circular-text,radial-lines,titles"
# Unit types for labels datasets:
lab_utypes="labels,words"
if [[ "${book}" == "tak" ]]; then
  trans_tag="H" # Extract Takahashi's transcription.
  utypes="${prs_utypes},${lab_utypes}"
  line_sel=(  ) 
else
  trans_tag="A" # Extract the majority transcription.
  if [[ "${book}" == "maj" ]]; then
    utypes="${prs_utypes},${lab_utypes}"
    line_sel=(  ) 
  elif [[ "${book}" == "prs" ]]; then
    utypes="${prs_utypes}"
    line_sel=(  ) 
  elif [[ "${book}" == "lab" ]]; then
    utypes="${lab_utypes}"
    line_sel=(  ) 
  elif [[ "${book}" == "ini" ]]; then
    utypes="${prs_utypes}"
    line_sel=( -v omitMedial=1 -v omitFinal=1 ) 
  elif [[ "${book}" == "mid" ]]; then
    utypes="${prs_utypes}"
    line_sel=( -v omitInitial=1 -v omitFinal=1 ) 
  elif [[ "${book}" == "fin" ]]; then
    utypes="${prs_utypes}"
    line_sel=( -v omitInitial=1 -v omitMedial=1 ) 
  else
    echo "** invalid book ${book}" 1>&2 ; exit 1
  fi

fi
echo "    transcriber's tag = '${trans_tag}'" 1>&2
echo "    unit types to consider = '${utypes}'" 1>&2
echo "    token positions to consider = '${line_sel[*]}'" 1>&2

raw_evt="${gen_book_dir}/raw.evt"
echo "creating EVT file ${raw_evt} for the whole book ${lang}/${book} ..."
if [[ ( ${book} == "ini" ) || ( ${book} == "mid" ) || ( ${book} == "fin" ) ]]; then
  prs_raw_evt="gen/${lang}/prs/raw.evt"
  echo "linking ${raw_evt} to ${prs_raw_evt} ..."
  if [[ ! ( -s ${prs_raw_evt} ) ]]; then echo "** file ${prs_raw_evt} is missing" 1>&2; exit 1; fi
  rm -fv ${raw_evt}
  ln -s -r ${prs_raw_evt} ${raw_evt}
  ls -l ${raw_evt}
else
  source_evt="inp/${lang}/source-${trans_tag}.evt"
  echo "extracting the relevant lines from ${source_evt}" 1>&2
  if [[ ! ( -s ${source_evt} ) ]]; then echo "** missing file ${source_evt}" 1>&2 ; exit 1; fi
  utype_tbl="gen/${lang}/unit-to-type.tbl"
  echo "    unit to unit type table = ${utype_tbl}" 1>&2
  if [[ ! ( -s ${utype_tbl} ) ]]; then echo "** no table ${utype_tbl}" 1>&2; exit 1; fi
  cat ${source_evt} \
    | egrep -v -e '[;][^'"${trans_tag}"'][>]' \
    | sed -e 's/[&][*!][*!][*!][*!;]/*!!!!/g' \
    | ./basify_weirdos.gawk \
    | ./select_units.gawk \
        -v types="${utypes}" \
        -v table="${utype_tbl}" \
    > ${raw_evt}
  ./vms_wc.sh ${source_evt} ${raw_evt}
  # ./show_first_last_lines.sh 10 10 ${raw_evt}
fi