#! /bin/bash -ue
# Last edited on 2025-09-24 19:32:06 by stolfi

cmd="$0"; cmd="${cmd##*/}"
usage="${cmd} {LANG}/{BOOK} [whole|trunc]"

if [[ $# -ne 2 ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

smp="$1"; shift;
sizeopt="$1"; shift;

echo "=== extracting the main token file dat/${smp}/*/${sizeopt}.tlw ===" 1>&2
        
totattr=( `get_sample_section_attrs.sh ${smp}/tot.1` )
echo "  totattr = ( ${totattr[*]} )" 1>&2
langbook="${totattr[1]}"

echo "  creating link dat/${smp}/org to ../../langbank/${langbook}" 1>&2
relink_source_dir.sh dat/${smp} org ../../langbank/${langbook}
srcFile="dat/${smp}/org/main.src"
wdsFile="dat/${smp}/org/main.wds"
if [[ -s ${srcFile} ]]; then 
  echo "  recreating ${wdsFile} from ${srcFile}" 1>&2
  ( cd dat/${smp}/org && make main.wds ) 2>&1 | sed -e 's:^:    :g' 1>&2
else
  echo "** ${srcFile} not found" 1>&2; exit 1
fi

# Generating the word-map table, if appropriate
tblFile="dat/${smp}/word-map.tbl"
tblMake="dat/${smp}/word-map.make"
if [[ -r ${tblMake} ]]; then
  echo "  recreating ${tblFile} from ${wdsFile}" 1>&2
  ( cd dat/${smp} && make -f word-map.make ) 2>&1 | sed -e 's:^:    :g' 1>&2
fi

# Ensure presence of the source links, and word-mapping tables for {fix_sample_tokens.gawk}
for f in word-map.tbl org/Makefile sample_fns.gawk ; do
  if [[ ! ( -r "dat/${smp}/${f}" ) ]]; then
    echo "*** missing dat/${smp}/${f}" 1>&2; exit 1
  fi
done

# Build file "{sizeopt}.tlw" for each section
secs=( `cat dat/${smp}/sections-ok.tags` )
for sec in "${secs[@]}" "tot.1" ; do
  smpsec="${smp}/${sec}"
  
  echo "  ... creating main token files dat/${smpsec}/${sizeopt}.tlw ..." 1>&2

  # Create directories if needed
  for pd in dat tex ; do
    dir="${pd}/${smpsec}"
    if [[ ! ( -d ${dir} ) ]]; then mkdir -pv ${dir}; fi
  done

  # Obtain sample size for this section:
  if [[ "/${sizeopt}" == "/whole" ]]; then
    gudnum=9999999
  elif [[ "/${sizeopt}" == "/trunc" ]]; then
    secattr=( `get_sample_section_attrs.sh ${smpsec}` )
    # echo "${smpsec} secattr = [${secattr[*]}]" 1>&2
    gudnum="${secattr[2]}"
  else
    echo "invalid sizeopt = ${sizeopt}"; exit 1
  fi 
  
  # Create link to source file in "dat/{smpsec}" directory:
  echo "    creating link dat/${smpsec}/org to ../../../langbank/${langbook}"
  relink_source_dir.sh dat/${smpsec} org ../../../langbank/${langbook}
  wdsFile="dat/${smpsec}/org/main.wds"
  if [[ ! ( -s ${wdsFile} ) ]]; then 
    echo "** ${wdsFile} not found" 1>&2; exit 1
  fi

  # Creating main ".tlw" file:
  rFile="dat/${smpsec}/${sizeopt}.tlw"
  /bin/rm -fv ${rFile} 2>&1 | sed -e 's:^:    :g' 1>&2
  if [[ ( "/${sec}" != "/tot.1" ) || ( ${#secs[@]} -eq 0 ) ]]; then
    echo "    creating ${rFile} (gudnum = ${gudnum}) from ${wdsFile}" 1>&2
    echo "    using dat/${smp}/sample_fns.gawk and dat/${smp}/word-map.tbl" 1>&2
    cat ${wdsFile} \
      | wds_to_tlw.gawk \
          -f dat/${smp}/sample_fns.gawk \
          -v table=dat/${smp}/word-map.tbl \
          -v smp=${smp} \
          -v sec=${sec} \
          -v maxAlpha=${gudnum} \
      > ${rFile}
  else
    echo "    making ${rFile} (gudnum = ${gudnum}) from whole ${secs[@]}" 1>&2
    sec_rFiles=( `echo "${secs[@]}" | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/whole.tlw:g" ` )
    merge_tlw_files.sh ${gudnum} ${sec_rFiles[@]} > ${rFile}
  fi
  # wc -l ${rFile}

done

# Print summaries
all_tFiles=( `echo ${secs[@]} "tot.1" | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/${sizeopt}.tlw:g" ` )
dicio-wc ${all_tFiles[@]} 2>&1 | sed -e 's:^:    :g' 1>&2
