# Makefile to build the word-map table
# Last edited on 2023-05-10 19:16:53 by stolfi

TBL := word-map.tbl
SRC := org/main.wds
MAKEFILE := word-map.make

all: ${TBL}

WORKBIN := ${STOLFIHOME}/projects/voynich/work
BANKBIN := ${STOLFIHOME}/projects/voynich/work

# We obtain the list of English words from the plaintext itself, using
# the same sample-functions that we used for the plaintext sample
# (engl/wow).

ENGLSMP := engl/wow
ENGLAWK := ../../${ENGLSMP}/sample-fns.gawk
ENGLTBL := ../../${ENGLSMP}/word-map.tbl
ENGLTKS = /tmp/.engl.tks

# We assign Roman codes to the words of the "War of the Worlds"
# in random order, while retaining all copies of each word.
# Therefore the most frequent words will tend to get 
# low codes, but codeword length will be uniform throughout the
# whole encoded text.

${TBL}: ${SRC} ${MAKEFILE} \
            ${WORKBIN}/wds-to-tlw \
	    ${ENGLAWK} ${ENGLTBL} \
            ${WORKBIN}/roman-code-words \
	    ${WORKBIN}/roman-encoding.gawk \
	    ${WORKBIN}/roman-encoding-subtractive.gawk 
	cat ${SRC} \
	  | ${WORKBIN}/wds-to-tlw \
              -f ${ENGLAWK} \
              -v table=${ENGLTBL} \
	      -v smp="${ENGLSMP}" \
	      -v sec="tot.1" \
	  | gawk 'BEGIN{ srand(4615); } ($$1 == "a") { printf "%10.8f %s\n", rand(), $$3; }' \
          | sort -b -k1,1g \
          | gawk '//{ print $$2; }' \
          > ${ENGLTKS}
	head -1000 ${ENGLTKS} | ${WORKBIN}/format-words-filled
	cat ${ENGLTKS} \
	  | ${WORKBIN}/roman-code-words \
	      -f ${WORKBIN}/roman-encoding.gawk \
	      -f ${WORKBIN}/roman-encoding-subtractive.gawk \
              -v honorCase=1 \
	  | sed \
	      -e '/^# BEGIN DICTIONARY/,/^# END DICTIONARY/!d' \
	      -e '/DICTIONARY/d' \
	      -e 's/^#[ ]*//' \
	      -e '/[*]DELETE[*]/!s/[ ][ ]*/ @/' \
          > ${TBL}
	/bin/rm ${ENGLTKS}