# Makefile to build the word-map table
# Last edited on 2023-05-10 19:15:31 by stolfi

TBL := word-map.tbl
SRC := org/main.wds
MAKEFILE := word-map.make

all: ${TBL}

WORKBIN := ${STOLFIHOME}/projects/voynich/work
BANKBIN := ${STOLFIHOME}/projects/voynich/work

# We extract the list of Chinese ideograms from the plaintext itself,
# unsing the same sample-functions we used for the pinyinified Chinese
# version (chin/red); except that the word table only maps punctuation
# to "*DELETE*" (just in case those words are not marked "p" in the
# source "main.wds") So the output is GB codes instead of Pinyin.

CHINSMP := chin/red
CHINAWK := ../../${CHINSMP}/sample-fns.gawk
CHINTBL := gb-punct-delete.tbl
CHINTKS = /tmp/.chin.tks

# We assign Roman-style codes to the ideograms of the "Red Mansion"
# in random order, while retaining all copies of each ideogram.
# Therefore the most frequent ideograms will tend to get 
# low codes, but codeword length will be uniform throughout the
# whole encoded text.

${TBL}: ${SRC} ${MAKEFILE} \
            ${WORKBIN}/wds-to-tlw \
	    ${CHINAWK} ${CHINTBL} \
            ${WORKBIN}/roman-code-words \
	    ${WORKBIN}/roman-encoding.gawk \
	    ${WORKBIN}/roman-encoding-pseudo-voynich-1.gawk 
	cat ${SRC} \
	  | ${WORKBIN}/wds-to-tlw \
              -f ${CHINAWK} \
              -v table=${CHINTBL} \
	      -v smp="${CHINSMP}" \
	      -v sec="tot.1" \
	  | gawk 'BEGIN{ srand(4615); } ($$1 == "a") { printf "%10.8f %s\n", rand(), $$3; }' \
          | sort -b -k1,1g \
          | gawk '//{ print $$2; }' \
          > ${CHINTKS}
	head -1000 ${CHINTKS} | ${WORKBIN}/format-words-filled
	cat ${CHINTKS} \
	  | ${WORKBIN}/roman-code-words \
	      -f ${WORKBIN}/roman-encoding.gawk \
	      -f ${WORKBIN}/roman-encoding-pseudo-voynich-1.gawk \
              -v honorCase=1 \
	  | sed \
	      -e '/^# BEGIN DICTIONARY/,/^# END DICTIONARY/!d' \
	      -e '/DICTIONARY/d' \
	      -e 's/^#[ ]*//' \
	      -e '/[*]DELETE[*]/!s/[ ][ ]*/ @/' \
          > ${TBL}
	/bin/rm ${CHINTKS}