# Last edited on 2002-01-20 18:53:02 by stolfi 
# Compute the token/word length histogram

MAKEFILE := tw-length-hists.make

######################################################################
# Makefile for computing the token and word length
# histograms for a specified sample, quality subset
# (raw, good, or bad), and element factoring. 
# Caller must define 
#   ${LANG} = "voyn", "chin", etc.;
#   ${BOOK} = "wow", "vms", etc.
#   ${QUAL} = "raw", "gud", or "bad".
#   ${ELEM} = "bgly", "qoko", "viqr", etc.
#
LANG := LANG.IS.UNDEFINED
ifneq "${LANG}" "LANG.IS.UNDEFINED"

BOOK := BOOK.IS.UNDEFINED
ifneq "${BOOK}" "BOOK.IS.UNDEFINED"

SAMPLE_DIR := ${LANG}/${BOOK}
SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1

QUAL := QUAL.IS.UNDEFINED
ifneq "${QUAL}" "QUAL.IS.UNDEFINED"

WFR_FILE := ${SAMPLE_TOT_DIR}/${QUAL}.wfr

ELEM := ELEM.IS.UNDEFINED
ifneq "${ELEM}" "ELEM.IS.UNDEFINED"

CTS_FILE := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}.cts

.PHONY: all

all: dat/${CTS_FILE}
	for tkwd in t w; do \
	  ${MAKE} LANG=${LANG} QUAL=${QUAL} ELEM=${ELEM} TKWD=$$tkwd \
	    -f ${MAKEFILE} single-hist; \
	done;

FACTOR_LIB := dat/${SAMPLE_DIR}/factor-text-to-${ELEM}.gawk

dat/${CTS_FILE}: dat/${WFR_FILE} \
             factor-field-general ${FACTOR_LIB} \
             compute-elem-counts ${MAKEFILE}
	@echo "dat/${WFR_FILE} -> dat/${CTS_FILE}"
	cat dat/${WFR_FILE} \
	  | factor-field-general \
	      -f ${FACTOR_LIB} -v inField=3 -v outField=4 \
	  | gawk '//{ print $$1, $$3, $$4; }' \
	  | compute-elem-counts \
	  > dat/${CTS_FILE}

######################################################################
# Recursive make for each language, book, 
# element type, sample quality, and token/word counting.
# Caller must define ${LANG}, ${BOOK}, ${ELEM}, ${QUAL} and also
#   ${TKWD} = "t" (tokens) or "w" (words).
#
TKWD := TKWD.IS.UNDEFINED
ifneq "${TKWD}" "TKWD.IS.UNDEFINED"

LHI_FILE := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}-${TKWD}.lhi

AVG_TEX := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}-${TKWD}-avlen.tex

single-hist: dat/${LHI_FILE} dat/${AVG_TEX}

dat/${LHI_FILE}: dat/${CTS_FILE} \
             compute-elem-count-distrib \
             ${MAKEFILE}
	@echo "dat/${CTS_FILE} -> dat/${LHI_FILE}"
	cat dat/${CTS_FILE} \
	  | gawk -v tkwd="${TKWD}" \
              '/./{ print (tkwd == "t" ? $$1 : 1), $$3, $$4; }'  \
	  | compute-elem-count-distrib \
	  > dat/${LHI_FILE}
	cat dat/${LHI_FILE}

dat/${AVG_TEX}: dat/${LHI_FILE}
	cat dat/${LHI_FILE} \
	  | gawk \
              -v lg=${LANG} -v bk=${BOOK} -v ek=${ELEM} -v tw=${TKWD} \
	      ' /^[#]/{next;} \
	        /./{ t+= $$2; e += $$2*$$1; } \
	        END { \
	          xtw = ( tw == "t" ? "Tk" : "Wd" ); \
	          printf "\\def\\%s%sAvg%sN%s{%.2f}\n", lg, bk, xtw, ek, e/t; \
	        } \
	      ' \
	  > dat/${AVG_TEX}
	cat dat/${AVG_TEX}
	update-paper-include dat/${AVG_TEX} exp/${AVG_TEX}

endif
# End of ${LANG}/${BOOK}/${ELEM}/${QUAL}/${TKWD} recursion
######################################################################

endif
endif
endif
endif
# End of ${LANG}/${BOOK}/${ELEM}/${QUAL} recursion
######################################################################