# Makefile for creating a language sample
# Last edited on 2002-01-15 15:04:36 by stolfi

######################################################################
# Caller must define the following variables:
#  
#   SAMPLE = name for text sample, e.g. "engl/wow".
#   SUBSEC = subsection tag, e.g. "cos.2" or "tot.t".
#   SIZETAG = "whole" or "trunc"
#
SAMPLE := SAMPLE.IS.UNDEFINED
SUBSEC := SUBSEC.IS.UNDEFINED
SIZETAG := SIZETAG.IS.UNDEFINED
ifneq "${SAMPLE}" "SAMPLE.IS.UNDEFINED"

# Top-level input and output directories:

SAMPLE_REP := sample
BIN := ${HOME}/bin

# Derived directories and file names:

SAMPLE_DIR := ${SAMPLE_REP}/${SAMPLE}
SOURCE_DIR := ${SAMPLE_DIR}/source

SOURCE_EVT := ${SOURCE_DIR}/main.evt
SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk

# Token streams:

RAW_TKS := ${SAMPLE_DIR}/${SUBSEC}/raw.tks
GUD_TKS := ${SAMPLE_DIR}/${SUBSEC}/gud.tks
BAD_TKS := ${SAMPLE_DIR}/${SUBSEC}/bad.tks

# Word counts and frequencies:

RAW_WFR := ${SAMPLE_DIR}/${SUBSEC}/raw.wfr
GUD_WFR := ${SAMPLE_DIR}/${SUBSEC}/gud.wfr
BAD_WFR := ${SAMPLE_DIR}/${SUBSEC}/bad.wfr

all: ${RAW_WFR} ${GUD_WFR} ${BAD_WFR}

######################################################################
# The following applies only for bona-fide sections, not for "tot.t"
# 
ifneq "${SUBSEC}" "tot.t"

# Create partial EVT source file with this subsection only:

SAMPLE_EVT := ${SAMPLE_DIR}/${SUBSEC}/raw.evt

${SAMPLE_EVT}: ${SOURCE_EVT} \
                select-evt-lines ${SAMPLE_AWK}
	ls -ld ${SOURCE_EVT}
	cat ${SOURCE_EVT}  \
	  | ./select-evt-lines \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} -v subsec=${SUBSEC} \
	  > ${SAMPLE_EVT}

# Rebuild the source EVMT file in the source repository
# if it is out of date:

SOURCE_SOURCES := \
  ${SOURCE_DIR}/main.org \
  ${SOURCE_DIR}/Makefile \
  ${SOURCE_DIR}/preprocess-org \
  ${BIN}/org-to-evt

${SOURCE_EVT}: ${SOURCE_SOURCES}
	cd ${SOURCE_DIR} && make all 

SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl

RAWNUM_FILE := ${SAMPLE_DIR}/${SUBSEC}/${SIZETAG}-raw.num
RAWNUM := ${shell cat ${RAWNUM_FILE}}

# get raw tokens from EVT file:

${RAW_TKS}: ${SAMPLE_EVT} ${SAMPLE_TBL} ${RAWNUM_FILE} \
            ${BIN}/evt-to-wds fix-raw-words ${SAMPLE_AWK}
	cat ${SAMPLE_EVT} \
	  | ${BIN}/evt-to-wds \
	      -v showBreaks=0 \
	      -v showParags=0 \
	      -v showPuncts=0 \
	      -v smashSymbols=1 \
	  | ./fix-raw-words \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} \
	      -v table=${SAMPLE_TBL} \
	  | head -${RAWNUM} \
	  > ${RAW_TKS}

# Separate raw tokens into good and bad

${GUD_TKS}: ${RAW_TKS} \
            select-gud-bad-words ${SAMPLE_AWK}
	cat ${RAW_TKS} \
	  | select-gud-bad-words \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} -v writeGood=1 \
	  > ${GUD_TKS}

${BAD_TKS}: ${RAW_TKS} \
            select-gud-bad-words ${SAMPLE_AWK}
	cat ${RAW_TKS} \
	  | ./select-gud-bad-words \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} -v writeGood=0 \
	  > ${BAD_TKS}

endif
# End of code for normal subsections
# ======================================================================

ifeq "${SUBSEC}" "tot.t"

SUBSECS := ${shell cat ${SAMPLE_DIR}/subsections-ok.tags}

SUBSECDIRS := ${addprefix ${SAMPLE_DIR}/,${SUBSECS}}

ALL_RAW_TKS := ${addsuffix /raw.tks,${SUBSECDIRS}}
ALL_GUD_TKS := ${addsuffix /gud.tks,${SUBSECDIRS}}
ALL_BAD_TKS := ${addsuffix /bad.tks,${SUBSECDIRS}}

${RAW_TKS}: ${ALL_RAW_TKS}
	cat ${ALL_RAW_TKS} > ${RAW_TKS}

${GUD_TKS}: ${ALL_GUD_TKS}
	cat ${ALL_GUD_TKS} > ${GUD_TKS}

${BAD_TKS}: ${ALL_BAD_TKS}
	cat ${ALL_BAD_TKS} > ${BAD_TKS}

endif
# End of code for "tot.t" subsection
# ======================================================================

# Rule to compute word frequencis:

%.wfr: %.tks  compute-freqs
	cat $*.tks \
	  | sort | uniq -c | expand \
	  | sort -b +0 -1nr +1 -2 \
	  | ./compute-freqs \
	  > $*.wfr

endif
# End of code for whole makefile
# ======================================================================
