# Makefile for creating a language sample
# Last edited on 2002-02-12 00:34:55 by stolfi

MAKERULES := lang-sample.make

######################################################################
# Caller must define the following variables:
#  
#   SAMPLE = name for text sample, e.g. "engl/wow".
#   SUBSEC = subsection tag, e.g. "cos.2" or "tot.1".
#   SIZETAG = "whole" or "trunc"
#
SAMPLE := SAMPLE.IS.UNDEFINED
SUBSEC := SUBSEC.IS.UNDEFINED
SIZETAG := SIZETAG.IS.UNDEFINED
ifneq "${SAMPLE}" "SAMPLE.IS.UNDEFINED"

# Top-level input and output directories:

SAMPLE_REP := dat
BIN := ${HOME}/bin

# Derived directories and file names:

SAMPLE_DIR := ${SAMPLE_REP}/${SAMPLE}
SOURCE_DIR := ${SAMPLE_DIR}/source

SOURCE_EVT := ${SOURCE_DIR}/main.evt
SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk

# Token streams, with and without location codes:

RAW_LTS := ${SAMPLE_DIR}/${SUBSEC}/raw.lts
RAW_TKS := ${SAMPLE_DIR}/${SUBSEC}/raw.tks

# Word counts and frequencies:

RAW_WFR := ${SAMPLE_DIR}/${SUBSEC}/raw.wfr
GUD_WFR := ${SAMPLE_DIR}/${SUBSEC}/gud.wfr
BAD_WFR := ${SAMPLE_DIR}/${SUBSEC}/bad.wfr

DERIVED_FILES := ${RAW_LTS} ${RAW_TKS} ${RAW_WFR} ${GUD_WFR} ${BAD_WFR}

all: ${DERIVED_FILES}

# If there are subsections other than 'tot.1", the "tot.1" subsection
# must be created from them, in order to get the right right
# number of words from each subsection.

ALL_SUBSECS := ${strip ${shell cat ${SAMPLE_DIR}/subsections.tags}}
ifeq "/${ALL_SUBSECS}/" "//"
  TOT_SUBSEC := *NONE*
else
  TOT_SUBSEC := tot.1
endif

######################################################################
# The following applies only to subsections whose raw words are 
# to be extracted directly from the EVT file:
ifneq "${SUBSEC}" "${TOT_SUBSEC}"

# Create partial EVT source file with this subsection only:

SAMPLE_EVT := ${SAMPLE_DIR}/${SUBSEC}/raw.evt

${SAMPLE_EVT}: ${SOURCE_EVT} ${MAKERULES} \
                select-evt-lines ${SAMPLE_AWK}
	@echo "ALL_SUBSECS = '${ALL_SUBSECS}'"
	@echo "TOT_SUBSEC = '${TOT_SUBSEC}'"
	ls -ld ${SOURCE_EVT}
	cat ${SOURCE_EVT}  \
	  | ./select-evt-lines \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} -v subsec=${SUBSEC} \
	  > ${SAMPLE_EVT}

# Rebuild the source EVMT file in the source repository
# if it is out of date:

SOURCE_SOURCES := \
  ${SOURCE_DIR}/main.org \
  ${SOURCE_DIR}/Makefile \
  ${SOURCE_DIR}/preprocess-org \
  ${BIN}/org-to-evt

${SOURCE_EVT}: ${SOURCE_SOURCES}
	cd ${SOURCE_DIR} && make all 

SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl

RAWNUM_FILE := ${SAMPLE_DIR}/${SUBSEC}/${SIZETAG}-raw.num
RAWNUM := ${shell cat ${RAWNUM_FILE}}

# get raw tokens with locators from EVT file:

${RAW_LTS}: ${SAMPLE_EVT} ${SAMPLE_TBL} ${RAWNUM_FILE} ${MAKERULES} \
            ${BIN}/evt-to-wds fix-raw-words ${SAMPLE_AWK}
	cat ${SAMPLE_EVT} \
	  | ${BIN}/evt-to-wds \
	      -v showBreaks=0 \
	      -v showParags=1 \
	      -v showPuncts=1 \
	      -v smashSymbols=1 \
	      -v showLocation=1 \
	  | ./fix-raw-words \
	      -f ${SAMPLE_AWK} \
	      -v sample=${SAMPLE} \
	      -v table=${SAMPLE_TBL} \
	      -v field=3 \
	  | gawk \
	      -v num=${RAWNUM} \
	      '($$2 != 1){n++;} (n > num){exit 0;} //{print;}' \
	  > ${RAW_LTS}

# Remove locators leaving only the raw tokens (including punctuation):

${RAW_TKS}: ${RAW_LTS} ${MAKERULES}
	cat ${RAW_LTS} \
	  | gawk '/./ { print $$3; }' \
	  > ${RAW_TKS}

endif
# End rules for subsections extracted from the EVT file
######################################################################

######################################################################
# The following applies only for subsection thst is total of others:
# 
ifeq "${SUBSEC}" "${TOT_SUBSEC}"

SUBSECDIRS := ${addprefix ${SAMPLE_DIR}/,${ALL_SUBSECS}}

ALL_RAW_LTS := ${addsuffix /raw.lts,${SUBSECDIRS}}

${RAW_LTS}: ${ALL_RAW_LTS} ${MAKERULES}
	@echo "ALL_SUBSECS = '${ALL_SUBSECS}'"
	@echo "TOT_SUBSEC = '${TOT_SUBSEC}'"
	cat ${ALL_RAW_LTS} > ${RAW_LTS}

ALL_RAW_TKS := ${addsuffix /raw.tks,${SUBSECDIRS}}

${RAW_TKS}: ${ALL_RAW_TKS} ${MAKERULES}
	@echo "ALL_SUBSECS = '${ALL_SUBSECS}'"
	@echo "TOT_SUBSEC = '${TOT_SUBSEC}'"
	cat ${ALL_RAW_TKS} > ${RAW_TKS}

endif
# End of code for subsection that is the total of others
######################################################################

# Count raw word ocurrences and compute their rel. frequencies:

${RAW_WFR}: ${RAW_TKS} ${MAKERULES}  \
	      compute-freqs
	@echo "${RAW_TKS} -> ${RAW_WFR}"
	cat ${RAW_TKS} \
	  | egrep -v '=' \
	  | sort | uniq -c | expand \
	  | sort -b +0 -1nr +1 -2 \
	  | compute-freqs \
	  > ${RAW_WFR}
 
# Extract the good words:

${GUD_WFR}: ${RAW_WFR} ${MAKERULES} \
	      select-gud-bad-words ${SAMPLE_AWK}
	@echo "${RAW_WFR} -> ${GUD_WFR}"
	cat ${RAW_WFR} \
	  | select-gud-bad-words \
	      -f ${SAMPLE_AWK} \
	      -v inField=3 -v writeBad=0 \
	  > ${GUD_WFR}

# Extract the bad words:

${BAD_WFR}: ${RAW_WFR} ${MAKERULES} \
	      select-gud-bad-words ${SAMPLE_AWK}
	@echo "${RAW_WFR} -> ${BAD_WFR}"
	cat ${RAW_WFR} \
	  | select-gud-bad-words \
	      -f ${SAMPLE_AWK} \
	      -v inField=3 -v writeBad=1 \
	  > ${BAD_WFR}


# Rule to compute word frequencis:

%.wfr: %.tks  compute-freqs
	cat $*.tks \
	  | sort | uniq -c | expand \
	  | sort -b +0 -1nr +1 -2 \
	  | ./compute-freqs \
	  > $*.wfr

endif
# End of code for whole makefile
# ======================================================================