# Last edited on 2004-07-24 03:03:54 by stolfi 
# Compute the token/word length histogram

MAKEFILE := dup-word-lists.make
# MAKEFILE := 

######################################################################
# Makefile for computing the repetitious word lists for a specified
# language sample. Caller must define 
#   ${LANG} = "voyn", "chin", etc.;
#   ${BOOK} = "wow", "vms", etc.
#
LANG := LANG.IS.UNDEFINED
ifneq "${LANG}" "LANG.IS.UNDEFINED"

BOOK := BOOK.IS.UNDEFINED
ifneq "${BOOK}" "BOOK.IS.UNDEFINED"

SAMPLE_DIR := ${LANG}/${BOOK}
SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1

BIN := ${STOLFIHOME}/bin

# Decide source EVT file and word-cleanup filter

ifeq "${LANG}" "voyn"
  EVT_FILE := ${SAMPLE_TOT_DIR}/raw.evt
  FIX_WORDS := /bin/cat
  FIX_WORDS_DEPS := 
else
  EVT_FILE := ${SAMPLE_DIR}/source/main.evt
  SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk
  SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl
  FIX_WORDS := ./fix-words \
    -f dat/${SAMPLE_AWK} \
    -v sample=${LANG}/${BOOK} \
    -v table=dat/${SAMPLE_TBL} \
    -v field=5
  FIX_WORDS_DEPS := \
    ./fix-words \
    dat/${SAMPLE_AWK} \
    dat/${SAMPLE_TBL}
endif
TLW_FILE := ${SAMPLE_TOT_DIR}/raw.tlw
REP_FILE := ${SAMPLE_TOT_DIR}/raw.rep
REPFM_FILE := ${SAMPLE_TOT_DIR}/raw.repfm
REPFR_FILE := ${SAMPLE_TOT_DIR}/raw.repfr
TEX_REP_SUMMARY := ${SAMPLE_TOT_DIR}/raw-rep-summary.tex
.PHONY: all rep-list

all: rep-list

rep-list: dat/${REP_FILE} dat/${REPFM_FILE} dat/${REPFR_FILE} dat/${TEX_REP_SUMMARY}
          
# List word replications (unformatted)

CONTEXT := 3

dat/${REP_FILE}: dat/${TLW_FILE} \
             list-replicate-words \
             ${MAKEFILE}          
	@echo "dat/${TLW_FILE} -> dat/${REP_FILE}"
	cat dat/${TLW_FILE} \
	  | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \
          | list-replicate-words \
	      -v sep='_' \
	      -v context=${CONTEXT} \
              -v lineWidth=0 \
          > dat/${REP_FILE}
	cat dat/${REP_FILE} \
          | head -10
          
# List word replications (formatted)

dat/${REPFM_FILE}: dat/${TLW_FILE} \
             list-replicate-words \
             ${MAKEFILE}          
	@echo "dat/${TLW_FILE} -> dat/${REPFM_FILE}"
	cat dat/${TLW_FILE} \
	  | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \
          | list-replicate-words \
	      -v sep='_' \
	      -v context=${CONTEXT} \
              -v lineWidth=100 \
          > dat/${REPFM_FILE}
	cat dat/${REPFM_FILE} \
          | head -10
          
# Extract replicated words and count them

dat/${REPFR_FILE}: dat/${REP_FILE} \
             compute-freqs \
             ${MAKEFILE}          
	@echo "dat/${REP_FILE} -> dat/${REPFR_FILE}"
	cat dat/${REP_FILE} \
	  | gawk -v ctx=${CONTEXT} \
              ' BEGIN {ctr = 3 + 2*ctx + 1; } \
                /./{ print $$(ctr); } \
              ' \
          | sort | uniq -c | expand \
          | sort -b +0 -1nr +1 -2 \
          | compute-freqs \
          > dat/${REPFR_FILE}
	cat dat/${REPFR_FILE} \
          | head -10

# Compute frequency of replications

dat/${TEX_REP_SUMMARY}: dat/${REPFR_FILE} dat/${TLW_FILE} \
	    ${MAKEFILE} \
            summarize-rep-stats
	@echo "dat/${REPFR_FILE} -> dat/${TEX_REP_SUMMARY}"
	summarize-rep-stats ${LANG} ${BOOK} tot.1 \
	  > dat/${TEX_REP_SUMMARY}
	cat dat/${TEX_REP_SUMMARY}
	update-paper-include dat/${TEX_REP_SUMMARY} exp/${TEX_REP_SUMMARY}

endif
endif
# End of ${LANG}/${BOOK} recursion
######################################################################