# Last edited on 2025-05-02 17:16:16 by stolfi 
# Compute the token/word length histogram

MAKEFILE := dup-word-lists.make
# MAKEFILE := 

######################################################################
# Makefile for computing the duplicate word lists for a specified
# language sample. Caller must define 
#   ${LANG} = "voyn", "chin", etc.;
#   ${BOOK} = "wow", "vms", etc.
#
LANG := LANG.IS.UNDEFINED
ifneq "${LANG}" "LANG.IS.UNDEFINED"

BOOK := BOOK.IS.UNDEFINED
ifneq "${BOOK}" "BOOK.IS.UNDEFINED"

SAMPLE_DIR := ${LANG}/${BOOK}
SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1

BIN := ${STOLFIHOME}/bin

# Decide source EVT file and word-cleanup filter

ifeq "${LANG}" "voyn"
  EVT_FILE := ${SAMPLE_TOT_DIR}/raw.evt
  FIX_WORDS := /bin/cat
  FIX_WORDS_DEPS := 
else
  EVT_FILE := ${SAMPLE_DIR}/source/main.evt
  SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk
  SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl
  FIX_WORDS := ./fix_words.gawk \
    -f dat/${SAMPLE_AWK} \
    -v sample=${LANG}/${BOOK} \
    -v table=dat/${SAMPLE_TBL} \
    -v field=5
  FIX_WORDS_DEPS := \
    ./fix_words.gawk \
    dat/${SAMPLE_AWK} \
    dat/${SAMPLE_TBL}
endif
TLW_FILE := ${SAMPLE_TOT_DIR}/raw.tlw
DUP_FILE := ${SAMPLE_TOT_DIR}/raw.dup
DUPFM_FILE := ${SAMPLE_TOT_DIR}/raw.dupfm
DUPFR_FILE := ${SAMPLE_TOT_DIR}/raw.dupfr
TEX_DUP_SUMMARY := ${SAMPLE_TOT_DIR}/raw-dup-summary.tex
.PHONY: all dup-list

all: dup-list

dup-list: dat/${DUP_FILE} dat/${DUPFM_FILE} dat/${DUPFR_FILE} dat/${TEX_DUP_SUMMARY}
          
# List word duplications (unformatted)

CONTEXT := 3

dat/${DUP_FILE}: dat/${TLW_FILE} \
             list-duplicate-words \
             ${MAKEFILE}          
	@echo "dat/${TLW_FILE} -> dat/${DUP_FILE}"
	cat dat/${TLW_FILE} \
	  | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \
          | list-duplicate-words \
	      -v sep='_' \
	      -v context=${CONTEXT} \
              -v lineWidth=0 \
          > dat/${DUP_FILE}
	cat dat/${DUP_FILE} \
          | head -10
          
# List word duplications (formatted)

dat/${DUPFM_FILE}: dat/${TLW_FILE} \
             list-duplicate-words \
             ${MAKEFILE}          
	@echo "dat/${TLW_FILE} -> dat/${DUPFM_FILE}"
	cat dat/${TLW_FILE} \
	  | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \
          | list-duplicate-words \
	      -v sep='_' \
	      -v context=${CONTEXT} \
              -v lineWidth=100 \
          > dat/${DUPFM_FILE}
	cat dat/${DUPFM_FILE} \
          | head -10
          
# Extract duplicated words and count them

dat/${DUPFR_FILE}: dat/${DUP_FILE} \
             compute-freqs \
             ${MAKEFILE}          
	@echo "dat/${DUP_FILE} -> dat/${DUPFR_FILE}"
	cat dat/${DUP_FILE} \
	  | gawk -v ctx=${CONTEXT} \
              ' BEGIN {ctr = 3 + 2*ctx + 1; } \
                /./{ print $$(ctr); } \
              ' \
          | sort | uniq -c | expand \
          | sort -b -k1nr -k2 \
          | compute-freqs \
          > dat/${DUPFR_FILE}
	cat dat/${DUPFR_FILE} \
          | head -10

# Compute frequency of duplications

dat/${TEX_DUP_SUMMARY}: dat/${DUPFR_FILE} dat/${TLW_FILE} \
	    ${MAKEFILE} \
            summarize_dup_stats.csh
	@echo "dat/${DUPFR_FILE} -> dat/${TEX_DUP_SUMMARY}"
	summarize_dup_stats.csh ${LANG} ${BOOK} tot.1 \
	  > dat/${TEX_DUP_SUMMARY}
	cat dat/${TEX_DUP_SUMMARY}
	update_paper_include.sh dat/${TEX_DUP_SUMMARY} tex/${TEX_DUP_SUMMARY}

endif
endif
# End of ${LANG}/${BOOK} recursion
######################################################################