#! /usr/bin/gawk -f # Last edited on 2016-05-09 12:15:33 by stolfilocal BEGIN \ { # Ranks how similar a base textis to candidate texts. # Claimed to identify authors. # Original version (in perl) by Scott Roberts, 2016. # Converted to gawk by Jorge Stolfi, 2016. abort = -1; # Check command-line arguments: if ((ARGV[1] == "--info") || (ARGV[1] == "--help") || (ARGV[1] == "-h")) { print_info(); exit(0); } if (baseText == "") { arg_error("must define {baseText}"); } if (candTexts == "") { arg_error("must define {candTexts}"); } gsub(/[ ]/,"",candTexts); nCands = split(candTexts, candText, ","); if (nCands <= 0) { arg_error("invalid {candTexts}"); } if (nRead == "") { arg_error("invalid {nRead}"); } nRead += 0; # Ensure it is numeric. if (omitPunct == "") { arg_error("invalid {omitPunct}"); } omitPunct += 0; # Ensure it is numeric. if (debug == "") { arg_error("invalid {debug}"); } debug += 0; # Ensure it is numeric. # Counts occurrences of each word in each text. Indexed by the word. split("", baseCount); # Word counts in the base text. # Get the word counts from the base text: printf "using %d tokens from each text\n", nRead > "/dev/stderr"; baseNW = read_text(baseText, nRead, omitPunct, debug, baseCount); # Process the candidate texts: bestScore = 1.0e10; bestText = "NONE"; for (kF = 1; kF <= nCands; kF++) { split("", candCount); # Word counts in the candidate text. candNW = read_text(candText[kF], nRead, omitPunct, debug, candCount); score = compute_score_scott(baseCount, candCount, nRead) printf "text = %s words = %d score = %10.3f\n", candText[kF], candNW, score > "/dev/stderr"; printf "%10.3f %6d %s\n", score, candNW, candText[kF]; if (score < bestScore) { bestScore = score; bestText = candText[kF]; } } printf "\n" > "/dev/stderr"; printf "best match = %s score = %10.3f\n", bestText, bestScore > "/dev/stderr"; fflush("/dev/stdout"); } function compute_score_scott(baseCount,candCount,nRead, w,sum,baseN,candN,nt) { # The Scott-Roberts dissimilarity score: sum = 0; # Score. nt = 0; # Number of terms included. ns = 0; # Number of terms skipped. for (w in baseCount) { baseN = baseCount[w] + 0.0; candN = candCount[w] + 0.0; if (candN < 1) { candN = 0.25; } if (baseN > candN) { sum += log(baseN/candN); nt++; } else { ns++; } } printf "added %d terms skipped %d terms\n", nt, ns > "/dev/stderr"; return sum*log(10); } function read_text(text,nRead,omitPunct,debug,count, fname,nSkip,nTot,nDist,nlin,lin,nfld,fld,w) { # Reads {nRead} tokens from file "in/{text}/main.wds". # If {text} ends with ":{nSkip}", skips that many tokens before # reading {nRead}. # Extract the number of tokens to skip, if any: if (match(text,/[:][0-9]+$/)) { nSkip = substr(text,RSTART+1) + 0; text = substr(text,1,RSTART-1); } else { nSkip = 0; } # Assemble the file name: fname = ("in/" text "/main.wds"); if (debug) { printf "reading file %s\n", fname > "/dev/stderr"; } # Read tokens from file: nTot = 0; # Tokens read from file, including skipped ones. nDist = 0; # Number of distinct words. nlin = 0; # Lines read frm file. nch = 0; # If debugging, number of characters in current debug line. while((nTot < nSkip + nRead) && ((getline lin < fname) > 0)) { nlin++; if (match(lin, /^[ap]/)) { nfld = split(lin, fld, " "); # printf "%6d %d [%s] [%s]\n", nlin, nfld, fld[1], fld[2] > "/dev/stderr"; if ((nfld != 2) || ((fld[1] != "a") && (fld[1] != "p"))) { tbl_error(fname, nlin, ("bad file entry = \"" lin "\"")); } if ((nTot >= nSkip) && ((fld[1] == "a") || (! omitPunct))) { w = tolower(fld[2]); if (! (w in count)) { nDist++; } count[w]++; if (debug) { if (nch > 0) { printf " " > "/dev/stderr"; nch++; } if (nch + length(w) > 72) { printf "\n" > "/dev/stderr"; nch = 0; } printf "%s", w > "/dev/stderr"; nch += length(w); } } nTot++; } } if (debug) { printf "\n" > "/dev/stderr"; } # Check for I/O errors: if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); } close (fname); if (nTot != nSkip + nRead) { tbl_error(fname, nlin, ("only " nTot " words found")); } if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } if (debug) { # Print summary: printf "read %d lines %d words (%d distinct)", nlin, nTot - nSkip, nDist > "/dev/stderr" printf " after skipping %d\n", nSkip > "/dev/stderr"; } return nDist; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 } function print_info() { printf "PURPOSE\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "This program takes a base text and computes the\n" > "/dev/stderr"; printf "ScottRoberts word-entropy difference between it\n" > "/dev/stderr"; printf "and a set of given candidate texts.\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "ARGUMENTS\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "The user must provide certain arguments in the command line,\n" > "/dev/stderr"; printf "with the syntax \"-v PARAMETER=VALUE\".\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf " -v nRead=NUMBER the number of tokens to read from each text.\n" > "/dev/stderr"; printf " -v baseText=NAME the name of the base text.\n" > "/dev/stderr"; printf " -v candTexts=NAMES a list of comma-separated candidate text names.\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "A token is one occurrence of a word in the text. Each\n" > "/dev/stderr"; printf "text name may be followed by \":\" and a number that specifies\n" > "/dev/stderr"; printf "the number of tokens to skip at the beginning of the text.\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "INPUTS\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "The text with a given NAME is read from file \"in/NAME/main.wds\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "Each input file should be in the \".wds\"\n" > "/dev/stderr"; printf "format. Namely, each line should contain one word or punctuation,\n" > "/dev/stderr"; printf "preceded by the tag \"a\" or \"p\", respectively, and a\n" > "/dev/stderr"; printf "blank space. Lines that do not start with \"a\" or \"p\"\n" > "/dev/stderr"; printf "are ignored.\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "OUTPUT\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "The program writes to standard output one line for each\n" > "/dev/stderr"; printf "candidate text, containing the score, the number of\n" > "/dev/stderr"; printf "distinct words, and the text's name.\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "SCORING FORMULA\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "The ScottRoberts score is: for each word in the base file, divide\n" > "/dev/stderr"; printf "its count {p} by the word count from the candidate file {q} and\n" > "/dev/stderr"; printf "then take the log base 10 of the ratio. If the word was not found\n" > "/dev/stderr"; printf "in the current file, assume {q=0.25}. Do this only if {p > q}. Sum\n" > "/dev/stderr"; printf "for all words.\n" > "/dev/stderr"; }