#! /usr/bin/gawk -f
# Last edited on 2016-05-09 12:15:33 by stolfilocal

BEGIN \
  {
    # Ranks how similar a base textis to candidate texts. 
    # Claimed to identify authors.
    # Original version (in perl) by Scott Roberts, 2016.
    # Converted to gawk by Jorge Stolfi, 2016.
    
    abort = -1;
    
    # Check command-line arguments:
    if ((ARGV[1] == "--info") || (ARGV[1] == "--help") || (ARGV[1] == "-h"))
      { print_info(); exit(0); }

    if (baseText == "") { arg_error("must define {baseText}"); }

    if (candTexts == "") { arg_error("must define {candTexts}"); }
    gsub(/[ ]/,"",candTexts);
    nCands = split(candTexts, candText, ",");
    if (nCands <= 0) { arg_error("invalid {candTexts}"); }
    
    if (nRead == "") { arg_error("invalid {nRead}"); }
    nRead += 0; # Ensure it is numeric.

    if (omitPunct == "") { arg_error("invalid {omitPunct}"); }
    omitPunct += 0; # Ensure it is numeric.

    if (debug == "") { arg_error("invalid {debug}"); }
    debug += 0; # Ensure it is numeric.

    # Counts occurrences of each word in each text. Indexed by the word.
    split("", baseCount); # Word counts in the base text.
    
    # Get the word counts from the base text:
    printf "using %d tokens from each text\n", nRead > "/dev/stderr";
    baseNW = read_text(baseText, nRead, omitPunct, debug, baseCount);
    
    # Process the candidate texts:
    bestScore = 1.0e10;
    bestText = "NONE";
    for (kF = 1; kF <= nCands; kF++) 
      { split("", candCount); # Word counts in the candidate text.
        candNW = read_text(candText[kF], nRead, omitPunct, debug, candCount); 
        score = compute_score_scott(baseCount, candCount, nRead)
        printf "text = %s  words = %d  score = %10.3f\n", candText[kF], candNW, score > "/dev/stderr";
        printf "%10.3f %6d %s\n", score, candNW, candText[kF];
        if (score < bestScore) { bestScore = score; bestText = candText[kF]; }
      }
   printf "\n"  > "/dev/stderr";
   printf "best match = %s  score = %10.3f\n", bestText, bestScore > "/dev/stderr";
   fflush("/dev/stdout");
  }

function compute_score_scott(baseCount,candCount,nRead,  w,sum,baseN,candN,nt)
  { 
    # The Scott-Roberts dissimilarity score:
    sum = 0; # Score.
    nt = 0; # Number of terms included.
    ns = 0; # Number of terms skipped.
    for (w in baseCount)
      { baseN = baseCount[w] + 0.0;
        candN = candCount[w] + 0.0;
        if (candN < 1) { candN = 0.25; }
        if (baseN > candN) 
          { sum += log(baseN/candN); nt++; }
        else
          { ns++; }
      }
    printf "added %d terms  skipped %d terms\n", nt, ns > "/dev/stderr";
    return sum*log(10);
  }

function read_text(text,nRead,omitPunct,debug,count,  fname,nSkip,nTot,nDist,nlin,lin,nfld,fld,w)
  { 
    # Reads {nRead} tokens from file "in/{text}/main.wds".
    # If {text} ends with ":{nSkip}", skips that many tokens before 
    # reading {nRead}.

    # Extract the number of tokens to skip, if any:
    if (match(text,/[:][0-9]+$/))
      { nSkip = substr(text,RSTART+1) + 0; 
        text = substr(text,1,RSTART-1);
      }
    else
      { nSkip = 0; }
      
    # Assemble the file name:
    fname = ("in/" text "/main.wds");
    if (debug) { printf "reading file %s\n", fname > "/dev/stderr"; }
    
    # Read tokens from file:
    nTot = 0;  # Tokens read from file, including skipped ones.
    nDist = 0; # Number of distinct words.
    nlin = 0;  # Lines read frm file.
    nch = 0;   # If debugging, number of characters in current debug line.
    while((nTot < nSkip + nRead) && ((getline lin < fname) > 0)) { 
      nlin++;
      if (match(lin, /^[ap]/))
        { nfld = split(lin, fld, " ");
          # printf "%6d %d [%s] [%s]\n", nlin, nfld, fld[1], fld[2] > "/dev/stderr";
          if ((nfld != 2) || ((fld[1] != "a") && (fld[1] != "p")))
            { tbl_error(fname, nlin, ("bad file entry = \"" lin "\"")); }
          if ((nTot >= nSkip) && ((fld[1] == "a") || (! omitPunct))) 
            { w = tolower(fld[2]);
              if (! (w in count)) { nDist++; }
              count[w]++;
              if (debug)
                { if (nch > 0) { printf " " > "/dev/stderr"; nch++; }
                  if (nch + length(w) > 72) { printf "\n" > "/dev/stderr"; nch = 0; }
                  printf "%s", w > "/dev/stderr"; nch += length(w);
                }
            }
          nTot++;
        }
    }
    if (debug) { printf "\n" > "/dev/stderr"; }

    # Check for I/O errors:
    if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); }
    close (fname);
    if (nTot != nSkip + nRead) { tbl_error(fname, nlin, ("only " nTot " words found")); }
    if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
    
    if (debug)
      { # Print summary:
        printf "read %d lines %d words (%d distinct)", nlin, nTot - nSkip, nDist > "/dev/stderr"
        printf " after skipping %d\n", nSkip > "/dev/stderr";
      }
    return nDist;
  }

function arg_error(msg)
  { 
    printf "%s\n", msg > "/dev/stderr";
    printf "usage: %s\n", usage > "/dev/stderr";
    abort = 1;
    exit 1
  }

function tbl_error(f,n,msg)
  { 
    printf "%s:%d: %s\n", f, n, msg > "/dev/stderr";
    abort = 1;
    exit 1
  }

function data_error(msg)
  { 
    printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
    abort = 1;
    exit 1
  }

function print_info()
  { 
    printf "PURPOSE\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "This program takes a base text and computes the\n" > "/dev/stderr";
    printf "ScottRoberts word-entropy difference between it\n" > "/dev/stderr";
    printf "and a set of given candidate texts.\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "ARGUMENTS\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "The user must provide certain arguments in the command line,\n" > "/dev/stderr";
    printf "with the syntax \"-v PARAMETER=VALUE\".\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "  -v nRead=NUMBER   the number of tokens to read from each text.\n" > "/dev/stderr";
    printf "  -v baseText=NAME   the name of the base text.\n" > "/dev/stderr";
    printf "  -v candTexts=NAMES  a list of comma-separated candidate text names.\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "A token is one occurrence of a word in the text.  Each\n" > "/dev/stderr";
    printf "text name may be followed by \":\" and a number that specifies\n" > "/dev/stderr";
    printf "the number of tokens to skip at the beginning of the text.\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "INPUTS\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "The text with a given NAME is read from file \"in/NAME/main.wds\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "Each input file should be in the \".wds\"\n" > "/dev/stderr";
    printf "format.  Namely, each line should contain one word or punctuation,\n" > "/dev/stderr";
    printf "preceded by the tag \"a\" or \"p\", respectively, and a\n" > "/dev/stderr";
    printf "blank space.  Lines that do not start with \"a\" or \"p\"\n" > "/dev/stderr";
    printf "are ignored.\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "OUTPUT\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "The program writes to standard output one line for each\n" > "/dev/stderr";
    printf "candidate text, containing the score, the number of\n" > "/dev/stderr";
    printf "distinct words, and the text's name.\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "SCORING FORMULA\n" > "/dev/stderr";
    printf "\n" > "/dev/stderr";
    printf "The ScottRoberts score is: for each word in the base file, divide\n" > "/dev/stderr";
    printf "its count {p} by the word count from the candidate file {q} and\n" > "/dev/stderr";
    printf "then take the log base 10 of the ratio. If the word was not found\n" > "/dev/stderr";
    printf "in the current file, assume {q=0.25}. Do this only if {p > q}. Sum\n" > "/dev/stderr";
    printf "for all words.\n" > "/dev/stderr";
  }