#! /usr/bin/gawk -f
# Last edited on 2004-09-26 10:38:26 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " < INPUT.wds > OUTPUT.flo" );
  # Input is a list of tokens, one per line, non-empty and with
  # no embedded spaces. Output has one line for each distinct word,
  # in the format
  #    
  #    "{COUNT} {FIRST} {LAST} {DIFF} {WORD}"
  #    
  # where {COUNT} is the number of occurrences, {FIRST} and 
  # {LAST} are the positions of the first and last occurrence
  # (couting from 0), and {DIFF} is {LAST-FIRST}.
  
  split("",fo);
  split("",lo);
  split("",ct);
}

(abort >= 0) { exit abort; }

/./ {
  wd = $1;
  pos = FNR-1;
  if (! (wd in ct)) { ct[wd]=0; fo[wd] = FNR-1; }
  lo[wd] = FNR-1; 
  ct[wd]++;
  next
}

// { data_error("bad word"); }

END { 
  if (abort >= 0) { exit abort; }
  for (wd in fo)
    { printf "%7d %7d %7d %7d %s\n", \
        ct[wd], fo[wd], lo[wd], lo[wd]-fo[wd], wd;
    }
}

function arg_error(msg)
{ 
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}