#! /usr/bin/gawk -f
# Last edited on 2004-09-13 19:18:37 by stolfi

BEGIN {

  abort = -1;
  usage = ( ARGV[0] " [-v wsep=CHAR] [-v maxlen=NUM] < INFILE > OUTFILE" );
  
  # Reads a text, one word per line.
  # Outputs all repeated maximal substrings.

  if (maxlen == "") { maxlen = 30; }
  if (wsep == "") { wsep = "."; }
  
  nstates = 0; # Number of states
  
  # Indexed by state:
  split("", count); # Number of input substrings leading to this state.
  split("", chars); # Characters leading out from this state.
  
  # Indexed by state and char:
  split("", nextst);  # Next state.
  
  curlen = 0;  # Number of currently active states.
  # Indexed [0..curlen-1]
  split("", active);   # {active[k]} is state spelled by last {k} characters.
  
  # Set up root state:
  root = nstates;
  nstates++;
  count[root] = 0;
  lpath[root] = 0;
  
  # Append the root as an active state:
  active[curlen] = root;
  curlen++;
  count[root]++;
  
  # Statistics
  nwords = 0; # Words read
  nchars = 0; # Characters processed (incl. word seps)
}

/./ { 
  if (nwords > 0) { process(wsep); }
  wd = $0;
  nwd = length(wd)
  for (i = 1; i <= nwd; i++) { process(substr(wd,i,1)); }
  nwords++;
}

END {
  printf "# %d words read \n", nwords;
  printf "# %d chars processed \n", nchars;
  printf "# %d states created \n", nstates;
  # Spit out states with count higher than 1: 
  printreps(root, "");
}

function process(ch,    len,st,stnew)
{
  nchars++;
  # Append {ch} to all current states:
  for (len = curlen-1; len >= 0; len--)
    { st = active[len];
      if (! ((st,ch) in nextst))
        { stnew = nstates; 
          nextst[st,ch] = stnew;
          count[stnew] = 0;
          chars[stnew] = "";
          chars[st] = (chars[st] ch);
          nstates++;
        }
      else
        { stnew = nextst[st,ch]; }
      active[len+1] = stnew;
      count[stnew]++;
    }
  # Increment {curlen} until it reaches {maxlen}:
  if (curlen < maxlen) { curlen++; }
  # Set the root as an active state:
  count[root]++;
}

function printreps(st,pre,  chs,nchs,i,ch,stnew)
{
  if (count[st] > 1) 
    { chs = chars[st];
      nchs = length(chs);
      if (nchs != 1) { printf "%9d %3d %s\n", count[st], length(pre), pre; } 
      for (i = 1; i <= nchs; i++)
        { ch = substr(chs,i,1);
          stnew = nextst[st,ch];
          printreps(stnew, (pre ch));
        }
    }
}