#! /bin/gawk -f 
# Last edited on 2004-07-24 03:04:32 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "list-duplicate-words \\\n" \
    "   [ -v sep=SEPSTRING ] \\\n" \
    "   [ -v context=NUM ] \\\n" \
    "   [ -v lineWidth=NUM ] \\\n" \
    "   < INFILE.wds > OUTFILE.txt" \
  );
  
  # Reads a list of words, one per line, in the format
  # 
  #   LOC TYPE WORD
  # 
  # where LOC is a line locator, WORD is any non-empty string, and
  # TYPE an integer indicating its type (see evt-to-wds).
  # 
  # Each WORD is considered to be `delim' if TYPE = 1, `plain'
  # otherwise. Assumes an empty delim word between any two adjacent
  # plain words; and joins together any string of two or more
  # consecutive delim words.
  #  
  # Prints out a file with lines of the form
  #
  #   LOC LPOS FPOS  LWDn LSPn ... LWD1 LSP1 WORD SEP WORD RSP1 RWD1 RWD2 ... RSPn LWDn 
  # 
  # where WORD is an plain word that occurs repeated in the file,
  # separated by SEP; and LWDi, LSPi, RSPj, RWDj are the adjacent
  # words and separators. The LOC, LPOS, and FPOS fields are the
  # position of the the first of the two WORD occurrences. The number
  # "n" of context words is defined by the "context" parameter.
  # 
  # If "lineWidth" is not given or zero, the fields are printed in 
  # free format; if any delim field ("LSPi", SEP, or "RSPj") is empty,  
  # it is replaced by "sep" (default "_").  If "lineWidth" is nonzero,
  # the output is spaced and padded so that the locator fields are
  # aligned at the left margin, and the SEPs are aligned 
  # and centered in the remaining columns; and empty
  # delims are omitted.
  # 
  if (sep == "") { sep = "_"; }
  if (context == "") { context = 3; }
  if (lineWidth == "") { lineWidth = 0; }
  
  # The words are in "buf[0..nbuf-1]".
  # Their original positions are "pos[0..nbuf-1]".
  # Adjacent delims are already concatenated. 
  # Synthetic delims are "" for now (turned "sep" on printout).
  split("", buf);
  nbuf = 4*context+3;  # Size of token buffer (plain and delim).
  ibuf = 0;            # Buffered tokens are "buf[ibuf+k]", "k=0..nbuf-1", modulo.
  ctr = 2*context;     # First token of repeat pair is "buf[ibuf+ctr]", modulo.
  split("", pos);      # Location of "buf[i]" is "pos[i]"
  for (i = 0; i < nbuf; i++) { buf[i] = ""; pos[i] = ""; }
  last_was_delim = 1;
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ { next; } 

/./ {
  if (NF != 3) { data_error("bad line format"); }
  wpos = (lineWidth == 0 ? $1 : sprintf("%-19s", $1) );
  wtype = $2;
  wd = $3;
  is_delim = (wtype == 1);
  if (is_delim)
    { if (last_was_delim) 
        { join_token(wd, wpos); }
      else
        { push_token(wd, wpos); last_was_delim = 1; }
    }
  else
    { if (! last_was_delim) { push_token("", wpos); }
      push_token(wd, wpos); check_for_dup();
      last_was_delim = 0;
    }
  next;
}

END{
  if (abort >= 0) { exit abort; }
  for (i = 0; i < nbuf; i++)
    { push_token("", "");
      check_for_dup();
    }
}

function join_token(wd,wpos,   j)
{
  # Appends "wd" to the last buffered token.
  # Also sets its location to "wpos".
  j = (ibuf + nbuf-1) % nbuf;
  buf[j] = ( buf[j] wd );
  pos[j] = wpos;
}

function push_token(wd,wpos,   j)
{
  # Appends a token (plain or delim) to the buffer, and records its
  # position "wpos".
  j = ibuf;
  buf[j] = wd;
  pos[j] = wpos;
  ibuf = (ibuf + 1) % nbuf;
}

function check_for_dup(   j1,j2)
{
  # Prints the current line if the middle 
  # two plain tokens in the buffer are equal and non-empty.
  j1 = (ibuf + ctr) % nbuf;
  j2 = (j1 + 2) % nbuf;
  if ((buf[j1] != "") && (buf[j1] !~ /[?]/) && (buf[j1] == buf[j2])) 
    { if (lineWidth > 0) 
        { print_buf_formatted(); }
      else
         { print_buf_plain(); }
    }
}

function print_buf_formatted(    k,j,wd,ps,lwd,pad)
{
  ps = pos[(ibuf + ctr) % nbuf];
  if (ps == "") { ps = "_"; }
  printf("%-19s ", ps);
  # Compute width of left half:
  lwd = 0;
  for (k = 0; k <= ctr+1; k++)
    { j = (ibuf + k) % nbuf;
      wd = buf[j]; 
      if (k <= ctr) 
        { lwd += ( wd != "" ? length(wd) + 1 : 0 ) }
      else
        { lwd += ( wd != "" ? 1 + length(wd)/2 : 1/2); }
    }
  # Pad line to align central separators:
  pad = int((lineWidth - 19)/2 - lwd);
  if (pad < 0) { pad = 0; }
  printf "%*s", pad, "";
  # Print line
  for (k = 0; k < nbuf; k++)
    { j = (ibuf + k) % nbuf;
      wd = buf[j]; 
      if (wd != "") { printf(" %s", wd); }
    }
  printf("\n");
}

function print_buf_plain(    j,wd,ps)
{
  ps = pos[(ibuf + ctr) % nbuf];
  if (ps == "") { ps = "_"; }
  printf("%s", ps);
  for (k = 0; k < nbuf; k++)
    { j = (ibuf + k) % nbuf;
      wd = buf[j]; 
      if (wd == "") { wd = sep; } 
      printf(" %s", wd);
    }
  printf("\n");
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  printf "usage: %s\n", usage >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function prog_error(msg)
{ 
  printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}