#! /usr/bin/gawk -f
# Last edited on 2002-03-05 00:16:15 by stolfi

# Reads a file containing lines of the form LOC WORD where LOC is a
# locator string, and WORD is a word.  Builds the equivalence
# relation where two consecutive WORDs with same LOC are equivalent.
# Outputs one line WORD ROOT for each distinct input WORD, where
# ROOT is a canonical representative of its equivalence class.

BEGIN {
  abort = -1; 
  split("", eqv);  # `eqv[w]' is the parent of `w' in the union-find tree.
  split("", esz);  # `eqv[w] == w' means `w' is root, and `esz[w]' is the tree size.
}

(abort >= 0) {exit abort;} 


(NF == 2) {
  loc = $1; w = $2;
  if (! (w in eqv)) 
    { eqv[w] = w; esz[w] = 1; }
  else
    { w = find_root(w); }
  if (loc == oloc) 
    { if (ow < w) 
        { eqv[w] = ow; esz[ow] += esz[w]; esz[w] = "0";  }
      else
        { eqv[ow] = w; esz[w] += esz[ow]; esz[ow] = "0"; }
    }
  ow = w; 
  oloc = loc;
  next;
}

/./{ data_error("bad line type"); }

END {
  if (abort >= 0) {exit abort;} 
  for (w in eqv)
    { ow = find_root(w); print w, ow; }
}

function find_root(w, t,p,r)
{
  t = w;
  while ((p = eqv[t]) != t) { t = p; }
  r = t;
  t = w;
  while ((p = eqv[t]) != t) { eqv[t] = r; t = p; }
  return r;
}

function data_error(msg)
{
  printf "*** line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}