#! /usr/bin/gawk -f

# Usage: "$0 [-v skip=m] [-v take=n] < INFILE > OUTFILE"
#
# Reads a bunch of strings from a file, sorts them by similarity.
# When comparing, skips m characters and then only looks at n.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function char_dist(xc, yc)
{
  return (xc == yc ? 0 : 1 )
}

function dist(x, y, skip, take,  i, xc, yc, s)
{
  # String distance from x to y
  s = 0;
  for(i=1;i<=take;i++)
    { xc = substr(x, i+skip, 1);
      yc = substr(y, i+skip, 1);
      s = s + char_dist(xc, yc)
    }
  return s;
}
  

function strsort(str, out, skip, take,  i, w, k, dw, d, da, db, dk, n)
{
  # sorts "str" by similarity and put the result in out[0..nout-1].
  # Skips 'skip' bytes and then compares "take" bytes.
  # Assumes "str" is a vector with the words as arguments.
  # Returns number of strings. 
  n = 0;
  split("", dw);
  # printf "skip=%d take=%d\n", skip, take > "/dev/stderr";
  for (w in str)
    { if (n == 0) 
        { out[0] = w; n++; }
      else if(n == 1)
        { out[1] = w; n++; dw[0] = dist(out[0], out[1], skip, take); }
      else
        { # find two consecutive elems most similar to w
          db =  dist(out[0], w, skip, take);
          dk = db; k = -1;
          for (i=0; i<n-1; i++)
            { da = db;
              db = dist(out[i+1], w, skip, take);
              d = da + db - dw[i];
              if (d < dk) { dk = d; k = i; }
            }
          if (db < dk) { dk = db; k = n-1; }
          # Now insert:
          for (i=n-1; i > k; i--) { out[i+1] = out[i]; }
          n++;
          out[k+1] = w;
          if (k >= 0) 
            { dw[k] = dist(out[k], out[k+1], skip, take); 
              # printf "%s", out[k] > "/dev/stderr";
            }
          # printf "(%s)", out[k+1] > "/dev/stderr";
          if (k < n-1) 
            { dw[k+1] = dist(out[k+1], out[k+2], skip, take);
              # printf "%s\n", out[k+2] > "/dev/stderr";
            }
        }
    }
  return n;
}

BEGIN {
  minlen=99999999;
  split("", str);
}

/./ {
  w = $0;
  str[w] = 1; 
  n=length(w); minlen = (n<minlen?n:minlen);
}

END {
  if (take==0) { take = minlen - skip; }
  if (take < 0) error("negative key length");
  split("", out);
  nout = strsort(str, out, skip, take);
  for (i=0;i<nout;i++) print out[i];
}