#! /usr/bin/gawk -f
# Last edited on 2001-01-15 03:18:12 by stolfi

BEGIN {
  abort = -1;
  usage = ( "extract-glyph-strings \\\n" \
    "  -v glyphs=GLYPHS \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # Reads recors of the form COUNT WORD where WORD is partitioned
  # into elements by braces {}.
  # 
  # For each record, the script partitions the WORD into an alternating
  # sequence of the form OKOKOKO...KO, where each O is a string of zero
  # or more elements consisting only of the specified GLYPH letters,
  # and each K is a single element containig at least one non-GLYPH
  # letter.
  # 
  # Then, for each O-string, outputs COUNT {LEFT}<STRING>{RIGHT} where
  # STRING is that string (deprived of braces and surrounded by <>), and
  # {LEFT} and {RIGHT} are the previous and next K elements
  # respectively.  The script provides a dummy K-element "{_}"
  # before the first O-slot and after the last O-slot, repsectively.
  
  if (glyphs == "") { arg_error(("must define \"glyphs\"")); }
  if (glyphs !~ /^[a-zA-Z0-9]+$/) { arg_error(("bad value for \"glyphs\"")); }
  gpat = ( "[" glyphs "]" );
}

(abort >= 0) { exit abort; }

/^#/ { print; next; }

/./ {
  # Insert word-start and word-stop (K-like) markers
  w = ("{_}" $2 "{_}");
  
  # Merge consecutive GLYPHS-only elements together: 
  pat = ( "(" gpat ")}{(" gpat ")" );
  w = gensub(pat, "\\1\\2", "g", w);
  w = gensub(pat, "\\1\\2", "g", w);

  # Replace the outer braces of GLYPHS-only elements by <>
  pat = ( "{(" gpat "+)}" );
  w = gensub(pat, "<\\1>", "g", w);

  # Mark empty GLYPHS-slots with <>
  w = gensub(/}{/, "}<>{", "g", w);

  # Duplicate K elements (with space separator) for sharing
  w = gensub(/{([^{}]*)}/, "{\\1} {\\1}", "g", w);
  
  # Now splits into KOK triples:
  n = split(w, wf);
  if (wf[1] != "{_}") { data_error(("bad karma \"" wf[1] "\"")); }
  if (wf[n] != "{_}") { data_error(("bad karma \"" wf[n] "\"")); }
  pat = ( "^[{][^{}]+[}][<]" gpat "*[>][{][^{}]+[}]$" );
  for (i=2; i<=n-1; i++)
    { if (wf[i] !~ pat) 
        { data_error(("bad karma \"" wf[i] "\"")); }
      print $1, wf[i]; 
    }
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}