#! /usr/bin/gawk -f
# Last edited on 2004-11-19 02:56:31 by stolfi

# Reads an inerlinear file  with "[..|..|..]" groups and 
# unfolds such lines into multiple lines. 

BEGIN {
  # original transcriber codes
  oldcodes = "CFTLKRJU";
  for (i=1; i<=length(oldcodes); i++) 
    { c = substr(oldcodes,i,1); newcode[c, 1] = c; }
  # new transcriber codes:
  newcode["C", 2] = "D";
  newcode["F", 2] = "G"; newcode["F", 3] = "H";
  newcode["J", 2] = "I";
  newcode["K", 2] = "Q";
  newcode["L", 2] = "M";
}

function first_alt(txt)
{
  # Removes from a comment-free Voynich string
  # all group alternatives but the first.
  
  gsub(/\|[^\]]*\]/, "]", txt);
  gsub(/\[/, "", txt);
  gsub(/\]/, "", txt);
  return txt;
}

function other_alts(txt)
{
  # Removes from a comment-free Voynich string
  # the first alternative from every group, leaving the rest.
  # If the group reduces to one alternative, removes the brackets.
    
  gsub(/\[[^\|\]]*\|/, "[", txt);
  txt = gensub(/\[([^\]\|]*)\]/, "\\1", "g", txt);
  return txt;
}

function remove_bangs(old,   neu, n, i, chunk)
{
  # Removes "!" characters outside the '{}' comments, while
  # preserving the text inside the '{}' comments.
  neu = "";
  while ((n = length(old)) != 0)
    { i = index(old, "{");
      if (i != 1)
        { if (i == 0) { i = n+1; }
          chunk = gensub(/[!]/, "", "g", substr(old, 1, i-1));
          old = substr(old, i);
          neu = (neu chunk);
        }
      else
        { match(old, /^{[^}]*}/);
          if (RSTART > 0) 
            { neu = (neu substr(old, 1, RLENGTH));
              old = substr(old, RLENGTH + 1);
            }
          else
            { printf "line %d, missing '\}'\n", NR > "/dev/stderr";
              neu = (neu old); old = "";
            }
        }
    }
  return neu;
}

function extract_choice(old, fst,    neu, n, i, chunk)
{
  # Processes the parts of "old" outside the '{}' comments, while
  # preserving the text inside the '{}' comments.
  # Returns either first alternative (fst=1) or all but first (fst=0)
  neu = "";
  while ((n = length(old)) != 0)
    { i = index(old, "{");
      if (i != 1)
        { if (i == 0) { i = n+1; }
          chunk = substr(old, 1, i-1); old = substr(old, i);
          if (fst) 
            { chunk = first_alt(chunk); } 
          else 
            { chunk = other_alts(chunk); } 
          neu = (neu chunk);
        }
      else
        { match(old, /^{[^}]*}/);
          if (RSTART > 0) 
            { neu = (neu substr(old, 1, RLENGTH));
              old = substr(old, RLENGTH + 1);
            }
          else
            { printf "line %d, missing '\}'\n", NR > "/dev/stderr";
              neu = (neu old); old = "";
            }
        }
    }
  return neu;
}

function has_alts(txt, n, i, chunk)
{
  # Returns 1 iff "txt" contains some [|] groups outside '{}'.
  while ((n = length(txt)) != 0)
    { i = index(txt, "{");
      if (i != 1)
        { if (i == 0) { i = n+1; }
          chunk = substr(txt, 1, i-1); txt = substr(txt, i);
          if (index(chunk, "[") != 0) return 1; 
        }
      else
        { match(txt, /^{[^}]*}/);
          if (RSTART > 0) 
            { neu = (neu substr(txt, 1, RLENGTH));
              txt = substr(txt, RLENGTH + 1);
            }
          else
            { printf "line %d, missing '\}'\n", NR > "/dev/stderr";
              txt = "";
            }
        }
    }
  return 0;
}

function fix_loc(loc, k,   c, d)
{
  # Replaces the transcriber's code "x" in "loc" by "newcode[x,k]".
  if ((k == 1) || (loc == "")) return loc;
  match(loc, /;[A-Z]>/);
  if (RSTART == 0) 
    { return loc; }
  else
    { c = substr(loc, RSTART+1, 1);
      if ((c,k) in newcode) 
        { d = newcode[c,k]; } 
      else 
        { printf "error: no %dth replacement for code %s\n", k, c > "/dev/stderr"; 
          d = "?";
        }
      return (substr(loc, 1, RSTART) d substr(loc, RSTART+2));
    }
}

function unfold_line(loc, msg,  k, r)
{
  # Given a line with location prefix "loc" (possibly empty) and
  # Voynich text "msg" (possibly with '{}' comments), prints one or
  # more lines, choosing successive alternatives in every group.  For
  # the second and successive lines, each transcriber's code "x" in
  # "loc" is replaced by "newcode[x,k]".
  msg = remove_bangs(msg);
  k = 1;
  match(loc, /;[A-Z]>/);
  r = RSTART;
  while (has_alts(msg))
    { print (fix_loc(loc, k) extract_choice(msg, 1));
      msg = extract_choice(msg, 0);
      k++;
    }
  print (fix_loc(loc, k) msg);
}  

/^ *$/ { print; next }

/^ *#/ { print; next; }

/^<[^>]*> *$/ { print; next }

/^</ { unfold_line(substr($0,1,19), substr($0,20)); next; }

/\[/ { unfold_line("", $0); next; }