#! /bin/gawk -f
# Last edited on 2023-05-10 12:08:08 by stolfi

# Converts a text frm ".src" format to a list of words, one per line

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n"  \
    "  < main.src > main.wds" \
  );
  # 
  # See "src-format.txt" for the input file format.
  # See "wds-format.txt" for the output file format.
  
  # TO DO: !!! Use locator lines to print errors with correct filename and line 

  # Section stack
  split("", sectag);     # {sectag[n]} is the tag of level {n}
  minlevel = 1           # Minimum section level.
  curlevel = minlevel-1; # Current section level.
  prevtag = "";          # Tag of section just ended, if any.
  seccount = 0;          # Counts sections, for debugging output.
  
  # Character type table:
  split("", chype);
  chtype[" "] = "b";
  chtype["@"] = "i";
  chtype["#"] = "i";
  chtype["{"] = "i";
  chtype["}"] = "i";
  
  # Word type and remapping table:
  split("", wdtype);
  split("", wdmap);
  
}

(abort >= 0) { exit abort; }

// { 
  # Get rid of funny spaces
  gsub(/[\011\014\015\240]/, " ");
  # Remove trailing blanks:
  gsub(/[ ]+$/, "");
}

/^ *([#]|$)/ { 
  # Output it as a comment entry:
  printf "# %s\n", $0;
  next;
}

/^ *[@]chars[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ {
  # Output it as comment entry:
  printf "# %s\n", $0;
  # Extract character type:
  type = $0;
  gsub(/^[ ]*@chars[ ]*/, "", type);
  gsub(/[ ]*[{].*[}][ ]*$/, "", type);
  type = substr(type, 1, 1);
  # Extract character list:
  chars = $0;
  gsub(/^[ ]*@chars[ ]*[a-z]+[ ]*[{]/, "", chars);
  gsub(/[}][ ]*$/, "", chars);
  # Set character table:
  for (i = 1; i <= length(chars); i++)
    { c = substr(chars, i, 1);
      if (c in chtype)
        { if (chtype[c] != type) 
            { data_error(("bad character {" c "} in chars")); }
        }
      else
        { chtype[c] = type; }
    }
  next;
}

/^ *[@]wordmap[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ {
  # Output it as comment entry:
  printf "# %s\n", $0;
  # Extract character type:
  type = $0;
  gsub(/^[ ]*@wordmap[ ]*/, "", type);
  gsub(/[ ]*[{].*[}][ ]*$/, "", type);
  type = substr(type, 1, 1);
  # Extract file name:
  fname = $0;
  gsub(/^[ ]*@wordmap[ ]*[a-z]+[ ]*[{]/, "", fname);
  gsub(/[}][ ]*$/, "", fname);
  # Read file and save data in tables:
  read_word_table(fname, type);
  next;
}

/^[@]begin[ ]*[{][^ {}]+[}][ ]*$/ {
  # Output it as comment entry:
  printf "# %s\n", $0;
  # Extract the section tag:
  tag = $0;
  gsub(/^[ ]*@begin[ ]*[{]/, "", tag);
  gsub(/[}][ ]*$/, "", tag);
  begin_section(tag);
  # Output the section locator:
  output_section_locator();
  next;
}

/^[@]end[ ]*[{][^ {}]+[}][ ]*$/ {
  # Output it as comment entry:
  printf "# %s\n", $0;
  # Extract the section tag:
  tag = $0;
  gsub(/^[ ]*@end[ ]*[{]/, "", tag);
  gsub(/[}][ ]*$/, "", tag);
  # Unstack section until the given tag:
  end_section(tag);
  # Output the section locator:
  output_section_locator();
  next;
}

/^[@]section[ ]*[0-9]+[ ]*[{][^ {}]+[}][ ]*$/ {
  # Output it as comment entry:
  printf "# %s\n", $0;
  # Extract the nesting level:
  lev = $0;
  gsub(/^[ ]*@section[ ]*/, "", lev);
  gsub(/[ ]*[{].*[}][ ]*$/, "", lev);
  # Extract the section tag:
  tag = $0;
  gsub(/^[ ]*@section[ ]*[0-9]+[ ]*[{]/, "", tag);
  gsub(/[}][ ]*$/, "", tag);
  # Unstack section until the given level:
  if ((lev < minlevel) || (lev > curlevel + 1))
    { data_error(("@invalid level \"" lev "\"")); }
  else if (lev <= curlevel)
    { end_section(sectag[lev]); }
  if (lev != curlevel + 1)
    { data_error(("program bug: curlevel")); }
  begin_section(tag);
  # Output the section locator:
  output_section_locator();
  next;
}

/./ {
  # Contents line, phew!
  # Print the line locator:
  printf "@ %d\n", FNR;
  # Parse and output the words:
  process_contents_line($0);
  # It is OK to repeat a session tag after some contents:
  prevtag = "";
  next;
}

END {
  if (abort >= 0) { exit abort; }
  if (curlevel >= minlevel) 
    { end_section(sectag[minlevel]); }
  printf "\n" > "/dev/stderr";
}

function output_section_locator(   lev)
{
  printf "$ ";
  for (lev = minlevel; lev <= curlevel; lev++)
    { printf "{%s}", sectag[lev]; }
  printf "\n";
}

function begin_section(tag,  j)
{
  if (tag == "")
    { data_error(("empty section tag")); }
  if (tag == prevtag)
    { data_error(("consecutive sections with same tag \"" tag "\"")); }
  for (j = minlevel; j <= curlevel; j++)
    { if (tag == sectag[j]) 
        { data_error(("nested sections with same tag \"" tag "\"")); }
    }
  curlevel++;
  sectag[curlevel] = tag;
  # Next "@begin" will be the first in its parent section:
  prevtag = "";
  # Report opening session:
  if ((seccount > 7) || (curlevel - minlevel < 2))
    { printf "\n%*s", 2*(curlevel-minlevel), "" > "/dev/stderr";
      seccount = 0;
    }
  else
    { printf " " > "/dev/stderr"; }
  printf "{ %s", sectag[curlevel] > "/dev/stderr";
  seccount++;
}

function end_section(tag)
{
  while ((curlevel >= minlevel) &&  (sectag[curlevel] != tag)) 
    { end_current_section(); }
  if (curlevel < minlevel)
    { data_error(("@end tag mismatch \"" tag "\"")); }
  end_current_section();
}

function end_current_section()
{
  # Report closure of section:
  printf " }" > "/dev/stderr";
  # After 2 "@ends" in a row, force a line break:
  if (prevtag != "") { seccount = 100; }
  # Next "@begin" will be a sibling of this one:
  prevtag = sectag[curlevel];
  curlevel--;
}

function process_contents_line(lin,   c,ct,w,wt)
{
  # We must be inside a section:
  if (curlevel < minlevel) { data_error(("missing a @begin or @section")); }
  # Parse line:
  w = ""; wt = "a";
  while (lin != "")
    { c = substr(lin, 1, 1);
      if (c == "@")
        { # Explicit-type text construct
          if (! match(lin, /^[@][aspnb][{][^{}]+[}]/))
            { data_error(("malformed embedded @-construct \"" lin "\"")); } 
          ct = substr(lin, 2, 1);
          if (ct == "n")
            { # Null text, ignore
              lin = substr(lin, RLENGTH+1);
            }
          else
            { # Non-null text, flush {w}:
              lookup_and_output_word(w, wt);
              w = ""; wt = "a";
              if (ct != "b") 
                { # Output argument words:
                  output_words(substr(lin,4,RLENGTH-4), ct);
                }
            }
          lin = substr(lin, RLENGTH+1);
        }
      else if (c == "{")
        { # Embedded {}-comment, ignore:
          if (! match(lin, /^[{][^{}]*[}]/))
            { data_error(("malformed {}-comment \"" lin "\"")); } 
          lin = substr(lin, RLENGTH+1);
        }
      else 
        { # Single character
          if (! (c in chtype))
            { # Invalid char
              data_error(("illegal character \"" c "\""));
            } 
          ct = chtype[c];
          if (ct == "i")
            { # Illegal char, ignore
              data_error(("illegal input character \"" c "\""));
            }
          else if (ct == "n")
            { # Null char, ignore
            }
          else if (ct == "a")
            { # Alpha char: append to word, preserve type
              w = (w c);
            }
          else if (ct == "s")
            { # Symbol char: append to word, mark it as symbol
              w = (w c); wt = "s";
            }
          else
            { # Flush current word:
              lookup_and_output_word(w, wt);
              w = ""; wt = "a";
              if (ct == "p") 
                { # Punct char: a word unto itself:
                  w = c; wt = "p"; 
                  lookup_and_output_word(w, wt);
                  w = ""; wt = "a";
                }
              else if (ct == "b") 
                { # Blank char: ignore it
                }
              else
                { # Program bug
                  data_error(("invalid class \"" ct "\" for char \"" c "\""));
                } 
            }
          lin = substr(lin, 2);
        }
    }
  lookup_and_output_word(w, wt);
}

function lookup_and_output_word(w, wt)
{
  if (w == "") { return; }
  if (w in wdtype) { wt = wdtype[w]; w = wdmap[w]; }
  if (wt == "i") 
    { # Invalid word: 
      data_error(("invalid word \"" w "\""));
    }
  else if ((wt == "b") || (wt == "n"))
    { # Blank or null word, ignore: 
    }
  else
    { output_word(w, wt); }
}

function output_words(wds, wt,   w,nw,iw)
{ # Splits {wds} at ASCII SP and output each word as type {wt},
  # without lookup.
  nw = split(wds, w);
  for (iw = 1; iw <= nw; iw++) { output_word(w[iw], wt); }
}

function output_word(w, wt)
{ 
  printf "%s %s\n", wt, w;
}

function read_word_table(fname,wt,    nwords,nlines,lin,fld,nfld,wa,wb)
{
  nwords=0;
  nlines=0;
  printf "reading wordmap of type = %s \"%s\"... ", wt, fname > "/dev/stderr";
  while((getline lin < fname) > 0) { 
    nlines++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { gsub(/[ ]+[#].*$/, "", lin);
        gsub(/^[ ]+/, "", lin);
        nfld = split(lin, fld, " ");
        if (nfld > 2) tbl_error(fname, nlines, ("bad table entry = \"" lin "\""));
        if (nfld < 1) tbl_error(fname, nlines, ("program error: nfld"));
        wa = fld[1]; wb = (nfld < 2 ? wa : fld[2]);
        if (wa in wdtype) tbl_error(fname, nlines, ("repeated word in tables = \"" lin "\""));
        wdmap[wa] = wb;
        wdtype[wa] = wt;
        nwords++;
      }
  }
  if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlines, ERRNO); }
  close (fname);
  if (nlines == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf " %d words\n", nwords > "/dev/stderr"
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "\n" > "/dev/stderr";
  printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function tbl_error(f,n,msg)
{ 
  printf "%s:%d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}