#! /usr/bin/gawk -f
# Last edited on 2001-03-12 22:26:27 by stolfi

# Process Culpeper's herbal

BEGIN {
  abort = -1;
  usage = ( "process-cul \\\n" \
    "  < INFILE.txt > OUTFILE.evt" \
  );

  # Lines will be numbered with <Xppp.Ukk.nnn;S> where
  # X is a letter identifying the part of the book (a,b,c...)
  # ppp is the chapter number (sequential theough book)
  # U is a letter identifying the unit type
  # kk is the seq number of the unit within the chapter
  # nnn is the line number within the unit

  curpage = "";        # Current page number.
  curpart = "";        # Current part number.
  curchapter = "";     # Current chapter number.
  nchapters = 0;       # Number of completed nonempty chapters.
  curunit = "";        # Current unit number within chapter.
  nchapunits = "";     # Number of completed nonempty units in chapter.
  nunitlines = 0;      # Number of lines already seein in current unit.
  nparlines = 0;       # Number of lines already seen in current parag.
  seplines = 0;        # TRUE = add parag mark to each line in current unit
  gobbling = 0;        # TRUE = we are inside a @note..@endnote pair.
  nunitnotes = 0;      # Number of lines of notes attached to current parag.
  split("", unitnote); # unitnote[0..nunitnotes-1] are the pending notes.
  lastwasblank = 0;    # TRUE if last non-@ line written was a blank #-comment.
}

(abort >= 0) { exit abort; }

/^ *[#]/ { 
  lin = $0; gsub(/^ */,"",lin); 
  output_comment_line(lin);
  next;
}

/^ *$/ { output_comment_line("#"); next; }

/.+[@]/ { data_error("@ not on column 1"); }

/./ { 
  # General contents and control line cleanup
  gsub(/^[ 	]+/, "", $0);
  gsub(/[ 	]+$/, "", $0);
  gsub(/[ 	]+/, " ", $0);
}

/^[@]note([ ]|$)/ {
  # start of marginal note. Gobble it up and discharge it
  # at the next end-of-unit
  if (gobbling) { data_error("inconsistent @note"); }
  gobbling = 1;
  unitnote[nunitnotes] = $0;
  nunitnotes++;
  next;
}

/^[@]endnote([ ]|$)/ {
  # end of marginal note. Keep it and discharge it
  # at the next end-of-paragraph
  if (! gobbling) { data_error("inconsistent @endnote"); }
  unitnote[nunitnotes] = $0;
  nunitnotes++;
  gobbling = 0;
  if (curunit == "") { spit_out_notes(); }
  next;
}

(gobbling) { # We are inside a marginal note
    if ($0 ~ /^[@]/) { data_error("unexpected @ in note"); }
  unitnote[nunitnotes] = $0;
  nunitnotes++;
  next;
}

/^[@]page([ ]|$)/ {
  # Page boundary. Print it as a comment.
  curpage = $2;
  output_control_line($0);
  output_comment_line(("# page " curpage));
  next;
}

/^[@]part([ ]|$)/ {
  # begin another part of book
  end_current_chapter();
  curpart = $2;
  # Do not reset chapter numbers between parts
  output_control_line($0);
  next;
}

/^[@]chapter([ ]|$)/ {
  # start a new chapter
  end_current_chapter();
  output_control_line($0);
  begin_new_chapter();
  next;
}

/^[@]booktitle([ ]|$)/ {
  # start of book's title
  end_current_unit();
  output_control_line($0);
  begin_new_unit("B");
  next;
}

/^[@]chaptitle([ ]|$)/ {
  # start of chapter title
  end_current_unit();
  output_control_line($0);
  begin_new_unit("C");
  next;
}

/^[@]figtitle([ ]|$)/ {
  # start of figure title
  end_current_unit();
  output_control_line($0);
  begin_new_unit("F");
  next;
}

/^[@]sectitle([ ]|$)/ {
  # start of section title
  end_current_unit();
  output_control_line($0);
  begin_new_unit("S");
  next;
}

/^[@]subsectitle([ ]|$)/ {
  # start of subsection title
  end_current_unit();
  output_control_line($0);
  begin_new_unit("R");
  next;
}

/^[@][=]([ ]|$)/ {
  # paragraph delimiter
  end_current_parag();
  begin_new_parag();
  next;
}

/^[@](indexlabel|listlabel)([ ]|$)/ {
  # A numeric or letter label
  end_current_unit();
  output_control_line($0);
  begin_new_unit("L");
  next;
}

/^[@]namelist([ ]|$)/ {
  # Start of a list of people names
  end_current_unit();
  output_control_line($0);
  begin_new_unit("N");
  seplines = 1;
  next;
}

/^[@]plantlist([ ]|$)/ {
  # Start of a list of plant names
  end_current_unit();
  output_control_line($0);
  begin_new_unit("M");
  seplines = 1;
  next;
}

/^[@]indexlines([ ]|$)/ {
  # Start of a list of index entries
  end_current_unit();
  output_control_line($0);
  begin_new_unit("X");
  seplines = 1;
  next;
}

/^[@]contentslines([ ]|$)/ {
  # Start of a list of table-of-contents entries
  end_current_unit();
  output_control_line($0);
  begin_new_unit("K");
  seplines = 1;
  next;
}

/^[@]text([ ]|$)/ {
  # Start (or continuation) of running text block
  if (curunittype != "P") 
    { end_current_unit();
      output_control_line($0);
      begin_new_unit("P");
    }
  next;
}

/^[@]textitem([ ]|$)/ {
  # A short text fragment, usually not a sentence
  end_current_unit();
  output_control_line($0);
  begin_new_unit("T");
  next;
}

/^[@]englverse([ ]|$)/ {
  end_current_unit();
  output_control_line($0);
  begin_new_unit("V");
  next;
}

/^[@]latinverse([ ]|$)/ {
  end_current_unit();
  output_control_line($0);
  begin_new_unit("Q");
  next;
}

/^ *$/ {
  next;
}

/^[@]/ {
  data_error("unknown @ directive");
}

/./ {
  # Contents line, phew!
  output_contents_line(($0 " "));
  if (seplines) 
    { end_current_parag(); begin_new_parag(); }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  end_current_chapter();
}

function end_current_chapter()
{
  # Finishes off the current chapter, and increments "nchapters".
  if (curchapter != "") 
    { end_current_unit();
      if (nchapunits > 0) 
        { nchapters++; }
      curchapter = "";
      nchapunits = "";
      printf "]\n" > "/dev/stderr";
    }
  else
    { if (curunit != "")
        { data_error("inconsistent curunit (0)"); }
      if (nchapunits != "")
        { data_error("inconsistent nchapunits (0)"); }
    }
}

function end_current_unit()
{
  # Finishes off the current unit, and defines the default "nchapunits".
  # Also dumps any pending notes as extra units.
  do_end_current_unit();
  if (nunitnotes > 0) { spit_out_notes(); }
}

function do_end_current_unit()
{
  # Finishes off the current unit, and increments "nchapunits".
  # Does NOT try to dump the pending notes.
  if (curunit != "")
    { end_current_parag();
      if (nunitlines > 0)
        { nchapunits++;
          printf "%d", nunitlines > "/dev/stderr"; 
        }
      curunit = "";
      curunittype = "";
      nunitlines = 0;
      printf ")" > "/dev/stderr";
    }
  else
    { if (nunitlines > 0) 
        { data_error("inconsistent curunit (1)"); }
    }
}

function end_current_parag()
{ # Finishes off the current paragraph, adding "@=" if necessary,
  if (nparlines > 0) 
    { output_contents_line("="); }
  nparlines = 0;
}

function begin_new_parag()
{
  # Starts a new paragraph.
  nparlines = 0;
}

function begin_new_unit(newtype)
{ 
  # Initializes a new unit of the given type.
  # Assumes that the previous unit has been 
  # finished, that there are no pending notes, and that
  # nchapunits is the number of complete nonempty units in chapter.
  if (curunit != "") { data_error("inconsistent curunit (2)"); }
  if (curunittype != "") { data_error("inconsistent curunittype (2)"); }
  curunit = nchapunits + 1;
  curunittype = newtype;
  printf "(%s%d:", curunittype, curunit > "/dev/stderr";
  nunitlines = 0;
  seplines = 0;
  if (! lastwasblank) { output_comment_line("#"); }
  begin_new_parag();
}

function begin_new_chapter()
{
  if (curpart == "") { data_error("unspecified part (2)"); }
  if (curchapter != "") { data_error("inconsistent curchapter (2)"); }
  curchapter = nchapters + 1;
  output_comment_line(sprintf("## <%s%03d>", curpart, curchapter));
  printf "[%d=", curchapter > "/dev/stderr";
  nchapunits = 0;
}

function spit_out_notes(  i,pp)
{
  # Dumps the marginal notes at the end of a paragraph.
  # Assumes that the current paragraph has already been 
  # finalized except for the notes, and curunit is 
  # the new unit number
  if (nunitnotes > 0)
    { for(i = 0; i < nunitnotes; i++)
        { pp = unitnote[i];
          if (pp ~ /^[@]note$/)
            { output_control_line(pp);
              begin_new_unit("G");
            }
          else if (pp ~ /^[@]endnote$/)
            { # supress the @endnote line
              do_end_current_unit();
            }
          else if (pp ~ /^[@]/)
            { data_error("bad @ in saved note"); }
          else
            { output_contents_line(pp); }
        }
      nunitnotes = 0;
      split("", unitnote);
    }
}

function output_contents_line(lin,   loc)
{
  nunitlines++;
  nparlines++;
  curline = nunitlines;
  if (curpage == "")    { data_error("unspecified page"); }
  if (curpart == "")    { data_error("unspecified part"); }
  if (curchapter == "") { data_error("unspecified chapter"); }
  if (curunit == "")    { data_error("unspecified unit"); }
  gsub(/[ 	]+/, "_", lin);
  loc = sprintf("<%s%03d.%s%02d.%03d>", 
    curpart, curchapter, curunittype, curunit, curline);
  printf "%-18s %s\n", loc, lin;
  lastwasblank = 0;
}

function output_comment_line(lin)
{ # Outputs the comment line `lin', and sets `lastwasblank'
  gsub(/[ 	]+$/, "", lin);
  if (lin !~ /^[#]/) { data_error("bad comment"); }
  lastwasblank = 0;
  if (lin ~ /^[#][ ]*$/) { lastwasblank = 1; }
  print lin;
}

function output_control_line(lin)
{ # Outputs an "@" line
  print lin;
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}