#! /usr/bin/gawk -f # Last edited on 2001-03-12 22:26:27 by stolfi # Process Culpeper's herbal BEGIN { abort = -1; usage = ( "process-cul \\\n" \ " < INFILE.txt > OUTFILE.evt" \ ); # Lines will be numbered with where # X is a letter identifying the part of the book (a,b,c...) # ppp is the chapter number (sequential theough book) # U is a letter identifying the unit type # kk is the seq number of the unit within the chapter # nnn is the line number within the unit curpage = ""; # Current page number. curpart = ""; # Current part number. curchapter = ""; # Current chapter number. nchapters = 0; # Number of completed nonempty chapters. curunit = ""; # Current unit number within chapter. nchapunits = ""; # Number of completed nonempty units in chapter. nunitlines = 0; # Number of lines already seein in current unit. nparlines = 0; # Number of lines already seen in current parag. seplines = 0; # TRUE = add parag mark to each line in current unit gobbling = 0; # TRUE = we are inside a @note..@endnote pair. nunitnotes = 0; # Number of lines of notes attached to current parag. split("", unitnote); # unitnote[0..nunitnotes-1] are the pending notes. lastwasblank = 0; # TRUE if last non-@ line written was a blank #-comment. } (abort >= 0) { exit abort; } /^ *[#]/ { lin = $0; gsub(/^ */,"",lin); output_comment_line(lin); next; } /^ *$/ { output_comment_line("#"); next; } /.+[@]/ { data_error("@ not on column 1"); } /./ { # General contents and control line cleanup gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); gsub(/[ ]+/, " ", $0); } /^[@]note([ ]|$)/ { # start of marginal note. Gobble it up and discharge it # at the next end-of-unit if (gobbling) { data_error("inconsistent @note"); } gobbling = 1; unitnote[nunitnotes] = $0; nunitnotes++; next; } /^[@]endnote([ ]|$)/ { # end of marginal note. Keep it and discharge it # at the next end-of-paragraph if (! gobbling) { data_error("inconsistent @endnote"); } unitnote[nunitnotes] = $0; nunitnotes++; gobbling = 0; if (curunit == "") { spit_out_notes(); } next; } (gobbling) { # We are inside a marginal note if ($0 ~ /^[@]/) { data_error("unexpected @ in note"); } unitnote[nunitnotes] = $0; nunitnotes++; next; } /^[@]page([ ]|$)/ { # Page boundary. Print it as a comment. curpage = $2; output_control_line($0); output_comment_line(("# page " curpage)); next; } /^[@]part([ ]|$)/ { # begin another part of book end_current_chapter(); curpart = $2; # Do not reset chapter numbers between parts output_control_line($0); next; } /^[@]chapter([ ]|$)/ { # start a new chapter end_current_chapter(); output_control_line($0); begin_new_chapter(); next; } /^[@]booktitle([ ]|$)/ { # start of book's title end_current_unit(); output_control_line($0); begin_new_unit("B"); next; } /^[@]chaptitle([ ]|$)/ { # start of chapter title end_current_unit(); output_control_line($0); begin_new_unit("C"); next; } /^[@]figtitle([ ]|$)/ { # start of figure title end_current_unit(); output_control_line($0); begin_new_unit("F"); next; } /^[@]sectitle([ ]|$)/ { # start of section title end_current_unit(); output_control_line($0); begin_new_unit("S"); next; } /^[@]subsectitle([ ]|$)/ { # start of subsection title end_current_unit(); output_control_line($0); begin_new_unit("R"); next; } /^[@][=]([ ]|$)/ { # paragraph delimiter end_current_parag(); begin_new_parag(); next; } /^[@](indexlabel|listlabel)([ ]|$)/ { # A numeric or letter label end_current_unit(); output_control_line($0); begin_new_unit("L"); next; } /^[@]namelist([ ]|$)/ { # Start of a list of people names end_current_unit(); output_control_line($0); begin_new_unit("N"); seplines = 1; next; } /^[@]plantlist([ ]|$)/ { # Start of a list of plant names end_current_unit(); output_control_line($0); begin_new_unit("M"); seplines = 1; next; } /^[@]indexlines([ ]|$)/ { # Start of a list of index entries end_current_unit(); output_control_line($0); begin_new_unit("X"); seplines = 1; next; } /^[@]contentslines([ ]|$)/ { # Start of a list of table-of-contents entries end_current_unit(); output_control_line($0); begin_new_unit("K"); seplines = 1; next; } /^[@]text([ ]|$)/ { # Start (or continuation) of running text block if (curunittype != "P") { end_current_unit(); output_control_line($0); begin_new_unit("P"); } next; } /^[@]textitem([ ]|$)/ { # A short text fragment, usually not a sentence end_current_unit(); output_control_line($0); begin_new_unit("T"); next; } /^[@]englverse([ ]|$)/ { end_current_unit(); output_control_line($0); begin_new_unit("V"); next; } /^[@]latinverse([ ]|$)/ { end_current_unit(); output_control_line($0); begin_new_unit("Q"); next; } /^ *$/ { next; } /^[@]/ { data_error("unknown @ directive"); } /./ { # Contents line, phew! output_contents_line(($0 " ")); if (seplines) { end_current_parag(); begin_new_parag(); } next; } END { if (abort >= 0) { exit abort; } end_current_chapter(); } function end_current_chapter() { # Finishes off the current chapter, and increments "nchapters". if (curchapter != "") { end_current_unit(); if (nchapunits > 0) { nchapters++; } curchapter = ""; nchapunits = ""; printf "]\n" > "/dev/stderr"; } else { if (curunit != "") { data_error("inconsistent curunit (0)"); } if (nchapunits != "") { data_error("inconsistent nchapunits (0)"); } } } function end_current_unit() { # Finishes off the current unit, and defines the default "nchapunits". # Also dumps any pending notes as extra units. do_end_current_unit(); if (nunitnotes > 0) { spit_out_notes(); } } function do_end_current_unit() { # Finishes off the current unit, and increments "nchapunits". # Does NOT try to dump the pending notes. if (curunit != "") { end_current_parag(); if (nunitlines > 0) { nchapunits++; printf "%d", nunitlines > "/dev/stderr"; } curunit = ""; curunittype = ""; nunitlines = 0; printf ")" > "/dev/stderr"; } else { if (nunitlines > 0) { data_error("inconsistent curunit (1)"); } } } function end_current_parag() { # Finishes off the current paragraph, adding "@=" if necessary, if (nparlines > 0) { output_contents_line("="); } nparlines = 0; } function begin_new_parag() { # Starts a new paragraph. nparlines = 0; } function begin_new_unit(newtype) { # Initializes a new unit of the given type. # Assumes that the previous unit has been # finished, that there are no pending notes, and that # nchapunits is the number of complete nonempty units in chapter. if (curunit != "") { data_error("inconsistent curunit (2)"); } if (curunittype != "") { data_error("inconsistent curunittype (2)"); } curunit = nchapunits + 1; curunittype = newtype; printf "(%s%d:", curunittype, curunit > "/dev/stderr"; nunitlines = 0; seplines = 0; if (! lastwasblank) { output_comment_line("#"); } begin_new_parag(); } function begin_new_chapter() { if (curpart == "") { data_error("unspecified part (2)"); } if (curchapter != "") { data_error("inconsistent curchapter (2)"); } curchapter = nchapters + 1; output_comment_line(sprintf("## <%s%03d>", curpart, curchapter)); printf "[%d=", curchapter > "/dev/stderr"; nchapunits = 0; } function spit_out_notes( i,pp) { # Dumps the marginal notes at the end of a paragraph. # Assumes that the current paragraph has already been # finalized except for the notes, and curunit is # the new unit number if (nunitnotes > 0) { for(i = 0; i < nunitnotes; i++) { pp = unitnote[i]; if (pp ~ /^[@]note$/) { output_control_line(pp); begin_new_unit("G"); } else if (pp ~ /^[@]endnote$/) { # supress the @endnote line do_end_current_unit(); } else if (pp ~ /^[@]/) { data_error("bad @ in saved note"); } else { output_contents_line(pp); } } nunitnotes = 0; split("", unitnote); } } function output_contents_line(lin, loc) { nunitlines++; nparlines++; curline = nunitlines; if (curpage == "") { data_error("unspecified page"); } if (curpart == "") { data_error("unspecified part"); } if (curchapter == "") { data_error("unspecified chapter"); } if (curunit == "") { data_error("unspecified unit"); } gsub(/[ ]+/, "_", lin); loc = sprintf("<%s%03d.%s%02d.%03d>", curpart, curchapter, curunittype, curunit, curline); printf "%-18s %s\n", loc, lin; lastwasblank = 0; } function output_comment_line(lin) { # Outputs the comment line `lin', and sets `lastwasblank' gsub(/[ ]+$/, "", lin); if (lin !~ /^[#]/) { data_error("bad comment"); } lastwasblank = 0; if (lin ~ /^[#][ ]*$/) { lastwasblank = 1; } print lin; } function output_control_line(lin) { # Outputs an "@" line print lin; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }