#! /usr/bin/gawk -f # Last edited on 2012-02-19 06:00:08 by stolfilocal # Reads a file in ".wds" format, prints the text in plain. # !!! Maybe too specific to {port/cso} and {port/csm} !!! BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " < some.wds > some.txt" \ ); # # See "wds-format.txt" for a description of the input file format. nlines = 0; # Number of lines read. nwdin = 0; # Number of words/symbols read. nptin = 0; # Number of punctuation symbols read. olin = ""; # Current output line. # Parsing state (shoudl use current section): ital = 0; # 1 within italic text. math = 0; # 1 within math formula. intx = 0; # 1 within normal text. inct = 0; # 1 within book title, bok author, chapter number or chapter title printf "% Created by wds-to-plaintext.gawk\n" } (abort >= 0) { exit abort; } // { # Get rid of funny spaces gsub(/[\011\014\015\240]/, " "); # Remove trailing blanks: gsub(/[ ]+$/, ""); nlines++; } /^[\#][ ]*$/ { # Comment line, ignore: next; } # ---------------------------------------------------------------------- # Title page /^[\$][ ].*{tpg} *$/ { # Start of title page if ((intx != 0) || (inct != 0)) { data_warning(("misplaced title page")); } intx = 0; inct = 0; math = 0; next; } /^[\$][ ].*{tpg}{tt} *$/ { # Start of book title: if ((intx != 0) || (inct != 0)) { data_warning(("misplaced book title")); } olin = "\\titpg{"; intx = 0; inct = 1; math = 0; next; } /^[\$][ ].*{tpg}{au} *$/ { # Start of book author: if ((intx != 0) || (inct != 1)) { data_warning(("misplaced book author")); } olin = ( olin "}{" ); intx = 0; math = 0; next; } # ---------------------------------------------------------------------- # Chapter headers: /^[\$][ ].*{c[0-9]+} *$/ { # Start of chapter: if (inct != 0) { # Assume it was in a book title: olin = ( olin "}" ); } if (math != 0) { olin = ( olin "}" ); } if (ital != 0) { # Must have been in math formula: data_warning(("unclosed italics")); olin = ( olin "}" ); } if (intx != 0) { # Handle as parag break: output_olin(); } output_olin(); intx = 0; inct = 0; math = 0; next; } /^[\$][ ].*{c[0-9]+}{cn} *$/ { # Start of chapter number: if (inct != 0) { data_warning(("dup chapter number")); } output_olin(); olin = "\\chapt{" intx = 0; inct = 1; math = 0; next; } /^[\$][ ].*{c[0-9]+}{tt} *$/ { # Start of chapter title: if ((intx != 0) || (inct != 1)) { data_warning(("chapter title without chapter number")); } olin = ( olin "}{" ); intx = 0; math = 0; next; } /^[\$][ ].*{tx} *$/ { # Start or restart of normal prose: if (inct != 0) { # end chapter title: olin = ( olin "}" ); } if (math != 0) { # Must have been in math formula: olin = ( olin "}" ); } if (intx == 0) { # Treat as paragraph break: output_olin(); output_olin(); } intx = 1; inct = 0; math = 0; next; } /^[\$][ ].*{(latp|frcp|itap|engp)} *$/ { # Start of foreign phrase, set {intx} but do not break parag. if (inct != 0) { data_warning(("foreign phrase in chapter title")); } if (math != 0) { olin = ( olin "}" ); } intx = 1; math = 0; next; } /^[\$][ ].*{math} *$/ { # Start of math formula, output "\math{}" and set {intx,math} but do not break parag. if (inct != 0) { data_warning(("foreign phrase in chapter title")); } if (math != 0) { olin = ( olin "}" ); } olin = ( olin "\\mth{" ); intx = 1; math = 1; next; } # ---------------------------------------------------------------------- # Final page /^[\$][ ].*{fpg} *$/ { # Start of final page if (inct != 0) { data_warning(("misplaced final page")); } if (math != 0) { olin = ( olin "}" ); } if (intx != 0) { # Handle as parag break: output_olin(); output_olin(); } intx = 0; inct = 0; next; } /^[\$][ ].*{fpg}{tt} *$/ { # Start of final page title: if ((intx != 0) || (inct != 0)) { data_warning(("misplaced final title")); } olin = "\\finpg{"; intx = 0; inct = 1; next; } # ---------------------------------------------------------------------- # Other sections: /^[\$][ ]/ { # Start of other sections, handle as end of paragraph, reset {intx}: data_warning(("unhandled section «" $0 "»")); output_olin(); output_olin(); intx = 0; inct = 0; next; } # ---------------------------------------------------------------------- # Non-sections: /^[@\#][ ]/ { # Internal info or comment line, ignore: next; } /^[asp][ ]/ { # Alpha, symbol, or punctuation, output it: # Grab the type and word: type = substr($0, 1, 1); word = substr($0, 3); # Get the last two characters {bisp,prev} on current line: nolin = length(olin); prev = (nolin < 1 ? "" : substr(olin, nolin, 1)); bisp = (nolin < 2 ? "" : substr(olin, nolin-1, 1)); # Decide the spaces {befo} to add before before (or flush at end of parag and set {word} to nil): if (type == "p") { # Punctuation: nptin++; if (word == "÷") { # End of paragraph: output_olin(); output_olin(); ital = 0; math = 0; next; } else if (word == "=") { # Hard line break: output_olin(); next; } else if (word == "_") { if (math) { befo = ""; } else if (ital) { befo = ""; word = "}"; ital = 0; } else { befo = " "; word = "\\emph{"; ital = 1; } } else if (word == "(") { befo = (prev == "(" ? "" : " "); } else if (word == "-") { if (prev == "-") { befo = ""; word = "--"; } else { befo = " "; } } else if (word == "«") { befo = ((prev == "(") ? "" : " "); } else { befo = ""; } } else { # Alpha or symbol: nwdin++; if ((bisp == "-") && (prev == "-")) { # After an em-dash: befo = " "; } else if (prev == "»") { # After a close quote: befo = " "; } else if ((prev == "(") || (prev == "«") || (prev == "-") || (prev == "{") || (prev == "~")) { befo = ""; } else { befo = " "; } } # No blanks needed at begin-of-line: if (prev == "") { befo = ""; } # Recode some symbols: gsub(/[_]/, "/", word); gsub(/[~]/, "-", word); gsub(/\^/, ".~", word); # Does it fit in the current line? if ((befo == "") || (length(olin) + length(befo) + length(word) < 72)) { # Append {word} with its space before: olin = (olin befo word); } else { # Flush the line and append {word} without space: output_olin(); olin = word; } next; } // { data_error(("bad line format")); next; } END { if (abort >= 0) { exit abort; } if (inct == 0) { data_warning(("missing final page?")); } else { # Close off final page: olin = ( olin "}" ); } if (olin != "") { output_olin(); } printf "%8d lines read\n", nlines > "/dev/stderr"; printf "%8d words/symbols read\n", nwdin > "/dev/stderr"; printf "%8d punctuation read\n", nptin > "/dev/stderr"; } function output_olin( ) { print olin; olin = ""; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_warning(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(f,n,msg) { printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 }