#! /usr/bin/gawk -f
# Last edited on 2012-02-19 06:00:08 by stolfilocal
# Reads a file in ".wds" format, prints the text in plain.
# !!! Maybe too specific to {port/cso} and {port/csm} !!!
BEGIN {
abort = -1;
usage = ( ARGV[0] " \\\n" \
" < some.wds > some.txt" \
);
#
# See "wds-format.txt" for a description of the input file format.
nlines = 0; # Number of lines read.
nwdin = 0; # Number of words/symbols read.
nptin = 0; # Number of punctuation symbols read.
olin = ""; # Current output line.
# Parsing state (shoudl use current section):
ital = 0; # 1 within italic text.
math = 0; # 1 within math formula.
intx = 0; # 1 within normal text.
inct = 0; # 1 within book title, bok author, chapter number or chapter title
printf "% Created by wds-to-plaintext.gawk\n"
}
(abort >= 0) { exit abort; }
// {
# Get rid of funny spaces
gsub(/[\011\014\015\240]/, " ");
# Remove trailing blanks:
gsub(/[ ]+$/, "");
nlines++;
}
/^[\#][ ]*$/ {
# Comment line, ignore:
next;
}
# ----------------------------------------------------------------------
# Title page
/^[\$][ ].*{tpg} *$/ {
# Start of title page
if ((intx != 0) || (inct != 0))
{ data_warning(("misplaced title page")); }
intx = 0;
inct = 0;
math = 0;
next;
}
/^[\$][ ].*{tpg}{tt} *$/ {
# Start of book title:
if ((intx != 0) || (inct != 0))
{ data_warning(("misplaced book title")); }
olin = "\\titpg{";
intx = 0;
inct = 1;
math = 0;
next;
}
/^[\$][ ].*{tpg}{au} *$/ {
# Start of book author:
if ((intx != 0) || (inct != 1))
{ data_warning(("misplaced book author")); }
olin = ( olin "}{" );
intx = 0;
math = 0;
next;
}
# ----------------------------------------------------------------------
# Chapter headers:
/^[\$][ ].*{c[0-9]+} *$/ {
# Start of chapter:
if (inct != 0)
{ # Assume it was in a book title:
olin = ( olin "}" );
}
if (math != 0) { olin = ( olin "}" ); }
if (ital != 0)
{ # Must have been in math formula:
data_warning(("unclosed italics"));
olin = ( olin "}" );
}
if (intx != 0)
{ # Handle as parag break:
output_olin();
}
output_olin();
intx = 0;
inct = 0;
math = 0;
next;
}
/^[\$][ ].*{c[0-9]+}{cn} *$/ {
# Start of chapter number:
if (inct != 0) { data_warning(("dup chapter number")); }
output_olin();
olin = "\\chapt{"
intx = 0;
inct = 1;
math = 0;
next;
}
/^[\$][ ].*{c[0-9]+}{tt} *$/ {
# Start of chapter title:
if ((intx != 0) || (inct != 1))
{ data_warning(("chapter title without chapter number")); }
olin = ( olin "}{" );
intx = 0;
math = 0;
next;
}
/^[\$][ ].*{tx} *$/ {
# Start or restart of normal prose:
if (inct != 0)
{ # end chapter title:
olin = ( olin "}" );
}
if (math != 0)
{ # Must have been in math formula:
olin = ( olin "}" );
}
if (intx == 0)
{ # Treat as paragraph break:
output_olin();
output_olin();
}
intx = 1;
inct = 0;
math = 0;
next;
}
/^[\$][ ].*{(latp|frcp|itap|engp)} *$/ {
# Start of foreign phrase, set {intx} but do not break parag.
if (inct != 0) { data_warning(("foreign phrase in chapter title")); }
if (math != 0) { olin = ( olin "}" ); }
intx = 1;
math = 0;
next;
}
/^[\$][ ].*{math} *$/ {
# Start of math formula, output "\math{}" and set {intx,math} but do not break parag.
if (inct != 0) { data_warning(("foreign phrase in chapter title")); }
if (math != 0) { olin = ( olin "}" ); }
olin = ( olin "\\mth{" );
intx = 1;
math = 1;
next;
}
# ----------------------------------------------------------------------
# Final page
/^[\$][ ].*{fpg} *$/ {
# Start of final page
if (inct != 0)
{ data_warning(("misplaced final page")); }
if (math != 0) { olin = ( olin "}" ); }
if (intx != 0)
{ # Handle as parag break:
output_olin();
output_olin();
}
intx = 0;
inct = 0;
next;
}
/^[\$][ ].*{fpg}{tt} *$/ {
# Start of final page title:
if ((intx != 0) || (inct != 0))
{ data_warning(("misplaced final title")); }
olin = "\\finpg{";
intx = 0;
inct = 1;
next;
}
# ----------------------------------------------------------------------
# Other sections:
/^[\$][ ]/ {
# Start of other sections, handle as end of paragraph, reset {intx}:
data_warning(("unhandled section «" $0 "»"));
output_olin();
output_olin();
intx = 0;
inct = 0;
next;
}
# ----------------------------------------------------------------------
# Non-sections:
/^[@\#][ ]/ {
# Internal info or comment line, ignore:
next;
}
/^[asp][ ]/ {
# Alpha, symbol, or punctuation, output it:
# Grab the type and word:
type = substr($0, 1, 1);
word = substr($0, 3);
# Get the last two characters {bisp,prev} on current line:
nolin = length(olin);
prev = (nolin < 1 ? "" : substr(olin, nolin, 1));
bisp = (nolin < 2 ? "" : substr(olin, nolin-1, 1));
# Decide the spaces {befo} to add before before (or flush at end of parag and set {word} to nil):
if (type == "p")
{ # Punctuation:
nptin++;
if (word == "÷")
{ # End of paragraph:
output_olin();
output_olin();
ital = 0;
math = 0;
next;
}
else if (word == "=")
{ # Hard line break:
output_olin();
next;
}
else if (word == "_")
{ if (math)
{ befo = ""; }
else if (ital)
{ befo = ""; word = "}"; ital = 0; }
else
{ befo = " "; word = "\\emph{"; ital = 1; }
}
else if (word == "(")
{ befo = (prev == "(" ? "" : " "); }
else if (word == "-")
{ if (prev == "-")
{ befo = ""; word = "--"; }
else
{ befo = " "; }
}
else if (word == "«")
{ befo = ((prev == "(") ? "" : " "); }
else
{ befo = ""; }
}
else
{ # Alpha or symbol:
nwdin++;
if ((bisp == "-") && (prev == "-"))
{ # After an em-dash:
befo = " ";
}
else if (prev == "»")
{ # After a close quote:
befo = " ";
}
else if ((prev == "(") || (prev == "«") || (prev == "-") || (prev == "{") || (prev == "~"))
{ befo = ""; }
else
{ befo = " "; }
}
# No blanks needed at begin-of-line:
if (prev == "") { befo = ""; }
# Recode some symbols:
gsub(/[_]/, "/", word);
gsub(/[~]/, "-", word);
gsub(/\^/, ".~", word);
# Does it fit in the current line?
if ((befo == "") || (length(olin) + length(befo) + length(word) < 72))
{ # Append {word} with its space before:
olin = (olin befo word);
}
else
{ # Flush the line and append {word} without space:
output_olin();
olin = word;
}
next;
}
// {
data_error(("bad line format"));
next;
}
END {
if (abort >= 0) { exit abort; }
if (inct == 0)
{ data_warning(("missing final page?")); }
else
{ # Close off final page:
olin = ( olin "}" );
}
if (olin != "") { output_olin(); }
printf "%8d lines read\n", nlines > "/dev/stderr";
printf "%8d words/symbols read\n", nwdin > "/dev/stderr";
printf "%8d punctuation read\n", nptin > "/dev/stderr";
}
function output_olin( )
{
print olin;
olin = "";
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1;
}
function data_warning(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
printf " %s\n", $0 > "/dev/stderr";
abort = 1; exit 1;
}
function tbl_error(f,n,msg)
{
printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
abort = 1;
exit 1
}