#! /bin/gawk -f
# Last edited on 2023-05-10 12:08:08 by stolfi
# Converts a text frm ".src" format to a list of words, one per line
BEGIN {
abort = -1;
usage = ( ARGV[0] " \\\n" \
" < main.src > main.wds" \
);
#
# See "src-format.txt" for the input file format.
# See "wds-format.txt" for the output file format.
# TO DO: !!! Use locator lines to print errors with correct filename and line
# Section stack
split("", sectag); # {sectag[n]} is the tag of level {n}
minlevel = 1 # Minimum section level.
curlevel = minlevel-1; # Current section level.
prevtag = ""; # Tag of section just ended, if any.
seccount = 0; # Counts sections, for debugging output.
# Character type table:
split("", chype);
chtype[" "] = "b";
chtype["@"] = "i";
chtype["#"] = "i";
chtype["{"] = "i";
chtype["}"] = "i";
# Word type and remapping table:
split("", wdtype);
split("", wdmap);
}
(abort >= 0) { exit abort; }
// {
# Get rid of funny spaces
gsub(/[\011\014\015\240]/, " ");
# Remove trailing blanks:
gsub(/[ ]+$/, "");
}
/^ *([#]|$)/ {
# Output it as a comment entry:
printf "# %s\n", $0;
next;
}
/^ *[@]chars[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ {
# Output it as comment entry:
printf "# %s\n", $0;
# Extract character type:
type = $0;
gsub(/^[ ]*@chars[ ]*/, "", type);
gsub(/[ ]*[{].*[}][ ]*$/, "", type);
type = substr(type, 1, 1);
# Extract character list:
chars = $0;
gsub(/^[ ]*@chars[ ]*[a-z]+[ ]*[{]/, "", chars);
gsub(/[}][ ]*$/, "", chars);
# Set character table:
for (i = 1; i <= length(chars); i++)
{ c = substr(chars, i, 1);
if (c in chtype)
{ if (chtype[c] != type)
{ data_error(("bad character {" c "} in chars")); }
}
else
{ chtype[c] = type; }
}
next;
}
/^ *[@]wordmap[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ {
# Output it as comment entry:
printf "# %s\n", $0;
# Extract character type:
type = $0;
gsub(/^[ ]*@wordmap[ ]*/, "", type);
gsub(/[ ]*[{].*[}][ ]*$/, "", type);
type = substr(type, 1, 1);
# Extract file name:
fname = $0;
gsub(/^[ ]*@wordmap[ ]*[a-z]+[ ]*[{]/, "", fname);
gsub(/[}][ ]*$/, "", fname);
# Read file and save data in tables:
read_word_table(fname, type);
next;
}
/^[@]begin[ ]*[{][^ {}]+[}][ ]*$/ {
# Output it as comment entry:
printf "# %s\n", $0;
# Extract the section tag:
tag = $0;
gsub(/^[ ]*@begin[ ]*[{]/, "", tag);
gsub(/[}][ ]*$/, "", tag);
begin_section(tag);
# Output the section locator:
output_section_locator();
next;
}
/^[@]end[ ]*[{][^ {}]+[}][ ]*$/ {
# Output it as comment entry:
printf "# %s\n", $0;
# Extract the section tag:
tag = $0;
gsub(/^[ ]*@end[ ]*[{]/, "", tag);
gsub(/[}][ ]*$/, "", tag);
# Unstack section until the given tag:
end_section(tag);
# Output the section locator:
output_section_locator();
next;
}
/^[@]section[ ]*[0-9]+[ ]*[{][^ {}]+[}][ ]*$/ {
# Output it as comment entry:
printf "# %s\n", $0;
# Extract the nesting level:
lev = $0;
gsub(/^[ ]*@section[ ]*/, "", lev);
gsub(/[ ]*[{].*[}][ ]*$/, "", lev);
# Extract the section tag:
tag = $0;
gsub(/^[ ]*@section[ ]*[0-9]+[ ]*[{]/, "", tag);
gsub(/[}][ ]*$/, "", tag);
# Unstack section until the given level:
if ((lev < minlevel) || (lev > curlevel + 1))
{ data_error(("@invalid level \"" lev "\"")); }
else if (lev <= curlevel)
{ end_section(sectag[lev]); }
if (lev != curlevel + 1)
{ data_error(("program bug: curlevel")); }
begin_section(tag);
# Output the section locator:
output_section_locator();
next;
}
/./ {
# Contents line, phew!
# Print the line locator:
printf "@ %d\n", FNR;
# Parse and output the words:
process_contents_line($0);
# It is OK to repeat a session tag after some contents:
prevtag = "";
next;
}
END {
if (abort >= 0) { exit abort; }
if (curlevel >= minlevel)
{ end_section(sectag[minlevel]); }
printf "\n" > "/dev/stderr";
}
function output_section_locator( lev)
{
printf "$ ";
for (lev = minlevel; lev <= curlevel; lev++)
{ printf "{%s}", sectag[lev]; }
printf "\n";
}
function begin_section(tag, j)
{
if (tag == "")
{ data_error(("empty section tag")); }
if (tag == prevtag)
{ data_error(("consecutive sections with same tag \"" tag "\"")); }
for (j = minlevel; j <= curlevel; j++)
{ if (tag == sectag[j])
{ data_error(("nested sections with same tag \"" tag "\"")); }
}
curlevel++;
sectag[curlevel] = tag;
# Next "@begin" will be the first in its parent section:
prevtag = "";
# Report opening session:
if ((seccount > 7) || (curlevel - minlevel < 2))
{ printf "\n%*s", 2*(curlevel-minlevel), "" > "/dev/stderr";
seccount = 0;
}
else
{ printf " " > "/dev/stderr"; }
printf "{ %s", sectag[curlevel] > "/dev/stderr";
seccount++;
}
function end_section(tag)
{
while ((curlevel >= minlevel) && (sectag[curlevel] != tag))
{ end_current_section(); }
if (curlevel < minlevel)
{ data_error(("@end tag mismatch \"" tag "\"")); }
end_current_section();
}
function end_current_section()
{
# Report closure of section:
printf " }" > "/dev/stderr";
# After 2 "@ends" in a row, force a line break:
if (prevtag != "") { seccount = 100; }
# Next "@begin" will be a sibling of this one:
prevtag = sectag[curlevel];
curlevel--;
}
function process_contents_line(lin, c,ct,w,wt)
{
# We must be inside a section:
if (curlevel < minlevel) { data_error(("missing a @begin or @section")); }
# Parse line:
w = ""; wt = "a";
while (lin != "")
{ c = substr(lin, 1, 1);
if (c == "@")
{ # Explicit-type text construct
if (! match(lin, /^[@][aspnb][{][^{}]+[}]/))
{ data_error(("malformed embedded @-construct \"" lin "\"")); }
ct = substr(lin, 2, 1);
if (ct == "n")
{ # Null text, ignore
lin = substr(lin, RLENGTH+1);
}
else
{ # Non-null text, flush {w}:
lookup_and_output_word(w, wt);
w = ""; wt = "a";
if (ct != "b")
{ # Output argument words:
output_words(substr(lin,4,RLENGTH-4), ct);
}
}
lin = substr(lin, RLENGTH+1);
}
else if (c == "{")
{ # Embedded {}-comment, ignore:
if (! match(lin, /^[{][^{}]*[}]/))
{ data_error(("malformed {}-comment \"" lin "\"")); }
lin = substr(lin, RLENGTH+1);
}
else
{ # Single character
if (! (c in chtype))
{ # Invalid char
data_error(("illegal character \"" c "\""));
}
ct = chtype[c];
if (ct == "i")
{ # Illegal char, ignore
data_error(("illegal input character \"" c "\""));
}
else if (ct == "n")
{ # Null char, ignore
}
else if (ct == "a")
{ # Alpha char: append to word, preserve type
w = (w c);
}
else if (ct == "s")
{ # Symbol char: append to word, mark it as symbol
w = (w c); wt = "s";
}
else
{ # Flush current word:
lookup_and_output_word(w, wt);
w = ""; wt = "a";
if (ct == "p")
{ # Punct char: a word unto itself:
w = c; wt = "p";
lookup_and_output_word(w, wt);
w = ""; wt = "a";
}
else if (ct == "b")
{ # Blank char: ignore it
}
else
{ # Program bug
data_error(("invalid class \"" ct "\" for char \"" c "\""));
}
}
lin = substr(lin, 2);
}
}
lookup_and_output_word(w, wt);
}
function lookup_and_output_word(w, wt)
{
if (w == "") { return; }
if (w in wdtype) { wt = wdtype[w]; w = wdmap[w]; }
if (wt == "i")
{ # Invalid word:
data_error(("invalid word \"" w "\""));
}
else if ((wt == "b") || (wt == "n"))
{ # Blank or null word, ignore:
}
else
{ output_word(w, wt); }
}
function output_words(wds, wt, w,nw,iw)
{ # Splits {wds} at ASCII SP and output each word as type {wt},
# without lookup.
nw = split(wds, w);
for (iw = 1; iw <= nw; iw++) { output_word(w[iw], wt); }
}
function output_word(w, wt)
{
printf "%s %s\n", wt, w;
}
function read_word_table(fname,wt, nwords,nlines,lin,fld,nfld,wa,wb)
{
nwords=0;
nlines=0;
printf "reading wordmap of type = %s \"%s\"... ", wt, fname > "/dev/stderr";
while((getline lin < fname) > 0) {
nlines++;
if (! match(lin, /^[ \011]*([#]|$)/))
{ gsub(/[ ]+[#].*$/, "", lin);
gsub(/^[ ]+/, "", lin);
nfld = split(lin, fld, " ");
if (nfld > 2) tbl_error(fname, nlines, ("bad table entry = \"" lin "\""));
if (nfld < 1) tbl_error(fname, nlines, ("program error: nfld"));
wa = fld[1]; wb = (nfld < 2 ? wa : fld[2]);
if (wa in wdtype) tbl_error(fname, nlines, ("repeated word in tables = \"" lin "\""));
wdmap[wa] = wb;
wdtype[wa] = wt;
nwords++;
}
}
if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlines, ERRNO); }
close (fname);
if (nlines == 0) { arg_error(("file \"" fname "\" empty or missing")); }
printf " %d words\n", nwords > "/dev/stderr"
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1;
}
function data_error(msg)
{
printf "\n" > "/dev/stderr";
printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}
function tbl_error(f,n,msg)
{
printf "%s:%d: %s\n", f, n, msg > "/dev/stderr";
abort = 1;
exit 1
}