#! /usr/bin/gawk -f
# Extracts significant characters from a text.
# Last edited on 1998-07-12 08:26:49 by stolfi

BEGIN {
  usage = ( \
      "extract-signif-chars \\\n" \
      "  [ -v errors=CHARS ] \\\n" \
      "  [ -v ignore=CHARS ] \\\n" \
      "  [ -v blanks=CHARS ] \\\n" \
      "  [ -v normal=CHARS ] \\\n" \
      "  < INFILE > OUTFILE" \
    );
  #
  # Extracts significant characters from INFILE.  Outputs a sequence
  # of fixed-format records of the form CLASS STR,
  # with no separation between the fields; where
  # STR is a substring of the input, and CLASS is its class,
  # according to this table:
  #
  #   CLASS = 0: STR contains non-significant chars.
  #   CLASS = 1: STR is a word separator.
  #   CLASS = 2: STR is a paragraph separator.
  #   CLASS = 3: STR contains a single significant char.
  # 
  # The string STR has newlines turned into CRs ("\015"). 
  #
  # Non-printable, non-blank ISO Latin-1 characters are always errors.
  # Letters (plain and accented) and digits are significant by
  # default.  The other printable ISO Latin-1 characters, and
  # LF(#10), FF(#12), SP (#32), and NBSP (#160), are
  # word separators by default.
  # 
  # These defaults can be overriden by the following option 
  # variables (set with "-v"):
  #
  #   "normal"        (string) significant characters.
  #   "errors"        (string) invalid input characters.
  #   "ignore"        (string) characters to be simply ignored.
  #   "blanks"        (string) word separator characters.
  #
  # A string STR of "ignore"s and "blanks" containing at least two
  # newlines (provided they are not "ignore"d and not in "#"-comments)
  # will be a class 2 string.  In any case an empty class 2 string will be
  # output before and after the whole text.
  #
  # Any other string STR of consecutive "ignore" and "blank"
  # characters with at least one "blank" is interpreted as a word
  # break (class 1).

  abort = -1;
  iso_blanks = "\012\014\040\240";
  check_options();
  init_buf();
}

# We maintain a string buffer "buf" with all non-significant
# characters seen since the last non-significant char.  The variable
# "numLines" counts newline characters in "buf", "numBlanks" counts
# "blanks" characters. These are cooked at begin-text and end-text
# to force a paragraph break output.
  
function init_buf()
{
  buf = ""; numBlanks = 0; numLines = 2;
}

function flush_buf(    i,n,h,cl)
{
  if (numLines > 1)
    { cl = 2; }
  else if (numBlanks > 0)
    { cl = 1; }
  else
    { cl = 0; }
  
  if ((cl != 0) || (buf != "")) { printf "%d%s\n", cl, encode(buf); }
  
  buf = ""; paraBrk = 0; numBlanks = 0; numLines = 0;
}

function encode(str)
{
  gsub(/\n/, "\015", str);
  return str;
}

# processing an input character:

function process_char(c,  cl)
{
  cl = class[c];
  if (cl == -1)
    { error(("line " NR ": invalid character \"" c "\"")); }
  else if  (cl == 0)
    { buf = (buf c); }
  else if (cl == 1)
    { buf = (buf c); numBlanks++; if (c == "\n") { numLines++; } }
  else 
    { flush_buf();
      printf "3%c\n", c;
    }
}

# Record processing:

/^ *[#]/{
  if (abort >= 0) { exit(abort); }
  buf = (buf $0 "\n");
  next;
}

// {
  if (abort >= 0) { exit(abort); }
  if (match($0, /^<f[^ >]*> */)) 
    { loc = substr($0, 1, RLENGTH); 
      lin = substr($0, RLENGTH+1); 
    }
  else
    { loc = ""; lin = $0; }
  buf = (buf loc);
  n = length(lin);
  for (i=1; i<=n; i++) { process_char(substr(lin,i,1)); }
  process_char("\n");
  next;
}

# Flush any buffered input (forcing a paragraph break):

END {
  if (abort >= 0) { exit(abort); }
  numLines = 2;
  flush_buf();
}

function check_options(   i,c,mk,ucs,lcs,uc,lc)
{
  # Analyzes/defaults the option variables listed by "tup_parse_options()",
  # namely
  #
  #     "normal" "ignore" "blanks" "breaks"
  #     "filler" "breakLength" "lowercase"
  #
  # Defines the global variables
  #
  #   "class"
  # 
  
  # "mk[c]" is number of explicit definitions for character "c":

  split("", mk);
  
  # "class[c]" is the input class of character "c".
  #    -1 - character is an error.
  #     0 - character is to be ignored.
  #     1 - character is a blank.
  #     3 - character is significant.

  split("", class);

  # --- default character classes --------------------------------

  str = ( \
    "ABCDEFGHIJKLMNOPQRSTUVWXYZРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежийклмно" \
    "abcdefghijklmnopqrstuvwxyzрстуфхцчшщъыьэюя№ёђѓєѕіјљњћќ§ў" \
    "п" sprintf("%c", 255) \
    "0123456789" \
  );
  for (i=1;i<=length(str); i++)
    { c = substr(str,i,1); class[c] = 3; }
       
  str = ( \
    iso_blanks \
    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" \
    "ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏАБВГДЕЖЗИЙКЛМНОПзї" \
  );
  for (i=1;i<=length(str);i++)
    { c = substr(str,i,1); class[c] = 1; }

  # Note: sets "mk[c]" to prevent redefinition.
  for(i=0;i<160;i++)
    { if ((i < 10) || (i == 11) || ((i > 12) && (i < 32)) || ((i > 126) && (i < 160)))
        { c = sprintf("%c", i); class[c] = -1; mk[c] += 1; }
    }

  # --- "normal" --------------------------------------------------
  for (i=1;i<=length(normal); i++)
    { c = substr(normal,i,1); 
      if ((class[c] == "\033") || (index(iso_blanks,c) != 0))
        { error(("normal characters must be printable and non-blank")); }
      class[c] = 3; mk[c] += 1;
    }
  
  # --- process "errors" option ----------------------------------------
  for (i=1;i<=length(errors); i++)
    { c = substr(errors,i,1); class[c] = -1; mk[c] += 1; }
    
  # --- process "ignore" option ----------------------------------------
  for (i=1;i<=length(ignore); i++)
    { c = substr(ignore,i,1); class[c] = 0; mk[c] += 1; }
    
  # --- process "blanks" option ---------------------------------------
  for (i=1;i<=length(blanks); i++)
    { c = substr(blanks,i,1); class[c] = 1; mk[c] += 1; }

  # --- consistency checking ----------------------------------------
  # Check if all characters have been covered exactly once:
  for (i=0;i<256;i++)
    { c = sprintf("%c", i); 
      if (! (c in class))
        { error(("character \"\\" sprintf("%03o", i) "\" not defined")); }
      else if(mk[c] > 1)
        { error(("character \"\\" sprintf("%03o", i) "\" multiply defined")); }
    }
    
}

function error(msg)
{ 
  printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 1;
}