#! /usr/bin/gawk -f
# Last edited on 2002-03-04 21:39:46 by stolfi

# Reads an EVMT-format file, with basified weirdos.  Removes all
# comments, fillers, and empty lines, turns all spaces and breaks into " ",
# inserts a beginning-of-paragraph ("=") or beginning-of-line ("-")
# in front of the text, and removes the "<>" around the line locator.

BEGIN{
  abort = -1;
  parag_break = 1;
}

(abort >= 0) {exit abort;} 

/^ *([#]|$)/ {next;} 
/^<[^>]*> *([#]|$)/ {next;}

// {
  
  # Split line into locator and text:
  if (match($0, /^[<][f][0-9]+[rv][0-9]*[.][A-Za-z][0-9]*[.][0-9]+[a-z]?([;][A-Z])?[>]/))
    { loc = substr($0, 2, RLENGTH-2);
      lin = substr($0, RLENGTH+1); 
    }
  else
    { data_error("bad locator format"); } 
  
  # Remove internal comments and fillers:
  gsub(/{[^{}]*}/, "", lin);
  gsub(/{[^{}]*}/, "", lin);
  gsub(/[!]/, "", lin);
  gsub(/[?%]/, "*", lin);
  
  # Decide whether line is parag-initial:
  nlin = loc;
  gsub(/[;][A-Za-z]+$/, "", nlin);
  if (nlin != onlin) 
    { first_in_parag = parag_break; 
      parag_break = 0; 
      onlin = nlin;
    }
  
  # Remember whether the line (any version) was parag-final:
  last_in_parag = (lin ~ /[=][-\/., ]*$/);
  
  # Replace all EVA spaces by ascii space:
  gsub(/[-\/=., ]+/, " ", lin);
  gsub(/^[ ]+/, "", lin);
  gsub(/[ ]+$/, "", lin);
  
  # Write out:
  if (lin != "") 
    { print loc, (first_in_parag ? "=" : "-" ), lin, (last_in_parag ? "=" : "-" ); }
  parag_break = last_in_parag;
  next;
}

/./{ data_error("bad line type"); }

function data_error(msg)
{
  printf "*** line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}