#! /bin/gawk -f
# Last edited on 2002-01-15 05:16:47 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "car INFILE.evt \\\n" \
    "  | select-evt-lines \\\n" \
    "      -f SAMPLEFNS.gawk \\\n" \
    "      -v sample=SAMPLE -v subsec=SEC.K \\\n" \
    "      -v maxLines=NUM \\\n" \
    "  > OUTFILE.evt" \
  );
  
  # Selects from the appropriate EVMT-formatted source file 
  # a subset of 
  # lines that is adequate for statistical analysis.
  # If "maxLines" is specified, truncates the 
  # output after that many lines (not counting comments). 
  #
  # The package SAMPLEFNS.gawk must define the predicate
  # select_evt_line(subsec,chapter,unit,linenum)
  
  if (sample == "") { arg_error("must define \"sample\""); }
  if (subsec == "") { arg_error("must define \"subsec\""); }
  if (maxLines == "") { maxLines = 999999999; }
  nread = 0;  # Number of data lines read
  nwrite = 0; # Number of data lines written
  printf "# SELECTED SUBSET\n", ARGV[0], sample;
  printf "# Extracted by select-evt-lines (sample = %s subsec = %s)\n", sample, subsec;
}

(abort >= 0) { exit abort; }

/^[ ]*([#]|$)/ { print; next; }

(nwrite >= maxLines) { 
  printf "# TRUNCATED AFTER %d DATA LINES\n", maxLines;
  exit 0;
}

/^[<]/ {
  nread++;
  # Parse the line locator:
  if (! match($0, /^[<][a-zA-Z0-9]+[.][A-Za-z0-9]+[.][A-Za-z0-9]+[>]/))
    { data_error(("bad line locator format \"" substr($0,1,19) "\"")); }
  loc = substr($0, 1, RLENGTH); 
  lin = substr($0, RLENGTH + 1);
  gsub(/[ <>]/, "", loc);
  if (match(loc, /[;][A-Za-z0-9]+$/))
    { version = substr(loc, RSTART+1, RLENGTH-1);
      loc = substr(loc, 1, RSTART-1);
    }
  else
    { version = "A"; }
  gsub(/[.]/, " ", loc);
  nf = split(loc, locf);
  if (nf != 3) { data_error(("bad number of fields in locator \"" loc "\"")); }
  chapter = locf[1];
  unit = locf[2];
  linenum = locf[3];
  # Decide if line should be saved:
  if (select_evt_line(subsec, chapter, unit, linenum)) { nwrite++; print; }
  next;
}

// { 
  data_error("neither text nor comment");
}

END {
  if (abort >= 0) { exit abort; }
  printf "%s: %7d data lines read, %7d written\n", sample, nread, nwrite > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort=1;  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}