#! /usr/bin/gawk -f
# Last edited on 2025-04-29 20:57:54 by stolfi

# Counts the number of non-comment, non-filler, non-space
# bytes per text unit and transcriber, in an EVT-format file.

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    " -v scribes=STRING \\\n" \
    " -v countChars=BOOL \\\n" \
    " -v countLines=BOOL \\\n" \
    " -v perPage=BOOL \\\n" \
    " -v perUnit=BOOL \\\n" \
    " [<] INFILE > TOTALS \\\n" \
  );
  
  if (scribes == "") { arg_error("must define \"scribes\""); }
  
  # Defaults are for backwards compatibility.
  if (perPage == "") { perPage = 0; }
  if (perUnit == "") { perUnit = 1; }
  if (countLines == "") { countLines = 0; }
  if (countChars == "") { countChars = 1; }
  
  # If splitting at units, also split at pages:
  if (perUnit) { perPage = 1; }
  
  # If counting per pages or units, use horizontal format:
  horFmt = (perUnit || perPage);
  
  split("", guy);   # {guy[0..nguys-1]} are the transcribers seen.
  split("", seen); # {seen[t]=1} if {t} has been registered already.
  nguys=0;
  len = length(scribes);
  for (i=1; i<=len; i++) { cert_guy(substr(scribes,i,1)); }
  
  cur_p = "*";   # Current page or "*.*" if none yet.
  cur_u = "*.*"; # Current page.unit or page."*" if no unit in page yet.
  split("", ct); # {ct[k,t]} is the level-k char count for transcriber {t}.
  split("", rt); # {rt[k,t]} is the level-k line count for transcriber {t}.
  split("", ng); # {ng[k]} is the level-k total line count.
  
  clr(2); clr(1); clr(0);
  
  if (horFmt) { head_hor(); dash_hor(); }
}

(abort >= 0) { exit abort; }

/^[ ]*$/ { 
  # Blank line, skip: 
  next; 
}

/^[#]/ { 
  # Comment line, skip: 
  next; 
}

/^[<][^<>; ]*[>] *[{][^{}]*[}] *$/ {
  # Page/unit/locus header line:
  check_locus($1);
  next; 
}

/^<[^<>; ]*[;][A-Z][>]/ {
  # Split into locator {loc} and text {txt}:
  loc = $1;
  len = length(loc);
  txt = substr($0, len+1);
  
  # Remove transcriber code {t} from {loc}:
  t = substr(loc, len-1,1);
  loc = ( substr(loc, 1,len-3) ">" );
  check_locus(loc);
  cert_guy(t);
  
  # Count this line at all three levels:
  nch = length(clean_text(substr($0,20)));
  for (k = 0; k < 3; k++)
    { ng[k]++;
      rt[k,t] ++;
      ct[k,t] += nch;
    }
}

END{
  if (abort >= 0) { exit; abort; }
  dmp(0, "TOTAL");
  if (horFmt) { dash_hor();  head_hor(); }
}

function check_locus(loc,   n,fld,p,u,grp)
{
  # If summation group changed, dumps statistics and 
  # resets them.
  
  # Splits locus into page and unit, 
  gsub(/[<>]/, "", loc);
  n = split(loc, fld, /[.]/);
  if ((n < 1) || (n > 3)) { data_error(("bad locator \"" loc "\"")); }
  p = (n >= 1 ? fld[1] : "*");
  u = (p "." (n >= 2 ? fld[2] : "*"));
  
  if ((u != cur_u) && perUnit) 
    { # printf "[%s]", u > "/dev/stderr";
      dmp(2, cur_u); clr(2);
    }
  if ((p != cur_p) && perPage) 
    { # printf "(%s)", p > "/dev/stderr";
      dmp(1, cur_p); clr(1);
    }

  cur_u = u; 
  cur_p = p; 
}

function cert_guy(t)
{
  # Adds "t" to the set of known transcribers, if not there already.
  if (! (t in seen)) 
    { guy[nguys] = t; nguys++; seen[t] = 1; }
}

function clr(k,   t)
{
  # Clears counters of level {k}
  ng[k] = 0;
  for (t in seen) { ct[k,t] = 0; rt[k,t] = 0; }
}

function dmp(k,grp) 
{
  if ((k > 0) && (ng[k] == 0)) { return; }
  if (horFmt)
    { dmp_hor(k,grp); }
  else
    { dmp_ver(k,grp); }
}

function dmp_hor(k,grp,  i,t) 
{
  # Dumps counters of level {k}, horizontal format:
  if (countLines)
    { printf "%-12s ", grp;
      printf " %7d ", ng[k];
      for(i=0; i<nguys; i++) 
        { printf " %6d", rt[k,guy[i]] }
      printf "\n";
    }
  if (countChars)
    { printf "%-12s ", grp;
      printf " %7d ", ng[k];
      for(i=0; i<nguys; i++) 
        { printf " %6d", ct[k,guy[i]] }
      printf "\n";
    }
  if (countLines && countChars)
    { printf "\n"; }
}

function dash_hor (  i)
{
  if (! horFmt) { return; }
  printf "%-12.12s ", "-----------------------";
  printf " %7.7s ", "-----------------------";
  for(i=0; i<nguys; i++) 
    { printf " %6.6s", "-----------------" }
  printf "\n";
}

function head_hor (  i)
{
  printf "%-12s ", "unit";
  printf " %7s ", "lines";
  for(i=0; i<nguys; i++) 
    { printf " %6s", guy[i] }
  printf "\n";
}

function dmp_ver(k,grp,  i,t) 
{
  # Dumps counters of level {k}, vertical format:
  printf "%s (%d lines)\n", grp, ng[k];
  head_ver(); 
  dash_ver();
  for(i=0; i<nguys; i++) 
    { t = guy[i]; 
    printf "%s ", t;
      if (countLines) { printf " %6d", rt[k,t]; }
      if (countChars) { printf " %6d", ct[k,t]; }
      printf "\n";
    }
  dash_ver();
}

function dash_ver (  )
{
  printf "%s ", "-";
  if (countLines) { printf " %6.6s", "-----------------------"; }
  if (countChars) { printf " %6.6s", "-----------------------"; }
  printf "\n";
}

function head_ver (  )
{
  printf "%s ", "T";
  if (countLines) { printf " %6.6s", "lines"; }
  if (countChars) { printf " %6.6s", "chars"; }
  printf "\n";
}

function clean_text(txt  )
{
  # Removes comments, fillers, and space characters.
  
  # Remove {}-comments: 
  gsub(/[{][^{}]*[}]/, "", txt);
  
  # Turn weirdo codes into "*":
  gsub(/[&][0-9]+[;]/, "*", txt);
  
  # Unshell ()-groups:
  txt = gensub(/[(]([^()]*)[)]/, "\\1", "g", txt);
  
  # Remove spaces and fillers:
  gsub(/[-= ,.%!]/, "", txt);
  
  return txt;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;  exit abort;
}

function data_error(msg)
{ 
  printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}