# To be included in extract-reading-tuples, combine-versions
# Last edited on 1998-12-30 10:58:00 by stolfi

function tup_clear_current_batch()
{
  # Empties the buffer that contains the current batch of versions 
  # for a given line:
  
  # Location of current batch (including line number):
  tup_cur_loc = "";
  
  # The current batch:
  tup_nb = 0;            # number of input lines in this batch (incl. comments). 
  split("", tup_batch);  # "tup_batch[i]", "i=0..tup_nb-1" are the lines of the batch.
  
}

function tup_append_line_to_batch(lin)
{
  # Appends a line to the curren batch, without checking anything.
  tup_batch[tup_nb] = lin;
  tup_nb++;
}

function tup_append_version_to_batch(lin,  k)
{
  # appends "lin" after last non-comment line of current batch
  k = tup_nb;
  while ((k>0) && (tup_batch[k-1] ~ /^#/)) 
    { tup_batch[k] = tup_batch[k-1]; k--; }
  tup_batch[k] = lin;
  tup_nb++;
}

function tup_prepend_version_to_batch(lin,  i,k)
{
  # inserts "lin" before first non-comment line of current batch
  i = 0;
  while ((i<tup_nb) && (tup_batch[i] ~ /^#/)) { i++; }
  k = tup_nb;
  while (k>i)
    { tup_batch[k] = tup_batch[k-1]; k--; }
  tup_batch[i] = lin;
  tup_nb++;
}

function tup_process_variant(lin,ignore,  loc,tmp)
{
  # Processes a new variant 
  if (length(lin) <= 19)
    { fatal_error("missing text"); }
  
  # Check general format, and extract location code and text proper.
  # Note that line number must start with digit,
  # while the text unit code must start with letter:
  match(lin, /^<f[0-9]+[vr]?[0-9]?[.](|[A-Z]|[A-Za-z][A-Za-z0-9]?[.])[0-9]+[a-f]?;[A-Z]>/);
  if (RSTART != 1) 
    { format_error("bad location format"); res = 0; }
  else
    { 
      loc = substr(lin,RSTART+1,RLENGTH-2);
      if (substr(lin,RLENGTH+1, 19-RLENGTH) !~ /^[ ]*$/)
        { fatal_error("too few blanks"); }
      if (substr(lin,20,1) == " ")
        { fatal_error("too many blanks"); }

      # Validate location code
      # Split location into fields:
      tmp = length(loc);
      if (substr(loc, tmp-1, 1) != ";") 
        { fatal_error("program error: semicolon"); }
      loc = substr(loc, 1, tmp-2);

      # If new batch, flush previous one:
      if (loc != tup_cur_loc) 
        { tup_process_current_batch(ignore); }
      
      # Save version in batch buffer:
      tup_append_line_to_batch(lin);

      # Update current location:
      tup_cur_loc = loc;
    }
}

function tup_process_current_batch(ignore,   \
  nv,txt,trn,trseen,len,nc,i,loc,lin,tx,trc)
{
  # Processes a batch of variants and comments for one VMS line,
  # generating tuples.  Calls the client functions 
  # process_tuples and output_batch_line
  
  nv = 0;
  split("", txt);  # "txt[i]", "i=0..nv-1]" are the variant texts.
  split("", trn);  # "trn[i]" the corresponding transcriber codes, 1..26.
  split("", trseen); # "trseen[tr] = 1" means transcriber "tr" has occurred.
        
  nc = -1; # length of text in lines of current batch.
  
  for(i=0; i<tup_nb; i++)
    { lin = tup_batch[i];
      if (lin !~ /^#/)
        { # Get EVA text;
          tx = substr(lin,20);
          gsub(/^[ ]+/, "", tx);
          gsub(/[ ]+$/, "", tx);

          # Remove trailing comments, if any
          while (gsub(/{[^{}]*}$/, "", tx)) { }

          # Convert any remaining comments to "!"s:  
          tx = tup_remove_comments(tx);
          
          # Check line length:
          len = length(tx);
          if ((nc != -1) && (len != nc))
            { fatal_error(("inconsistent line lengths (" nc ":" len ")")); }
          nc = len;

          # Get transcriber code:
          if (! match(substr(lin,1,19), /[;][A-Z][>]/))
            { fatal_error("program error: tr code"); }
          trc = substr(lin, RSTART+1, 1);

          if (trc in trseen) 
            { fatal_error("repeated transcription code"); res = 0; }
          else if (index(ignore, trc) == 0)
            { trseen[trc] = 1;
              # Save for later tuple extraction:
              txt[nv] = tx; 
              tk = index("ABCDEFGHIJKLMNOPQRSTUVWXYZ", trc);
              if (tk == 0) { fatal_error("program error: trc"); }
              trn[nv] = tk; 
              nv++;
            }
        }
    }
    
  if (nv > 0)
    { # Call client function to process VMS texts
      process_batch_texts(tup_cur_loc,txt,trn,nv,nc);
    }

  # Call client function to dispose of batch lines:
  process_batch_lines(tup_batch, tup_nb);
  tup_clear_current_batch()
}

function tup_remove_comments(txt,   chunk,i,res)
{
  # Replaces {}-comments by an equal length of "!"s
  
  res = "";
  while (txt != "")
    { i = index(txt, "{");
      if (i == 0) 
        { res = (res txt); txt = ""; } 
      else 
        { res = (res substr(txt, 1, i-1)); 
          txt = substr(txt, i);
          i = index(txt, "}");
          if (i == 0) 
            { format-error("mismatched `{'"); res = (res txt); txt = ""; }
          while (length(tup_bangs) < i) { tup_bangs = ( tup_bangs tup_bangs "!"); } 
          res = (res substr(tup_bangs, 1,i)); 
          txt = substr(txt, i+1);
        }
    }
  return res;
}

function tup_extract_tuples(txt,trn,nv,nc,tuple,  i,j,k,d,tup,r)
{
  # For each character position "j=1..nc", creates a tuple
  # "tuple[j]" from "txt[i]", "i=0..nv-1".
  
  # Check lengths, just to be sure:
  nc = length(txt[0]);
  for (i=1; i<nv; i++) 
    { if (nc != length(txt[i])) 
        { fatal_error(("diff lengths [" txt[i] "]")); }
    }

  split("", tup);
  for (j=1; j<=nc; j++)
    { for (k=1;k<=26;k++) { tup[k] = "%"; }
      for (i=0; i<nv; i++) 
        { d = substr(txt[i], j,1);
          tup[trn[i]] = d;
        }
      r = "";
      for (k=1;k<=26;k++) { r = (r tup[k]); }
      tuple[j] = r;
    }
}