#! /usr/bin/python3
# Last edited on 2026-01-18 21:43:57 by stolfi

import sys, os, re, random
from sys import stdout as out, stderr as err
from process_funcs import bash
from error_funcs import arg_error, file_line_error, prog_error
import ivtff_format as iv
from math import floor, ceil

def main():
  o = parse_options()

  sys.stdout.reconfigure(encoding='iso-8859-1')
  sys.stdin.reconfigure(encoding='iso-8859-1')
  
  # Reads an ".ivt" file from {stdin} and writes its tokens to {stdout},
  # one per line.
  
  # Ignores #-comments, blank lines and page headers "<[a-z0-9]+> ...".
  # Ignores page breaks, treating them as line breaks.
  #
  # Data line must have a valid locus ID
  # "<{FNUM}[.]{LSEQ}{TRANS}{POSTY}>" then at least one blank and then
  # the transcrition data {TEXT}.
  
  # Assumes that a parag has "<%>" at the start of the {TEXT} 
  # of its head line and
  # "<$>" at the end of the tail line (which may be the same as the
  # head). These are noted and removed. If these markers occur in the
  # file, they must be paired, in that order, and not nested.
  
  # Tokens must be sequences of letters [A-Za-z], or ligature braces
  # '{}', or '?', or weirdo codes "&{NNN}" where {NNN} is always three
  # digits. An optional ";" after the {NNN} is allowed. Every weirdo
  # code is turned into a single '?'. Otherwise, tokens must have only
  # letters [a-zA-Z], '?'. These are written unchanged to the output.

  # There must be exactly one separator [-.,] between two consecutive 
  # tokens in the {TEXT}.  There must be no separator at the front of 
  # {TEXT}. A single separator at the end of the {TEXT}
  # is tolerated and ignored.
  
  # A line break is either a "physical" line break in the file, or a '-'
  # in the middle of the text of a line. A parag break is defined as a
  # transition between two physical lines where either the previous one
  # was a parag tail or the next one is a parag head.

  # If {showLines} is true, input line breaks are translated into a
  # blank line, input paragraph breaks are translated into two blank
  # lines.
  # 
  # If {showLines} is false but {showParags} is true, input line breaks
  # are treated like word spaces, but paragraph breaks are translated
  # into an empty line.
  # 
  # If both {showLines} and {showParags} are false (the defaukt), all
  # tokens in the file are written sequentially without any blank
  # lines.  That is, line breaks and parag breaks are tretaed as 
  # word spaces.

  # If {omitInitial} is true, tokens immediately following a line or
  # figure break are omitted from the output. The option {omitFinal} is
  # symmetric, and {omitMedial} discards any tokens that are *not*
  # adjacent to a line or figure break. The default is all three false;
  # that is, output all the words of each line.
  
  # The {dubiousSpaces} parameter should be a number {p} in 0..100. If
  # {p} is 0, all dubious spaces ',' will be ignored. If {p} is 100,
  # they will all be treated as word spaces, same as '.'. Otherwise the
  # script will choose between the two options at random, with
  # probability {p/100}
  
  # Each output line has fields 
  # 
  #   "{LOCID} {IPAR} {ILIN} {ITOK} {TOKEN}" 
  #
  # where
  #
  #   {TOKEN} is a token.
  #
  #   {LOCID} is the locus ID of the line  containing {TOKEN}
  #     (such as "f82v1.2;U").  If the line has no locus ID,
  #     "-" is printed instead.
  # 
  #   {IPAR} is the sequential number of the paragraph
  #     in the file, starting from 1; or 0 if the line is 
  #     not part of a parag.
  #  
  #   {ILIN} is the sequential number of the data line in the file,
  #     starting from 1, not counting #-comments, blank lines,
  #     and page header lines;
  #
  #   {ITOK} is the index of the token in the line, starting
  #     from 1.
  # 
  
  # We first reduce the input file to a stream of tokens alternating
  # with single separators (".", "-", or "="). This stream is fed
  # through a filter {output_token} that converts the separators to
  # blank lines, as requested, and omits line -initial, -medial, or
  # -final tokens, as requested.

  inParag = False  # Becomes true within a parag.
  prevTail = False # True if there was a prev line and it was a parag tail.

  prComma = o['dubiousSpaces'] / 100 # Prob of comma becoming dot.
  random.seed(171717)
  
  npara = 0  # Total number of parags seen.
  ntoks = 0  # Number of tokens parsed.
  nwrit = 0  # Number of tokens written.
  
  def process_line(nread,ndata,npage, line, page,lseq,posty,trans,text):
    nonlocal o, npara, ntoks, nwrit, inParag, prComma

    def input_error(msg):
      nonlocal nread, line
      data_error(nread, msg, line)
      # ....................................................................

    def process_text(locid, text):
      nonlocal o, npara, ntoks, nwrit, inParag, prComma, prevTail
      # Updates {npara,ntoks,nwrit,inParag,prevTail}

      # Remove inline comments:
      text = re.sub(r"<![^<>]*>", "", text);
      # Remove blanks:
      text = re.sub(r"[ ]", "", text);
      # Remove rail alignment markers:
      text = re.sub(r"[«=»]", "", text);
      # Reduce all weirdo codes to '?'
      text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text);
      if text == "": input_error("line with no tokens")

      # Remove parag markers "<%>" and "<$>",
      # setting {thisHead} and {thisTail} instead: 
      if re.match(r"<%>", text): 
        if inParag: input_error("missing end-of-parag marker")
        text = re.sub(r"^<[%]>", "", text)
        inParag = True
        thisHead = True
        npara += 1
      else:
        thisHead = False

      if re.search(r"<[$]>$", text): 
        if not inParag: input_error("missing start-of-parag marker") 
        text = re.sub(r"<[$]>$", "", text)
        thisTail = True
      else:
        thisTail = False

      # The {prevBreak} variable is the delimiter before the current
      # token, either 0 (word space), 1 (line or figure break), or 2
      # (parag break).

      # Append a "-" to the line to simplify parsing:
      text = re.sub(r"[-.,]?$", "-", text)

      text = fix_dubious_spaces(text, prComma, input_error)

      # Separator before first token:
      prevBreak = 2 if prevTail or thisHead else 1
      # Loop on tokens:
      itok = 0
      while True:
        m = re.match(r"([a-zA-Z?{}]*)([-.])", text)
        if m == None: break
        ntoks += 1
        itok += 1
        # Isolate the next token with its following delimiter:
        token = m.group(1)
        delim = m.group(2)
        text = text[len(token) + len(delim):]
        # Determine type of break {nextBreak} after this token.
        # It is 1 at end of text, not 2, even if this is a parag tail.
        if delim == ".":
          nextBreak = 0
        elif delim == "-":
          nextBreak = 1
        else:
          prog_error("token delim")
        # err.write(f"!! {prevBreak} {token} {nextBreak}\n")
        ilin = ndata
        ipar = npara if inParag else 0
        if token == "": input_error("empty token")
        nwrit = process_token(o, locid, ipar, ilin, itok, prevBreak, token, nextBreak, nwrit)
        prevBreak = nextBreak

      if text != "": input_error(f"invalid char in line '{text[0:1]}'")
      if thisTail: inParag = False
      prevTail = thisTail
      return
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    line = line.strip()
    if page == None:
      # Comment or blank line:
      return
    elif lseq == None:
      # Page header line:
      return
    else:
      # Rebuild locus ID:
      locid = f"{page}.{lseq}"
      if posty != None: locid = locid + "," + posty
      if trans != None: locid = locid + ";" + trans

      process_text(locid, text)
    return
    # ....................................................................
    
  def data_error(nread, msg, line):
    file_line_error("-", nread, msg, line)
    # ....................................................................
  
  nread, ndata, npage = iv.line_loop(sys.stdin, process_line, data_error)

  if inParag: data_error(("missing end-of-parag marker"))
  err.write(f"{nread:5d} lines read\n")
  err.write(f"{ndata:5d} data lines parsed\n")
  err.write(f"{npara:5d} parags seen\n")
  err.write(f"{ntoks:5d} tokens parsed\n")
  err.write(f"{nwrit:5d} tokens written\n")

  return
  # ----------------------------------------------------------------------

def fix_dubious_spaces(text, prComma, input_error):
  # Replaces dubious spaces ',' by '.' with prob {prComma} else by ''.
  new_text = "";
  while True:
    m = re.match(r"([-.a-zA-Z?{}]*)[,]", text)
    if m == None: break
    piece = m.group(1)
    if piece == "": input_error("comma at front or doubled")
    text = text[len(piece)+1:]
    new_text += piece
    if random.random() < prComma:
      new_text += "."
  return new_text + text
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def process_token(o, locid, ipar, ilin, itok, prevBreak, token, nextBreak, nwrit):
  # The {nwrit} should be the number of tokens written.
  # Returns {nwrit} incremented if the token was written.

  # Decide number of blank lines to write:
  if prevBreak == 2:
    blanks = 2 if o['showLines'] else 1
  elif prevBreak == 1:
    blanks = 1 if o['showLines'] else 0
  else: 
    blanks = 0

  if prevBreak == 0 and nextBreak == 0:
    # Token is text-medial:
    omit = o['omitMedial']
  else:
    omit = False
    if prevBreak >= 1:
      # Token is line-initial:
      omit |= o['omitInitial']
    if nextBreak >= 1:
      # Token is line-final:
      omit |= o['omitFinal']
  if not omit:
    # Output token:
    if nwrit > 0:
      for kb in range(blanks): out.write("\n")
    output_token(locid, ipar, ilin, itok, token);
    nwrit += 1
  return nwrit
  # ..................................................................

def output_token(locid, ipar, ilin, itok, token):
  # {prevBreak} must be 0 for word space, 1 for line break, 2 for parag break.
  # {nextBreak} must be 0 for word space, 1 for line break or parag break.
  if token == "": prog_error(("empty token"))
  out.write("%-12s " % locid)
  out.write(" %3d" % ipar)
  out.write(" %5d" % ilin)
  out.write(" %3d" % itok)
  out.write(" %s\n" % token)
  return
  # ----------------------------------------------------------------------

def parse_options():  
  usage = ( "words_from_ivt.py \\\n" \
    "  [ -showLines ] [ -showParags ] \\\n" \
    "  [ -omitInitial ] [ -omitFinal ] \\\n" \
    "  [ -omitMedial ] \\\n" \
    "  [ --dubiousSpaces NUM ] \\\n" \
    "  < INFILE > OUTFILE" \
  );

  o = dict()
  
  o['showLines'] = False
  o['showParags'] = False
  o['showParags'] = False
  o['omitInitial'] = False
  o['omitMedial'] = False
  o['omitFinal'] = False
  o['dubiousSpaces'] = 50
  
  iarg = 1
  narg = len(sys.argv)
  while iarg < narg:
    arg = sys.argv[iarg]; iarg += 1
    arg = re.sub(r"^--", "-", arg)
    if arg == "-help" or arg == "-info": 
      err.write(usage); err.write("\n")
      sys.exit(0)
    elif arg == "-showLines": o['showLines'] = True
    elif arg == "-showParags": o['showParags']  = True
    elif arg == "-omitInitial": o['omitInitial']  = True
    elif arg == "-omitMedial": o['omitMedial']  = True
    elif arg == "-omitFinal": o['omitFinal']  = True
    elif arg == "-dubiousSpaces": 
      arg = sys.argv[iarg]; iarg += 1
      o['dubiousSpaces'] = float(arg)
    else:
      arg_error(f"invalid command line argument {arg}")
    
  # It simplifies things if {showLines} implies {showParags} :
  if o['showLines']: o['showParags'] = True
  
  if o['omitInitial'] and o['omitMedial'] and o['omitFinal']:
    arg_error("omitting everything!?")

  return o
  # ----------------------------------------------------------------------

main()