#! /usr/bin/python3
# Last edited on 2025-07-29 18:26:41 by stolfi
# A {python3} library module for parsing and writing IVTFF files.

from math import inf
import sys, re

def line_loop(rd, process_line, data_error):
  # Loops on lines of an IVTFF-ish file {rd}, like the
  # implicit loop of {gawk}.
  #
  # Parses each line according to the {IVTFF} format. in which case it
  # tries to match the full {line} with either of these formats
  #
  #    0. "[ ]*([#].*|)"  (blank or #-comment line)
  #
  #    1. "<{page}>[ ]*{text}[ ]*" (page header line)
  # 
  #    2. "<{page}[.]{lseq}(,{posty}|)(;{trans}|)>[ ]*{text}[ ]*" (data line)
  # 
  # where
  #
  #    {page} is a non-empty string over [a-zA-Z0-9], like "f95v2";
  #
  #    {lseq} is an integer starting from 1 for each page, e. g. "32";
  #    
  #    {posty} is Rene's "locator and type" code defined in the IVTFF
  #      document, e. g. "+P0";
  #
  #    {trans} is a transcriber code, namely any alphameric string
  #      starting with uppercase letter, like "U2"
  #
  #    {text} is either empty or any string that does not begin or end
  #      with spaces e. g. "<%>daiin.dor<!s?>..am" or "{$Q=AA $Z=33}".
  #
  # !!! Add] suport for {quire}+{qpage}+{lseq} loc ID format, like BC012. !!!
  #
  # For each line read from {rd}, this procedure calls
  # 
  #  {process_line(nread,ndata,npage, line, page,lseq,posty,trans,text)}
  #
  # where
  #
  #   {nread} is the count of lines read so far, including this one just read.
  #
  #   {ndata} is the count of transcription lines
  #
  #   {npage} is the count of page heads, like "<f85v2> ..." among those lines.
  #
  #   {line}  is the whole line, as a string, without the trailing blanks or newline.
  #
  # The other arguments {page,lseq,posty,trans,text} are the fields parsed 
  # from the line, as per the format above.
  #
  #   The {line} argument is always a string, but may be "" if the line
  #   is empty.
  # 
  #   If the line was blank or a '#'-comment (case 0), all other fields
  #   ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}.
  #
  #   If the line was a page header (case 1), arguments {page} and
  #   {text} will be strings (but the latter may be ""), while the
  #   {lseq}, {posty} and {trans} fields will be {None}.
  # 
  #   If the line was a data (trascription) line (case 2), the argumens
  #   {page}, and {text} will be non-empty strings, and {lseq} will be
  #   an integer. The arguments {posty} {trans}' (transcriber code) may
  #   be strings or {None}.
  #
  # This procedure calls {data_error(nread, msg, line)} if a line
  # does not match the syntax above.  If {data_error} returns,
  # this procedure treats the invalid line as a comment.
  #
  # The procedure exits when{rd} hits end-of-file. Then it returns the
  # counts {nread,ndata,npage}.
  #
  nread = 0
  ndata = 0
  npage = 0
  while True:
    line = rd.readline()
    if line == "":
      # End of file:
      return nread, ndata, npage
    nread += 1
    line = line.rstrip()
    page, lseq, posty, trans, text = parse_line(line, data_error)
    if page == None:
      # Comment, blank, or invalid line:
      if re.match(r"[ ]*([#]|$)", line) == None:
        data_error(nread, line, "invalid line format")
      if lseq != None or text != None: prog_error("parsing bug (1)", line)
      if posty != None or trans != None: prog_error("parsing bug (2)", line)
      process_line(nread, ndata, npage, line, None, None, None, None, None)
    elif lseq == None:
      # Page header:
      if posty != None or trans != None: prog_error("parsing bug (3)", line)
      npage += 1
      process_line(nread, ndata, npage, line, page, None, None, None, text)
    else: 
      # Not blank, comment, or page header line:
      ndata += 1
      if page == None: prog_error("parsing bug (4)", line)
      process_line(nread, ndata, npage, line, page, lseq, posty, trans, text)
  return nread, ndata, npage
  # ......................................................................

def parse_line(line, data_error):
  # Parses a line {line} from an IVTFF format file, as in cases 1 and 2
  # of the comments of {line_loop}. Assumes that {line} has no trailing
  # spaces.
  #
  # In case 1, return {page,None,None,None,text}. The {page} and {text}
  # will be strings. The latter may be "".
  # 
  # In case 2, returns {page,lseq,posty,trans,text}. The {page} and
  # {text} will be strings. The latter may be "". The {lseq} will be a
  # positive integer. The {posty} and/or {trans} results will be
  # non-empty strings if present on the input, otherwise either or both
  # will be {None}. The {text} will be either "" or a string without any
  # trailing spaces or newlines.
  #
  # If the {line} does not fit case 1 or case 1, including when it fits
  # case 0, this procedure returns {None,None,None,None,None}.
  # 
  # !!! Add suport for {quire}+{qpage}+{lseq} loc ID, like "BC012". !!!
  
  assert line != None, "The {line} arg must not be {None}" 

  pat_page = r"f[0-9]+[rv][0-9]?"            # Page f-number.
  pat_lseq = r"[.][0-9]+"                    # Locus seq in page, with '.'.
  pat_posty = r"[,][@+*=&~/!][A-Z][a-z0-9]"  # Locus position and type, with ','
  pat_trans = r"[;][A-Z][A-Za-z0-9]*"        # Transcriber code, with ';'
  pat_locid = f"<({pat_page})({pat_lseq}|)({pat_posty}|)({pat_trans}|)>"

  m = re.fullmatch(f"{pat_locid} *(.*)", line)
  if m == None: 
    # Not a text line
    return None, None, None, None, None
  else:
    # Is a data line:
    if m.lastindex != 5: prog_error("num fields = %d" % m.lastindex)
    page = m.group(1)
    lseq = m.group(2); lseq = None if lseq == "" else int(lseq[1:])
    posty = m.group(3); posty = None if posty == "" else posty[1:] 
    trans = m.group(4); trans = None if trans == "" else trans[1:]
    text = m.group(5).strip()  # {DATA} field.
    if lseq == None:
      # Page header:
      assert posty == None, "page header cannot have position and type"
      assert trans == None, "page header cannot have transition code"
    return page, lseq, posty, trans, text
  # ----------------------------------------------------------------------

def write_line(wr, fs, prefix):
  # Writes the line with the field dict {fs} (as would be returned by
  # {parse_line}) to file {wr}.  Does NOT write a final newlin.
  #
  # The {fs} must not be {None} and at least {fs[line]} must be not {None}.
  #
  # Otherwise, if {fs['page']} is {None}, then the other fields must be
  # all {None} and the string {fs[line]}is printed as it is.
  #
  # If {fs['page']} is not {None} but {fs['lseq']} is {None} then
  # {fs['posty']} and {fs['trans']}; prints the page header line with
  # text fields {fs[text]} on column 19. Ignores {fs[line]}.
  #
  # If both {fs['page']} and {fs['lseq']} are not {None} the prints a
  # both, plus {fs['posty']} and/or {fs['trans']} if not {None}, and the
  # text {fs[text]}. Ignores {fs[line]}.
  #
  # Tries to format the output so as to keep the start of the text field
  # on column 19.
  # 
  # !!! Add] suport for {quire}+{qpage}+{lseq} loc ID format, like BC012. !!!
  # 
  assert fs != None, "arg {fs} must not be none"
  assert fs[line] != None 
  page = fs['page']
  lseq = fs['lseq']
  posty = fs['posty'];
  trans = fs['trans'];
  text = fs[text];
  if page == None:
    # Line must be comment, blank, or other non-text, non-header.
    assert page == None and lseq == None, "spurious fields {page,lseq}"
    assert posty == None and trans == None, "spurious fields {posty,trans}"
    assert text == None, "spurious {text} field"
    wr.write(prefix + fs[line])
  else:
    if lseq == None:
      assert posty == None and trans == None , "spurious fields {posty,trans}"
      pglt = prefix + f"<{fs['page']}>"
    else:
      if posty != None: posty = "," + posty
      if trans != None: trans = ";" + trans
      pglt = prefix + f"<{page}.{lseq}{posty}{trans}>"
    assert text != None
    wr.write("%18s%s" % (pglt, text))
  return
  # ----------------------------------------------------------------------

def prog_error(msg, line):
  if msg != "": sys.stderr.write(f"** {msg}\n")
  if line != "": sys.stderr.write(f"  {line}\n")
  assert False, msg
  # ----------------------------------------------------------------------

def strip_comments(text):
  # Removes from an IVTFF text string all inline comments
  # and markers like "<%>", "=", etc.
  text = re.sub(r"[<][!][^<>]*[>]", "", text)
  text = re.sub(r"[<].[>]", "", text)
  text = text.strip("-«=».,")
  return text
  # ----------------------------------------------------------------------