#! /usr/bin/python3 # Last edited on 2025-07-29 18:26:41 by stolfi # A {python3} library module for parsing and writing IVTFF files. from math import inf import sys, re def line_loop(rd, process_line, data_error): # Loops on lines of an IVTFF-ish file {rd}, like the # implicit loop of {gawk}. # # Parses each line according to the {IVTFF} format. in which case it # tries to match the full {line} with either of these formats # # 0. "[ ]*([#].*|)" (blank or #-comment line) # # 1. "<{page}>[ ]*{text}[ ]*" (page header line) # # 2. "<{page}[.]{lseq}(,{posty}|)(;{trans}|)>[ ]*{text}[ ]*" (data line) # # where # # {page} is a non-empty string over [a-zA-Z0-9], like "f95v2"; # # {lseq} is an integer starting from 1 for each page, e. g. "32"; # # {posty} is Rene's "locator and type" code defined in the IVTFF # document, e. g. "+P0"; # # {trans} is a transcriber code, namely any alphameric string # starting with uppercase letter, like "U2" # # {text} is either empty or any string that does not begin or end # with spaces e. g. "<%>daiin.dor..am" or "{$Q=AA $Z=33}". # # !!! Add] suport for {quire}+{qpage}+{lseq} loc ID format, like BC012. !!! # # For each line read from {rd}, this procedure calls # # {process_line(nread,ndata,npage, line, page,lseq,posty,trans,text)} # # where # # {nread} is the count of lines read so far, including this one just read. # # {ndata} is the count of transcription lines # # {npage} is the count of page heads, like " ..." among those lines. # # {line} is the whole line, as a string, without the trailing blanks or newline. # # The other arguments {page,lseq,posty,trans,text} are the fields parsed # from the line, as per the format above. # # The {line} argument is always a string, but may be "" if the line # is empty. # # If the line was blank or a '#'-comment (case 0), all other fields # ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}. # # If the line was a page header (case 1), arguments {page} and # {text} will be strings (but the latter may be ""), while the # {lseq}, {posty} and {trans} fields will be {None}. # # If the line was a data (trascription) line (case 2), the argumens # {page}, and {text} will be non-empty strings, and {lseq} will be # an integer. The arguments {posty} {trans}' (transcriber code) may # be strings or {None}. # # This procedure calls {data_error(nread, msg, line)} if a line # does not match the syntax above. If {data_error} returns, # this procedure treats the invalid line as a comment. # # The procedure exits when{rd} hits end-of-file. Then it returns the # counts {nread,ndata,npage}. # nread = 0 ndata = 0 npage = 0 while True: line = rd.readline() if line == "": # End of file: return nread, ndata, npage nread += 1 line = line.rstrip() page, lseq, posty, trans, text = parse_line(line, data_error) if page == None: # Comment, blank, or invalid line: if re.match(r"[ ]*([#]|$)", line) == None: data_error(nread, line, "invalid line format") if lseq != None or text != None: prog_error("parsing bug (1)", line) if posty != None or trans != None: prog_error("parsing bug (2)", line) process_line(nread, ndata, npage, line, None, None, None, None, None) elif lseq == None: # Page header: if posty != None or trans != None: prog_error("parsing bug (3)", line) npage += 1 process_line(nread, ndata, npage, line, page, None, None, None, text) else: # Not blank, comment, or page header line: ndata += 1 if page == None: prog_error("parsing bug (4)", line) process_line(nread, ndata, npage, line, page, lseq, posty, trans, text) return nread, ndata, npage # ...................................................................... def parse_line(line, data_error): # Parses a line {line} from an IVTFF format file, as in cases 1 and 2 # of the comments of {line_loop}. Assumes that {line} has no trailing # spaces. # # In case 1, return {page,None,None,None,text}. The {page} and {text} # will be strings. The latter may be "". # # In case 2, returns {page,lseq,posty,trans,text}. The {page} and # {text} will be strings. The latter may be "". The {lseq} will be a # positive integer. The {posty} and/or {trans} results will be # non-empty strings if present on the input, otherwise either or both # will be {None}. The {text} will be either "" or a string without any # trailing spaces or newlines. # # If the {line} does not fit case 1 or case 1, including when it fits # case 0, this procedure returns {None,None,None,None,None}. # # !!! Add suport for {quire}+{qpage}+{lseq} loc ID, like "BC012". !!! assert line != None, "The {line} arg must not be {None}" pat_page = r"f[0-9]+[rv][0-9]?" # Page f-number. pat_lseq = r"[.][0-9]+" # Locus seq in page, with '.'. pat_posty = r"[,][@+*=&~/!][A-Z][a-z0-9]" # Locus position and type, with ',' pat_trans = r"[;][A-Z][A-Za-z0-9]*" # Transcriber code, with ';' pat_locid = f"<({pat_page})({pat_lseq}|)({pat_posty}|)({pat_trans}|)>" m = re.fullmatch(f"{pat_locid} *(.*)", line) if m == None: # Not a text line return None, None, None, None, None else: # Is a data line: if m.lastindex != 5: prog_error("num fields = %d" % m.lastindex) page = m.group(1) lseq = m.group(2); lseq = None if lseq == "" else int(lseq[1:]) posty = m.group(3); posty = None if posty == "" else posty[1:] trans = m.group(4); trans = None if trans == "" else trans[1:] text = m.group(5).strip() # {DATA} field. if lseq == None: # Page header: assert posty == None, "page header cannot have position and type" assert trans == None, "page header cannot have transition code" return page, lseq, posty, trans, text # ---------------------------------------------------------------------- def write_line(wr, fs, prefix): # Writes the line with the field dict {fs} (as would be returned by # {parse_line}) to file {wr}. Does NOT write a final newlin. # # The {fs} must not be {None} and at least {fs[line]} must be not {None}. # # Otherwise, if {fs['page']} is {None}, then the other fields must be # all {None} and the string {fs[line]}is printed as it is. # # If {fs['page']} is not {None} but {fs['lseq']} is {None} then # {fs['posty']} and {fs['trans']}; prints the page header line with # text fields {fs[text]} on column 19. Ignores {fs[line]}. # # If both {fs['page']} and {fs['lseq']} are not {None} the prints a # both, plus {fs['posty']} and/or {fs['trans']} if not {None}, and the # text {fs[text]}. Ignores {fs[line]}. # # Tries to format the output so as to keep the start of the text field # on column 19. # # !!! Add] suport for {quire}+{qpage}+{lseq} loc ID format, like BC012. !!! # assert fs != None, "arg {fs} must not be none" assert fs[line] != None page = fs['page'] lseq = fs['lseq'] posty = fs['posty']; trans = fs['trans']; text = fs[text]; if page == None: # Line must be comment, blank, or other non-text, non-header. assert page == None and lseq == None, "spurious fields {page,lseq}" assert posty == None and trans == None, "spurious fields {posty,trans}" assert text == None, "spurious {text} field" wr.write(prefix + fs[line]) else: if lseq == None: assert posty == None and trans == None , "spurious fields {posty,trans}" pglt = prefix + f"<{fs['page']}>" else: if posty != None: posty = "," + posty if trans != None: trans = ";" + trans pglt = prefix + f"<{page}.{lseq}{posty}{trans}>" assert text != None wr.write("%18s%s" % (pglt, text)) return # ---------------------------------------------------------------------- def prog_error(msg, line): if msg != "": sys.stderr.write(f"** {msg}\n") if line != "": sys.stderr.write(f" {line}\n") assert False, msg # ---------------------------------------------------------------------- def strip_comments(text): # Removes from an IVTFF text string all inline comments # and markers like "<%>", "=", etc. text = re.sub(r"[<][!][^<>]*[>]", "", text) text = re.sub(r"[<].[>]", "", text) text = text.strip("-«=».,") return text # ----------------------------------------------------------------------