#! /usr/bin/python3 # Last edited on 2026-01-18 21:43:57 by stolfi import sys, os, re, random from sys import stdout as out, stderr as err from process_funcs import bash from error_funcs import arg_error, file_line_error, prog_error import ivtff_format as iv from math import floor, ceil def main(): o = parse_options() sys.stdout.reconfigure(encoding='iso-8859-1') sys.stdin.reconfigure(encoding='iso-8859-1') # Reads an ".ivt" file from {stdin} and writes its tokens to {stdout}, # one per line. # Ignores #-comments, blank lines and page headers "<[a-z0-9]+> ...". # Ignores page breaks, treating them as line breaks. # # Data line must have a valid locus ID # "<{FNUM}[.]{LSEQ}{TRANS}{POSTY}>" then at least one blank and then # the transcrition data {TEXT}. # Assumes that a parag has "<%>" at the start of the {TEXT} # of its head line and # "<$>" at the end of the tail line (which may be the same as the # head). These are noted and removed. If these markers occur in the # file, they must be paired, in that order, and not nested. # Tokens must be sequences of letters [A-Za-z], or ligature braces # '{}', or '?', or weirdo codes "&{NNN}" where {NNN} is always three # digits. An optional ";" after the {NNN} is allowed. Every weirdo # code is turned into a single '?'. Otherwise, tokens must have only # letters [a-zA-Z], '?'. These are written unchanged to the output. # There must be exactly one separator [-.,] between two consecutive # tokens in the {TEXT}. There must be no separator at the front of # {TEXT}. A single separator at the end of the {TEXT} # is tolerated and ignored. # A line break is either a "physical" line break in the file, or a '-' # in the middle of the text of a line. A parag break is defined as a # transition between two physical lines where either the previous one # was a parag tail or the next one is a parag head. # If {showLines} is true, input line breaks are translated into a # blank line, input paragraph breaks are translated into two blank # lines. # # If {showLines} is false but {showParags} is true, input line breaks # are treated like word spaces, but paragraph breaks are translated # into an empty line. # # If both {showLines} and {showParags} are false (the defaukt), all # tokens in the file are written sequentially without any blank # lines. That is, line breaks and parag breaks are tretaed as # word spaces. # If {omitInitial} is true, tokens immediately following a line or # figure break are omitted from the output. The option {omitFinal} is # symmetric, and {omitMedial} discards any tokens that are *not* # adjacent to a line or figure break. The default is all three false; # that is, output all the words of each line. # The {dubiousSpaces} parameter should be a number {p} in 0..100. If # {p} is 0, all dubious spaces ',' will be ignored. If {p} is 100, # they will all be treated as word spaces, same as '.'. Otherwise the # script will choose between the two options at random, with # probability {p/100} # Each output line has fields # # "{LOCID} {IPAR} {ILIN} {ITOK} {TOKEN}" # # where # # {TOKEN} is a token. # # {LOCID} is the locus ID of the line containing {TOKEN} # (such as "f82v1.2;U"). If the line has no locus ID, # "-" is printed instead. # # {IPAR} is the sequential number of the paragraph # in the file, starting from 1; or 0 if the line is # not part of a parag. # # {ILIN} is the sequential number of the data line in the file, # starting from 1, not counting #-comments, blank lines, # and page header lines; # # {ITOK} is the index of the token in the line, starting # from 1. # # We first reduce the input file to a stream of tokens alternating # with single separators (".", "-", or "="). This stream is fed # through a filter {output_token} that converts the separators to # blank lines, as requested, and omits line -initial, -medial, or # -final tokens, as requested. inParag = False # Becomes true within a parag. prevTail = False # True if there was a prev line and it was a parag tail. prComma = o['dubiousSpaces'] / 100 # Prob of comma becoming dot. random.seed(171717) npara = 0 # Total number of parags seen. ntoks = 0 # Number of tokens parsed. nwrit = 0 # Number of tokens written. def process_line(nread,ndata,npage, line, page,lseq,posty,trans,text): nonlocal o, npara, ntoks, nwrit, inParag, prComma def input_error(msg): nonlocal nread, line data_error(nread, msg, line) # .................................................................... def process_text(locid, text): nonlocal o, npara, ntoks, nwrit, inParag, prComma, prevTail # Updates {npara,ntoks,nwrit,inParag,prevTail} # Remove inline comments: text = re.sub(r"]*>", "", text); # Remove blanks: text = re.sub(r"[ ]", "", text); # Remove rail alignment markers: text = re.sub(r"[«=»]", "", text); # Reduce all weirdo codes to '?' text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text); if text == "": input_error("line with no tokens") # Remove parag markers "<%>" and "<$>", # setting {thisHead} and {thisTail} instead: if re.match(r"<%>", text): if inParag: input_error("missing end-of-parag marker") text = re.sub(r"^<[%]>", "", text) inParag = True thisHead = True npara += 1 else: thisHead = False if re.search(r"<[$]>$", text): if not inParag: input_error("missing start-of-parag marker") text = re.sub(r"<[$]>$", "", text) thisTail = True else: thisTail = False # The {prevBreak} variable is the delimiter before the current # token, either 0 (word space), 1 (line or figure break), or 2 # (parag break). # Append a "-" to the line to simplify parsing: text = re.sub(r"[-.,]?$", "-", text) text = fix_dubious_spaces(text, prComma, input_error) # Separator before first token: prevBreak = 2 if prevTail or thisHead else 1 # Loop on tokens: itok = 0 while True: m = re.match(r"([a-zA-Z?{}]*)([-.])", text) if m == None: break ntoks += 1 itok += 1 # Isolate the next token with its following delimiter: token = m.group(1) delim = m.group(2) text = text[len(token) + len(delim):] # Determine type of break {nextBreak} after this token. # It is 1 at end of text, not 2, even if this is a parag tail. if delim == ".": nextBreak = 0 elif delim == "-": nextBreak = 1 else: prog_error("token delim") # err.write(f"!! {prevBreak} {token} {nextBreak}\n") ilin = ndata ipar = npara if inParag else 0 if token == "": input_error("empty token") nwrit = process_token(o, locid, ipar, ilin, itok, prevBreak, token, nextBreak, nwrit) prevBreak = nextBreak if text != "": input_error(f"invalid char in line '{text[0:1]}'") if thisTail: inParag = False prevTail = thisTail return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ line = line.strip() if page == None: # Comment or blank line: return elif lseq == None: # Page header line: return else: # Rebuild locus ID: locid = f"{page}.{lseq}" if posty != None: locid = locid + "," + posty if trans != None: locid = locid + ";" + trans process_text(locid, text) return # .................................................................... def data_error(nread, msg, line): file_line_error("-", nread, msg, line) # .................................................................... nread, ndata, npage = iv.line_loop(sys.stdin, process_line, data_error) if inParag: data_error(("missing end-of-parag marker")) err.write(f"{nread:5d} lines read\n") err.write(f"{ndata:5d} data lines parsed\n") err.write(f"{npara:5d} parags seen\n") err.write(f"{ntoks:5d} tokens parsed\n") err.write(f"{nwrit:5d} tokens written\n") return # ---------------------------------------------------------------------- def fix_dubious_spaces(text, prComma, input_error): # Replaces dubious spaces ',' by '.' with prob {prComma} else by ''. new_text = ""; while True: m = re.match(r"([-.a-zA-Z?{}]*)[,]", text) if m == None: break piece = m.group(1) if piece == "": input_error("comma at front or doubled") text = text[len(piece)+1:] new_text += piece if random.random() < prComma: new_text += "." return new_text + text # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def process_token(o, locid, ipar, ilin, itok, prevBreak, token, nextBreak, nwrit): # The {nwrit} should be the number of tokens written. # Returns {nwrit} incremented if the token was written. # Decide number of blank lines to write: if prevBreak == 2: blanks = 2 if o['showLines'] else 1 elif prevBreak == 1: blanks = 1 if o['showLines'] else 0 else: blanks = 0 if prevBreak == 0 and nextBreak == 0: # Token is text-medial: omit = o['omitMedial'] else: omit = False if prevBreak >= 1: # Token is line-initial: omit |= o['omitInitial'] if nextBreak >= 1: # Token is line-final: omit |= o['omitFinal'] if not omit: # Output token: if nwrit > 0: for kb in range(blanks): out.write("\n") output_token(locid, ipar, ilin, itok, token); nwrit += 1 return nwrit # .................................................................. def output_token(locid, ipar, ilin, itok, token): # {prevBreak} must be 0 for word space, 1 for line break, 2 for parag break. # {nextBreak} must be 0 for word space, 1 for line break or parag break. if token == "": prog_error(("empty token")) out.write("%-12s " % locid) out.write(" %3d" % ipar) out.write(" %5d" % ilin) out.write(" %3d" % itok) out.write(" %s\n" % token) return # ---------------------------------------------------------------------- def parse_options(): usage = ( "words_from_ivt.py \\\n" \ " [ -showLines ] [ -showParags ] \\\n" \ " [ -omitInitial ] [ -omitFinal ] \\\n" \ " [ -omitMedial ] \\\n" \ " [ --dubiousSpaces NUM ] \\\n" \ " < INFILE > OUTFILE" \ ); o = dict() o['showLines'] = False o['showParags'] = False o['showParags'] = False o['omitInitial'] = False o['omitMedial'] = False o['omitFinal'] = False o['dubiousSpaces'] = 50 iarg = 1 narg = len(sys.argv) while iarg < narg: arg = sys.argv[iarg]; iarg += 1 arg = re.sub(r"^--", "-", arg) if arg == "-help" or arg == "-info": err.write(usage); err.write("\n") sys.exit(0) elif arg == "-showLines": o['showLines'] = True elif arg == "-showParags": o['showParags'] = True elif arg == "-omitInitial": o['omitInitial'] = True elif arg == "-omitMedial": o['omitMedial'] = True elif arg == "-omitFinal": o['omitFinal'] = True elif arg == "-dubiousSpaces": arg = sys.argv[iarg]; iarg += 1 o['dubiousSpaces'] = float(arg) else: arg_error(f"invalid command line argument {arg}") # It simplifies things if {showLines} implies {showParags} : if o['showLines']: o['showParags'] = True if o['omitInitial'] and o['omitMedial'] and o['omitFinal']: arg_error("omitting everything!?") return o # ---------------------------------------------------------------------- main()