#! /usr/bin/python3 # Last edited on 2025-11-01 10:16:52 by stolfi # Reads an IVTFF file. Extracts plain parag text, excluding labels, # titles, circular and radial text, etc. Removes the figure # intrusion marks "<->" # # Writes one line per word # "{FNUM} {LSEQ} {IW} {JW} {WORD}" where {IW} is the word index from # start of line, {JW} is index from end of line, {WORD} is a single word # or "=" for parag break or "-" for line break. # # There is a lone line break "=" at the end of the file. import os, sys, re import ivtff_format as iv import error_funcs as ef def main(): last_page = None last_lseq = None last_word = None def write_word(nread, line, page, lseq, iw, jw, word): # Writes one line with data {page,lseq,iw,jw,word}. # The {nread,line} are for error messages. nonlocal last_page, last_lseq, last_word if word == "=" or word == "-": if page == last_page and (last_word == "=" or last_word == "-"): data_error(nread, f"multiple line/parag breaks {page = } {last_page = } {last_word = }", line) sys.stdout.write("%-6s %3d %3d %3d %s\n" % (page, lseq, iw, jw, word)) last_page = page last_lseq = lseq last_word = word return # .................................................................... def proc_line(nread,ndata,npage, line, page,lseq,posty,trans,text): nonlocal last_page, last_lseq, last_word if page != None and lseq == None: # sys.stderr.write(f"{nread = } {page = }\n") pass elif page != None and lseq != None: # if nread > 1070 and nread < 1080: # sys.stderr.write(f".. {nread = } {page = } {lseq = } {posty = }\n") # Hack fixes: if page == "f11v" and lseq == 6: assert posty == "=P0" posty = "+P0" elif page == "f36v" and lseq == 5: assert posty == "+P1" posty = "@P1" elif page == "f41v" and lseq == 2: assert posty == "+P0" posty = "@P0" elif page == "f49v" and lseq == 2: assert posty == "=P0" posty = "@P0" elif page == "f75v" and lseq >= 5 and lseq <= 12 and posty == "*P0": posty = "+P0" # Text cleanup: text = re.sub(r"[@]", "&", text) text = re.sub(r"<[&][^<>]*>", "", text) text = re.sub(r"[.,]", " ", text) text = re.sub(r"<->", " ", text) # Normalize weirdos that are not weirdos: text = re.sub(r"[&]152;", "d", text) text = re.sub(r"[&]176;", "k", text) text = re.sub(r"[&]206;", "r", text) text = re.sub(r"[&]221;", "a", text) text = re.sub(r"[&]222;", "y", text) text = text.strip() # Ensure each page ends with "=": if last_page != None and last_lseq != None and last_page != page: write_word(nread,None, last_page, last_lseq + 1, 0, 0, "=") last_page = None; last_lseq = None; last_word = None # Select parag text lines and write their words: if posty == None: data_error(nread, line, "posty is None") elif posty == "@P0" or posty == "@P1" or posty == "*P0": # Parag head line: text = "= " + text elif posty == "+P0" or posty == "+P1" or posty == "=P0": # Parag body line: if last_page == None or last_page != page: data_error(nread, f"body line without head line", line) text = "- " + text elif posty[1] == "L" or posty[1] == "R" or posty[1] == "C": # Label, radial text, circular text: text = None elif posty[1:] == "Pt" or posty[1:] == "Pb" or posty[1:] == "Pc" or posty[1:] == "Pr": # Title, sector parag line, radial line: text = None else: data_error(nread, f"invalid {posty = }", line) if text != None and text != "" and text != "-": words = text.split() nw = len(words) for iw in range(nw): write_word(nread, line, page, lseq, iw, nw-1-iw, words[iw]) return # ...................................................................... nread,ndata,npage = iv.line_loop(sys.stdin, proc_line, data_error) sys.stderr.write(f"{nread = }\n") sys.stderr.write(f"{ndata = }\n") sys.stderr.write(f"{npage = }\n") # Ensure last page ends with "=": if last_page != None and last_lseq != None: write_word(nread+1,None, last_page, last_lseq + 1, 0, 0, "=") sys.stdout.flush() return # ---------------------------------------------------------------------- def data_error(lnum,msg,line): ef.file_line_error("-",lnum,msg,line) return # ---------------------------------------------------------------------- main()