#! /usr/local/bin/python ## Last edited on 2003-11-18 23:09:55 by stolfi ## ## prescript.py 0.1 -- a PostScript to text converter ## Copyright (C) 1996 Todd Reed ## ## Changed 2003-11-18 by Jorge Stolfi, Unicamp ## Now looks for "prescript.ps" in sister "lib" ## directory if PRESCRIPT_DIR is not set. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ## import os, sys, regex, posixpath, posix from string import split, atoi, find from regsub import gsub, sub from urllib import unquote class Fragment: # A Fragment is a unit of text and its start and end points def __init__(self, x0, y0, s, x1, y1): self.x0 = x0 self.y0 = y0 self.x1 = x1 self.y1 = y1 self.string = s def averageCharWidth(self): return int((self.x1-self.x0)/len(self.string)) def concat(self, fragment): self.x1 = fragment.x1; self.y1 = fragment.y1; self.string = self.string+fragment.string def __str__(self): return "("+`self.x0`+", "+`self.y0`+", "+self.string+")" ## Opens filename, a file containing PostScript text fragment data, and ## calls worker.newPage() for each P data item and worker.textFragment() ## for each S data item ## def ReadPostScriptDataFile(filename, worker): data = open(filename) while 1: input = data.readline() if not input: break; if input[0] == "P": worker.newPage() elif (input[0] == "S") and (len(split(input[:-1], '\t')[0:6]) == 6): [tag, x0, y0, string, x1, y1] = split(input[:-1], '\t')[0:6] # If x1 is 'S', then some funny recursive font stuff has happened. # Ignore the recursive stuff, and search for the rest of this line if x1 == "S": while 1: input = data.readline(); if input[0] != "S": break [x1, y1] = split(input[:-1], '\t') string = unquote(string) if len(string) > 0: worker.textFragment(Fragment(atoi(x0), atoi(y0), string, atoi(x1), atoi(y1))) else: print "Bad data line." data.close() worker.done() # x and y are tuples of the form (value, freq) from a histogram. def CmpRVItems(x, y): if x[1] < y[1]: return -1 elif x[1] > y[1]: return 1 else: return 0 class RandomVariable: # A RandomVariable is a statistical random variable (rv). This # implementation only builds a histogram of observations since # no additional functionality is needed. def __init__(self): self.histogram = {} self.data = [] def addObservation(self, x): self.data.append(x) if self.histogram.has_key(x): self.histogram[x] = self.histogram[x]+1 else: self.histogram[x] = 1 def computeStats(self): self.observations = self.histogram.items() self.observations.sort(CmpRVItems) self.observations.reverse() # After computing the stats, we don't need some attributes del self.histogram del self.data def mode(self, n): return self.observations[n][0] # # The first pass worker object -- gathers statistics about the text fragments # class FirstPassWorker: def __init__(self): self.fragment = None self.DeltaX = RandomVariable(); self.DeltaY = RandomVariable(); self.CharWidth = RandomVariable(); def done(self): pass def newPage(self): self.fragment = None def textFragment(self, fragment): if self.fragment: deltaY = fragment.y0-self.fragment.y1; if deltaY != 0: self.DeltaY.addObservation(deltaY) deltaX = fragment.x0-self.fragment.x1 if deltaX < 0: self.DeltaX.addObservation(-deltaX) self.CharWidth.addObservation(fragment.averageCharWidth()) self.fragment = fragment def computeThresholds(self): # Compute character width self.CharWidth.computeStats() charWidth = self.CharWidth.mode(0) # Compute line width self.DeltaX.computeStats() n = 0 while self.DeltaX.mode(n) < 5*charWidth: n = n+1 lineWidth = self.DeltaX.mode(n) # Compute line spacing self.DeltaY.computeStats() n = 0 while abs(self.DeltaY.mode(n)) < charWidth: n = n+1 lineSpacing = abs(self.DeltaY.mode(n)) return charWidth, lineWidth, lineSpacing class Line: # A Line is a line of text, constructed from Fragments. The y # coordinates of all Fragments are saved to determine the best # value for the baseline. # Line classifications to be assigned Header = 1 Footer = 2 PageNo = 3 TOCEntry = 4 BibEntry = 5 Plain = 7 def __init__(self): self.string = None; self.baseline = [] self.x0 = None self.y0 = None self.x1 = None self.y1 = None self.type = None # Additional attributes may be added later: # self.newPara if this line starts a new paragraph # self.pageNo if a page no was found # self.lineBreak if the line has an explict line break at the end def concat(self, fragment): if self.string: self.string = self.string+" "+fragment.string else: self.string = fragment.string self.x0 = fragment.x0 self.y0 = fragment.y0 self.x1 = fragment.x1 self.y1 = fragment.y1 self.baseline.append(fragment.y0) self.baseline.append(fragment.y1) # Determines the best y coordinate for the baseline # On input, self.baseline is a list of observed y values. # On output, self.baseline is assigned the median of self.baseline # (changing the type of self.baseline from a list to a scalar) def computeBestBaseline(self): self.baseline.sort(); self.baseline = self.baseline[len(self.baseline)/2] def length(self): return self.x1-self.x0 class SecondPassWorker: def __init__(self, charWidth): self.charWidth = charWidth; self.fragment = None; self.line = Line() self.lines = [] self.pages = [] def done(self): self.newPage() for page in self.pages: for line in page: line.computeBestBaseline() def newPage(self): if self.fragment: self.line.concat(self.fragment) self.lines.append(self.line) self.line = Line() self.fragment = None if self.lines: self.pages.append(self.lines) self.lines = [] def textFragment(self, fragment): fragment.string = TranslateChars(fragment.string) if self.fragment is None: self.fragment = fragment elif self.fragment.x1-fragment.x0 > 2*self.charWidth or \ abs(fragment.y0-self.fragment.y1) > 2*self.charWidth: # fragment starts a new line self.line.concat(self.fragment) self.lines.append(self.line) self.line = Line() self.fragment = fragment else: avgCharWidth = min([self.fragment.averageCharWidth(), fragment.averageCharWidth()]) if fragment.x0-self.fragment.x1 <= 0.3*avgCharWidth: self.fragment.concat(fragment) else: self.line.concat(self.fragment) self.fragment = fragment # # Translate some characters that are known ligatures (mostly for TeX sources) # def TranslateChars(string): string = gsub('\013', 'ff', string) string = gsub('\014', 'fi', string) string = gsub('\015', 'fl', string) string = gsub('\016', 'ffi', string) string = gsub('\017', 'ffl', string) string = gsub('\024', '<=', string) string = gsub('\025', '>=', string) string = gsub('\027A', 'AA', string) string = gsub('\027a', 'aa', string) string = gsub('\031', 'ss', string) string = gsub('\032', 'ae', string) string = gsub('\033', 'oe', string) string = gsub('\034', 'o', string) string = gsub('\035', 'AE', string) string = gsub('\036', 'OE', string) string = gsub('\037', 'O', string) string = gsub('\256', 'fi', string) string = gsub('\257', 'fl', string) string = gsub('\366', 'fi', string) string = gsub('\377', 'fl', string) string = gsub('[\000-\037]', '?', string) string = gsub('[\177-\377]', '?', string) return string class Document: tocPattern = regex.compile("\(\([.:] ?\)+\) *[0-9]+ *$") bibPattern = regex.compile("^\[[A-Za-z0-9\+]+\] *[A-Z]") pageNoPattern = regex.symcomp(" *\([^0-9]*\) *\([0-9]+\) *\1 *$") headerPatternA = regex.symcomp("^ *\([0-9]+\) +CHAPTER +[0-9]+") headerPatternB = regex.symcomp("^ *[0-9]+\.\([0-9]+\.\)* +\([A-Za-z]+ +\)+ *\([0-9]+\) *$") footerPattern = headerPatternB startParaPattern = regex.compile("^ *[A-Z]") hyphenHeadPattern = regex.compile("[a-z]-$") hyphenTailPattern = regex.compile("^\([^ ]+\)[ ]*") def __init__(self, charWidth, lineWidth, lineSpacing): self.charWidth = charWidth self.minLineWidth = 0.8*lineWidth self.maxLineSpacing = 1.1*lineSpacing def markupPage(self, lines): if not lines: return lastIndex = len(lines)-1 # Label lines, and mark those that start new paragraphs for i in range(len(lines)): self.classifyLine(lines, lines[i], i, lastIndex) self.markParagraphs(lines, lines[i], i, lastIndex) # Mark lines that have explicit line breaks; for i in range(len(lines)): self.markLineBreaks(lines, lines[i], i, lastIndex) # De-hyphenate lines. This could cause some lines to disappear. blankLines = []; for i in range(len(lines)): if len(lines[i].string) == 0: blankLines.append(i) else: self.deHyphenate(lines, lines[i], i, lastIndex) while blankLines: del lines[blankLines[len(blankLines)-1]] del blankLines[len(blankLines)-1] def classifyLine(self, lines, line, i, lastIndex): if Document.tocPattern.search(line.string) >= 0: line.type = Line.TOCEntry elif Document.bibPattern.search(line.string) >= 0: line.type = Line.BibEntry elif (i == 0 or i == lastIndex) and \ Document.pageNoPattern.search(line.string) >= 0: line.type = Line.PageNo line.pageNo = atoi(Document.pageNoPattern.group('page')) elif i == 0 and Document.headerPatternA.search(line.string) >= 0: line.type = Line.Header line.pageNo = atoi(Document.headerPatternA.group('page')) elif i == 0 and Document.headerPatternB.search(line.string) >= 0: line.type = Line.Header line.pageNo = atoi(Document.headerPatternB.group('page')) elif i == lastIndex and Document.footerPattern.search(line.string) >= 0: line.type = Line.Footer() line.pageNo = atoi(Document.footerPattern.group('page')) else: line.type = Line.Plain def markParagraphs(self, lines, line, i, lastIndex): # Determine if line should start a new paragraph; if so, set # line.newPara if i == 0: line.newPara = 1 else: if abs(line.baseline-lines[i-1].baseline) > self.maxLineSpacing or \ line.type == Line.TOCEntry and \ lines[i-1].type != Line.TOCEntry or \ line.type == Line.BibEntry: line.newPara = 1 elif i < lastIndex: # Even though the line spacing doesn't indicate a new # paragraph, we may start a new para. # A line looks like the beginning of a new para if: # it is further right than the line above and below, # and the line below is the longest line of the three, # and the line starts with a capital. if line.x0 > lines[i-1].x0+2*self.charWidth and \ line.x0 > lines[i+1].x0+2*self.charWidth and \ lines[i+1].length() == max([lines[i-1].length(), line.length(), lines[i+1].length()]) and \ line.length() >= 0.9*lines[i+1].length() and \ Document.startParaPattern.search(line.string) >= 0: line.newPara = 1 def markLineBreaks(self, lines, line, i, lastIndex): if i < lastIndex and line.length() < self.minLineWidth and \ not hasattr(lines[i+1], 'newPara'): line.lineBreak = 1 def deHyphenate(self, lines, line, i, lastIndex): if i < lastIndex and not hasattr(lines[i+1], 'newPara') and \ Document.hyphenHeadPattern.search(line.string) >= 0 and \ Document.hyphenTailPattern.search(lines[i+1].string) >= 0: line.string = line.string[:-1]+Document.hyphenTailPattern.group(1) lines[i+1].string = sub(Document.hyphenTailPattern, '', lines[i+1].string) class Formatter: def endPage(self): pass def line(self, l): pass def startParagraph(self): pass def HTMLQuote(s): quote_chars = '<>&"' entities = ("<", ">", "&", """) res = '' for c in s: index = find(quote_chars, c) if index >= 0: res = res + entities[index] else: res = res + c return res class HTMLFormatter(Formatter): def __init__(self, out): self.out = out def start(self): pass def end(self): pass def endPage(self): self.out.write("\n
\n") def line(self, l): if l.type == Line.Header or l.type == Line.Footer: self.out.write("") self.out.write(HTMLQuote(l.string)) self.out.write("") self.out.write(HTMLQuote(l.string)) def startParagraph(self): self.out.write("\n

") def lineBreak(self): self.out.write("\n") def explicitLineBreak(self): self.out.write("
\n") class PlainTextFormatter(Formatter): def __init__(self, out): self.out = out def start(self): pass def end(self): pass def endPage(self): pass def line(self, l): self.out.write(l.string) def startParagraph(self): self.out.write("\n") def lineBreak(self): self.out.write("\n") def explicitLineBreak(self): self.out.write("\n") def RenderPage(formatter, page): for line in page: if hasattr(line, 'newPara'): formatter.startParagraph(); formatter.line(line) if hasattr(line, 'lineBreak'): formatter.explicitLineBreak() else: formatter.lineBreak() formatter.endPage() def RenderDocument(document, formatter, pages): pageSequence = [] formatter.start() for page in pages: document.markupPage(page) RenderPage(formatter, page) formatter.end() def CalculateTextStats(file): pass1 = FirstPassWorker() print "Computing statistics..." ReadPostScriptDataFile(file, pass1) return pass1.computeThresholds() def FormatDocument(file, charWidth, lineWidth, lineSpacing): pass2 = SecondPassWorker(charWidth) print "Formatting..." ReadPostScriptDataFile(file, pass2) return pass2.pages def MakeFilename(sourceName, newExt): head, tail = posixpath.split(sourceName) root, ext = posixpath.splitext(tail) return root + newExt # start of program def PostScriptToPSData(psFilename, datFilename): if 'PRESCRIPT_DIR' in os.environ: libdir = os.environ['PRESCRIPT_DIR'] else: libdir = dirname(sys.argv[0]) + "/../lib" os.system("gs -q -dNODISPLAY -soutfile=%s %s/prescript.ps %s quit.ps" % (datFilename, libdir, psFilename)) def PSDataToText(datFilename, formatter): charWidth, lineWidth, lineSpacing = CalculateTextStats(datFilename) pages = FormatDocument(datFilename, charWidth, lineWidth, lineSpacing) document = Document(charWidth, lineWidth, lineSpacing) RenderDocument(document, formatter, pages) def main(): if len(sys.argv) < 3: print "Usage: prescript format input [output]" print " format is either html or plain" format = sys.argv[1] inputFilename = sys.argv[2]; if regex.search("\.ps$", inputFilename) >= 0: # Make a .dat file from the .ps file datFilename = MakeFilename(inputFilename, ".dat") print "Running %s" % inputFilename PostScriptToPSData(inputFilename, datFilename) else: datFilename = inputFilename if len(sys.argv) == 4: outFilename = sys.argv[3] else: if format == 'html': outFilename = MakeFilename(datFilename, ".html") elif format == '.txt': outFilename = MakeFilename(datFilename, ".txt") outFile = open(outFilename, "w") if format == 'html': formatter = HTMLFormatter(outFile) elif format == 'plain': formatter = PlainTextFormatter(outFile) else: print "Unknown format %s." % format sys.exit(1) PSDataToText(datFilename, formatter) posix.unlink(datFilename) outFile.close() if __name__ == '__main__': main()