#! /usr/local/bin/python
## Last edited on 2003-11-18 23:09:55 by stolfi
## 
##     prescript.py 0.1 -- a PostScript to text converter
##     Copyright (C) 1996  Todd Reed
##
##     Changed 2003-11-18 by Jorge Stolfi, Unicamp
##       Now looks for "prescript.ps" in sister "lib"
##       directory if PRESCRIPT_DIR is not set.
##  
##     This program is free software; you can redistribute it and/or modify
##     it under the terms of the GNU General Public License as published by
##     the Free Software Foundation; either version 2 of the License, or
##     (at your option) any later version.
##  
##     This program is distributed in the hope that it will be useful,
##     but WITHOUT ANY WARRANTY; without even the implied warranty of
##     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##     GNU General Public License for more details.
##  
##     You should have received a copy of the GNU General Public License
##     along with this program; if not, write to the Free Software
##     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
## 
import os, sys, regex, posixpath, posix
from string import split, atoi, find
from regsub import gsub, sub
from urllib import unquote

class Fragment:

   # A Fragment is a unit of text and its start and end points

   def __init__(self, x0, y0, s, x1, y1):
      self.x0 = x0
      self.y0 = y0
      self.x1 = x1
      self.y1 = y1
      self.string = s

   def averageCharWidth(self):
      return int((self.x1-self.x0)/len(self.string))

   def concat(self, fragment):
      self.x1 = fragment.x1;
      self.y1 = fragment.y1;
      self.string = self.string+fragment.string

   def __str__(self):
      return "("+`self.x0`+", "+`self.y0`+", "+self.string+")"

## Opens filename, a file containing PostScript text fragment data, and
## calls worker.newPage() for each P data item and worker.textFragment()
## for each S data item
## 
def ReadPostScriptDataFile(filename, worker):
   data = open(filename)

   while 1:
      input = data.readline()
      if not input: break;
      if input[0] == "P":
	 worker.newPage()
      elif (input[0] == "S") and (len(split(input[:-1], '\t')[0:6]) == 6):
	 [tag, x0, y0, string, x1, y1] = split(input[:-1], '\t')[0:6]

	 # If x1 is 'S', then some funny recursive font stuff has happened.
	 # Ignore the recursive stuff, and search for the rest of this line
	 if x1 == "S":
	    while 1:
	       input = data.readline();
	       if input[0] != "S": break
	    [x1, y1] = split(input[:-1], '\t')
	 string = unquote(string)
	 if len(string) > 0:
	    worker.textFragment(Fragment(atoi(x0), atoi(y0),
					 string,
					 atoi(x1), atoi(y1)))
      else:
	 print "Bad data line."
   data.close()
   worker.done()

# x and y are tuples of the form (value, freq) from a histogram.
def CmpRVItems(x, y):
   if x[1] < y[1]: return -1
   elif x[1] > y[1]: return 1
   else: return 0

class RandomVariable:

   # A RandomVariable is a statistical random variable (rv).  This
   # implementation only builds a histogram of observations since
   # no additional functionality is needed.

   def __init__(self):
      self.histogram = {}
      self.data = []

   def addObservation(self, x):
      self.data.append(x)
      if self.histogram.has_key(x):
	 self.histogram[x] = self.histogram[x]+1
      else:
	 self.histogram[x] = 1

   def computeStats(self):
      self.observations = self.histogram.items()
      self.observations.sort(CmpRVItems)
      self.observations.reverse()

      # After computing the stats, we don't need some attributes

      del self.histogram
      del self.data

   def mode(self, n):
      return self.observations[n][0]

#
# The first pass worker object -- gathers statistics about the text fragments
#
class FirstPassWorker:

   def __init__(self):
      self.fragment = None
      self.DeltaX = RandomVariable();
      self.DeltaY = RandomVariable();
      self.CharWidth = RandomVariable();

   def done(self): pass

   def newPage(self):
      self.fragment = None

   def textFragment(self, fragment):
      if self.fragment:
	 deltaY = fragment.y0-self.fragment.y1;
	 if deltaY != 0:
	    self.DeltaY.addObservation(deltaY)
	 deltaX = fragment.x0-self.fragment.x1
	 if deltaX < 0:
	    self.DeltaX.addObservation(-deltaX)
      self.CharWidth.addObservation(fragment.averageCharWidth())
      self.fragment = fragment

   def computeThresholds(self):
      # Compute character width
      
      self.CharWidth.computeStats()
      charWidth = self.CharWidth.mode(0)

      # Compute line width

      self.DeltaX.computeStats()
      n = 0
      while self.DeltaX.mode(n) < 5*charWidth: n = n+1
      lineWidth = self.DeltaX.mode(n)

      # Compute line spacing

      self.DeltaY.computeStats()
      n = 0
      while abs(self.DeltaY.mode(n)) < charWidth: n = n+1
      lineSpacing = abs(self.DeltaY.mode(n))
      
      return charWidth, lineWidth, lineSpacing

class Line:

   # A Line is a line of text, constructed from Fragments.  The y
   # coordinates of all Fragments are saved to determine the best
   # value for the baseline.

   # Line classifications to be assigned
   Header = 1
   Footer = 2
   PageNo = 3
   TOCEntry = 4
   BibEntry = 5
   Plain = 7

   def __init__(self):
      self.string = None;
      self.baseline = []
      self.x0 = None
      self.y0 = None
      self.x1 = None
      self.y1 = None
      self.type = None
      
      # Additional attributes may be added later:
      # self.newPara if this line starts a new paragraph
      # self.pageNo if a page no was found
      # self.lineBreak if the line has an explict line break at the end

   def concat(self, fragment):
      if self.string:
	 self.string = self.string+" "+fragment.string
      else:
	 self.string = fragment.string
	 self.x0 = fragment.x0
	 self.y0 = fragment.y0
      self.x1 = fragment.x1
      self.y1 = fragment.y1
      self.baseline.append(fragment.y0)
      self.baseline.append(fragment.y1)

   # Determines the best y coordinate for the baseline
   # On input, self.baseline is a list of observed y values.
   # On output, self.baseline is assigned the median of self.baseline
   # (changing the type of self.baseline from a list to a scalar)
   def computeBestBaseline(self):
      self.baseline.sort();
      self.baseline = self.baseline[len(self.baseline)/2]

   def length(self):
      return self.x1-self.x0

class SecondPassWorker:

   def __init__(self, charWidth):
      self.charWidth = charWidth;
      self.fragment = None;
      self.line = Line()
      self.lines = []
      self.pages = []

   def done(self):
      self.newPage()
      for page in self.pages:
	 for line in page:
	    line.computeBestBaseline()

   def newPage(self):
      if self.fragment:
	 self.line.concat(self.fragment)
	 self.lines.append(self.line)
	 self.line = Line()
	 self.fragment = None
      if self.lines:
	 self.pages.append(self.lines)
	 self.lines = []

   def textFragment(self, fragment):
      fragment.string = TranslateChars(fragment.string)
      if self.fragment is None:
	 self.fragment = fragment
      elif self.fragment.x1-fragment.x0 > 2*self.charWidth or \
	   abs(fragment.y0-self.fragment.y1) > 2*self.charWidth:
	 # fragment starts a new line
	 self.line.concat(self.fragment)
	 self.lines.append(self.line)
	 self.line = Line()
	 self.fragment = fragment
      else:
	 avgCharWidth = min([self.fragment.averageCharWidth(),
			     fragment.averageCharWidth()])
	 if fragment.x0-self.fragment.x1 <= 0.3*avgCharWidth:
	    self.fragment.concat(fragment)
	 else:
	    self.line.concat(self.fragment)
	    self.fragment = fragment

#
# Translate some characters that are known ligatures (mostly for TeX sources)
#
def TranslateChars(string):
   string = gsub('\013', 'ff', string)
   string = gsub('\014', 'fi', string)
   string = gsub('\015', 'fl', string)
   string = gsub('\016', 'ffi', string)
   string = gsub('\017', 'ffl', string)
   string = gsub('\024', '<=', string)
   string = gsub('\025', '>=', string)
   string = gsub('\027A', 'AA', string)
   string = gsub('\027a', 'aa', string)
   string = gsub('\031', 'ss', string)
   string = gsub('\032', 'ae', string)
   string = gsub('\033', 'oe', string)
   string = gsub('\034', 'o', string)
   string = gsub('\035', 'AE', string)
   string = gsub('\036', 'OE', string)
   string = gsub('\037', 'O', string)
   string = gsub('\256', 'fi', string)
   string = gsub('\257', 'fl', string)
   string = gsub('\366', 'fi', string)
   string = gsub('\377', 'fl', string)
   string = gsub('[\000-\037]', '?', string)
   string = gsub('[\177-\377]', '?', string)
   return string

class Document:

   tocPattern = regex.compile("\(\([.:] ?\)+\) *[0-9]+ *$")
   bibPattern = regex.compile("^\[[A-Za-z0-9\+]+\] *[A-Z]")
   pageNoPattern = regex.symcomp(" *\([^0-9]*\) *\(<page>[0-9]+\) *\1 *$")
   headerPatternA = regex.symcomp("^ *\(<page>[0-9]+\) +CHAPTER +[0-9]+")
   headerPatternB = regex.symcomp("^ *[0-9]+\.\([0-9]+\.\)* +\([A-Za-z]+ +\)+ *\(<page>[0-9]+\) *$")
   footerPattern = headerPatternB
   startParaPattern = regex.compile("^ *[A-Z]")
   hyphenHeadPattern = regex.compile("[a-z]-$")
   hyphenTailPattern = regex.compile("^\([^ ]+\)[ ]*")

   def __init__(self, charWidth, lineWidth, lineSpacing):
      self.charWidth = charWidth
      self.minLineWidth = 0.8*lineWidth
      self.maxLineSpacing = 1.1*lineSpacing

   def markupPage(self, lines):
      if not lines: return
      lastIndex = len(lines)-1

      # Label lines, and mark those that start new paragraphs
      for i in range(len(lines)):
	 self.classifyLine(lines, lines[i], i, lastIndex)
	 self.markParagraphs(lines, lines[i], i, lastIndex)

      # Mark lines that have explicit line breaks;
      for i in range(len(lines)):
	 self.markLineBreaks(lines, lines[i], i, lastIndex)

      # De-hyphenate lines.  This could cause some lines to disappear.
      blankLines = [];
      for i in range(len(lines)):
	 if len(lines[i].string) == 0:
	    blankLines.append(i)
	 else:
	    self.deHyphenate(lines, lines[i], i, lastIndex)
      while blankLines:
	 del lines[blankLines[len(blankLines)-1]]
	 del blankLines[len(blankLines)-1]
	 
   def classifyLine(self, lines, line, i, lastIndex):
      if Document.tocPattern.search(line.string) >= 0:
	 line.type = Line.TOCEntry
      elif Document.bibPattern.search(line.string) >= 0:
	 line.type = Line.BibEntry
      elif (i == 0 or i == lastIndex) and \
	   Document.pageNoPattern.search(line.string) >= 0:
	 line.type = Line.PageNo
	 line.pageNo = atoi(Document.pageNoPattern.group('page'))
      elif i == 0 and Document.headerPatternA.search(line.string) >= 0:
	 line.type = Line.Header
	 line.pageNo = atoi(Document.headerPatternA.group('page'))
      elif i == 0 and Document.headerPatternB.search(line.string) >= 0:
	 line.type = Line.Header
	 line.pageNo = atoi(Document.headerPatternB.group('page'))
      elif i == lastIndex and Document.footerPattern.search(line.string) >= 0:
	 line.type = Line.Footer()
	 line.pageNo = atoi(Document.footerPattern.group('page'))
      else:
	 line.type = Line.Plain

   def markParagraphs(self, lines, line, i, lastIndex):

      # Determine if line should start a new paragraph; if so, set
      # line.newPara

      if i == 0:
	 line.newPara = 1
      else:
	 if abs(line.baseline-lines[i-1].baseline) > self.maxLineSpacing or \
	    line.type == Line.TOCEntry and \
	    lines[i-1].type != Line.TOCEntry or \
	    line.type == Line.BibEntry:
	    line.newPara = 1
	 elif i < lastIndex:

	    # Even though the line spacing doesn't indicate a new
	    # paragraph, we may start a new para.
	    # A line looks like the beginning of a new para if:
	    # it is further right than the line above and below,
	    # and the line below is the longest line of the three,
	    # and the line starts with a capital.

	    if line.x0 > lines[i-1].x0+2*self.charWidth and \
	       line.x0 > lines[i+1].x0+2*self.charWidth and \
	       lines[i+1].length() == max([lines[i-1].length(),
					   line.length(),
					   lines[i+1].length()]) and \
	       line.length() >= 0.9*lines[i+1].length() and \
	       Document.startParaPattern.search(line.string) >= 0:
	       line.newPara = 1

   def markLineBreaks(self, lines, line, i, lastIndex):
      if i < lastIndex and line.length() < self.minLineWidth and \
	 not hasattr(lines[i+1], 'newPara'):
	 line.lineBreak = 1

   def deHyphenate(self, lines, line, i, lastIndex):
      if i < lastIndex and not hasattr(lines[i+1], 'newPara') and \
	 Document.hyphenHeadPattern.search(line.string) >= 0 and \
	 Document.hyphenTailPattern.search(lines[i+1].string) >= 0:
	 line.string = line.string[:-1]+Document.hyphenTailPattern.group(1)
	 lines[i+1].string = sub(Document.hyphenTailPattern, '', lines[i+1].string)

class Formatter:
   def endPage(self): pass
   def line(self, l): pass
   def startParagraph(self): pass

def HTMLQuote(s):
   quote_chars = '<>&"'
   entities = ("&lt;", "&gt;", "&amp;", "&quot;")
   res = ''
   for c in s:
      index = find(quote_chars, c)
      if index >= 0:
	 res = res + entities[index]
      else:
	 res = res + c
   return res

class HTMLFormatter(Formatter):

   def __init__(self, out):
      self.out = out

   def start(self): pass

   def end(self): pass

   def endPage(self):
      self.out.write("\n<hr>\n")

   def line(self, l):
      if l.type == Line.Header or l.type == Line.Footer:
	 self.out.write("<i>")
	 self.out.write(HTMLQuote(l.string))
	 self.out.write("</i>")
      self.out.write(HTMLQuote(l.string))

   def startParagraph(self):
      self.out.write("\n<p>")

   def lineBreak(self):
      self.out.write("\n")

   def explicitLineBreak(self):
      self.out.write("<br>\n")

class PlainTextFormatter(Formatter):

   def __init__(self, out):
      self.out = out

   def start(self): pass

   def end(self): pass

   def endPage(self): pass

   def line(self, l):
      self.out.write(l.string)

   def startParagraph(self):
      self.out.write("\n")

   def lineBreak(self):
      self.out.write("\n")

   def explicitLineBreak(self):
      self.out.write("\n")

def RenderPage(formatter, page):
   for line in page:
      if hasattr(line, 'newPara'):
	 formatter.startParagraph();
      formatter.line(line)
      if hasattr(line, 'lineBreak'):
	 formatter.explicitLineBreak()
      else:
	 formatter.lineBreak()
   formatter.endPage()

def RenderDocument(document, formatter, pages):
   pageSequence = []
   formatter.start()
   for page in pages:
      document.markupPage(page)
      RenderPage(formatter, page)
   formatter.end()

def CalculateTextStats(file):
   pass1 = FirstPassWorker()
   print "Computing statistics..."
   ReadPostScriptDataFile(file, pass1)
   return pass1.computeThresholds()

def FormatDocument(file, charWidth, lineWidth, lineSpacing):
   pass2 = SecondPassWorker(charWidth)
   print "Formatting..."
   ReadPostScriptDataFile(file, pass2)
   return pass2.pages

def MakeFilename(sourceName, newExt):
   head, tail = posixpath.split(sourceName)
   root, ext = posixpath.splitext(tail)
   return root + newExt

# start of program

def PostScriptToPSData(psFilename, datFilename):
   if 'PRESCRIPT_DIR' in os.environ:
      libdir = os.environ['PRESCRIPT_DIR']
   else:
      libdir = dirname(sys.argv[0]) + "/../lib"
   os.system("gs -q -dNODISPLAY -soutfile=%s %s/prescript.ps %s quit.ps" %
	     (datFilename, libdir, psFilename))

def PSDataToText(datFilename, formatter):
   charWidth, lineWidth, lineSpacing = CalculateTextStats(datFilename)
   pages = FormatDocument(datFilename, charWidth, lineWidth, lineSpacing)
   document = Document(charWidth, lineWidth, lineSpacing)
   RenderDocument(document, formatter, pages)

def main():
   if len(sys.argv) < 3:
      print "Usage: prescript format input [output]"
      print "  format is either html or plain"

   format = sys.argv[1]
   inputFilename = sys.argv[2];

   if regex.search("\.ps$", inputFilename) >= 0:
      # Make a .dat file from the .ps file
      datFilename = MakeFilename(inputFilename, ".dat")
      print "Running %s" % inputFilename
      PostScriptToPSData(inputFilename, datFilename)
   else:
      datFilename = inputFilename

   if len(sys.argv) == 4:
      outFilename = sys.argv[3]
   else:
      if format == 'html':
	 outFilename = MakeFilename(datFilename, ".html")
      elif format == '.txt':
	 outFilename = MakeFilename(datFilename, ".txt")

   outFile = open(outFilename, "w")
   if format == 'html':
      formatter = HTMLFormatter(outFile)
   elif format == 'plain':
      formatter = PlainTextFormatter(outFile)
   else:
      print "Unknown format %s." % format
      sys.exit(1)

   PSDataToText(datFilename, formatter)
   posix.unlink(datFilename)
   outFile.close()

if __name__ == '__main__':
   main()