# Last edited on 2012-02-13 21:53:51 by stolfilocal

2012-02-10 

  Fetched a scanned PDF of the original 1899 edition from USP Brasiliana
  ("usp/002038_COMPLETO.pdf" subdir; file dated 2010-07-01).  Note that some 
  accents may have been erased by the scanner's cleanup filter. 

2012-02-11 

  Also obtained an OCR transcription of that PDF file, chock full of errors.
  Fixed it by hand, checking against the PDF images. 
  Result is "usp/002038_COMPLETO.txt"

PAIRING WORDS WITH MODERN VERSION

  cat main.wds \
    | gawk '/^[a]/{ print $2; }' \
    > old.wds
  
  cat ../csm/main.wds \
    | gawk '/^[a]/{ print $2; }' \
    > new.wds

  diff \
       --side-by-side \
       old.wds \
       new.wds \
     | expand \
     | cleanup-diff-pairs.gawk \
     > old-new.pairs

  cat old-new.pairs \
    | sort \
    | uniq -c \
    | select-translations.gawk \
    > old-new.tbl
    
  map-field \
      -v inField=1 \
      -v outField=1 \
      -v forgiving=1 \
      -v table=old-new.tbl \
      old.wds \
    | gawk '//{ print $1; }' \
    > map.wds
    
  diff \
       --side-by-side \
       map.wds \
       new.wds \
     | expand \
     | cleanup-diff-pairs.gawk \
     > map-new.pairs