# Last edited on 2004-10-14 00:47:35 by stolfi

2004-10-06 
  
  Received from John E Koontz the file "jod-all.1637DEFANGED-ZIP"
  containing "jd1890.txt" and "jd1891.txt", the Omaha-Ponca 
  corpus collected by James O. Dorsey.  
  
  Concatenated the two files (with separators, minus '\015's) into
  "jd-1890-1891.txt"
  
  Created "main.src", started converting the encoding and cleaning up
  the format.
  
WORKING VERSION

  Checking for lost tokens:
  
    cat main.src \
      | egrep -e '^[<][>](tt|st|dt|au|pr|op|pr)' \
      | sed \
          -e '/<>tt/s/ /_/g' \
          -e 's/[<][>][a-z][a-z][_ ]*{[_ ]*//' \
          -e 's/[_ ]*} *$//' \
      | tr 'A-Z'\Á'ÉÍÓÚÂÊÎÔÛÄËÏÖÜ' 'a-záéíóúâêîôûäëïöü' \
      | tr ' ' '\012' \
      | egrep -e '[^/]' \
      > .toks.new 
      
    cat jd-1890-1891.txt \
      | gawk \
          ' /^[\\](op|ti)/{f=1;print;next;} \
            /^[^\\]/{if(f) print; next;} \
            /^[\\]/{f = 0;} \
          ' \
      | sed \
          -e '/[\\]ti/s/ /_/g' \
          -e 's/[\\][a-z][a-z][_ ]*//' \
          -e 's/[_ ]*$//' \
      | tr 'A-Z' 'a-z' \
      | tr ' ' '\012' \
      | egrep -e '[^/]' \
      | sed -f fix-accents.sed \
      > .toks.old 
      
    diff -C 1 .toks.{old,new} > .toks.diff

  Checking line numbers:
  
    cat main.src \
      | grep -e '<>rf' \
      > .refs.new

    cat jd-1890-1891.txt \
      | egrep -e '^[\\]rf' \
      | sed -e 's/[\\]rf */<>rf {/' -e 's/$/}/' \
      > .refs.old
      
    diff .refs.{old,new}
  
  Checking glosses: 
  
    check-glosses \
          -v synFile=main.syn \
          -v gprFile=main.gpr \
          main.src
    gzip -c main.syn > main.syn.gz 
    cat main.gpr \
      | format-glosses \
      | sort -b +0 -1 \
      > op-en.dic
    cat main.gpr \
      | gawk '/^[#]/{print;next;} /./{print $2,$1;}' \
      | format-glosses \
      | sort -b +0 -1 \
      > en-op.dic
    gzip -c op-en.dic > op-en.dic.gz 
    gzip -c en-op.dic > en-op.dic.gz 

  Checking locator order:
  
    cat main.src \
      | egrep -e '^[<][>]rf' \
      | sed -e 's/[}]/:}/' -e 's/[.]/:/' \
      | sort -c -t: +1 -2n +2 -3n +3 -4n \
      | sed -e 's/[:][}]/}/' -e 's/[:]\(.*\)[}]/.\1/'
 
  Counting characters in OP lines:
  
    cat main.src \
      | egrep -e '^[<][>]op' \
      | sed \
          -e 's/^[<][>][a-z][a-z] *{ *//g' \
          -e 's/ *} *$//g' \
          -e 's/[&][^][ .,:;()\!?=-]*/&/g' \
          -e 's/[«][^«»]*[»]//g' \
          -e 's:[/]::g' \
      | ../../tools/raw-count-chars \
      | sort -b +1 -2 \
      > main.chn
  
  Extracting the grammar tags used in main.src:
  
    cat main.src \
      | egrep -e '^[<][>]' \
      | sed -e 's/[«]/@«/g' -e 's/[»]/»@/g' \
      | tr '@' '\012' \
      | egrep -e '^[«]' \
      | sort | uniq \
      > .grtags
  
EXPORT VERSION

  Preparing a reference version of JEK's file,
  minus trivial details (delete final "/"s, join multilines, 
  normalize order of "ý" and "ü" accents, markup grammar tags)

    cat jd-1890-1891.txt \
      | basic-jek-cleanup \
      | sed -f markup-grammar-tags.sed \
      > orig.jek
    
  Restoring file to JEK's format, sort of:
  
    restore-jek-format main.src \
      > main.jek
    gzip -c main.jek > main.jek.gz
      
  Comparing the two versions:

    compare-jek-js
  
  Comparing line numbers:
  
    cat main.jek \
      | grep -e '^[\\]rf' \
      > .refs.new

    cat orig.jek \
      | egrep -e '^[\\]rf' \
      > .refs.old
      
    diff .refs.{old,new} > jek-js.rfd