Hacking at the Voynich manuscript
Notebook - volume 11

Warning: these notebooks aren't strictly chronological logs.
  Sometimes I go back and redo things, clarify comments,
  delete garbage, etc.

97-10-09 stolfi
===============

  I have split Landini's file into one chunk per page
  
    csplit \
      --prefix 'chunk-' \
      --suffix '%03d.evt' \
      - '^# *$' '{*}'
  
  and then futher edited it manually, splitting each 
  page into homogeneous "textual units" (all normal text,
  all labels, etc.)
  
  The files are L16/fNNN and L16/fNNN.L, where fNNN is the panel
  number (as in f85r1) and L is the location code within that panel.
  Files without location code contain general comments about the panel.
  
  See L16/README for a detailed description of the files and
  my editings.
  
97-10-12 stolfi
===============
  
  Did a preliminarey version of the label location map, later redone.
  
97-10-14 stolfi
===============
  
97-10-14 stolfi
===============
  
  Ditto.
  
  An intermezzo: prompted by John Grove, let's compute the frquency of 
  single-letter words in the manuscript
 
    cat .voyn.fsg \
      | /n/gnu/bin/tr ' =./' '\012\012\012\012' \
      | egrep '^.$' \
      | sort | uniq -c | expand \
      | compute-freqs \
      | sort +0 -1nr \
      > .single.frq

       13 0.245 R
        9 0.170 2
        7 0.132 E
        5 0.094 G
        5 0.094 O
        4 0.075 4
        3 0.057 8
        2 0.038 *
        2 0.038 D
        2 0.038 S
        1 0.019 6

  Now for something else again.

97-10-19 stolfi
===============
  
  I translated the unit files to the ECC encoding:
  
    mkdir L16-ecc
    
    foreach f ( L16/f[0-9]* )
      echo "$f -> L16-ecc/${f:t}"
      cat ${f} \
        | fsg2ecc \
        > L16-ecc/${f:t}
    end

  Let's collect the textual units that comtain text or labels
  of each hand:

    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*(labels|words):' \
      > .units-labels.dir
    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*parags:' \
      > .units-parags.dir
    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*(lines|titles):' \
      > .units-lines.dir

    foreach let ( A B X )
      set pat = "${let}"
      if ( "${let}" == "X" ) set pat = '[^:]*\?'
      cat L16/page-table.dir \
        | egrep ':'"${pat}"':[^:]*:[^:]*(labels|words):' \
        > .units-labels-${let}.dir
      cat L16/page-table.dir \
        | egrep ':'"${pat}"':[^:]*:[^:]*parags:' \
        > .units-parags-${let}.dir
      cat L16/page-table.dir \
        | egrep ':'"${pat}"':[^:]*:[^:]*(lines|titles):' \
        > .units-lines-${let}.dir
    end
    
  Let's gather all panel numbers that occur in the text.
  
    cat L16/page-table.dir \
      | sed -e 's/:.*//g' -e 's/\..*$//g' \
      | uniq \
      > .panels.dir
    
    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*(labels|words):' \
      | sed -e 's/:.*//g' -e 's/\..*$//g' \
      > .panels-labels.dir
    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*parags:' \
      | sed -e 's/:.*//g' -e 's/\..*$//g' \
      > .panels-parags.dir
    cat L16/page-table.dir \
      | egrep '^[^:]*:[^:]*:[^:]*:[^:]*:[^:]*(lines|titles):' \
      | sed -e 's/:.*//g' -e 's/\..*$//g' \
      > .panels-lines.dir

  From them we create a script to convert panel numbers to 
  sequential page numbers (000 to 263):
  
    foreach f ( '' '-parags' '-labels' '-lines' )
      echo '#\! /n/gnu/bin/sed -f' \
        > panel${f}-to-page
      cat .panels${f}.dir \
        | gawk 'BEGIN {pg=0} /./ {printf"s/<%s>/<%03d>/g\n", $1, pg; pg++}' \
        >> panel${f}-to-page
      chmod a+x panel${f}-to-page
    end
  
    --- panel-to-page ------------------------
    #! /n/gnu/bin/sed -f
    s/<f0>/<000>/g
    s/<f1r>/<001>/g
    s/<f1v>/<002>/g
    s/<f2r>/<003>/g
    s/<f2v>/<004>/g
    s/<f3r>/<005>/g
    s/<f3v>/<006>/g
    s/<f4r>/<007>/g
    ...
    s/<f113v>/<260>/g
    s/<f114r>/<261>/g
    s/<f114v>/<262>/g
    s/<f115r>/<263>/g
    s/<f115v>/<264>/g
    s/<f116r>/<265>/g
    s/<f116v>/<266>/g
    ------------------------------------------
     
97-10-20 stolfi
===============
  
  Let's now create a consensus version for each unit.
  
  It turns out that the transcription code ";J>" is already used
  and means Jim Reeds.  I will use ";S" for my consensus.
  
    mkdir L16-ecc-x
    
    foreach f ( L16-ecc/f[0-9]* )
      set g = "L16-ecc-x/${f:t}"
      echo "$f -> $g"
      cat ${f} \
        | make-consensus-interlin \
        > ${g}~~
      cat ${g}~~ \
        | egrep ';S>|^#' \
        > ${g}
    end
  
  Let's concatenate all the paragraph locations into a single
  file:
  
    cat .units-parags.dir \
      | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16-ecc-x/:g' \
      > .tmp

    cat `cat .tmp` \
      > .parags-j-ecc.evt
      
    cat .parags-j-ecc.evt \
      | egrep '^<' \
      | sed \
          -e 's/^<.*> *//g' \
          -e 's/  *//g' \
      | dicio-wc
    
     lines   words     bytes
    ------ ------- ---------
      3918    3918    168489

  Note that the count above includes newlines, so we 
  actually have 164571 Voynich characters in the parags file. 
  
  Now let's extract the good label text:
  
    cat .units-labels.dir \
      | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16-ecc-x/:g' \
      > .tmp

    cat `cat .tmp` \
      > .labels-j-ecc.evt
    
    extract-words-from-interlin \
        -chars "8coqHPemrwkij" \
        .labels-j-ecc.evt \
        .labels-j-ecc

     lines   words     bytes file        
    ------ ------- --------- ------------
       986     986      4243 .labels-j-ecc.wds
       277     277      2234 .labels-j-ecc.dic
       282     282      2201 .labels-j-ecc-gut.wds
       225     225      1831 .labels-j-ecc-gut.dic
       652     652      1630 .labels-j-ecc-fun.wds
         2       2         5 .labels-j-ecc-fun.dic
        52      52       412 .labels-j-ecc-bad.wds
        50      50       398 .labels-j-ecc-bad.dic
      2526    2526     12954 total

    Digraph counts:

           TT           8     c     o     q     H     P     e     m     r     k     i     j
        ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
          282     .    21    26   200     4     3     1     1     .    26     .     .     .
      8   108    11     .     6    88     .     .     .     .     .     2     1     .     .
      c   342     1    16   165   120     .    15     8     .     .     8     .     .     9
      o   787   119    48    24    24     .   170    20   156    36   171    19     .     .
      q     4     .     .     .     3     .     1     .     .     .     .     .     .     .
      H   194     .     .    60   131     .     .     .     2     .     1     .     .     .
      P    31     2     .    14    15     .     .     .     .     .     .     .     .     .
      e   159    37    18    21    63     .     4     1     .     1    13     .     1     .
      m    38    23     3     1    10     .     .     .     .     .     1     .     .     .
      r   226    71     2    22   125     .     1     .     .     1     4     .     .     .
      k    20    16     .     3     1     .     .     .     .     .     .     .     .     .
      i     1     .     .     .     .     .     .     1     .     .     .     .     .     .
      j     9     2     .     .     7     .     .     .     .     .     .     .     .     .
        ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
    TOT  2201   282   108   342   787     4   194    31   159    38   226    20     1     9

    Next-symbol probability (× 99):

        TT     8  c  o  q  H  P  e  m  r  k  i  j
        -- -- -- -- -- -- -- -- -- -- -- -- -- --
        99  .  7  9 70  1  1  .  .  .  9  .  .  .
      8 99 10  .  6 81  .  .  .  .  .  2  1  .  .
      c 99  .  5 48 35  .  4  2  .  .  2  .  .  3
      o 99 15  6  3  3  . 21  3 20  5 22  2  .  .
      q 99  .  .  . 74  . 25  .  .  .  .  .  .  .
      H 99  .  . 31 67  .  .  .  1  .  1  .  .  .
      P 99  6  . 45 48  .  .  .  .  .  .  .  .  .
      e 99 23 11 13 39  .  2  1  .  1  8  .  1  .
      m 99 60  8  3 26  .  .  .  .  .  3  .  .  .
      r 99 31  1 10 55  .  .  .  .  .  2  .  .  .
      k 99 79  . 15  5  .  .  .  .  .  .  .  .  .
      i 99  .  .  .  .  .  . 99  .  .  .  .  .  .
      j 99 22  .  . 77  .  .  .  .  .  .  .  .  .
        -- -- -- -- -- -- -- -- -- -- -- -- -- --
    TOT 99 13  5 15 35  0  9  1  7  2 10  1  0  0

    Previous-symbol probability (× 99):

        TT     8  c  o  q  H  P  e  m  r  k  i  j
        -- -- -- -- -- -- -- -- -- -- -- -- -- --
        13  . 19  8 25 99  2  3  1  . 11  .  .  .
      8  5  4  .  2 11  .  .  .  .  .  1  5  .  .
      c 15  . 15 48 15  .  8 26  .  .  4  .  . 99
      o 35 42 44  7  3  . 87 64 97 94 75 94  .  .
      q  0  .  .  .  .  .  1  .  .  .  .  .  .  .
      H  9  .  . 17 16  .  .  .  1  .  .  .  .  .
      P  1  1  .  4  2  .  .  .  .  .  .  .  .  .
      e  7 13 17  6  8  .  2  3  .  3  6  . 99  .
      m  2  8  3  .  1  .  .  .  .  .  .  .  .  .
      r 10 25  2  6 16  .  1  .  .  3  2  .  .  .
      k  1  6  .  1  .  .  .  .  .  .  .  .  .  .
      i  0  .  .  .  .  .  .  3  .  .  .  .  .  .
      j  0  1  .  .  1  .  .  .  .  .  .  .  .  .
        -- -- -- -- -- -- -- -- -- -- -- -- -- --
    TOT 99 99 99 99 99 99 99 99 99 99 99 99 99 99

    Symbol entropy: 2.764

    Next-symbol entropy: 2.020

  Now, let's make a list of all labels therein.  Multiword labels
  (where words are separated by "-") will be entered as a single word,
  as well as separate words.
  
    /bin/rm -f .labels.def
    
  First, the labels without word breaks:
  
    cat .labels-j-ecc.evt  \
      | remove-comments-from-evt \
      | sed \
          -e 's/  *//g' \
          -e 's/;[A-Z]>/>/g' \
          -e 's/[-=]//g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>/> /g' \
      > .labels1.def
      
  Second, the labels split at word boundaries:

    cat .labels-j-ecc.evt \
      | remove-comments-from-evt \
      | /n/gnu/bin/sed \
          -e 's/  *//g' \
          -e 's/;[A-Z]>/>/g' \
          -e 's/[=-]$//g' \
          -e 's/^/@/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/>/> /g' \
      | tr '@' '\012' \
      | egrep '.' \
      > .labels2.def
      
  Now merge the two files, and insert sequential page numbers:
      
    cat .labels1.def .labels2.def \
      | sort | uniq \
      | sed \
          -e 's/<\(.*\)> \(.*\)/<\1> \2 {\1}/g' \
          -e 's/\.[^>]*> */> /g' \
      | panel-to-page \
      | tr '{}' '<>' \
      > .labels.def
      
  Keep only the first definition of each label as its "official position":

    cat .labels.def \
      | sort +1 -2 +0 -1n \
      | gawk 'BEGIN{b=""} /./ {if(b!=$2) {print; b=$2; next}}' \
      | sort \
      > .labels-first.def
      
  Collect the labels proper:
  
    cat .labels.def \
      | gawk '/./ {print $2}' \
      | sort | uniq \
      | egrep -v '\?' \
      > .labels.dic
      
    dicio-wc .labels.dic
    
     lines   words     bytes file        
    ------ ------- --------- ------------
       231     231      1906 .labels.dic

    cat .labels.dic\
      | gawk 'BEGIN{m=0} /./{s=length($0);m=(s>m?s:m);next} END{print ("max len " m)}'
    
    max len 18

  Now let's find all occurrences of the labels in the parags text:

    cat .parags-j-ecc.evt \
      | enum-word-locations .labels.dic \
      | sort -b +2 -3n \
      > .label-occurrences.idx
  
  Let's tabulate the reference frequencies per label:
  
    cat .label-occurrences.idx \
      | gawk '/./ { print $4 }' \
      | sort | uniq -c | expand \
      | compute-freqs \
      | sort +0 -1nr \
      > .label-refs-by-label.frq
      
    --- .label-refs-by-label.frq ------------------------
       4774 0.173 oHo
       2151 0.078 ccoe
       1540 0.056 oHom
       1162 0.042 oHoe
        976 0.035 oHor
        944 0.034 rom
        896 0.033 oHcc8o
        748 0.027 oroe
        737 0.027 cccco
        716 0.026 cccoe
        646 0.023 oror
        612 0.022 8oro
        ... ..... ........
          4 0.000 oHoeoror
          4 0.000 oHoroe8o
          3 0.000 ccPoeo
          3 0.000 ccoccro
          3 0.000 oPcco8oo
          3 0.000 oPccorom
          3 0.000 roPoe
          3 0.000 roeoer
          2 0.000 oHcc8occcHoe
          2 0.000 oHco8oer
          2 0.000 oHcooe8o
          2 0.000 occc8oe8om
          2 0.000 qHoe
          2 0.000 qoHcro
          2 0.000 roromr
          1 0.000 8o8orm
          1 0.000 Hoccorom
          1 0.000 oHcco8oer
          1 0.000 oHco8occcHco
          1 0.000 oHcoeroe
          1 0.000 oHcooeo
          1 0.000 oHcororor
          1 0.000 oHorccok
          1 0.000 oHorco
          1 0.000 oHroe
          1 0.000 oPoeror
          1 0.000 oecccccco
          1 0.000 ororcco8om
          1 0.000 ro8ororo
          1 0.000 roeccror
    -----------------------------------------------------

  Obviously "oHo", "ccoe", "oHom", "oHoe", etc. are not really labels; they are
  either common words (function words? "Star"? "Plant"? "Day"?), or common letter groups
  that got split off by accident.  
  
  Let's list the unreferenced labels:
  
    cat .label-occurrences.idx \
      | gawk '/./ { print $4 }' \
      | sort | uniq \
      | bool 2-1 - .labels.dic \
      > .labels-unref.dic
      
    8ccccorocccPoeom 8cccoe8o 8oHocHc 8oHoecj 8orokcjoe cc8or8omo
    ccoHcco8orr ccorcHcoroe ccoroeiP eHcccPoe eHcccPoeooPcco
    o8orcc8oom oHcc8ccor oHccoroeokcco oHcoHcororo oHcooeoeroroeo
    oHcoorororoeo oHcororoo8o oHe8ok oHeorcjo oHo8rco8occPooeo
    oHoHokom oHoecHo oHoecPoromok oHoeccHco oHoeccoHorokcjo oHoecjor
    oHoeoPccoroe oHoeoorok oHooccooHoeoHcoeor oHorccorrr oHorcjo
    oHorcororo oHorcr oHoroecHoeHoo oHoroecjo oPcc8ocjo oPcccoroe
    oPccoe8k oPccorok oPocPcor oPoeo8om oPoeo8omrr oPoeoro oPoeoror
    oPorHo8oe oPoroeor occcHoroHoem8 occcrororcco occoeccorok
    ocoeoecroror omoHoeoccor oo8cco ooeccccj oomororo qoHoomocHcco
    roeomcccoe roeooHoro8 room8oeo8 rrcHcrcccHo

  Note that these "labels" are distinctly longer than those that do
  occur, and are almost certainly multiword phrases.

  Now let's prepare a map showing for each label its occurrences in
  the running text. First, a block-based map:

    setenv BLOCKSZ 1646
    cat .label-occurrences.idx \
      | sort -b +3 -4 \
      | gawk '/./ { print int($3 / '"$BLOCKSZ"') + 1, $4 }' \
      | make-word-location-map \
         -v MAXLEN=18 \
         -v CTWD=1 \
         -v NBLOCKS=100 \
      > .label-by-block.map
      
  Add a column with the panel where each label was first defined:
  
    cat .labels-first.def \
      | sed -e 's/\.[^ >]*>/>/g' \
      | tr -d '<>' \
      | sort +1 -2 \
      > .foo
    
    join \
        -a 1 -e '000' \
        -j1 4 -j2 2 \
        -o0,2.3,2.1,1.1,1.2,1.3,1.5 \
        .label-by-block.map .foo \
      | gawk \
        '/./ {printf "%-18s %-6s %-3s %5d %5.1f %5.1f %s\n", $1,$2,$3,$4,$5,$6,$7}' \
      > .label-by-block-def.map
  
  Some comments:
  
    * The occurrences of a rarer label in the text
      usually form one or more tight clusters.
      
    * The labels occuring in some pages are relatively common in the
      text, those in other pages do not occur at all in the text.
    
  I computed the number of text characters in each panel:
  
    cat .units-parags.dir \
      | sed -e 's/:.*$//g' \
      > .ups
    
    /bin/rm -f .tmp
    foreach f ( `cat .panels.dir` )
      set pp = ( `egrep $f'[.]' .ups` )
      echo "$f $pp" > /dev/stderr
      echo "<$f> `( cd L16-ecc-x && cat $f $pp ) | count-text-chars`" >> .tmp
    end
    cat .tmp \
      | sort \
      > .panels.nchars

  Let's make a table that gives the range of byte offsets for each panel.
  Each line has the sequential panel number, the physical panel number,
  the first offset, and the last offset plus one.
  
    cat .panels.nchars \
      | sed -e 's/<\(.*\)>/<\1> {\1}/g' \
      | panel-to-page \
      | tr '{}' '<>' \
      | sort +0 -1n \
      | gawk 'BEGIN {a=0} /./ {b = a+$3; print $1, $2, a, b; a=b}' \
      > .panels.chrange
      
  Let's make tables that map block index to panel number and vice-versa:
  
    echo 'block size = '$BLOCKSZ
    cat .panels.chrange \
      | tr -d '<>' \
      | gawk '/./ {printf "s/<%s>/<%03d>/g\n", $2, 1+int($3/'"$BLOCKSZ"')}' \
      > panel-to-block
    chmod a+x panel-to-block
    
    echo 'block size = '$BLOCKSZ
    cat .panels.chrange \
      | grep -v '<f0>' \
      | tr -d '<>' \
      | gawk '/./ {printf "%03d %s\n", 1+int($3/'"$BLOCKSZ"'), $2}' \
      | gawk  'BEGIN {n=0} /./ {while($1>n){n++;printf "s/<%03d>/<%s>/g\n", n,$2}}' \
      > block-to-panel
    chmod a+x block-to-panel
    
  Formatting block-to-panel as a header:
  
    cat block-to-panel \
     | tr '<>/' '  ' \
     | gawk '/./ {print $2, $3}' \
     | sed -e 's/ f\([0-9][0-9]*\)/ f\1 /g' \
     | format-block-map-header \
     > .block-map-header

  Computing frequencies of references per panel:

    cat .label-occurrences.idx \
      | gawk '/./ { print $1 }' \
      | sed -e 's/\..*>/>/g' \
      | sort | uniq -c | expand \
      | compute-freqs \
      | sort +0 -1nr \
      > .label-refs-by-panel.frq

    --- .label-refs-by-panel.frq ------------------------
        636 0.023 <f113v>
        563 0.020 <f86v6>
        546 0.020 <f58r>
        511 0.019 <f107r>
        498 0.018 <f111v>
        494 0.018 <f104v>
        492 0.018 <f113r>
        491 0.018 <f107v>
        482 0.017 <f106r>
        475 0.017 <f106v>
        465 0.017 <f116r>
        ... ..... ........
         30 0.001 <f41r>
         30 0.001 <f7r>
         29 0.001 <f25r>
         28 0.001 <f2v>
         28 0.001 <f38v>
         27 0.001 <f11v>
         27 0.001 <f67r2>
         22 0.001 <f5r>
         20 0.001 <f68r1>
         19 0.001 <f25v>
         19 0.001 <f68v2>
         15 0.001 <f67r1>
         12 0.000 <f27v>
         11 0.000 <f65v>
    -----------------------------------------------------
    
  There are a few pages that are particularly rich in label
  references.
    
  The panels from f103 on are "starred paragraphs", and so is f58r.
  Panel f86v6 is on the back of the big fold-out.

97-10-21 stolfi
===============

  Of particular interest arer the labels with intermediate frequency
  (between 1 and 99 occurrences). I separated those into a file 
  .labels-rare.dic.  
  
    8cc8o 8ccoe 8o8orm 8oHccoe 8oHoeo 8oHor 8oeoro 8oero 8orcco8o
    Hoccorom Hoecc8 Horomo Poro cPccor ccPoeo cccHoe cccocHco cccor8o
    cccoro8 cccoroe ccoHoro ccoccro ccoer ccoerom o8r oHcc8occcHoe
    oHcc8oe oHcccc8o oHcccco oHccccor oHccco8o oHccco8or oHcccor
    oHcco8oer oHcco8or oHccocc8o oHccocco oHccooro oHccoror oHccr
    oHco8occcHco oHco8oer oHcoeo oHcoeor oHcoeroe oHcooe8o oHcooeo
    oHcoor oHcoroe oHcororor oHo8oe oHo8or oHo8oro oHoHo oHoHoe
    oHocHco oHocco oHoccor oHoe8oe oHoe8or oHoecc8 oHoecc8o oHoeccoe8o
    oHoeo8 oHoeo8o oHoeoe oHoeoeo oHoeok oHoeom oHoeor oHoeoroe
    oHoeoror oHoer oHoero oHom8om oHomoHom oHorcccHo oHorcco8
    oHorcco8o oHorccok oHorccor oHorco oHoroe8o oHorok oHorom oHoror
    oHororo oHroe oPccco8o oPcccor oPcco8oo oPccor oPccorom oPoeror
    oPorom occc8oe8om occo8o8o occoro oecPco oecccccco oeorom oeoror
    ooPcco oorcccor oorom oroe8 ororcco8om ororoeo qHoe qoHcro rccoor
    ro8or ro8ororo roPoe roe8ok roeP roeccror roeoe roeoer roer roero
    rorccor roroeo roromr rororo rr

  I thought of removing those labels that were superstrings of others
  in the same set, e.g. removing "oHoecc8o" since there is already 
  "oHoecc8". Here is a (rather convoluted) recipe to do that:
  
    cp -p .labels-rare.dic .labs
    @ i = 0
    while ( $i < 4 )
      @ i = $i + 1
      cat .labs \
        | enum-proper-substrings \
        | sort | uniq \
        > .subs.$i
      bool 1.2 .labs .subs.$i \
        > .subs-oc.$i
      if ( -z .subs-oc.$i ) break
      cat .labs \
        | fgrep -v -f .subs-oc.$i \
        > .prop.$i
      cat .subs-oc.$i >> .prop.$i
      cat .prop.$i \
        | sort | uniq \
        > .labs
    end
    
  Here is what it would remove:
      
    bool 1-2 .labels-rare.dic .labs
    
    Hoccorom ccoerom oHcc8occcHoe oHccccor oHccco8or oHcoeor oHo8oro
    oHoHoe oHoccor oHoecc8 oHoecc8o oHoeo8o oHoeoeo oHoeoroe oHoeoror
    oHoero oHorcco8o oHoroe8o oHororo oPccorom oPorom ororoeo ro8ororo
    roeoer roero

  But it seems that those labels are intersting on their own, and
  in some cases much larger than the relevant substrings. So I decided
  to leave them in. 
  
  We should also look at the "rarest" labels (with less than 25 occurrences):
  
    8o8orm 8oHccoe 8oHoeo 8oero 8orcco8o Hoccorom cPccor ccPoeo
    cccor8o cccoro8 ccoccro ccoerom o8r oHcc8occcHoe oHcccc8o oHccccor
    oHccco8or oHcco8oer oHccocc8o oHccooro oHccoror oHco8occcHco
    oHco8oer oHcoeor oHcoeroe oHcooe8o oHcooeo oHcoor oHcoroe
    oHcororor oHo8oe oHo8or oHo8oro oHoHoe oHocHco oHoccor oHoeccoe8o
    oHoeo8 oHoeo8o oHoeoeo oHoeok oHoeom oHoeoroe oHoeoror oHoero
    oHom8om oHorcccHo oHorcco8 oHorcco8o oHorccok oHorccor oHorco
    oHoroe8o oHorok oHroe oPccco8o oPcccor oPcco8oo oPccorom oPoeror
    oPorom occc8oe8om occo8o8o oecPco oecccccco oorcccor ororcco8om
    qHoe qoHcro rccoor ro8or ro8ororo roPoe roe8ok roeP roeccror
    roeoer rorccor roromr

  Now let's find all occurrences of these rare labels in the parags text:

    cat .parags-j-ecc.evt \
      | enum-word-locations .labels-rare.dic \
      | sort -b +2 -3n \
      > .label-rare-occurrences.idx
  
    cat .parags-j-ecc.evt \
      | enum-word-locations .labels-rarest.dic \
      | sort -b +2 -3n \
      > .label-rarest-occurrences.idx

  Let's tabulate their frequencies per page:

    foreach f ( '' -rare -rarest )
      cat .label${f}-occurrences.idx \
        | gawk '/./ { print $1 }' \
        | sed -e 's/\..*>/>/g' \
        | sort | uniq -c | expand \
        | compute-freqs \
        | sort +0 -1nr \
        > .label${f}-refs-by-panel.frq
    end
    
    --- .label-rare-refs-by-panel.frq ------------------------
         93 0.030 <f58r>
         78 0.025 <f113v>
         63 0.020 <f104v>
         60 0.019 <f115r>
         60 0.019 <f86v6>
         57 0.018 <f86v5>
         55 0.017 <f106v>
         53 0.017 <f113r>
         51 0.016 <f116r>
         50 0.016 <f108r>
         49 0.016 <f104r>
         49 0.016 <f105v>
         48 0.015 <f107r>
         44 0.014 <f108v>
         42 0.013 <f106r>
         41 0.013 <f107v>
         40 0.013 <f112r>
         39 0.012 <f103v>
         39 0.012 <f85r1>
         37 0.012 <f115v>
         36 0.011 <f111v>
         36 0.011 <f80v>
         35 0.011 <f114r>
         35 0.011 <f76r>
         35 0.011 <f99v>
         34 0.011 <f103r>
         34 0.011 <f111r>
         34 0.011 <f80r>
         33 0.010 <f79r>
         30 0.010 <f105r>
         28 0.009 <f78v>
         27 0.009 <f66r>
         26 0.008 <f76v>
         25 0.008 <f112v>
         25 0.008 <f79v>
         24 0.008 <f114v>
         24 0.008 <f39v>
         24 0.008 <f40r>
         24 0.008 <f84r>
         23 0.007 <f101r1>
         23 0.007 <f78r>
         22 0.007 <f39r>
         22 0.007 <f75v>
         21 0.007 <f102v2>
         21 0.007 <f77r>
         20 0.006 <f50r>
         20 0.006 <f70r2>
         20 0.006 <f75r>
         19 0.006 <f93r>
         18 0.006 <f82r>
         18 0.006 <f94v>
         17 0.005 <f46v>
         17 0.005 <f48v>
         17 0.005 <f55r>
         16 0.005 <f66v>
         16 0.005 <f94r>
         15 0.005 <f100v>
         15 0.005 <f43r>
         15 0.005 <f47v>
         15 0.005 <f77v>
         15 0.005 <f82v>
         15 0.005 <f89r2>
         15 0.005 <f9v>
         14 0.004 <f102r2>
         14 0.004 <f14v>
         14 0.004 <f1v>
         14 0.004 <f23v>
         14 0.004 <f33r>
         14 0.004 <f55v>
         14 0.004 <f81v>
         14 0.004 <f83r>
         13 0.004 <f34v>
         13 0.004 <f81r>
         13 0.004 <f87v>
         13 0.004 <f88v>
         13 0.004 <f8r>
         12 0.004 <f100r>
         12 0.004 <f101v1>
         12 0.004 <f16v>
         12 0.004 <f1r>
         12 0.004 <f22r>
         12 0.004 <f23r>
         12 0.004 <f2r>
         12 0.004 <f3r>
         12 0.004 <f45r>
         12 0.004 <f54r>
         12 0.004 <f56r>
         12 0.004 <f83v>
         12 0.004 <f84v>
         12 0.004 <f86v3>
         12 0.004 <f8v>
         12 0.004 <f90v1>
         12 0.004 <f95v1>
         12 0.004 <f95v2>
         12 0.004 <f99r>
         11 0.003 <f31v>
         11 0.003 <f36r>
         11 0.003 <f40v>
         11 0.003 <f44r>
         11 0.003 <f49r>
         11 0.003 <f87r>
         11 0.003 <f88r>
         11 0.003 <f9r>
         10 0.003 <f19v>
         10 0.003 <f46r>
         10 0.003 <f48r>
         10 0.003 <f52r>
         10 0.003 <f54v>
         10 0.003 <f86v4>
         10 0.003 <f93v>
         10 0.003 <f96r>
          9 0.003 <f13r>
          9 0.003 <f15v>
          9 0.003 <f16r>
          9 0.003 <f17v>
          9 0.003 <f24r>
          9 0.003 <f27r>
          9 0.003 <f29r>
          9 0.003 <f34r>
          9 0.003 <f51v>
          9 0.003 <f95r2>
          8 0.003 <f101v2>
          8 0.003 <f102v1>
          8 0.003 <f21v>
          8 0.003 <f22v>
          8 0.003 <f24v>
          8 0.003 <f35r>
          8 0.003 <f35v>
          8 0.003 <f37r>
          8 0.003 <f42v>
          8 0.003 <f43v>
          8 0.003 <f56v>
          8 0.003 <f57r>
          7 0.002 <f19r>
          7 0.002 <f30v>
          7 0.002 <f41v>
          7 0.002 <f42r>
          7 0.002 <f45v>
          7 0.002 <f6v>
          7 0.002 <f7v>
          7 0.002 <f89v1>
          7 0.002 <f89v2>
          6 0.002 <f10r>
          6 0.002 <f11v>
          6 0.002 <f13v>
          6 0.002 <f15r>
          6 0.002 <f17r>
          6 0.002 <f20r>
          6 0.002 <f20v>
          6 0.002 <f26v>
          6 0.002 <f32v>
          6 0.002 <f33v>
          6 0.002 <f3v>
          6 0.002 <f52v>
          6 0.002 <f67r2>
          6 0.002 <f6r>
          6 0.002 <f90r1>
          6 0.002 <f90r2>
          6 0.002 <f90v2>
          6 0.002 <f95r1>
          6 0.002 <f96v>
          5 0.002 <f102r1>
          5 0.002 <f11r>
          5 0.002 <f18v>
          5 0.002 <f21r>
          5 0.002 <f25r>
          5 0.002 <f25v>
          5 0.002 <f26r>
          5 0.002 <f2v>
          5 0.002 <f30r>
          5 0.002 <f31r>
          5 0.002 <f32r>
          5 0.002 <f36v>
          5 0.002 <f53r>
          5 0.002 <f53v>
          5 0.002 <f5r>
          5 0.002 <f5v>
          5 0.002 <f69r>
          5 0.002 <f7r>
          4 0.001 <f14r>
          4 0.001 <f27v>
          4 0.001 <f28r>
          4 0.001 <f28v>
          4 0.001 <f29v>
          4 0.001 <f37v>
          4 0.001 <f38r>
          4 0.001 <f44v>
          4 0.001 <f47r>
          4 0.001 <f4r>
          4 0.001 <f50v>
          4 0.001 <f51r>
          4 0.001 <f68v2>
          4 0.001 <f89r1>
          3 0.001 <f10v>
          3 0.001 <f68r2>
          3 0.001 <f68v3>
          2 0.001 <f38v>
          1 0.000 <f18r>
          1 0.000 <f41r>
          1 0.000 <f4v>
          1 0.000 <f65v>
          1 0.000 <f67r1>
    ----------------------------------------------------------
    
    --- .label-rarest-refs-by-panel.frq ------------------------
         28 0.041 <f58r>
         18 0.026 <f113v>
         15 0.022 <f115r>
         13 0.019 <f113r>
         13 0.019 <f66r>
         12 0.017 <f86v5>
         12 0.017 <f99v>
         11 0.016 <f105v>
         10 0.014 <f106v>
         10 0.014 <f116r>
         10 0.014 <f78r>
          9 0.013 <f108r>
          9 0.013 <f85r1>
          9 0.013 <f86v6>
          8 0.012 <f102v2>
          8 0.012 <f103r>
          8 0.012 <f105r>
          8 0.012 <f106r>
          8 0.012 <f107r>
          8 0.012 <f46v>
          8 0.012 <f76r>
          7 0.010 <f104v>
          7 0.010 <f111r>
          7 0.010 <f112r>
          7 0.010 <f84r>
          7 0.010 <f9v>
          6 0.009 <f101r1>
          6 0.009 <f103v>
          6 0.009 <f107v>
          6 0.009 <f108v>
          6 0.009 <f114r>
          6 0.009 <f78v>
          6 0.009 <f80r>
          6 0.009 <f93r>
          5 0.007 <f102r2>
          5 0.007 <f104r>
          5 0.007 <f111v>
          5 0.007 <f114v>
          5 0.007 <f14v>
          5 0.007 <f1v>
          5 0.007 <f23r>
          5 0.007 <f24v>
          5 0.007 <f3r>
          5 0.007 <f45r>
          5 0.007 <f52r>
          5 0.007 <f87r>
          5 0.007 <f8v>
          5 0.007 <f95v2>
          4 0.006 <f100r>
          4 0.006 <f100v>
          4 0.006 <f112v>
          4 0.006 <f13v>
          4 0.006 <f22r>
          4 0.006 <f32v>
          4 0.006 <f33r>
          4 0.006 <f34v>
          4 0.006 <f39r>
          4 0.006 <f51v>
          4 0.006 <f54v>
          4 0.006 <f55v>
          4 0.006 <f6v>
          4 0.006 <f75v>
          4 0.006 <f77r>
          4 0.006 <f77v>
          4 0.006 <f80v>
          4 0.006 <f81v>
          4 0.006 <f86v3>
          4 0.006 <f87v>
          4 0.006 <f88v>
          4 0.006 <f89r2>
          4 0.006 <f90r1>
          4 0.006 <f95v1>
          4 0.006 <f99r>
          3 0.004 <f101v2>
          3 0.004 <f102v1>
          3 0.004 <f11v>
          3 0.004 <f16v>
          3 0.004 <f17v>
          3 0.004 <f19r>
          3 0.004 <f23v>
          3 0.004 <f2r>
          3 0.004 <f30v>
          3 0.004 <f34r>
          3 0.004 <f35r>
          3 0.004 <f37r>
          3 0.004 <f39v>
          3 0.004 <f42r>
          3 0.004 <f47v>
          3 0.004 <f50r>
          3 0.004 <f51r>
          3 0.004 <f57r>
          3 0.004 <f66v>
          3 0.004 <f69r>
          3 0.004 <f79r>
          3 0.004 <f79v>
          3 0.004 <f84v>
          3 0.004 <f86v4>
          3 0.004 <f89v1>
          3 0.004 <f89v2>
          3 0.004 <f90r2>
          2 0.003 <f101v1>
          2 0.003 <f10v>
          2 0.003 <f13r>
          2 0.003 <f14r>
          2 0.003 <f16r>
          2 0.003 <f17r>
          2 0.003 <f18v>
          2 0.003 <f1r>
          2 0.003 <f20r>
          2 0.003 <f22v>
          2 0.003 <f26v>
          2 0.003 <f28r>
          2 0.003 <f30r>
          2 0.003 <f31r>
          2 0.003 <f31v>
          2 0.003 <f35v>
          2 0.003 <f36r>
          2 0.003 <f3v>
          2 0.003 <f40r>
          2 0.003 <f42v>
          2 0.003 <f43r>
          2 0.003 <f45v>
          2 0.003 <f47r>
          2 0.003 <f49r>
          2 0.003 <f4r>
          2 0.003 <f52v>
          2 0.003 <f53r>
          2 0.003 <f54r>
          2 0.003 <f55r>
          2 0.003 <f56r>
          2 0.003 <f5v>
          2 0.003 <f70r2>
          2 0.003 <f75r>
          2 0.003 <f7v>
          2 0.003 <f81r>
          2 0.003 <f83v>
          2 0.003 <f89r1>
          2 0.003 <f8r>
          2 0.003 <f90v1>
          2 0.003 <f90v2>
          2 0.003 <f93v>
          2 0.003 <f94r>
          2 0.003 <f95r2>
          2 0.003 <f96r>
          2 0.003 <f96v>
          2 0.003 <f9r>
          1 0.001 <f115v>
          1 0.001 <f11r>
          1 0.001 <f15r>
          1 0.001 <f15v>
          1 0.001 <f19v>
          1 0.001 <f21r>
          1 0.001 <f21v>
          1 0.001 <f25r>
          1 0.001 <f26r>
          1 0.001 <f27r>
          1 0.001 <f27v>
          1 0.001 <f28v>
          1 0.001 <f29v>
          1 0.001 <f2v>
          1 0.001 <f33v>
          1 0.001 <f37v>
          1 0.001 <f43v>
          1 0.001 <f44r>
          1 0.001 <f46r>
          1 0.001 <f48r>
          1 0.001 <f48v>
          1 0.001 <f53v>
          1 0.001 <f67r2>
          1 0.001 <f68v2>
          1 0.001 <f76v>
          1 0.001 <f82r>
          1 0.001 <f82v>
          1 0.001 <f83r>
          1 0.001 <f88r>
          1 0.001 <f94v>
    ------------------------------------------------------------

  Let's compute the density of label references per page (number
  of references divided by number of text characters in page):
    
    foreach f ( '' -rare -rarest )
      cat .label${f}-refs-by-panel.frq \
        | sort +2 -3 \
        > .foo
      join \
          -a 1 -e 00 \
          -j1 3 -j2 1 \
          -o0,1.1,1.2,2.2 \
          .foo .panels.nchars \
        > .bar${f}
      cat .bar${f} \
        | gawk '/./ {printf "%-8s %5d %5.3f %5d\n", $1, $2, $3, int(1000*$2/$4 + 0.5)}' \
        | sort -b +3 -4nr \
        > .label${f}-refs-by-panel.rfrq
    end

    --- .label-refs-by-panel.rfrq ------------------------
    <f40r>     146 0.005   348
    <f58r>     546 0.020   294
    <f99v>     178 0.006   293
    <f95v2>     69 0.003   273
    <f113v>    636 0.023   269
    <f94r>      99 0.004   268
    <f86v5>    436 0.016   267
    <f55r>     136 0.005   261
    <f95r2>     97 0.004   261
    <f39v>     152 0.006   256
    <f86v6>    563 0.020   252
    <f70r2>    106 0.004   249
    <f23v>      95 0.003   242
    <f23r>     106 0.004   240
    <f86v3>    131 0.005   234
    <f33r>      76 0.003   229
    <f55v>     100 0.004   224
    <f69r>      37 0.001   224
    <f94v>     105 0.004   224
    <f107v>    491 0.018   223
    <f86v4>     45 0.002   223
    <f107r>    511 0.019   219
    <f50r>     104 0.004   218
    <f100v>     87 0.003   215
    <f105v>    395 0.014   215
    <f36r>      58 0.002   215
    <f9v>       79 0.003   215
    <f106r>    482 0.017   213
    <f19v>      75 0.003   213
    <f106v>    475 0.017   211
    ...        ... .....   ...
    <f7r>       30 0.001    89
    <f32v>      31 0.001    87
    <f4v>       35 0.001    87
    <f5r>       22 0.001    85
    <f89r1>     53 0.002    84
    <f68v2>     19 0.001    82
    <f30r>      38 0.001    80
    <f26r>      33 0.001    76
    <f25v>      19 0.001    73
    <f41r>      30 0.001    54
    <f65v>      11 0.000    49
    <f27v>      12 0.000    39
    ------------------------------------------------------
  
    --- .label-rare-refs-by-panel.rfrq ------------------------
    <f99v>      35 0.011    58
    <f40r>      24 0.008    57
    <f58r>      93 0.030    50
    <f86v4>     10 0.003    50
    <f70r2>     20 0.006    47
    <f95v2>     12 0.004    47
    <f14v>      14 0.004    44
    <f94r>      16 0.005    43
    <f33r>      14 0.004    42
    <f50r>      20 0.006    42
    <f36r>      11 0.003    41
    <f9v>       15 0.005    41
    <f39v>      24 0.008    40
    <f47v>      15 0.005    38
    <f94v>      18 0.006    38
    <f100v>     15 0.005    37
    <f23v>      14 0.004    36
    <f67r2>      6 0.002    36
    <f1v>       14 0.004    35
    <f86v5>     57 0.018    35
    <f16v>      12 0.004    34
    <f113v>     78 0.025    33
    <f55r>      17 0.005    33
    <f46v>      17 0.005    32
    <f15v>       9 0.003    31
    <f45r>      12 0.004    31
    <f52r>      10 0.003    31
    <f55v>      14 0.004    31
    <f21v>       8 0.003    30
    <f39r>      22 0.007    30
    ...        ... .....   ...
    <f38v>       2 0.001     8
    <f50v>       4 0.001     8
    <f51r>       4 0.001     8
    <f83r>      14 0.004     8
    <f89v2>      7 0.002     8
    <f84v>      12 0.004     7
    <f67r1>      1 0.000     6
    <f89r1>      4 0.001     6
    <f65v>       1 0.000     4
    <f18r>       1 0.000     2
    <f41r>       1 0.000     2
    <f4v>        1 0.000     2
    -----------------------------------------------------------

    --- .label-rarest-refs-by-panel.rfrq ------------------------
    <f95v2>      5 0.007    20
    <f99v>      12 0.017    20
    <f9v>        7 0.010    19
    <f69r>       3 0.004    18
    <f14v>       5 0.007    16
    <f46v>       8 0.012    15
    <f52r>       5 0.007    15
    <f58r>      28 0.041    15
    <f86v4>      3 0.004    15
    <f90r2>      3 0.004    15
    <f11v>       3 0.004    14
    <f13v>       4 0.006    14
    <f1v>        5 0.007    13
    <f45r>       5 0.007    13
    <f24v>       5 0.007    12
    <f33r>       4 0.006    12
    <f102v2>     8 0.012    11
    <f23r>       5 0.007    11
    <f32v>       4 0.006    11
    <f90r1>      4 0.006    11
    <f100v>      4 0.006    10
    <f51v>       4 0.006    10
    <f87r>       5 0.007    10
    <f19r>       3 0.004     9
    <f30v>       3 0.004     9
    <f3r>        5 0.007     9
    <f54v>       4 0.006     9
    <f55v>       4 0.006     9
    <f5v>        2 0.003     9
    <f87v>       4 0.006     9
    ...        ... .....   ...
    <f80v>       4 0.006     2
    <f81r>       2 0.003     2
    <f84v>       3 0.004     2
    <f88r>       1 0.001     2
    <f94v>       1 0.001     2
    <f43v>       1 0.001     1
    <f46r>       1 0.001     1
    <f75r>       2 0.003     1
    <f82r>       1 0.001     1
    <f82v>       1 0.001     1
    <f83r>       1 0.001     1
    <f83v>       2 0.003     1
    <f115v>      1 0.001     0
    <f76v>       1 0.001     0
    -------------------------------------------------------------

97-10-22 stolfi
===============

  Let's make a joint file with all three counts:
  
    foreach f ( '' -rare -rarest )
      cat .label${f}-refs-by-panel.rfrq \
        | sort +0 -1 \
        > .foo${f}
    end
    
    join \
        -a 1 -a 2 -e 00 \
        -j1 1 -j2 1 \
        -o0,1.2,1.4,2.2,2.4 \
        .foo .foo-rare \
      > .bar
      
    join \
        -a 1 -a 2 -e 00 \
        -j1 1 -j2 1 \
        -o0,1.2,1.3,1.4,1.5,2.2,2.4 \
        .bar .foo-rarest \
      > .baz
  
    cat .baz \
      | sed -e 's/<\(.*\)>/<\1> {\1}/g' \
      | panel-to-page \
      | tr -d '{}<>' \
      | sort +0 -1n \
      | gawk '/./ {printf "%03d %-6s  %5d %5d  %5d %5d  %5d %5d\n", $1,$2,$3,$4,$5,$6,$7,$8}' \
      > .label-refs-by-panel.jfrq
  
    --- .label-refs-by-panel.jfrq ------------------------
    001 f1r       116   122     12    13      2     2
    002 f1v        78   197     14    35      5    13
    003 f2r        67   157     12    28      3     7
    004 f2v        28   103      5    18      1     4
    005 f3r        90   159     12    21      5     9
    006 f3v        62   160      6    15      2     5
    007 f4r        37   123      4    13      2     7
    008 f4v        35    87      1     2      0     0
    009 f5r        22    85      5    19      0     0
    010 f5v        38   176      5    23      2     9
    ... ......    ...   ...    ...   ...    ...   ...
    262 f114v     289   147     24    12      5     3
    263 f115r     445   188     60    25     15     6
    264 f115v     342   157     37    17      1     0
    265 f116r     465   193     51    21     10     4
    ------------------------------------------------------

  Let's make histograms of those ratios, sorted by page position.
  
    cat .label-refs-by-panel.jfrq \
      | make-label-ref-graphs \
         -v MAX1=348 -v MAX2=58 -v MAX3=20 \
      > .label-refs-by-panel.jhis

    --- .label-refs-by-panel.jhis ------------------------
    001 f1r       116   122 ooo......     12    13 oo.......      2     2 .........
    002 f1v        78   197 ooooo....     14    35 ooooo....      5    13 ooooo....
    003 f2r        67   157 oooo.....     12    28 oooo.....      3     7 ooo......
    004 f2v        28   103 oo.......      5    18 oo.......      1     4 o........
    005 f3r        90   159 oooo.....     12    21 ooo......      5     9 oooo.....
    ... ...       ...   ... ...           ...  ... ...          ...   ... ...      
    261 f114r     382   170 oooo.....     35    16 oo.......      6     3 o........
    262 f114v     289   147 ooo......     24    12 o........      5     3 o........
    263 f115r     445   188 oooo.....     60    25 ooo......     15     6 oo.......
    264 f115v     342   157 oooo.....     37    17 oo.......      1     0 .........
    265 f116r     465   193 oooo.....     51    21 ooo......     10     4 o........
    ------------------------------------------------------
    
  The most label-rich and label-poor pages are 
  
    ALL LABELS
      <f40r>     146 0.005   348
      <f58r>     546 0.020   294
      <f99v>     178 0.006   293

      <f41r>      30 0.001    54
      <f65v>      11 0.000    49
      <f27v>      12 0.000    39
      
    UNDER 100
      <f99v>      35 0.011    58
      <f40r>      24 0.008    57
      <f58r>      93 0.030    50
      <f86v4>     10 0.003    50
      <f70r2>     20 0.006    47
      <f95v2>     12 0.004    47
      <f14v>      14 0.004    44
      <f94r>      16 0.005    43
      <f33r>      14 0.004    42
      <f50r>      20 0.006    42
      <f36r>      11 0.003    41

      <f18r>       1 0.000     2
      <f41r>       1 0.000     2
      <f4v>        1 0.000     2
    
    UNDER 25
      <f95v2>      5 0.007    20
      <f99v>      12 0.017    20
      <f9v>        7 0.010    19
      <f69r>       3 0.004    18
      <f14v>       5 0.007    16
      <f46v>       8 0.012    15
      <f52r>       5 0.007    15
      <f58r>      28 0.041    15
      <f86v4>      3 0.004    15
      <f90r2>      3 0.004    15

      <f43v>       1 0.001     1
      <f46r>       1 0.001     1
      <f75r>       2 0.003     1
      <f82r>       1 0.001     1
      <f82v>       1 0.001     1
      <f83r>       1 0.001     1
      <f83v>       2 0.003     1
      <f115v>      1 0.001     0
      <f76v>       1 0.001     0
    
  The UNDER 100 class is distributed fairly uniformly among the 
  most labelliferous pages.  The UNDER 25 class has a steeper 
  ditribution.  The "starred paragraph" pages (incuding f58r)
  are not exceptionally labelliferous in relative terms; their
  absolute counts are high only because they contain a lot of
  paragraphical text.
  
  Page f40r has many labels of the ALL and UNDER 100 classes, but
  only 2 of the UNDER 25 class. 
  
  On the other hand, f99v is label-rich in all three classes.  The
  same can be said of page f58r, except for a modest drop in the
  UNDER-25 class.
  
  Let's find WHICH labels were mentioned on pages f58r and f99v.
  
  I had previously written a gawk script "show-occurrences" to show
  the occurrences of a bunch of words in a text.  let's run it,
  just as a check:
  
    foreach f ( f58r f99v )
      cat .units-parags.dir \
        | egrep "^${f}[:.]" \
        | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16-ecc-x/:g' \
      > .tmp
      
      cat `cat .tmp` \
        | find-and-show-occurrences .labels-rarest.dic \
        > .label-rarest-occs2-$f
    end
  
  Let's now generate the same files from the occurrence lists.
  We will add to the latter the label's definition code.
  
    foreach f ( '' '-rare' '-rarest' )
      cat .label${f}-occurrences.idx \
        | sort -b +3 -4 \
        > .occ
        
      cat .labels-first.def \
        | sort -b +1 -2 \
        | tr '<>' '{}' \
        >.def
        
      join \
          -a1 -e '{???}' \
          -j1 4 -j2 2 \
          -o1.1,1.2,1.3,0,2.3 \
          .occ .def \
        | sort -b +2 -3n +3 -4 \
        > .label${f}-occ-def.idx
    end
  
    foreach f ( f58r f99v )
      cat .units-parags.dir \
        | egrep "^${f}[:.]" \
        | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16-ecc-x/:g' \
      > .tmp
      foreach g ( '' '-rare' '-rarest' )

        cat .label${g}-occ-def.idx \
          | egrep "<${f}[.]" \
          > .occ

        cat `cat .tmp` \
          | show-occurrences .occ \
          > .label${g}-occs-$f
      end  
    end
    
  Now let's prepare a similar file in FSG format.  We must use the FSG
  text, and we must replace each ECC label in the occurrence file by
  its correspondent FSG label.
  
  For the second part, let's first extract the labels in FSG notation, 
  just as we did for ECC.  Note that we must remove the "."s from the
  text, but keep the "-"s and "="s:
  
    cat .units-labels.dir \
      | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16/:g' \
      > .tmp

    cat `cat .tmp` \
      > .labels-m-fsg.evt
    
    cat .units-parags.dir \
      | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16/:g' \
      > .tmp

    cat `cat .tmp` \
      > .parags-m-fsg.evt
    
    /bin/rm -f .labels-fsg.def
    
    cat .labels-m-fsg.evt  \
      | remove-comments-from-evt \
      | sed \
          -e 's/  *//g' \
          -e 's/;[A-Z]>/>/g' \
          -e 's/[-=]//g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>/> /g' \
      | egrep ' .' \
      > .labels1.def

    cat .labels-m-fsg.evt \
      | remove-comments-from-evt \
      | /n/gnu/bin/sed \
          -e 's/  *//g' \
          -e 's/;[A-Z]>/>/g' \
          -e 's/[=-]$//g' \
          -e 's/^/@/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/>\(.*\)[.]/>\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \
          -e 's/>/> /g' \
      | tr '@' '\012' \
      | egrep ' .' \
      > .labels2.def
  
    cat .labels1.def .labels2.def \
      | sort | uniq \
      | sed \
          -e 's/<\(.*\)> \(.*\)/<\1> \2 {\1}/g' \
          -e 's/\.[^>]*> */> /g' \
      | panel-to-page \
      | tr '{}' '<>' \
      > .labels-fsg.def
      
  Now make file that lists each FSG label with the equivalent ECC label:
  
    cat .labels-fsg.def \
      | tr '<>' '{}' \
      | gawk 'BEGIN{n=0} /./ {n++; printf "<f999.L.%03d>       %s %s\n",n,$2,$3}' \
      > .foo-fsg
      
    cat .foo-fsg \
      | fsg2ecc \
      > .foo-ecc
      
    join \
        -j1 1 -j2 1 \
        -o1.2,2.2,2.3 \
        .foo-ecc .foo-fsg \
      | sort | uniq \
      > .label-ecc-fsg.map
      
  Now let's use that table to "translate" the label occurrence index
  from ECC to FSG.
  
    foreach g ( '' '-rare' '-rarest' )

      cat .label${g}-occ-def.idx \
        | translate-occurrences-ecc-to-fsg \
           .label-ecc-fsg.map \
        > .label-fsg${g}-occ-def.idx
    end
    
  Finally, let's list the occurrences on pages f58r and f99v.
  We must "fake" entries with transcription code ";S"
  in order for show-occurrences to work.

    foreach f ( f58r f99v f95v2 f69r f9v )
      cat .units-parags.dir \
        | egrep "^${f}[:.]" \
        | sed \
          -e 's/:.*$//g' \
          -e 's:^:L16/:g' \
      > .tmp-$f
    end

    foreach f ( f58r f99v f95v2 f69r f9v )
      foreach g ( '' '-rare' '-rarest' )

        cat .label-fsg${g}-occ-def.idx \
          | egrep "<${f}[.]" \
          > .occ

        cat `cat .tmp-$f` \
          | fake-S-transcription-codes \
          | show-occurrences .occ \
          | egrep -v '^<[^>]*;S> *$' \
          > .label-fsg${g}-occs-$f
      end  
    end
  
      
  Curiously, label f89v1.t.4 is identical to f89v1.b.4.
  There is a comment on the latter:
    
    #looks like 3 plants and 4 names. none of the other "vases" seem to be named.
    
97-10-19 stolfi
===============

  In order to make the results easier to describe, I should
  rename the textual unit files with standard panel numbers 
  (like f77v) rather than modern page numbers (like 114).