# Last edited on 2004-11-19 13:01:13 by stolfi
# Working directory for the interlinear trascription of the VMS

LAST RELEASED VERSION

  text16e6.evt - the interlinear, including page-level comments.
  unit16e6.txt - list of textual units and their attributes
  tcmt16e6.txt - only the page headlines "<f53v>" and page-level comments.
  
** OUT OF DATE - SEE Notes/069 **

CURRENT WORKING VERSION

  text16e7.evt - bare text of the interlinear, with minimum comments
  unit16e7.evt - list of textual units and their attributes

LISTS OF UNITS AND PAGES IN READING ORDER

  cat unit16e7.txt \
    | gawk -v FS=':' '/./{print $2;}' \
    > .units

  cat .units \
    | egrep -v '[.]' \
    > .pages
    
  dicio-wc .pages .units

      lines   words     bytes file        
    ------- ------- --------- ------------
        255     255      1338 .pages
        767     767      5305 .units
    
FREEZING THE UNITS FILES

  From about 1998-12-29 to 2004-10-24, the interlinear was kept and edited
  as several hundred separate files, one for each unit. On 2004-10-24
  these files were moved to the subdirectory UNITS and frozen. The concatenation
  of those files, plus some page header data extracted from the previous
  version ("text16e6.evt") became the new working version.
  
RECOVERING UP-TO-DATE PAGE HEADERS
  
  Since the release of 16e6, the page header files were reduced to
  the VTT header line.  For the reader's convenience, we must
  put back some of that deleted information: at least the 
  physical location, section, and (for believers) the Currier
  language and hand. 
  
  Extracting the page descriptions from the old interlinear:
    
    mkdir HD-16e6
    
    cat text16e6.evt \
      | gawk \
          ' BEGIN { opg = ""; omit = 1; } \
            /^[<][f][0-9rv]+[>]/ { \
              pg = $1; gsub(/[<>]/, "", pg); \
              omit = 0; df = ( "HD-16e6/" pg ); \
              print > df; print pg > "/dev/stderr"; next; \
            } \
            /Description:/ { omit = 1; next; } \
            /Last edited/ { next; } \
            /Colors:/ { next; } \
            /Plant:/ { next; } \
            /([\#][\#]|) *[<][f][0-9rv]+(|[.][A-Z][0-9]*)[>]/ { omit=1; } \
            (omit) { next; } \
            // { print > df; } \
          ' \
      > .foo

  Two of the files (f95r2 and f96r) had to be trimmed by hand since 
  they were missing the "Description:" headers.
  
  Applied the script "fix-subtitles.sed" to all header files, to 
  add "*" in front of the standardized field titles, and to delete
  the plant data (which escaped on the first try).
  
    ( cd HD-16e6 && sed-files -f ../fix-subtitles.sed f*[0-9a-z] ) 

  Sometime after the release of 16e6, the page descriptions were moved
  to a "Notes" directory, with the intention of preparing verbal
  descritptions of all pages. Extracting alternative headers from
  verbal page descriptions:
  
    mkdir HD-n040
    
    foreach pg ( `cat .pages` ) 
      echo ${pg}
      set otf = "HD-n040/${pg}"
      cat UNITS/${pg} >> ${otf}
      cat ../Notes/040/data/${pg} \
        | gawk \
            ' BEGIN { omit = 0; } \
              /Description:/ { omit = 1; next; } \
              /Last edited/ { next; } \
              /Colors:/ { next; } \
              /Plant:/ { next; } \
              (omit) { next; } \
              // { print; } \
            ' \
        >> ${otf}
    end
    
  Concatenating all headers just to check: 
  
    ( cd HD-16e6 && cat `cat ../.pages` ) > .head-16e6.txt
    ( cd HD-n040 && cat `cat ../.pages` ) > .head-n040.txt
    
    diff -Bb .head-n040.txt .head-16e6.txt \
      | prettify-diff-output \
      > .diffh
      
  Conclusion: the headers in HD-n040 are slightly but consistently
  better than those in HD-16e6.
    
REASSEMBLING THE WORKING INTERLINEAR

  Now we can put the working interlinear back together into a single
  file:
  
    set fnew = "text16e7.evt"
    rm -f ${fnew}
    
    foreach un ( `cat .units` )
      printf " %s" "${un}"
      if ( "/${un:r}" == "/${un}" ) then
        cat HD-n040/${un} | egrep -v '^[#] *Last edited ' >> ${fnew}
      else
        cat UNITS/${un} | egrep -v '^[#] *Last edited ' >> ${fnew}
      endif
    end

FIXING PAGE TITLES

  Most pages had "Title: ???". These lines "text16e7.evt" were manually 
  replaced by "Title: \"Page fNNN\"" with Emacs.
  
  For Herbal pages, we could use the Petersen plant numbers, extracted
  from the Notes/040 files:
  
    set titf = ".plants.txt"
    rm -f ${titf}
    foreach pg ( `cat .pages` ) 
      echo ${pg}
      printf "%s " "${pg}" >> ${titf}
      cat ../Notes/040/data/${pg} \
        | egrep -e '(Plant[:]|Petersen plant)' \
        >> ${titf}
    end
    
  This file was massaged into "insert-plant-titles.sed", which was
  then applied to the interlinear file.

COMPARING WITH OLD VERSION

  In order to compare the current working version with the old one,
  we must remove the page descriptions from the latter, and 
  apply a slight reformatting to the subtitles "Quire:", etc..
  The descriptions start with "Description:" and end at the 
  beginning of the next unit.  We also must protect the VTT
  page locators.
  
    cat text16e6.evt \
      | gawk \
          ' BEGIN { keep=1; } \
            /Description:/ { keep=0; next; } \
            /([\#][\#]|) *[<][f][0-9rv]+(|[.][A-Z][0-9]*)[>]/ { keep=1; } \
            (keep) { print; } \
          ' \
      | sed -f fix-subtitles.sed \
      | egrep -v '^[#] *Last edited ' \
      | sed -e 's/^\([<]f[0-9a-z]*[>]\)/## \1/' \
      > .old.evt
      
  Now we can compare:
  
    diff -Bb .old.evt text16e7.evt \
      | prettify-diff-output \
      > .diff