# Last edited on 2008-06-13 20:35:42 by stolfi

PROJECT

  Unpacking and preprocessing the Saxonov et al. Exon-Intron-DNA (EID) database.
  
GENERAL SETUP 

    set progdir = "${STOLFIHOME}/programs/c/DNA/dnabayes"


DATABASE DESCRIPTION

  This is a set of files extracted from GenBank, selected and packaged for
  exon/intron studies.  
  
    An exhaustive database of protein-coding intron-containing genes
    Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert
    Nucleic Acids Res. 2000 Jan 1;28(1):185-190.
    
  The database consists of several datasets, one for each species
  of organism.  Each dataset contains eight data files.
  
  The first four have one section for each /EID item/, which corresponds 
  to a distict messenger RNA:
  
    {DATASET}.dEID     Labeled raw DNA sequence for each item (>).
    {DATASET}.hEID     GenBank documentation of each item (>).
    {DATASET}.pEID     Protein synthesized by each item (>).
    {DATASET}.mrnaEID  DNA seq of spliced messenger RNA for each item (>).
   
  The files with "(>)" have one header line that begins with '>'
  before each item.
    
  In the ".dEID" file, the introns and exons are listed in the header
  but also identified by the case of the letters (uppercase [ATCG] for
  exons, lowercase [atcg] for introns). The untranslated regions
  (UTRs)at the beginning and end of the concatenated exons can only be
  dentified by the CDS_start and CDS_end parameters in the ">" line.
  Note that the CD_len parameter includes the stop codon.
  
  Every EID item is associated to one gene, but sometimes there are
  several items for the same gene, due to variant splicing. Each item
  has a unique /item ID/ whcih comes after the ">", and a non-unique
  /gene ID/ which is the value of the '/gene="..."/ parameter in that
  line.
  
  The next two files have one section for each intron or exon
  in the above files: 

    {DATASET}.exEID    One entry for each exon (>).
    {DATASET}.intrEID  One entry for each intron (>).
  
  The last two files are summary and logs: 
    
    {DATASET}.sEID     Statistical summary.
    {DATASET}.tEID     Processing logs (?).
 
FETCHING

  Fetched files from "http://hsc.utoledo.edu/bioinfo/eid/", 
  saved to sub-directories of "orig":
  
     70141229 orig/at2004/at2004.EID.tgz  Arabidopsis thaliana [Thalecress]
     60031227 orig/ce2003/ce2003.EID.tgz  Caenorhabditis elegans [Worm]
     80690970 orig/dm4p1/dm4p1.EID.tgz    Drosophila melanogaster [Fruitfly]
    888139245 orig/hs35p1/hs35p1.EID.tgz  Homo sapiens [Human]          
    586699627 orig/mm34p1/mm34p1.EID.tgz  Mus musculus [Mouse]           
    
  The following files were not fectched yet, for lack of space:

    rn3p1.EID.tar.gz   Rattus norvegicus [Black rat] (GenBank 3.1)        
    cfa2p1.EID.tar.gz  Canis familiaris [Dog] (GenBank 2.1)        
    gga1p1.EID.tar.gz  Gallus gallus [Chicken] (GenBank 1.1)    
    drzv4.EID.tar.gz   Danio rerio [Zebrafish] (release Zv4)
    
  Dataset list:
  
    set datasets = ( at2004 ce2003 dm4p1 hs35p1 mm34p1 )

DATASET INFORMATION

  Listing files: 
  
    echo ${datasets}
    
    foreach f ( ${datasets} )
      echo " "; echo $f
      mkdir -p orig/eids/${f}
      tar -tvzf orig/eids/${f}.EID.tgz \
        | tar-tvf-to-sdf \
        > orig/eids/${f}.EID.sdf
    end
 
UNPACKING THE FILES

  We must unpack and compress each file and then delete the ".tgz"
  original to save space. We use "bzip2" instead of "gzip" because it
  provides a noticeably better compression, which in this case means
  almost 1 GB less disk space.
  
    foreach f ( ${datasets} )
      echo " "; echo $f
      ls -l orig/eids/${f}/${f}.EID.tgz
      ( cd orig/eids/${f} && tar -xvzf ${f}.EID.tgz )
      mv -v orig/eids/${f}/${f}.dEID orig/eids/${f}/dEID
      mv -v orig/eids/${f}/${f}.hEID orig/eids/${f}/hEID
      mv -v orig/eids/${f}/${f}.pEID orig/eids/${f}/pEID
      mv -v orig/eids/${f}/${f}.sEID orig/eids/${f}/sEID
      mv -v orig/eids/${f}/${f}.tEID orig/eids/${f}/tEID
      bzip2 orig/eids/${f}/{d,h,p,s,t}EID 
      rm -v orig/eids/${f}/${f}.{ex,intr,mrna}EID
      rm -v orig/eids/${f}/${f}.EID.tgz
      ls -l orig/eids/${f}/*EID*
    end

  Listing the data: 
  
    foreach f ( ${datasets} )
      echo " "; echo $f
      ls -l orig/eids/${f}/{d,p,h,t,s}EID.bz2
    end

      at2004
      -rw-r--r-- 1 stolfi staff 17300658 Sep 19  2005 orig/eids/at2004/dEID.bz2
      -rw-r--r-- 1 stolfi staff  8522788 Sep 19  2005 orig/eids/at2004/hEID.bz2
      -rw-r--r-- 1 stolfi staff  6143983 Sep 19  2005 orig/eids/at2004/pEID.bz2
      -rw-r--r-- 1 stolfi staff      722 Sep 19  2005 orig/eids/at2004/sEID.bz2
      -rw-r--r-- 1 stolfi staff   314322 Sep 19  2005 orig/eids/at2004/tEID.bz2

      ce2003
      -rw-r--r-- 1 stolfi staff 17792249 Sep 19  2005 orig/eids/ce2003/dEID.bz2
      -rw-r--r-- 1 stolfi staff  2734012 Sep 19  2005 orig/eids/ce2003/hEID.bz2
      -rw-r--r-- 1 stolfi staff  5512474 Sep 19  2005 orig/eids/ce2003/pEID.bz2
      -rw-r--r-- 1 stolfi staff      719 Sep 19  2005 orig/eids/ce2003/sEID.bz2
      -rw-r--r-- 1 stolfi staff   184146 Sep 19  2005 orig/eids/ce2003/tEID.bz2

      dm4p1
      -rw-r--r-- 1 stolfi staff 28719795 Sep 19  2005 orig/eids/dm4p1/dEID.bz2
      -rw-r--r-- 1 stolfi staff  2127031 Sep 19  2005 orig/eids/dm4p1/hEID.bz2
      -rw-r--r-- 1 stolfi staff  4878410 Sep 19  2005 orig/eids/dm4p1/pEID.bz2
      -rw-r--r-- 1 stolfi staff      721 Sep 19  2005 orig/eids/dm4p1/sEID.bz2
      -rw-r--r-- 1 stolfi staff   370503 Sep 19  2005 orig/eids/dm4p1/tEID.bz2

      hs35p1
      -rw-r--r-- 1 stolfi staff 376095833 Sep  6  2005 orig/eids/hs35p1/dEID.bz2
      -rw-r--r-- 1 stolfi staff   4538853 Sep  6  2005 orig/eids/hs35p1/hEID.bz2
      -rw-r--r-- 1 stolfi staff   7791956 Sep  6  2005 orig/eids/hs35p1/pEID.bz2
      -rw-r--r-- 1 stolfi staff       723 Sep  6  2005 orig/eids/hs35p1/sEID.bz2
      -rw-r--r-- 1 stolfi staff    347643 Sep  6  2005 orig/eids/hs35p1/tEID.bz2

      mm34p1
      -rw-r--r-- 1 stolfi staff 253053245 Aug 30  2005 orig/eids/mm34p1/dEID.bz2
      -rw-r--r-- 1 stolfi staff   4622864 Aug 30  2005 orig/eids/mm34p1/hEID.bz2
      -rw-r--r-- 1 stolfi staff   6810977 Aug 30  2005 orig/eids/mm34p1/pEID.bz2
      -rw-r--r-- 1 stolfi staff       725 Aug 30  2005 orig/eids/mm34p1/sEID.bz2
      -rw-r--r-- 1 stolfi staff    297517 Sep  6  2005 orig/eids/mm34p1/tEID.bz2

LISTING THE ITEMS IN EACH DATASET:
  
  Listing the items in each file of each dataset, and the number 
  of non-blank data bytes in each item (excluding the ">" line):
  
    echo ${datasets}
    
    foreach f ( ${datasets} )
      echo " "; echo "=== $f ==="
      foreach t ( d h p ) 
        printf "%-20s" "${f}/${t}EID"
         bzcat orig/eids/${f}/${t}EID.bz2 \
          | list-EID-items \
          > orig/eids/${f}/${t}EID.items
      end
    end

      === at2004 ===
      at2004/dEID              22957 items     737981 lines   58129067 bytes
      at2004/hEID              22957 items    1080233 lines   42704522 bytes
      at2004/pEID              22957 items     138240 lines   10149075 bytes

      === ce2003 ===
      ce2003/dEID              20470 items     789045 lines   62321120 bytes
      ce2003/hEID              20470 items    1573910 lines   64401134 bytes
      ce2003/pEID              20470 items     123787 lines    9088861 bytes

      === dm4p1 ===
      dm4p1/dEID               15624 items    1574157 lines  125311779 bytes
      dm4p1/hEID               15624 items    2394806 lines   96669029 bytes
      dm4p1/pEID               15624 items     128807 lines    9680941 bytes

      === hs35p1 ===
      hs35p1/dEID              24541 items   17946954 lines 1434796138 bytes
      hs35p1/hEID              24541 items    1215169 lines   43174584 bytes
      hs35p1/pEID              24541 items     176711 lines   13142208 bytes

      === mm34p1 ===
      mm34p1/dEID              20890 items   11471587 lines  916904125 bytes
      mm34p1/hEID              20890 items    1896044 lines   87193661 bytes
      mm34p1/pEID              20890 items     145957 lines   10847688 bytes

  Note that, in all datasets, the files "dEID", "hEID", and "pEID"
  have the same number of items.

  Checking again, the number of items and "data" bytes in each dataset:
  
    tabulate-datasets ${datasets}

      Dataset   items       dEID       hEID       pEID
      at2004    22957   58129067   42704522   10149075
      ce2003    20470   62321120   64401134    9088861
      dm4p1     15624  125311779   96669029    9680941
      hs35p1    24541 1434757209   43172816   13141772
      mm34p1    20890  916904125   87193661   10847688

  Note that the number of lines is not a good indication of item 
  size.  For instance, in the ".mrnaEID" files the whole mRNA
  is given in a single line, whereas in the ".pEID" files have the 
  protein split into several sequences.
  
IDENTIFYING GOOD ITEMS

  We should probably exclude all items with "CDS_incomplete" (they
  have a format bug in the "pEID" files).
  
  It would be nice if we could select only the items whose CDS was
  determined by experimental evidence (rather than computer
  inference).

  Unfortunately, some data sets don't have such annotations. The
  "ce2003" dataset has the annotation "/evidence=experimental" on
  12595 of its 20470 items. The datasets hs35p1 and mm34p1 have have
  [evidence ...] notes but it is not clear whether that means
  *eperimental* evidence. The datasets at2004 and dm4p1 do not seem to
  have any indication to this effect.
  
  We should also exclude any items that are the same gene with two or
  more different splicings, since they may have coding regions marked
  as introns. These are characterized by having capital letters in the
  first part of the item ID. E.g. "27_NT_077913" has a single
  splicing, while "28A_NT_077913", "28B_NT_077913", and
  "28C_NT_077913" are three alternative splicings of the same gene.
  (Some H.sapiens genes have 23 variants, "A" thru "W").
  
  Here we try to identify the items which are "good" by those
  criteria:
  
    foreach f ( ${datasets} )
      echo " ";
      printf "%-10s" "${f}"
      set rex = "1";
      if ( ( "${f}" == "at2004" ) || ( "${f}" == "dm4p1" ) ) set rex = "0"
      bzcat orig/eids/${f}/hEID.bz2 \
        | identify-good-hEID-items \
            -v reqExpr="${rex}" \
        > orig/eids/${f}/hEID.good
    end
  
      at2004       22957 items    18772 good     4185 bad
      ce2003       20470 items    17173 good     3297 bad
      dm4p1        15624 items     7963 good     7661 bad
      hs35p1       24541 items    17693 good     6848 bad
      mm34p1       20890 items    19551 good     1339 bad

  Actually, it would be better to take *all* items of the same gene,
  and combine them into a single `maximally expressed' item. I.e., if
  a base of the genome is marked as `exon' in any item, it should be
  marked as `coding' in the output file. Hopefully there will be no
  phase ambiguity.  Maybe some other year...
  
  We must also check whether the DNA and protein sequences are 
  valid, since there are items which have invalid chars.
  
    foreach f ( ${datasets} )
      echo " ";
      printf "%-10s" "${f}"
      bzcat orig/eids/${f}/dEID.bz2 \
        | identify-good-dEID-items \
        > orig/eids/${f}/dEID.good
    end

      at2004       22957 items    22951 good        6 bad
      ce2003       20470 items    20443 good       27 bad
      dm4p1        15624 items    15609 good       15 bad
      hs35p1       24541 items    24476 good       65 bad
      mm34p1       20890 items    17311 good     3579 bad

    foreach f ( ${datasets} )
      echo " ";
      printf "%-10s" "${f}"
      bzcat orig/eids/${f}/pEID.bz2 \
        | identify-good-pEID-items \
        > orig/eids/${f}/pEID.good
    end

      at2004       22957 items    22926 good       31 bad
      ce2003       20470 items    20447 good       23 bad
      dm4p1        15624 items    15588 good       36 bad
      hs35p1       24541 items    23348 good     1193 bad
      mm34p1       20890 items    19832 good     1058 bad

  Obtaining the items that are good by all three programs:
  
    foreach f ( ${datasets} )
      echo " ";
      printf "%-10s" "${f}"
      cat orig/eids/${f}/{d,p,h}EID.good \
        | gawk '($2 == 1){ print $1; }' \
        | sort \
        | uniq -c \
        | gawk '($1 == 3){ print $2; }' \
        > orig/eids/${f}/EID.good
    end
    dicio-wc orig/eids/*/EID.good

        lines   words     bytes file        
      ------- ------- --------- ------------
        18766   18766    290406 orig/eids/at2004/EID.good
        17146   17146    264351 orig/eids/ce2003/EID.good
         7956    7956    119006 orig/eids/dm4p1/EID.good
        16860   16860    260438 orig/eids/hs35p1/EID.good
        15807   15807    244061 orig/eids/mm34p1/EID.good

SELECTING A SAMPLE OF THE ITEMS

  We create a list of all candidate items in each dataset. 
  Currently it is just the ".good" lists produced above:
  
    mkdir -p pick/eids
    
    foreach f ( ${datasets} )
      echo " ";
      printf "%-20s" "${f}"
      set iFile = "orig/eids/${f}/dEID.items"
      set gfile = "orig/eids/${f}/EID.good"
      mkdir -p pick/eids/${f}
      set cFile = "pick/eids/${f}/EID.cands"
      cp -p ${gfile} ${cFile}
    end
    dicio-wc pick/eids/*/EID.cands

        lines   words     bytes file        
      ------- ------- --------- ------------
        18766   18766    290406 pick/eids/at2004/EID.cands
        17146   17146    264351 pick/eids/ce2003/EID.cands
         7956    7956    119006 pick/eids/dm4p1/EID.cands
        16860   16860    260438 pick/eids/hs35p1/EID.cands
        15807   15807    244061 pick/eids/mm34p1/EID.cands

SELECTING A RANDOM SAMPLE

  We must randomly select only a certain number {N} of each
  candidate list.  A nice number is 1000 items,
  but we start with a few more to allow for losses along the way:
  
    set smpSize = 1100
    
    foreach f ( ${datasets} )
      set cFile = "pick/eids/${f}/EID.cands"
      set pFile = "pick/eids/${f}/EID.picks"
      cat ${cFile} \
        | gawk '//{ n++; printf "%17.15f %10d %s\n", rand(), n, $0; }' \
        | sort -b -k1,1g \
        | head -${smpSize} \
        | sort -b -k2,2n \
        | gawk '//{ lin=$0; print substr(lin,30); }' \
        > ${pFile}
    end
    dicio-wc pick/eids/*/EID.picks

        lines   words     bytes file        
      ------- ------- --------- ------------
         1100    1100     17032 pick/eids/at2004/EID.picks
         1100    1100     16973 pick/eids/ce2003/EID.picks
         1100    1100     16428 pick/eids/dm4p1/EID.picks
         1100    1100     16989 pick/eids/hs35p1/EID.picks
         1100    1100     17007 pick/eids/mm34p1/EID.picks

  Consistency check:
     
    foreach f ( ${datasets} )
      set cFile = "pick/eids/${f}/EID.cands"
      set pFile = "pick/eids/${f}/EID.picks"
      echo " "; echo "=== ${cFile} == ${pFile} ==="
      cat ${pFile} | sort > .picks
      cat ${cFile} | sort > .cands
      bool 1-2 .picks .cands
      rm .picks .cands
    end

EXTRACTING THE SELECTED SAMPLE

  Extracting the picked items from the "dEID", "pEID", and "hEID" files:
  
    foreach f ( ${datasets} )
      set pFile = "pick/eids/${f}/EID.picks"
      echo " "; echo "=== ${pFile} ==="
      foreach t ( d h p ) 
        set iFile = "orig/eids/${f}/${t}EID"
        set xFile = "pick/eids/${f}/${t}EID"
        printf "%-15s\n" "${f}.${t}"
        bzcat ${iFile}.bz2 \
          | extract-EID-items \
              -v itemList=${pFile} \
          > ${xFile}
      end
    end

      === pick/eids/at2004/EID.picks ===
      at2004/dEID         
      items  read =      22957  written =       1100
      lines  read =     783895  written =      35629
      at2004/hEID         
      items  read =      22957  written =       1100
      lines  read =    1126147  written =      53726
      at2004/pEID         
      items  read =      22957  written =       1100
      lines  read =     184154  written =       8728

      === pick/eids/ce2003/EID.picks ===
      ce2003/dEID         
      items  read =      20470  written =       1100
      lines  read =     829985  written =      39573
      ce2003/hEID         
      items  read =      20470  written =       1100
      lines  read =    1635320  written =      87707
      ce2003/pEID         
      items  read =      20470  written =       1100
      lines  read =     164727  written =       8565

      === pick/eids/dm4p1/EID.picks ===
      dm4p1/dEID          
      items  read =      15624  written =       1100
      lines  read =    1605405  written =      64265
      dm4p1/hEID          
      items  read =      15624  written =       1100
      lines  read =    2426054  written =     170165
      dm4p1/pEID          
      items  read =      15624  written =       1100
      lines  read =     160055  written =      10887

      === pick/eids/hs35p1/EID.picks ===
      hs35p1/dEID         
      items  read =      24541  written =       1100
      lines  read =   17996036  written =     724463
      hs35p1/hEID         
      items  read =      24541  written =       1100
      lines  read =    1264251  written =      54314
      hs35p1/pEID         
      items  read =      24541  written =       1100
      lines  read =     225793  written =       9816

      === pick/eids/mm34p1/EID.picks ===
      mm34p1/dEID         
      items  read =      20890  written =       1100
      lines  read =   11513367  written =     507417
      mm34p1/hEID         
      items  read =      20890  written =       1100
      lines  read =    1937824  written =     101306
      mm34p1/pEID         
      items  read =      20890  written =       1100
      lines  read =     187737  written =       9394

  Listing the extracted items:

    foreach f ( ${datasets} )
      echo " "; echo "=== $f ==="
      foreach t ( d h p ) 
        printf "%-15s" "${f}/${t}EID"
        cat pick/eids/${f}/${t}EID \
          | list-EID-items \
          > pick/eids/${f}/${t}EID.items
      end
    end
    dicio-wc pick/eids/*/{d,p,h}EID.items

      === at2004 ===
      at2004/dEID          1100 items      33429 lines    2629668 bytes
      at2004/hEID          1100 items      51526 lines    2027832 bytes
      at2004/pEID          1100 items       6528 lines     479367 bytes

      === ce2003 ===
      ce2003/dEID          1100 items      37373 lines    2946169 bytes
      ce2003/hEID          1100 items      84407 lines    3450216 bytes
      ce2003/pEID          1100 items       6365 lines     464996 bytes

      === dm4p1 ===
      dm4p1/dEID           1100 items      62065 lines    4920808 bytes
      dm4p1/hEID           1100 items     167965 lines    6770663 bytes
      dm4p1/pEID           1100 items       8687 lines     651986 bytes

      === hs35p1 ===
      hs35p1/dEID          1100 items     722263 lines   57738446 bytes
      hs35p1/hEID          1100 items      52114 lines    1834466 bytes
      hs35p1/pEID          1100 items       7616 lines     565591 bytes

      === mm34p1 ===
      mm34p1/dEID          1100 items     505217 lines   40374578 bytes
      mm34p1/hEID          1100 items      99106 lines    4554467 bytes
      mm34p1/pEID          1100 items       7194 lines     531328 bytes
      >>> stolfi@dumont.ic.unicamp.br 61>>>

        lines   words     bytes file        
      ------- ------- --------- ------------
         1100    4400     70400 pick/eids/at2004/dEID.items
         1100    4400     70400 pick/eids/ce2003/dEID.items
         1100    4400     70400 pick/eids/dm4p1/dEID.items
         1100    4400     70400 pick/eids/hs35p1/dEID.items
         1100    4400     70400 pick/eids/mm34p1/dEID.items
         1100    4400     70400 pick/eids/at2004/pEID.items
         1100    4400     70400 pick/eids/ce2003/pEID.items
         1100    4400     70400 pick/eids/dm4p1/pEID.items
         1100    4400     70400 pick/eids/hs35p1/pEID.items
         1100    4400     70400 pick/eids/mm34p1/pEID.items
         1100    4400     70400 pick/eids/at2004/hEID.items
         1100    4400     70400 pick/eids/ce2003/hEID.items
         1100    4400     70400 pick/eids/dm4p1/hEID.items
         1100    4400     70400 pick/eids/hs35p1/hEID.items
         1100    4400     70400 pick/eids/mm34p1/hEID.items

  Consistency check:
     
    foreach f ( ${datasets} )
      echo " "
      foreach t ( d h p ) 
        set oFile = "orig/eids/${f}/${t}EID.items"
        set xFile = "pick/eids/${f}/${t}EID.items"
        echo "=== ${oFile} ${xFile} ==="
        cat ${oFile} | sort > .orig
        cat ${xFile} | sort > .pick
        bool 1-2 .pick .orig | head -20
        rm .pick .orig
      end
    end

SPLITTING THE FILES INTO ITEMS

  Testing the program: 
  
    mkdir -p temp/{eids,item}/ce2003
    
    ${progdir}/dbd_digest_EID \
        -maxItems 25 \
        pick/eids/ce2003/dEID \
        pick/eids/ce2003/pEID \
        pick/eids/ce2003/hEID \
        temp/item/ce2003

  Checking the consistency between the DNA basis sequence 
  (.bas) and the expressed protein (.ama): 
  
    foreach f ( temp/item/ce2003/*.lab )
      echo "${f}"
      catsep -sep " " ${f:r}.bas ${f:r}.lab ${f:r}.ama \
        | merge-bas-lab-ama \
        | ${progdir}/dbd_check_translation
    end
      
  Doing it for real:
  
    foreach f ( ${datasets} )
      echo " " 
      echo "=== ${f} ===" 
      mkdir -p pick/item/${f}
      /bin/rm -f pick/item/${f}/*.bas
      /bin/rm -f pick/item/${f}/*.lab
      /bin/rm -f pick/item/${f}/*.ama
      /bin/rm -f pick/item/${f}/*.doc
      ${progdir}/dbd_digest_EID \
        pick/eids/${f}/dEID \
        pick/eids/${f}/pEID \
        pick/eids/${f}/hEID \
        pick/item/${f}
      printf "items = "; ls pick/item/${f}/*.bas | wc -l
    end

  Checking translation ({dbd_digest_EID} should do it!!!):

    foreach f ( ${datasets} )
      echo " " 
      echo "=== ${f} ===" 
      foreach lab ( pick/item/${f}/*.lab )
        echo "${lab:r}"
        catsep -sep " " ${lab:r}.bas ${lab:r}.lab ${lab:r}.ama \
          | merge-bas-lab-ama \
          | ${progdir}/dbd_check_translation
      end
    end

CREATING ITEM SETS

  We next partition the items into item sets for the purposes of
  statistics gathering and performance evaluation. First we create an
  item set for each species, and a single set of all items:
  
    echo ${datasets}
    mkdir -p sets
    /bin/rm -f sets/all.gset
    foreach f ( ${datasets} )
      echo "=== ${f} ==="
      ( cd pick/item && ls -1 ${f}/*.bas ) \
        | sed -e 's:[.]bas$::' \
        > sets/${f}.gset
      cat sets/${f}.gset \
        >> sets/all.gset
    end

  Since "all.gset" is too big for some purposes,
  we create also an item set "mix.gset" that has
  about 1000 items randomly selected from
  "all.gset": 
  
    echo ${smpSize}
    
    cat sets/all.gset \
      | gawk '//{ n++; printf "%17.15f %10d %s\n", rand(), n, $0; }' \
      | sort -b -k1,1g \
      | head -${smpSize} \
      | sort -b -k2,2n \
      | gawk '//{ lin=$0; print substr(lin,30); }' \
      > sets/mix.gset

    dicio-wc sets/*.gset

        lines   words     bytes file        
      ------- ------- --------- ------------
         1098    1098     24688 sets/at2004.gset
         1100    1100     24673 sets/ce2003.gset
         1099    1099     23007 sets/dm4p1.gset
         1074    1074     24105 sets/hs35p1.gset
         1082    1082     24302 sets/mm34p1.gset
         5453    5453    120775 sets/all.gset
         1100    1100     24361 sets/mix.gset

PROBABILITIES

  Computing the probabilities from all datasets. We leave out the
  "mix" one since it has too few sequences of each species, and the
  properties vary quite a lot between species.
  
    mkdir prob
    foreach f ( all ${datasets} ) 
      echo " "
      foreach wsle ( \
          1.DEFHIJNXYPQRS \
          2.KKKHIJNXYPQRS \
          3.DEFHIJNXYZZZZ \
          6.DZZZZZNZZZZZZ \
        )
        set ws = "${wsle:r}"
        set le = "${wsle:e}"
        echo "=== ${f}-${ws}-${le} ==="
        tabulate-probs \
          ${f} ${ws} ${le} sets pick/item evts prob
      end
    end
    dicio-wc prob/*.prb

        lines   words     bytes file        
      ------- ------- --------- ------------
           67     223      1500 prob/at2004-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/ce2003-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/dm4p1-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/hs35p1-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/mm34p1-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/all-1-DEFHIJNXYPQRS.prb
           67     223      1500 prob/mix-1-DEFHIJNXYPQRS.prb

          104     399      2829 prob/at2004-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/ce2003-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/dm4p1-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/hs35p1-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/mm34p1-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/all-2-KKKHIJNXYPQRS.prb
          104     399      2829 prob/mix-2-KKKHIJNXYPQRS.prb

          327    1295      9740 prob/at2004-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/ce2003-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/dm4p1-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/hs35p1-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/mm34p1-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/all-3-DEFHIJNXYZZZZ.prb
          327    1295      9740 prob/mix-3-DEFHIJNXYZZZZ.prb

         7849   31395    282557 prob/all-6-DZZZZZNZZZZZZ.prb
         7827   31307    281765 prob/at2004-6-DZZZZZNZZZZZZ.prb
         7835   31339    282053 prob/ce2003-6-DZZZZZNZZZZZZ.prb
         7832   31327    281945 prob/dm4p1-6-DZZZZZNZZZZZZ.prb
         7831   31323    281909 prob/hs35p1-6-DZZZZZNZZZZZZ.prb
         7833   31331    281981 prob/mix-6-DZZZZZNZZZZZZ.prb
         7827   31307    281765 prob/mm34p1-6-DZZZZZNZZZZZZ.prb


REGION LENGTHS

  Computing the length distributions of regions by type
  (coding, intron, head UTR, tail UTR):
  
    foreach f ( ${datasets} ) 
      echo " "; echo "=== ${f} ==="
      get-region-lengths ${f} sets pick/item prob
    end

  Plotting the length distributions:

    mkdir -p plot
    
    ${progdir}/dbd_plot_region_length_stats \
      prob plot ${datasets}
      
  Comparing distributions: 

    foreach t ( N K X Y )
      foreach s ( ${datasets} )
        ghostview \
            -notitle \
            -nodate \
            -nolocator \
            -magstep -2 \
            plot/${s}-${t}-lhs.eps &
      end
      xmessage "done?"
    end

    foreach t ( N K X Y )
      foreach p ( lin log )
        ghostview \
          -notitle \
          -nodate \
          -nolocator \
          plot/CMP-${t}-${p}.eps &
      end
      xmessage "done?"
    end

ANALYSIS OF REGION LENGTHS

  Analyzing the plots of region lengths, we conclude that the length
  distributions vary substantially with the species and type.
  Specifically:
    
    "N" regions:
    
      * There is a definite minimum length, different for each
      species. Less than 1% of the sequences are below this minimum,
      but the counts escalate rapidly above it. The minimum is
      particularly sharp in the ce2003 and dm4p1 sets.
      
      * There seem to be a relatively narrow Gaussian-like hump, with
      mean just above the minimum, and then a long exponential tail.
      
                  Overall         Gaussian       Exponential
               --------------  --------------  --------------
      Species  median     min     avg     dev     mlo     tau
      -------  ------  ------  ------  ------  ------  ------
      at2004       99     ~60     ~88
      ce2003       66      40      47
      dm4p1        68      49     ~62
      hs35p1     1523      66     ~82   
      mm34p1     1331      66     ~84
      
      * Gaussian-like hump avg ~ 88 ?
      * Exponential tail  mlo ~ ???  tau = -log(0.05/0.3)/400  amp = ???
      === ce2003 ===
      * median = 66, min = 40
      * Gaussian-like hump avg ~ 48 ?
      * Exponential tail  mlo ~ ???  tau = -log(0.03/0.3)/1200  amp = ???
      === dm4p1 ===
      * median = 68, min = 48
      * Gaussian-like hump avg ~ 62 ?
      * Exponential tail  mlo ~ ???  tau = -log(???)/???  amp = ???
      === hs35p1 ===
      * median = 1523, min ~ 66
      * Gaussian-like hump avg ~ 100
      * Exponential tail  mlo ~ ???  tau = -log(???)/???  amp = ???
      === mm34p1 ===
      * median = 1331, min ~ 66
      * Gaussian-like hump avg ~ 88??? dev = ??? 
      * Exponential tail  mlo ~ 100  tau = -log(???)/???  amp = ???
      
    "K" regions:

      * There is no minimum length.
      
      * The distribution is dominated by a Gaussian-like hump, with a weak 
      exponential tail.  In hs35p1 and mm34p1, 
      the tail is practically absent.
      
      * There is a sharp peak at ???.

                  Overall         Gaussian       Exponential
               --------------  --------------  --------------
      Species  median     min     avg     dev     mlo     tau
      -------  ------  ------  ------  ------  ------  ------
      at2004      134       1     ~82
      ce2003      147       1    ~110
      dm4p1       233       1    ~140
      hs35p1      126       1    ~110
      mm34p1      123       1    ~110
      
      === at2004 [Arabidopsis thaliana] ===
      * median = 134, min ~ 0
      * Lognormal (???) distrib, peak at ~ 80 ???
      * Exponential tail, tau ~ -log(0.01/0.1)/800, mlo ~ ???
      === ce2003 ===
      * median = 147  min ~ 0
      * Gaussian hump avg ~ 115  dev ~ ???
      * Exponential tail, tau ~ -log(???)/???, mlo ~ ???
      === dm4p1 ===
      * median = 233  min ~ 0
      * Gaussian hump avg ~ 150  dev ~ ???
      * Exponential tail, tau ~ -log(???)/???, mlo ~ ???
      === hs35p1 ===
      * median = 126  min ~ 0
      * Gaussian hump avg ~ 110  dev ~ ???
      * Exponential tail, tau ~ -log(???)/???, mlo ~ ???
      === mm34p1 ===
      * median = 123  min ~ 0
      * Gaussian hump avg ~ 110  dev ~ ???
      * Sharp peak at ???
      * Weak exponential tail, tau ~ -log(???)/???, mlo ~ ???
      
    "X" regions:
    
      * There is no minimum length.  Many are empty (perhaps artifact of 
      database construction?)
      
      * The distribution seems to be an exponential startint at m = 1,
      plus a weak hump
    
                  Overall         Gaussian       Exponential
               --------------  --------------  --------------
      Species  median     min     avg     dev     mlo     tau
      -------  ------  ------  ------  ------  ------  ------
      at2004       82       1     ~60
      ce2003       30       1     ~80
      dm4p1       108       1     ~40
      hs35p1       86       1     ~70
      mm34p1       80       1     ~70


      === at2004 [Arabidopsis thaliana] ===
      * median = 82, min = 1
      * Gaussian hump avg ~ 66  dev ~ 66
      * Almost pure exponential tail tau ~ -log(0.05/0.5)/300 
      === ce2003 ===
      * very few (52); median = 30  min ~ 0
      * Gaussian hump avg ~ 115  dev ~ ???
      * Almost pure exponential tail  tau ~ -log(???)/???, mlo ~ ???
      === dm4p1 ===
      * median = 108  min ~ 0
      * Gaussian hump avg ~ 66  dev ~ ???
      * Sharp drop (-40%) at m ~ 100 
      * Almost pure exponential tail  tau ~ -log(???)/???, mlo ~ ???
      === hs35p1 ===
      * median = 86  min ~ 0
      * Weak Gaussian hump avg ~ 45  dev ~ ???
      * Almost pure exponential tail  tau ~ -log(???)/???, mlo ~ ???
      === mm34p1 ===
      * median = 80  min ~ 0
      * Weak Gaussian hump avg ~ 45  dev ~ ???
      * Sharp drop at ~ 60
      * Almost pure exponential tail  tau ~ -log(???)/???, mlo ~ ???
      
    "Y" regions:
    
      * There is no minimum length.  Many are empty (perhaps artifact of 
      database construction?)
      
      * The distribution is mostly Gaussian-like, however for
      hs35p1 and mm34p1 it is much broader. 
    
                  Overall         Gaussian       Exponential
               --------------  --------------  --------------
      Species  median     min     avg     dev     mlo     tau
      -------  ------  ------  ------  ------  ------  ------
      at2004      201       1    ~200
      ce2003      192       1    ~192
      dm4p1       173       1    
      hs35p1      446       1
      mm34p1      442       1

      === at2004 [Arabidopsis thaliana] ===
      * median = 201, min = 0
      * Gaussian hump avg ~ 100, dev ~ ???
      * Practically no exponential tail.
      === ce2003 ===
      * very few (58); median = 192, min = 0
      * Gaussian hump avg ~ 100, dev ~ ???
      * Practically no exponential tail.
      === dm4p1 ===
      * median = 173  min ~ 0
      * Gaussian hump avg ~ 110, dev ~ ???
      === hs35p1 ===
      * median = 446  min ~ 0
      * Gaussian hump avg ~ , dev ~ ???
      * Long exponential tail  tau ~ -log(???)/???, mlo ~ ???
      === mm34p1 ===
      * median = 442  min ~ 0
      * No Gaussian hump avg ~ , dev ~ ???
      * Long exponential tail  tau ~ -log(???)/???, mlo ~ ???

FITTING STATISTICAL MODELS TO THE OBSERVED DATA


TO DO

  This is too much data (we don't even have enough disk). Perhaps we
  can unpack it, convert to our format, and burn a few DVDs.
  
  Must convert the ".trn" format to be compatible with Reese's 
  ("{CODON} {AMINO}" pairs, one per line).
  
  Should we try to get homologous sequences?

----------------------------------------------------------------------

RENAMING FILES

    foreach f ( ${datasets} )
      echo ${f}
      mv -v pick/${f}/item pick/item/${f}
      mv -v pick/${f} pick/eids/${f}
    end
    

======================================================================
EXCEPTIONAL THINGS THAT HAD TO BE DONE ONCE 
======================================================================

CHANGING COMPRESSOR

    foreach f ( ${datasets} )
      echo " "
      foreach t ( d p h t s ) 
        echo "${f}/${t}EID"
        ( cd orig/eids/${f} && gunzip ${t}EID.gz && bzip2 ${t}EID )
      end
    end
    
RENAMING 

    foreach f ( ${datasets} )
      echo " "
      mv -v orig/${f}/${f}.EID.sdf    orig/eids/${f}/EID.sdf
      mv -v orig/${f}/${f}.dEID.items orig/eids/${f}/dEID.items
      mv -v orig/${f}/${f}.dGood      orig/eids/${f}/dEID.good
      mv -v orig/${f}/${f}.good       orig/eids/${f}/EID.good 
      mv -v orig/${f}/${f}.hEID.items orig/eids/${f}/hEID.items
      mv -v orig/${f}/${f}.hGood      orig/eids/${f}/hEID.good 
      mv -v orig/${f}/${f}.pEID.items orig/eids/${f}/pEID.items
      mv -v orig/${f}/${f}.pGood      orig/eids/${f}/pEID.good 
    end