# Last edited on 2008-06-13 20:35:42 by stolfi PROJECT Unpacking and preprocessing the Saxonov et al. Exon-Intron-DNA (EID) database. GENERAL SETUP set progdir = "${STOLFIHOME}/programs/c/DNA/dnabayes" DATABASE DESCRIPTION This is a set of files extracted from GenBank, selected and packaged for exon/intron studies. An exhaustive database of protein-coding intron-containing genes Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert Nucleic Acids Res. 2000 Jan 1;28(1):185-190. The database consists of several datasets, one for each species of organism. Each dataset contains eight data files. The first four have one section for each /EID item/, which corresponds to a distict messenger RNA: {DATASET}.dEID Labeled raw DNA sequence for each item (>). {DATASET}.hEID GenBank documentation of each item (>). {DATASET}.pEID Protein synthesized by each item (>). {DATASET}.mrnaEID DNA seq of spliced messenger RNA for each item (>). The files with "(>)" have one header line that begins with '>' before each item. In the ".dEID" file, the introns and exons are listed in the header but also identified by the case of the letters (uppercase [ATCG] for exons, lowercase [atcg] for introns). The untranslated regions (UTRs)at the beginning and end of the concatenated exons can only be dentified by the CDS_start and CDS_end parameters in the ">" line. Note that the CD_len parameter includes the stop codon. Every EID item is associated to one gene, but sometimes there are several items for the same gene, due to variant splicing. Each item has a unique /item ID/ whcih comes after the ">", and a non-unique /gene ID/ which is the value of the '/gene="..."/ parameter in that line. The next two files have one section for each intron or exon in the above files: {DATASET}.exEID One entry for each exon (>). {DATASET}.intrEID One entry for each intron (>). The last two files are summary and logs: {DATASET}.sEID Statistical summary. {DATASET}.tEID Processing logs (?). FETCHING Fetched files from "http://hsc.utoledo.edu/bioinfo/eid/", saved to sub-directories of "orig": 70141229 orig/at2004/at2004.EID.tgz Arabidopsis thaliana [Thalecress] 60031227 orig/ce2003/ce2003.EID.tgz Caenorhabditis elegans [Worm] 80690970 orig/dm4p1/dm4p1.EID.tgz Drosophila melanogaster [Fruitfly] 888139245 orig/hs35p1/hs35p1.EID.tgz Homo sapiens [Human] 586699627 orig/mm34p1/mm34p1.EID.tgz Mus musculus [Mouse] The following files were not fectched yet, for lack of space: rn3p1.EID.tar.gz Rattus norvegicus [Black rat] (GenBank 3.1) cfa2p1.EID.tar.gz Canis familiaris [Dog] (GenBank 2.1) gga1p1.EID.tar.gz Gallus gallus [Chicken] (GenBank 1.1) drzv4.EID.tar.gz Danio rerio [Zebrafish] (release Zv4) Dataset list: set datasets = ( at2004 ce2003 dm4p1 hs35p1 mm34p1 ) DATASET INFORMATION Listing files: echo ${datasets} foreach f ( ${datasets} ) echo " "; echo $f mkdir -p orig/eids/${f} tar -tvzf orig/eids/${f}.EID.tgz \ | tar-tvf-to-sdf \ > orig/eids/${f}.EID.sdf end UNPACKING THE FILES We must unpack and compress each file and then delete the ".tgz" original to save space. We use "bzip2" instead of "gzip" because it provides a noticeably better compression, which in this case means almost 1 GB less disk space. foreach f ( ${datasets} ) echo " "; echo $f ls -l orig/eids/${f}/${f}.EID.tgz ( cd orig/eids/${f} && tar -xvzf ${f}.EID.tgz ) mv -v orig/eids/${f}/${f}.dEID orig/eids/${f}/dEID mv -v orig/eids/${f}/${f}.hEID orig/eids/${f}/hEID mv -v orig/eids/${f}/${f}.pEID orig/eids/${f}/pEID mv -v orig/eids/${f}/${f}.sEID orig/eids/${f}/sEID mv -v orig/eids/${f}/${f}.tEID orig/eids/${f}/tEID bzip2 orig/eids/${f}/{d,h,p,s,t}EID rm -v orig/eids/${f}/${f}.{ex,intr,mrna}EID rm -v orig/eids/${f}/${f}.EID.tgz ls -l orig/eids/${f}/*EID* end Listing the data: foreach f ( ${datasets} ) echo " "; echo $f ls -l orig/eids/${f}/{d,p,h,t,s}EID.bz2 end at2004 -rw-r--r-- 1 stolfi staff 17300658 Sep 19 2005 orig/eids/at2004/dEID.bz2 -rw-r--r-- 1 stolfi staff 8522788 Sep 19 2005 orig/eids/at2004/hEID.bz2 -rw-r--r-- 1 stolfi staff 6143983 Sep 19 2005 orig/eids/at2004/pEID.bz2 -rw-r--r-- 1 stolfi staff 722 Sep 19 2005 orig/eids/at2004/sEID.bz2 -rw-r--r-- 1 stolfi staff 314322 Sep 19 2005 orig/eids/at2004/tEID.bz2 ce2003 -rw-r--r-- 1 stolfi staff 17792249 Sep 19 2005 orig/eids/ce2003/dEID.bz2 -rw-r--r-- 1 stolfi staff 2734012 Sep 19 2005 orig/eids/ce2003/hEID.bz2 -rw-r--r-- 1 stolfi staff 5512474 Sep 19 2005 orig/eids/ce2003/pEID.bz2 -rw-r--r-- 1 stolfi staff 719 Sep 19 2005 orig/eids/ce2003/sEID.bz2 -rw-r--r-- 1 stolfi staff 184146 Sep 19 2005 orig/eids/ce2003/tEID.bz2 dm4p1 -rw-r--r-- 1 stolfi staff 28719795 Sep 19 2005 orig/eids/dm4p1/dEID.bz2 -rw-r--r-- 1 stolfi staff 2127031 Sep 19 2005 orig/eids/dm4p1/hEID.bz2 -rw-r--r-- 1 stolfi staff 4878410 Sep 19 2005 orig/eids/dm4p1/pEID.bz2 -rw-r--r-- 1 stolfi staff 721 Sep 19 2005 orig/eids/dm4p1/sEID.bz2 -rw-r--r-- 1 stolfi staff 370503 Sep 19 2005 orig/eids/dm4p1/tEID.bz2 hs35p1 -rw-r--r-- 1 stolfi staff 376095833 Sep 6 2005 orig/eids/hs35p1/dEID.bz2 -rw-r--r-- 1 stolfi staff 4538853 Sep 6 2005 orig/eids/hs35p1/hEID.bz2 -rw-r--r-- 1 stolfi staff 7791956 Sep 6 2005 orig/eids/hs35p1/pEID.bz2 -rw-r--r-- 1 stolfi staff 723 Sep 6 2005 orig/eids/hs35p1/sEID.bz2 -rw-r--r-- 1 stolfi staff 347643 Sep 6 2005 orig/eids/hs35p1/tEID.bz2 mm34p1 -rw-r--r-- 1 stolfi staff 253053245 Aug 30 2005 orig/eids/mm34p1/dEID.bz2 -rw-r--r-- 1 stolfi staff 4622864 Aug 30 2005 orig/eids/mm34p1/hEID.bz2 -rw-r--r-- 1 stolfi staff 6810977 Aug 30 2005 orig/eids/mm34p1/pEID.bz2 -rw-r--r-- 1 stolfi staff 725 Aug 30 2005 orig/eids/mm34p1/sEID.bz2 -rw-r--r-- 1 stolfi staff 297517 Sep 6 2005 orig/eids/mm34p1/tEID.bz2 LISTING THE ITEMS IN EACH DATASET: Listing the items in each file of each dataset, and the number of non-blank data bytes in each item (excluding the ">" line): echo ${datasets} foreach f ( ${datasets} ) echo " "; echo "=== $f ===" foreach t ( d h p ) printf "%-20s" "${f}/${t}EID" bzcat orig/eids/${f}/${t}EID.bz2 \ | list-EID-items \ > orig/eids/${f}/${t}EID.items end end === at2004 === at2004/dEID 22957 items 737981 lines 58129067 bytes at2004/hEID 22957 items 1080233 lines 42704522 bytes at2004/pEID 22957 items 138240 lines 10149075 bytes === ce2003 === ce2003/dEID 20470 items 789045 lines 62321120 bytes ce2003/hEID 20470 items 1573910 lines 64401134 bytes ce2003/pEID 20470 items 123787 lines 9088861 bytes === dm4p1 === dm4p1/dEID 15624 items 1574157 lines 125311779 bytes dm4p1/hEID 15624 items 2394806 lines 96669029 bytes dm4p1/pEID 15624 items 128807 lines 9680941 bytes === hs35p1 === hs35p1/dEID 24541 items 17946954 lines 1434796138 bytes hs35p1/hEID 24541 items 1215169 lines 43174584 bytes hs35p1/pEID 24541 items 176711 lines 13142208 bytes === mm34p1 === mm34p1/dEID 20890 items 11471587 lines 916904125 bytes mm34p1/hEID 20890 items 1896044 lines 87193661 bytes mm34p1/pEID 20890 items 145957 lines 10847688 bytes Note that, in all datasets, the files "dEID", "hEID", and "pEID" have the same number of items. Checking again, the number of items and "data" bytes in each dataset: tabulate-datasets ${datasets} Dataset items dEID hEID pEID at2004 22957 58129067 42704522 10149075 ce2003 20470 62321120 64401134 9088861 dm4p1 15624 125311779 96669029 9680941 hs35p1 24541 1434757209 43172816 13141772 mm34p1 20890 916904125 87193661 10847688 Note that the number of lines is not a good indication of item size. For instance, in the ".mrnaEID" files the whole mRNA is given in a single line, whereas in the ".pEID" files have the protein split into several sequences. IDENTIFYING GOOD ITEMS We should probably exclude all items with "CDS_incomplete" (they have a format bug in the "pEID" files). It would be nice if we could select only the items whose CDS was determined by experimental evidence (rather than computer inference). Unfortunately, some data sets don't have such annotations. The "ce2003" dataset has the annotation "/evidence=experimental" on 12595 of its 20470 items. The datasets hs35p1 and mm34p1 have have [evidence ...] notes but it is not clear whether that means *eperimental* evidence. The datasets at2004 and dm4p1 do not seem to have any indication to this effect. We should also exclude any items that are the same gene with two or more different splicings, since they may have coding regions marked as introns. These are characterized by having capital letters in the first part of the item ID. E.g. "27_NT_077913" has a single splicing, while "28A_NT_077913", "28B_NT_077913", and "28C_NT_077913" are three alternative splicings of the same gene. (Some H.sapiens genes have 23 variants, "A" thru "W"). Here we try to identify the items which are "good" by those criteria: foreach f ( ${datasets} ) echo " "; printf "%-10s" "${f}" set rex = "1"; if ( ( "${f}" == "at2004" ) || ( "${f}" == "dm4p1" ) ) set rex = "0" bzcat orig/eids/${f}/hEID.bz2 \ | identify-good-hEID-items \ -v reqExpr="${rex}" \ > orig/eids/${f}/hEID.good end at2004 22957 items 18772 good 4185 bad ce2003 20470 items 17173 good 3297 bad dm4p1 15624 items 7963 good 7661 bad hs35p1 24541 items 17693 good 6848 bad mm34p1 20890 items 19551 good 1339 bad Actually, it would be better to take *all* items of the same gene, and combine them into a single `maximally expressed' item. I.e., if a base of the genome is marked as `exon' in any item, it should be marked as `coding' in the output file. Hopefully there will be no phase ambiguity. Maybe some other year... We must also check whether the DNA and protein sequences are valid, since there are items which have invalid chars. foreach f ( ${datasets} ) echo " "; printf "%-10s" "${f}" bzcat orig/eids/${f}/dEID.bz2 \ | identify-good-dEID-items \ > orig/eids/${f}/dEID.good end at2004 22957 items 22951 good 6 bad ce2003 20470 items 20443 good 27 bad dm4p1 15624 items 15609 good 15 bad hs35p1 24541 items 24476 good 65 bad mm34p1 20890 items 17311 good 3579 bad foreach f ( ${datasets} ) echo " "; printf "%-10s" "${f}" bzcat orig/eids/${f}/pEID.bz2 \ | identify-good-pEID-items \ > orig/eids/${f}/pEID.good end at2004 22957 items 22926 good 31 bad ce2003 20470 items 20447 good 23 bad dm4p1 15624 items 15588 good 36 bad hs35p1 24541 items 23348 good 1193 bad mm34p1 20890 items 19832 good 1058 bad Obtaining the items that are good by all three programs: foreach f ( ${datasets} ) echo " "; printf "%-10s" "${f}" cat orig/eids/${f}/{d,p,h}EID.good \ | gawk '($2 == 1){ print $1; }' \ | sort \ | uniq -c \ | gawk '($1 == 3){ print $2; }' \ > orig/eids/${f}/EID.good end dicio-wc orig/eids/*/EID.good lines words bytes file ------- ------- --------- ------------ 18766 18766 290406 orig/eids/at2004/EID.good 17146 17146 264351 orig/eids/ce2003/EID.good 7956 7956 119006 orig/eids/dm4p1/EID.good 16860 16860 260438 orig/eids/hs35p1/EID.good 15807 15807 244061 orig/eids/mm34p1/EID.good SELECTING A SAMPLE OF THE ITEMS We create a list of all candidate items in each dataset. Currently it is just the ".good" lists produced above: mkdir -p pick/eids foreach f ( ${datasets} ) echo " "; printf "%-20s" "${f}" set iFile = "orig/eids/${f}/dEID.items" set gfile = "orig/eids/${f}/EID.good" mkdir -p pick/eids/${f} set cFile = "pick/eids/${f}/EID.cands" cp -p ${gfile} ${cFile} end dicio-wc pick/eids/*/EID.cands lines words bytes file ------- ------- --------- ------------ 18766 18766 290406 pick/eids/at2004/EID.cands 17146 17146 264351 pick/eids/ce2003/EID.cands 7956 7956 119006 pick/eids/dm4p1/EID.cands 16860 16860 260438 pick/eids/hs35p1/EID.cands 15807 15807 244061 pick/eids/mm34p1/EID.cands SELECTING A RANDOM SAMPLE We must randomly select only a certain number {N} of each candidate list. A nice number is 1000 items, but we start with a few more to allow for losses along the way: set smpSize = 1100 foreach f ( ${datasets} ) set cFile = "pick/eids/${f}/EID.cands" set pFile = "pick/eids/${f}/EID.picks" cat ${cFile} \ | gawk '//{ n++; printf "%17.15f %10d %s\n", rand(), n, $0; }' \ | sort -b -k1,1g \ | head -${smpSize} \ | sort -b -k2,2n \ | gawk '//{ lin=$0; print substr(lin,30); }' \ > ${pFile} end dicio-wc pick/eids/*/EID.picks lines words bytes file ------- ------- --------- ------------ 1100 1100 17032 pick/eids/at2004/EID.picks 1100 1100 16973 pick/eids/ce2003/EID.picks 1100 1100 16428 pick/eids/dm4p1/EID.picks 1100 1100 16989 pick/eids/hs35p1/EID.picks 1100 1100 17007 pick/eids/mm34p1/EID.picks Consistency check: foreach f ( ${datasets} ) set cFile = "pick/eids/${f}/EID.cands" set pFile = "pick/eids/${f}/EID.picks" echo " "; echo "=== ${cFile} == ${pFile} ===" cat ${pFile} | sort > .picks cat ${cFile} | sort > .cands bool 1-2 .picks .cands rm .picks .cands end EXTRACTING THE SELECTED SAMPLE Extracting the picked items from the "dEID", "pEID", and "hEID" files: foreach f ( ${datasets} ) set pFile = "pick/eids/${f}/EID.picks" echo " "; echo "=== ${pFile} ===" foreach t ( d h p ) set iFile = "orig/eids/${f}/${t}EID" set xFile = "pick/eids/${f}/${t}EID" printf "%-15s\n" "${f}.${t}" bzcat ${iFile}.bz2 \ | extract-EID-items \ -v itemList=${pFile} \ > ${xFile} end end === pick/eids/at2004/EID.picks === at2004/dEID items read = 22957 written = 1100 lines read = 783895 written = 35629 at2004/hEID items read = 22957 written = 1100 lines read = 1126147 written = 53726 at2004/pEID items read = 22957 written = 1100 lines read = 184154 written = 8728 === pick/eids/ce2003/EID.picks === ce2003/dEID items read = 20470 written = 1100 lines read = 829985 written = 39573 ce2003/hEID items read = 20470 written = 1100 lines read = 1635320 written = 87707 ce2003/pEID items read = 20470 written = 1100 lines read = 164727 written = 8565 === pick/eids/dm4p1/EID.picks === dm4p1/dEID items read = 15624 written = 1100 lines read = 1605405 written = 64265 dm4p1/hEID items read = 15624 written = 1100 lines read = 2426054 written = 170165 dm4p1/pEID items read = 15624 written = 1100 lines read = 160055 written = 10887 === pick/eids/hs35p1/EID.picks === hs35p1/dEID items read = 24541 written = 1100 lines read = 17996036 written = 724463 hs35p1/hEID items read = 24541 written = 1100 lines read = 1264251 written = 54314 hs35p1/pEID items read = 24541 written = 1100 lines read = 225793 written = 9816 === pick/eids/mm34p1/EID.picks === mm34p1/dEID items read = 20890 written = 1100 lines read = 11513367 written = 507417 mm34p1/hEID items read = 20890 written = 1100 lines read = 1937824 written = 101306 mm34p1/pEID items read = 20890 written = 1100 lines read = 187737 written = 9394 Listing the extracted items: foreach f ( ${datasets} ) echo " "; echo "=== $f ===" foreach t ( d h p ) printf "%-15s" "${f}/${t}EID" cat pick/eids/${f}/${t}EID \ | list-EID-items \ > pick/eids/${f}/${t}EID.items end end dicio-wc pick/eids/*/{d,p,h}EID.items === at2004 === at2004/dEID 1100 items 33429 lines 2629668 bytes at2004/hEID 1100 items 51526 lines 2027832 bytes at2004/pEID 1100 items 6528 lines 479367 bytes === ce2003 === ce2003/dEID 1100 items 37373 lines 2946169 bytes ce2003/hEID 1100 items 84407 lines 3450216 bytes ce2003/pEID 1100 items 6365 lines 464996 bytes === dm4p1 === dm4p1/dEID 1100 items 62065 lines 4920808 bytes dm4p1/hEID 1100 items 167965 lines 6770663 bytes dm4p1/pEID 1100 items 8687 lines 651986 bytes === hs35p1 === hs35p1/dEID 1100 items 722263 lines 57738446 bytes hs35p1/hEID 1100 items 52114 lines 1834466 bytes hs35p1/pEID 1100 items 7616 lines 565591 bytes === mm34p1 === mm34p1/dEID 1100 items 505217 lines 40374578 bytes mm34p1/hEID 1100 items 99106 lines 4554467 bytes mm34p1/pEID 1100 items 7194 lines 531328 bytes >>> stolfi@dumont.ic.unicamp.br 61>>> lines words bytes file ------- ------- --------- ------------ 1100 4400 70400 pick/eids/at2004/dEID.items 1100 4400 70400 pick/eids/ce2003/dEID.items 1100 4400 70400 pick/eids/dm4p1/dEID.items 1100 4400 70400 pick/eids/hs35p1/dEID.items 1100 4400 70400 pick/eids/mm34p1/dEID.items 1100 4400 70400 pick/eids/at2004/pEID.items 1100 4400 70400 pick/eids/ce2003/pEID.items 1100 4400 70400 pick/eids/dm4p1/pEID.items 1100 4400 70400 pick/eids/hs35p1/pEID.items 1100 4400 70400 pick/eids/mm34p1/pEID.items 1100 4400 70400 pick/eids/at2004/hEID.items 1100 4400 70400 pick/eids/ce2003/hEID.items 1100 4400 70400 pick/eids/dm4p1/hEID.items 1100 4400 70400 pick/eids/hs35p1/hEID.items 1100 4400 70400 pick/eids/mm34p1/hEID.items Consistency check: foreach f ( ${datasets} ) echo " " foreach t ( d h p ) set oFile = "orig/eids/${f}/${t}EID.items" set xFile = "pick/eids/${f}/${t}EID.items" echo "=== ${oFile} ${xFile} ===" cat ${oFile} | sort > .orig cat ${xFile} | sort > .pick bool 1-2 .pick .orig | head -20 rm .pick .orig end end SPLITTING THE FILES INTO ITEMS Testing the program: mkdir -p temp/{eids,item}/ce2003 ${progdir}/dbd_digest_EID \ -maxItems 25 \ pick/eids/ce2003/dEID \ pick/eids/ce2003/pEID \ pick/eids/ce2003/hEID \ temp/item/ce2003 Checking the consistency between the DNA basis sequence (.bas) and the expressed protein (.ama): foreach f ( temp/item/ce2003/*.lab ) echo "${f}" catsep -sep " " ${f:r}.bas ${f:r}.lab ${f:r}.ama \ | merge-bas-lab-ama \ | ${progdir}/dbd_check_translation end Doing it for real: foreach f ( ${datasets} ) echo " " echo "=== ${f} ===" mkdir -p pick/item/${f} /bin/rm -f pick/item/${f}/*.bas /bin/rm -f pick/item/${f}/*.lab /bin/rm -f pick/item/${f}/*.ama /bin/rm -f pick/item/${f}/*.doc ${progdir}/dbd_digest_EID \ pick/eids/${f}/dEID \ pick/eids/${f}/pEID \ pick/eids/${f}/hEID \ pick/item/${f} printf "items = "; ls pick/item/${f}/*.bas | wc -l end Checking translation ({dbd_digest_EID} should do it!!!): foreach f ( ${datasets} ) echo " " echo "=== ${f} ===" foreach lab ( pick/item/${f}/*.lab ) echo "${lab:r}" catsep -sep " " ${lab:r}.bas ${lab:r}.lab ${lab:r}.ama \ | merge-bas-lab-ama \ | ${progdir}/dbd_check_translation end end CREATING ITEM SETS We next partition the items into item sets for the purposes of statistics gathering and performance evaluation. First we create an item set for each species, and a single set of all items: echo ${datasets} mkdir -p sets /bin/rm -f sets/all.gset foreach f ( ${datasets} ) echo "=== ${f} ===" ( cd pick/item && ls -1 ${f}/*.bas ) \ | sed -e 's:[.]bas$::' \ > sets/${f}.gset cat sets/${f}.gset \ >> sets/all.gset end Since "all.gset" is too big for some purposes, we create also an item set "mix.gset" that has about 1000 items randomly selected from "all.gset": echo ${smpSize} cat sets/all.gset \ | gawk '//{ n++; printf "%17.15f %10d %s\n", rand(), n, $0; }' \ | sort -b -k1,1g \ | head -${smpSize} \ | sort -b -k2,2n \ | gawk '//{ lin=$0; print substr(lin,30); }' \ > sets/mix.gset dicio-wc sets/*.gset lines words bytes file ------- ------- --------- ------------ 1098 1098 24688 sets/at2004.gset 1100 1100 24673 sets/ce2003.gset 1099 1099 23007 sets/dm4p1.gset 1074 1074 24105 sets/hs35p1.gset 1082 1082 24302 sets/mm34p1.gset 5453 5453 120775 sets/all.gset 1100 1100 24361 sets/mix.gset PROBABILITIES Computing the probabilities from all datasets. We leave out the "mix" one since it has too few sequences of each species, and the properties vary quite a lot between species. mkdir prob foreach f ( all ${datasets} ) echo " " foreach wsle ( \ 1.DEFHIJNXYPQRS \ 2.KKKHIJNXYPQRS \ 3.DEFHIJNXYZZZZ \ 6.DZZZZZNZZZZZZ \ ) set ws = "${wsle:r}" set le = "${wsle:e}" echo "=== ${f}-${ws}-${le} ===" tabulate-probs \ ${f} ${ws} ${le} sets pick/item evts prob end end dicio-wc prob/*.prb lines words bytes file ------- ------- --------- ------------ 67 223 1500 prob/at2004-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/ce2003-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/dm4p1-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/hs35p1-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/mm34p1-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/all-1-DEFHIJNXYPQRS.prb 67 223 1500 prob/mix-1-DEFHIJNXYPQRS.prb 104 399 2829 prob/at2004-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/ce2003-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/dm4p1-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/hs35p1-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/mm34p1-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/all-2-KKKHIJNXYPQRS.prb 104 399 2829 prob/mix-2-KKKHIJNXYPQRS.prb 327 1295 9740 prob/at2004-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/ce2003-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/dm4p1-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/hs35p1-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/mm34p1-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/all-3-DEFHIJNXYZZZZ.prb 327 1295 9740 prob/mix-3-DEFHIJNXYZZZZ.prb 7849 31395 282557 prob/all-6-DZZZZZNZZZZZZ.prb 7827 31307 281765 prob/at2004-6-DZZZZZNZZZZZZ.prb 7835 31339 282053 prob/ce2003-6-DZZZZZNZZZZZZ.prb 7832 31327 281945 prob/dm4p1-6-DZZZZZNZZZZZZ.prb 7831 31323 281909 prob/hs35p1-6-DZZZZZNZZZZZZ.prb 7833 31331 281981 prob/mix-6-DZZZZZNZZZZZZ.prb 7827 31307 281765 prob/mm34p1-6-DZZZZZNZZZZZZ.prb REGION LENGTHS Computing the length distributions of regions by type (coding, intron, head UTR, tail UTR): foreach f ( ${datasets} ) echo " "; echo "=== ${f} ===" get-region-lengths ${f} sets pick/item prob end Plotting the length distributions: mkdir -p plot ${progdir}/dbd_plot_region_length_stats \ prob plot ${datasets} Comparing distributions: foreach t ( N K X Y ) foreach s ( ${datasets} ) ghostview \ -notitle \ -nodate \ -nolocator \ -magstep -2 \ plot/${s}-${t}-lhs.eps & end xmessage "done?" end foreach t ( N K X Y ) foreach p ( lin log ) ghostview \ -notitle \ -nodate \ -nolocator \ plot/CMP-${t}-${p}.eps & end xmessage "done?" end ANALYSIS OF REGION LENGTHS Analyzing the plots of region lengths, we conclude that the length distributions vary substantially with the species and type. Specifically: "N" regions: * There is a definite minimum length, different for each species. Less than 1% of the sequences are below this minimum, but the counts escalate rapidly above it. The minimum is particularly sharp in the ce2003 and dm4p1 sets. * There seem to be a relatively narrow Gaussian-like hump, with mean just above the minimum, and then a long exponential tail. Overall Gaussian Exponential -------------- -------------- -------------- Species median min avg dev mlo tau ------- ------ ------ ------ ------ ------ ------ at2004 99 ~60 ~88 ce2003 66 40 47 dm4p1 68 49 ~62 hs35p1 1523 66 ~82 mm34p1 1331 66 ~84 * Gaussian-like hump avg ~ 88 ? * Exponential tail mlo ~ ??? tau = -log(0.05/0.3)/400 amp = ??? === ce2003 === * median = 66, min = 40 * Gaussian-like hump avg ~ 48 ? * Exponential tail mlo ~ ??? tau = -log(0.03/0.3)/1200 amp = ??? === dm4p1 === * median = 68, min = 48 * Gaussian-like hump avg ~ 62 ? * Exponential tail mlo ~ ??? tau = -log(???)/??? amp = ??? === hs35p1 === * median = 1523, min ~ 66 * Gaussian-like hump avg ~ 100 * Exponential tail mlo ~ ??? tau = -log(???)/??? amp = ??? === mm34p1 === * median = 1331, min ~ 66 * Gaussian-like hump avg ~ 88??? dev = ??? * Exponential tail mlo ~ 100 tau = -log(???)/??? amp = ??? "K" regions: * There is no minimum length. * The distribution is dominated by a Gaussian-like hump, with a weak exponential tail. In hs35p1 and mm34p1, the tail is practically absent. * There is a sharp peak at ???. Overall Gaussian Exponential -------------- -------------- -------------- Species median min avg dev mlo tau ------- ------ ------ ------ ------ ------ ------ at2004 134 1 ~82 ce2003 147 1 ~110 dm4p1 233 1 ~140 hs35p1 126 1 ~110 mm34p1 123 1 ~110 === at2004 [Arabidopsis thaliana] === * median = 134, min ~ 0 * Lognormal (???) distrib, peak at ~ 80 ??? * Exponential tail, tau ~ -log(0.01/0.1)/800, mlo ~ ??? === ce2003 === * median = 147 min ~ 0 * Gaussian hump avg ~ 115 dev ~ ??? * Exponential tail, tau ~ -log(???)/???, mlo ~ ??? === dm4p1 === * median = 233 min ~ 0 * Gaussian hump avg ~ 150 dev ~ ??? * Exponential tail, tau ~ -log(???)/???, mlo ~ ??? === hs35p1 === * median = 126 min ~ 0 * Gaussian hump avg ~ 110 dev ~ ??? * Exponential tail, tau ~ -log(???)/???, mlo ~ ??? === mm34p1 === * median = 123 min ~ 0 * Gaussian hump avg ~ 110 dev ~ ??? * Sharp peak at ??? * Weak exponential tail, tau ~ -log(???)/???, mlo ~ ??? "X" regions: * There is no minimum length. Many are empty (perhaps artifact of database construction?) * The distribution seems to be an exponential startint at m = 1, plus a weak hump Overall Gaussian Exponential -------------- -------------- -------------- Species median min avg dev mlo tau ------- ------ ------ ------ ------ ------ ------ at2004 82 1 ~60 ce2003 30 1 ~80 dm4p1 108 1 ~40 hs35p1 86 1 ~70 mm34p1 80 1 ~70 === at2004 [Arabidopsis thaliana] === * median = 82, min = 1 * Gaussian hump avg ~ 66 dev ~ 66 * Almost pure exponential tail tau ~ -log(0.05/0.5)/300 === ce2003 === * very few (52); median = 30 min ~ 0 * Gaussian hump avg ~ 115 dev ~ ??? * Almost pure exponential tail tau ~ -log(???)/???, mlo ~ ??? === dm4p1 === * median = 108 min ~ 0 * Gaussian hump avg ~ 66 dev ~ ??? * Sharp drop (-40%) at m ~ 100 * Almost pure exponential tail tau ~ -log(???)/???, mlo ~ ??? === hs35p1 === * median = 86 min ~ 0 * Weak Gaussian hump avg ~ 45 dev ~ ??? * Almost pure exponential tail tau ~ -log(???)/???, mlo ~ ??? === mm34p1 === * median = 80 min ~ 0 * Weak Gaussian hump avg ~ 45 dev ~ ??? * Sharp drop at ~ 60 * Almost pure exponential tail tau ~ -log(???)/???, mlo ~ ??? "Y" regions: * There is no minimum length. Many are empty (perhaps artifact of database construction?) * The distribution is mostly Gaussian-like, however for hs35p1 and mm34p1 it is much broader. Overall Gaussian Exponential -------------- -------------- -------------- Species median min avg dev mlo tau ------- ------ ------ ------ ------ ------ ------ at2004 201 1 ~200 ce2003 192 1 ~192 dm4p1 173 1 hs35p1 446 1 mm34p1 442 1 === at2004 [Arabidopsis thaliana] === * median = 201, min = 0 * Gaussian hump avg ~ 100, dev ~ ??? * Practically no exponential tail. === ce2003 === * very few (58); median = 192, min = 0 * Gaussian hump avg ~ 100, dev ~ ??? * Practically no exponential tail. === dm4p1 === * median = 173 min ~ 0 * Gaussian hump avg ~ 110, dev ~ ??? === hs35p1 === * median = 446 min ~ 0 * Gaussian hump avg ~ , dev ~ ??? * Long exponential tail tau ~ -log(???)/???, mlo ~ ??? === mm34p1 === * median = 442 min ~ 0 * No Gaussian hump avg ~ , dev ~ ??? * Long exponential tail tau ~ -log(???)/???, mlo ~ ??? FITTING STATISTICAL MODELS TO THE OBSERVED DATA TO DO This is too much data (we don't even have enough disk). Perhaps we can unpack it, convert to our format, and burn a few DVDs. Must convert the ".trn" format to be compatible with Reese's ("{CODON} {AMINO}" pairs, one per line). Should we try to get homologous sequences? ---------------------------------------------------------------------- RENAMING FILES foreach f ( ${datasets} ) echo ${f} mv -v pick/${f}/item pick/item/${f} mv -v pick/${f} pick/eids/${f} end ====================================================================== EXCEPTIONAL THINGS THAT HAD TO BE DONE ONCE ====================================================================== CHANGING COMPRESSOR foreach f ( ${datasets} ) echo " " foreach t ( d p h t s ) echo "${f}/${t}EID" ( cd orig/eids/${f} && gunzip ${t}EID.gz && bzip2 ${t}EID ) end end RENAMING foreach f ( ${datasets} ) echo " " mv -v orig/${f}/${f}.EID.sdf orig/eids/${f}/EID.sdf mv -v orig/${f}/${f}.dEID.items orig/eids/${f}/dEID.items mv -v orig/${f}/${f}.dGood orig/eids/${f}/dEID.good mv -v orig/${f}/${f}.good orig/eids/${f}/EID.good mv -v orig/${f}/${f}.hEID.items orig/eids/${f}/hEID.items mv -v orig/${f}/${f}.hGood orig/eids/${f}/hEID.good mv -v orig/${f}/${f}.pEID.items orig/eids/${f}/pEID.items mv -v orig/${f}/${f}.pGood orig/eids/${f}/pEID.good end