#! /usr/bin/gawk -f # Last edited on 2008-06-07 11:14:03 by stolfi BEGIN { usage = ( "split-sets -v nSets={NSETS} -v outName={OUTNAME} {GENESIZES}" ) # Reads from file {GENESIZES} a list of pairs "{SIZE} {NAME}", one per line. # Writes {NSETS} files called "{OUTNAME}-{KK}.gset", where {KK} is a # two-digit set number ranging from 0 to {NSETS-1}. # The number of lines in the input must me a multiple of {NSETS}. # Each set will have exactly the same number of genes and # approximately the same total size. abort = -1; if (nSets == "") { arg_error(("must define \"nSets\"")); } if (outName == "") { arg_error(("must define \"outName\"")); } nGenes = 0; # Total number of genes in input. totSize = 0; # Sum of all gen sizes as gigen in inpu. split("", gName); # {gName[j]} is the name of gene number {j} in input. split("", gSize); # {gSize[j]} is the size of gene {gName[j]}, as give in the input. split("", gSet); # {gSet[j]} is the index of the set where gene {j} has been assigned to. split("", setSize); # {setSize[k]} is the total size of all genes in set {k}. } /^[ ]*($|[\#])/ { # Blank or comment line, ignore: next; } /^[ ]*[0-9]+[ ]+[-A-Z0-9_]+ *($|[\#])/ { # Looks like a valid data line sz = $1; # Gene size na = $2; # Gene name st = (nGenes % nSets); # Set index (temporary) gName[nGenes] = na; gSize[nGenes] = sz; gSet[nGenes] = st; nGenes ++; totSize += sz; setSize[st] += sz; next; } // { # Invalid format: data_error(("bad format")); next; } END { # Ensure that all sets have the same number of genes: if ((nGenes % nSets) != 0) { data_error(("number of genes = " nGenes " is not a multiple of " nSets "")); } # Try to uniformize the total size of each set: maxPasses = 50; passes = 0; do { # Print initial set sizes and variance avgSize = (totSize + 0.0)/nSets; # Ideal size of each set printf "set sizes after %d passes:\n", passes > "/dev/stderr"; sum2 = 0; for (k = 0; k < nSets; k++) { printf " %10d [%02d]\n", setSize[k], k > "/dev/stderr"; dsz = setSize[k] - avgSize; sum2 += dsz*dsz; } varSize = sum2/nSets; devSize = sqrt(varSize); printf "average size %12.1f deviation %10.1f\n", avgSize, devSize > "/dev/stderr"; printf "\n" > "/dev/stderr"; # Look for a swap that reduces the variance: nswaps = 0; for (j1 = 0; j1 < nGenes; j1++) { gsz1 = gSize[j1]; # Size of gene {j1}. for (j2 = j1+1; j2 < nGenes; j2++) { gsz2 = gSize[j2]; # Size of gene {j2}. st1 = gSet[j1]; st2 = gSet[j2]; # Current sets of genes {j1,j2}. ssz1 = setSize[st1]; ssz2 = setSize[st2]; # Current total size of sets {st1,st2}. dsz1 = ssz1 - avgSize; dsz2 = ssz2 - avgSize; # Excess size of sets {st1,st2}. if ((st1 != st2) && (dsz1*dsz2 < 0)) { # Sets are different and have opposite excess sizes. # Compute the excess sizes if the genes were swapped: esz1 = dsz1 - gsz1 + gsz2; esz2 = dsz2 - gsz2 + gsz1; if (dsz1*dsz1 + dsz2*dsz2 > esz1*esz1 + esz2*esz2) { # Swap the two genes, since it improves the variance: printf "swapping [%02d] %-20s <--> [%02d] %-20s\n", \ st1, gName[j1], st2, gName[j2] \ > "/dev/stderr"; gSet[j1] = st2; gSet[j2] = st1; nswaps++; # Adjust the set sizes to account for the swap: setSize[st1] += (gsz2 - gsz1); setSize[st2] += (gsz1 - gsz2); } } } } passes ++; } while ((passes < maxPasses) && (nswaps > 0)); # Output the data sets: split("", fname); # {fname[k]} is the file name for set {k}. for (k = 0; k < nSets; k++) { fname[k] = sprintf("%s-%02d.gset", outName, k); } for (j = 0; j < nGenes; j++) { st = gSet[j]; na = gName[j]; sz = gSize[j]; printf "%s\n", na > fname[st]; } for (k = 0; k < nSets; k++) { close(fname[k]); printf "%10d %s\n", setSize[k], fname[k] > "/dev/stderr"; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit(1); } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(1); }