#! /usr/bin/gawk -f
# Last edited on 2008-06-07 11:14:03 by stolfi

BEGIN {
  usage = ( "split-sets -v nSets={NSETS} -v outName={OUTNAME} {GENESIZES}" )
  # Reads from file {GENESIZES} a list of pairs "{SIZE} {NAME}", one per line.
  # Writes {NSETS} files called "{OUTNAME}-{KK}.gset", where {KK} is a 
  # two-digit set number ranging from 0 to {NSETS-1}.
  
  # The number of lines in the input must me a multiple of {NSETS}.
  # Each set will have exactly the same number of genes and 
  # approximately the same total size.
  
  abort = -1;
  if (nSets == "") { arg_error(("must define \"nSets\"")); }
  if (outName == "") { arg_error(("must define \"outName\"")); }
  
  nGenes = 0;   # Total number of genes in input.
  totSize = 0;  # Sum of all gen sizes as gigen in inpu.
  
  split("", gName); # {gName[j]} is the name of gene number {j} in input.
  split("", gSize); # {gSize[j]} is the size of gene {gName[j]}, as give in the input. 
  split("", gSet);  # {gSet[j]} is the index of the set where gene {j} has been assigned to.
  
  split("", setSize);   # {setSize[k]} is the total size of all genes in set {k}. 
}

/^[ ]*($|[\#])/ { 
  # Blank or comment line, ignore: 
  next;
}

/^[ ]*[0-9]+[ ]+[-A-Z0-9_]+ *($|[\#])/ {
  # Looks like a valid data line
  sz = $1; # Gene size
  na = $2; # Gene name
  st = (nGenes % nSets); # Set index (temporary)
  gName[nGenes] = na;
  gSize[nGenes] = sz;
  gSet[nGenes] = st;
  nGenes ++;
  totSize += sz;
  setSize[st] += sz;
  next;
}

// {
  # Invalid format:
  data_error(("bad format")); 
  next;
}

END {
  
  # Ensure that all sets have the same number of genes:
  if ((nGenes % nSets) != 0)
    { data_error(("number of genes = " nGenes " is not a multiple of " nSets "")); }
  
  # Try to uniformize the total size of each set:
  maxPasses = 50;
  passes = 0;
  do
    { 
      # Print initial set sizes and variance
      avgSize = (totSize + 0.0)/nSets; # Ideal size of each set
      printf "set sizes after %d passes:\n", passes > "/dev/stderr";
      sum2 = 0;
      for (k = 0; k < nSets; k++)
        { printf "  %10d [%02d]\n", setSize[k], k > "/dev/stderr";
          dsz = setSize[k] - avgSize;
          sum2 += dsz*dsz;
        }
      varSize = sum2/nSets;
      devSize = sqrt(varSize);
      printf "average size %12.1f  deviation %10.1f\n", avgSize, devSize > "/dev/stderr";
      printf "\n" > "/dev/stderr";

      # Look for a swap that reduces the variance:
      nswaps = 0;
      for (j1 = 0; j1 < nGenes; j1++)
        { gsz1 = gSize[j1];  # Size of gene {j1}.
          for (j2 = j1+1; j2 < nGenes; j2++)
            { gsz2 = gSize[j2];  # Size of gene {j2}.
              st1 = gSet[j1];          st2 = gSet[j2];      # Current sets of genes {j1,j2}.
              ssz1 = setSize[st1];     ssz2 = setSize[st2]; # Current total size of sets {st1,st2}.
              dsz1 = ssz1 - avgSize; dsz2 = ssz2 - avgSize; # Excess size of sets {st1,st2}.
              if ((st1 != st2) && (dsz1*dsz2 < 0))
                { # Sets are different and have opposite excess sizes.
                  # Compute the excess sizes if the genes were swapped:
                  esz1 = dsz1 - gsz1 + gsz2;
                  esz2 = dsz2 - gsz2 + gsz1;
                  if (dsz1*dsz1 + dsz2*dsz2 > esz1*esz1 + esz2*esz2)
                    { # Swap the two genes, since it improves the variance:
                      printf "swapping [%02d] %-20s <--> [%02d] %-20s\n", \
                        st1, gName[j1], st2, gName[j2] \
                        > "/dev/stderr";
                      gSet[j1] = st2; 
                      gSet[j2] = st1;
                      nswaps++;
                      # Adjust the set sizes to account for the swap:
                      setSize[st1] += (gsz2 - gsz1);
                      setSize[st2] += (gsz1 - gsz2);
                    }
                }
            }
        }
      passes ++;
    }
  while ((passes < maxPasses) && (nswaps > 0));
    
  # Output the data sets:
  split("", fname);  # {fname[k]} is the file name for set {k}.
  for (k = 0; k < nSets; k++) { fname[k] = sprintf("%s-%02d.gset", outName, k); }
  for (j = 0; j < nGenes; j++)
    { st = gSet[j];
      na = gName[j]; 
      sz = gSize[j];
      printf "%s\n", na > fname[st];
    }
  for (k = 0; k < nSets; k++) 
    { close(fname[k]);
      printf "%10d %s\n", setSize[k], fname[k] > "/dev/stderr";
    }
}    
 
function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit(1);
}

function data_error(msg)
{
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
  abort = 1;
  exit(1);
}