#! /usr/bin/gawk -f
# Last edited on 2013-10-14 20:11:18 by stolfilocal

# Assumes that each line of the input is a nucleotide sequence.
# Applies a small random mutation at the beginning of each line.

/^ *[ATCGU ]+ *$/ {
  lin = $0;
  gsub(/[ ]/, "", lin);
  n = length(lin);
  # Choose length {p} of subseq to replace.
  p1 = int(rand()*3) + 1;
  p2 = int(rand()*3) + 1;
  p3 = int(rand()*3) + 1;
  p = (p1 < p2 ? p1 : p2);
  p = (p < p3 ? p : p3);
  p = (p > n ? n : p);
  # Choose the replacement length {q}:
  q1 = int(rand()*3) + 1;
  q2 = int(rand()*3) + 1;
  q3 = int(rand()*3) + 1;
  q = (q1 < q2 ? q1 : q2);
  q = (q < q3 ? q : q3);
  # Create the replacement string:
  rep = "";
  for (i = 0; i < q; i++)
    { j = int(rand()*4) + 1; 
      b = substr("ATCG", j, 1);
      rep = (rep b);
    }
  # Replace: 
  out = (rep substr(lin, p+1));
  print out;
  next;
}

// { 
  # May be header or comment:
  print;
}