#! /usr/bin/gawk -f
# Last edited on 2008-06-16 21:07:11 by stolfi

BEGIN {
  # Error flag
  abort = -1; 
  
  USAGE = ( \
    "extract_RefSeq_segment \\\n" \
    "    -v ini={INI} -v fin={FIN} \\\n" \
    "    -v outName={NAME} \\\n" \
    "  [<] {INFILE}.gbk" \
  );

  # Reads from standard input an NCBI RefSeq genomic cDNA file
  # in the full GenBank flatfile format.  Writes several files with
  # data that is relevant to coding/non-coding detection research.
  # 
  # The input format appears to be similar or identical to the EMBL format
  # described in
  #    
  #    http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
  #      
  # The output consists of four output files, two for each strand, called
  # 
  #   "{outName}-{S}.bas" - the sequence of bases [ATCG] on strand {S}. 
  # 
  #   "{outName}-{S}.lab" - the corresponding functional label sequence.
  #      
  # The strand code is "0" for the primary strand (as in the input file) or "1" 
  # for the complementary strand.
  #
  # The {lab} files contain one label letter for each
  # nucleotide in the segment. The letters record the function of the
  # corresponding nucleotide, as inferred from the "FEATURES" section
  # of the input file. Each label may be:
  #   
  #      'D', 'E', 'F'      The three codon positions in coding regions.
  #      'N'                Base of transcribed but non-coding region.
  #      'O'                Gap in sequence.
  #      'Z'                Unknown function.
  #      'X'                Conflicting functions.
  #       
  # The coding labels [DEF] are assigned only if (1) the nucleotide
  # is spanned by a "CDS" feature, and (2) all "CDS" features that
  # span the base agree on its phase (reading frame alignment).
  # 
  # The label 'N' is assigned iff the base (1) is not spanned by
  # any "CDS" feature, but (2) is spanned by one or more transcription
  # features ("mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA").
  #
  # The letter 'O' is assigned if the base is spanned by one or more
  # "gap" features.
  # 
  # Bases that do not fit any of the above rules are assigned
  # label 'Z' (`unknown').
  # 
  # Bases that would receive two or more distinct labels by the above
  # rules are labeled 'X' (`conflict/bug').
  # 
  # All output files begin with a perfunctory header with the format
  #   
  #   "> {ITEMVERS} /strand={S} /ini={INI} /fin={FIN}"
  #   

  # Argument checking and defaulting:
  if (outName == "") { arg_error(("must define {outName}")); }
  if (ini == "") { arg_error(("must define {ini}")); } else { ini += 0; }
  if (fin == "") { arg_error(("must define {fin}")); } else { fin += 0; }
  
  if (ini < 1) { arg_error(("bad {ini} = " ini "")); }
  if (fin < ini) { arg_error(("bad range {ini..fin} = " ini ".." fin "")); }
  
  # Parsing state:
  #   0 before "FEATURES" line
  #   1 between "FEATURES" and "ORIGIN" lines
  #   2 between "ORIGIN" and "//" lines.
  #   3 after "//" line..
  state = 0;
  
  # One result of parsing the "FEATURES" section is a 
  # collection of (possibly overlapping) /labeling ranges/
  # A labeling range is defined by three attributes
  #   
  #   {rng_ini[k]}   First DNA position in range.
  #   {rng_fin[k]}   Last DNA position in range.
  #   {rng_lab[k]}   Labels to be applied to the range.
  #   {rng_cmp[k]}   Strand (0 for primary, 1 for complementary).
  #   
  # The interpretation of {rng_lab} is defined further on. The
  # positions {rng_ini[k],rng_fin[k]} refer to the primary strand,
  # even when {rng_cmp[k]} is 1.
  
  split("", rng_ini);
  split("", rng_fin);
  split("", rng_lab);
  split("", rng_cmp);
  nrng = 0; # The ranges are indexed by {k} in {0..nrng-1}.
  
  # Another result of parsing the "FEATURES" section is 
  # a collection of /transcribed proteins/. Each transcribed 
  # protein consists of two attributes:
  #   
  #   {prt_ama[j]} The 1-letter codes of the protein's aminoacids.
  #   {prt_rng[j]} The index of the first range of its CDS.
  #   
  # The protein's CDS must consist of one or more disjoint ranges with
  # consecutive indices {k} beginning with {k = prt_rng[j]} and
  # spanning at least {3*m+1} bases in all, where {m =
  # length(prt_ama[j])}. The first range must have type 'D' and
  # the subsequent ones must have types [DEF]. The ranges
  # need not be adjacent in the DNA sequence, and may lie on either
  # strand (but must be all on the same strand). The protein's
  # aminoacids must be compatible with the first {3*m} bases spanned
  # by those ranges, and the three bases following the last codon
  # (completed with 'A' nucleotides if necessary) must be a valid
  # stop codon.
  
  split("", prt_ama);
  split("", prt_rng);
  nprt = 0; # The proteins are indexed by {j} in {0..nprt-1}.
  
  # The parsing of the "FEATURES" section keeps only those ranges that
  # are relevant for the range {ini..fin} of positions to be
  # extracted. Note that a range {rng_ini[k]..rng_fin[k]} that is
  # disjoint with {ini..fin} may still be relevant if it is part of a
  # CDS or other multi-range fature that intersects {ini..fin}.
  
  # The parsing of the "ORIGIN" section results in the nucleotide
  # sequence {bas[ini..fin]} of the primary (cDNA) strand.
  # If the DNA is circular, the sequence is assumed to be
  # infintely replicated.  (We assume that there is no
  # `Möbius strip DNA' --- circular DNA that consists of
  # a single closed strand twice looped, with the first 
  # half paired to the second half in the direct sense.)
  
  split("", bas);  # Base sequence will be {bas[ini..fin]}.

  # The complementary strand is just the reverse-complement of
  # {bas[ini..fin]}.
  
  # DNA encoding table
  #  
  #   Symbol  Meaning
  #   ------  -------
  #
  #   a       a; adenine
  #   c       c; cytosine
  #   g       g; guanine
  #   t       t; thymine in DNA; uracil in RNA
  #
  #   k       g or t
  #   m       a or c
  #   r       a or g
  #   s       c or g
  #   w       a or t
  #   y       c or t
  #
  #   b       c or g or t; not a
  #   d       a or g or t; not c
  #   h       a or c or t; not g
  #   v       a or c or g; not t
  #
  #   n       a or c or g or t
  
  # Base complementation table:
  split("", complement);
  complement["a"] = "t";
  complement["c"] = "g";
  complement["g"] = "c";
  complement["t"] = "a";
                  
  complement["k"] = "m";
  complement["m"] = "k";
  complement["r"] = "y";
  complement["s"] = "s";
  complement["w"] = "w";
  complement["y"] = "r";
                  
  complement["b"] = "v";
  complement["d"] = "h";
  complement["h"] = "d";
  complement["v"] = "b";
                  
  complement["n"] = "n";
  
  # Other attributes:
  locus = "";     # NCBI locus identifier from the "LOCUS" line, or "".
  version = "";   # NCBI locus + version identifier from the "VERSION" line, or "". 
  GenBank_id = ""; # GenBank acession number from the "VERSION" line, or "". 

  source_ok = 0; # TRUE iff "source" entry has been found.
}

(abort >= 0) { exit abort; }

# Ignore blank lines:
/^ *$/ { next; }

/[Ťť]/ { 
  data_error(("funny quotes in file")); 
}

# PARSING THE MAIN HEADER LINES

/^LOCUS[ ]/ {
  if (state != 0) { data_error(("misplaced ŤLOCUSť line")); }
  if (locus != "") { data_error(("duplicate ŤLOCUSť line")); }
  
  # Grab attributes from "LOCUS" line:
  locus = $2;       # Locus identifier
  tot_length = $3;  # Total length of sequence. 
  length_unit = $4; # Unit of {tot_length} (should be "bp").
  seq_type = $5;    # Type of sequence (should be "DNA").
  seq_shape = $6;   # "circular" or "linear".
  locus_kind = $7;  # NCBI classifier: "PRI", "CON", etc.
  date = $8;        # Date of file in {DD}-{MMM}-{YYYY} format.
  
  if (locus !~ /^[A-Z0-9_]+$/) { data_error(("invalid locus Ť" locus "ť")); }
  if (tot_length !~ /^[0-9]+$/) { data_error(("invalid length Ť" tot_length "ť")); }
  tot_length = tot_length + 0;
  if (length_unit != "bp") 
    { data_error(("expected size unit Ťbpť, found Ť" length_unit "ť")); }
  if (seq_type != "DNA")
    { data_error(("expected sequence type ŤDNAť, found Ť" seq_type "ť")); }
  if (seq_shape == "linear") 
    { circular = 0; }
  else if (seq_shape == "circular") 
    { circular = 1; }
  else 
    { data_error(("invalid sequence shape Ť" date "ť")); }
  if (date !~ /^[0-9]+[-][A-Z][A-Z][A-Z]-[12][09][0-9][0-9]$/) 
    { data_error(("invalid date format Ť" date "ť")); }
  next;
}

/^VERSION[ ]/ {
  if (state != 0) { data_error(("misplaced ŤVERSIONť line")); }
  if (version != "") { data_error(("duplicate ŤVERSIONť line")); }
  version = $2;
  GenBank_id = $3;
  next;
}

/^FEATURES[ ]/ {
  if (state != 0) { data_error(("misplaced ŤFEATURESť line")); }
  state = 1;
  
  # Print global header data to stderr:
  if (locus == "") { data_error(("missing ŤLOCUSť line")); }
  if (version == "") { data_error(("missing ŤVERSIONť line")); }
  printf "locus = %s  version = %s  date = %s\n", locus, version, date > "/dev/stderr";
  printf "  tot_length = %d\n", tot_length > "/dev/stderr";
  printf "  seq_type = %s  circular = %d \n", seq_type, circular > "/dev/stderr";

  # Prepare to parse the features: 
  key = "";  # Keyword of current feature.
  arg = "";  # Argument string of current feature.
  next;
}

/^ORIGIN[ ]/ {
  if (state != 1) { data_error(("misplaced ŤORIGINť line")); }
  state = 2;
  
  # Finish processing the last feature, if any: 
  if (key != "") { process_feature(key, arg); }
  
  # Check for required features: 
  if (! source_ok) { data_error(("missing Ťsourceť entry")); }
  
  # Prepare to parse the DNA data:
  bas_next = seq_ini; # Position of next base that we expect on input.
  for (i = ini; i <= fin; i++) { bas[i] = "?"; }
  next;
}

/^[\/][\/]/ {
  if (state != 2) { data_error(("misplaced Ť//ť line")); }
  if (bas_next != seq_fin+1) 
    { data_error(("position of last base = " bas_next-1 " should be " seq_fin "")); }
  state = 3;
  next;
}

/^[A-Z]+([ ]|$)/ {
  # Any other header line: 
  if (state != 0) { data_error(("misplaced Ť" $1 "ť header line")); }
  # Ignore extra headers in state 0:
  next;
}

# PARSING THE NON-HEADER LINES

(state == 0) { 
   # Non-header line before the "FEATURES" lines:
  next;
}

(state == 1) {
  # Non-header line between "FEATURES" and "ORIGIN" lines:
  
  # Get keyword in columns 1--20:
  new_key = substr($0, 1, 20);
  gsub(/[ ]/, "", new_key);
  
  # Get argument in columns 21--end:
  new_arg = substr($0, 21); 
  gsub(/^[ ]+/, "", new_arg);
  gsub(/[ ]+$/, "", new_arg);
  if (new_key == "")
    { # Continuation line: 
      if (key == "") { data_error(("missing start of feature")); } 
      arg = (arg " " new_arg);
    }
  else 
    { # Start of a new feature:
      if (key != "") { process_feature(key, arg); }
      key = new_key;
      arg = new_arg;
    }
  next;
}

(state == 2) {
  # Non-header line between "ORIGIN" and "//" lines -- must be nucleotide chunk:
  
  # Parse the position {str_ini} of the first base of the chunk:
  if ($0 !~ /^[ ]*[0-9]+[ ]+[a-z ]+[ ]*$/) { data_error(("bad DNA line Ť" $0 "ť")); }
  str_ini = $1;  
  if (str_ini != bas_next) { data_error(("sequence error Ť" str_ini "ť should be " bas_next "")); }
  
  # Get the nucleotide chunk {str} without any spaces:
  str = $0;
  gsub(/^[ ]*[0-9]+[ ]*/, "", str);
  gsub(/[ ]/, "", str);
  
  # Store it as appropriate:
  process_bases_chunk(str_ini,str);
  bas_next += length(str);
  next;
}

(state == 3) {
  # Non-header line after "//" line:
  data_error(("unexpected line after Ť//ť")); 
  next;
}

END {
  if (abort >= 0) { exit abort; }
  
  # Check whether all necessary information has been parsed:
  if (state != 3) { data_error(("missing Ť//ť line")); }
  
  output_data();
  
}

# DATA PROCESSING FUNCTIONS

function process_feature(key,arg,    loc,qua)
{
  # Processes a feature from the "FEATURES" section,
  # with the given {key} and arguments {arg}.
  
  # The interesting feature keys are:
  # 
  #        at       ce       dr       dm  KEY
  #   -------  -------  -------  -------  ---
  #         7        7       26        7  source
  #   -------  -------  -------  -------  ---
  #         .        .     2458        8  gap
  #   -------  -------  -------  -------  ---
  #     32615    23208    24225    20221  mRNA
  #        14        2        2        2  rRNA
  #        84      204        .      296  ncRNA
  #       577      228      614      339  misc_RNA
  #       689      630       22      314  tRNA
  #         .        .        .        .  tmRNA
  #   -------  -------  -------  -------  ---
  #     32817    23221    24220    20234  CDS
  #   -------  -------  -------  -------  ---
  # 
  # Other uninteresting keys that appear in the NCBI RefSeq files:
  # 
  #        at       ce       dr       dm  KEY
  #   -------  -------  -------  -------  ---
  #        14        .        .        .  promoter
  #     33263    21052    25205    14690  gene
  #       716    16980       12      975  STS
  #      5228        2        .        7  misc_feature
  #        32        .     3482        .  exon
  #        21        .        .        .  intron
  #         .        .        9        .  C_region
  #         .        .        1        .  D-loop
  #         .        .        2        .  V_segment
  #         .        .        1        1  rep_origin
  #         3        .        .     5395  repeat_region
  #         3        .        .        .  repeat_unit
  # 
  
  # Case is not significant for keys, so:
  key = tolower(key);
  
  # Split the argument {arg} into a location {loc}
  # and a qualifier string {qua}:
  
  if (match(arg, /[\/]/))
    { loc = substr(arg,1,RSTART-1); qua = substr(arg,RSTART); }
  else
    { loc = arg; qua = ""; }
    
  # General cleanup of location: 
  loc = tolower(loc);
  gsub(/[ ]/, "", loc);
  
  printf "  key = Ť%sť  loc = Ť%sť\n", key, loc > "/dev/stderr";
  
  # Dispatch on {key}:
  if (key == "source")
    { if (source_ok) { data_error(("more than one Ťsourceť feature")); }
      source_ok = 1;
      process_source(key, loc, qua);
    }
  else
    { # The "source" feature must come first:
      if (! source_ok) { data_error(("missing Ťsourceť feature")); } 
      if (key == "gap")
        { process_gap(key, loc, qua); }
      else if (key ~ /rna$/)
        { process_rna(key, loc, qua); }
      else if (key == "cds")
        { process_cds(key, loc, qua); }
      else if (key ~ /^(gene|sts|promoter|misc_feature|exon|intron|repeat_region|repeat_unit)$/)
        { process_useless_feature(key, loc, qua) }
      else 
        { data_warning(("ignored feature key = Ť" key "ť")); }
    }
}

function process_source(key,loc,qua,   i)
{
  # Processes the "source" feature.
  # Defines the variables {seq_ini} and {seq_fin}.
   
  if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny source location = Ť" loc "ť")); }
  seq_ini = get_range_ini(loc);
  seq_fin = get_range_fin(loc);
  if (seq_ini != 1) { data_error(("source range Ť" loc "ť does not start at 1")); }
  if (seq_fin < seq_ini) { data_error(("source range Ť" loc "ť is empty")); }
  
  # Check whether requested segment is valid: 
  if (! circular)
    { if ((ini < seq_ini) || (fin >= seq_fin))
        { data_error(("requested segment Ť" ini ".." fin "ť falls outside source range Ť" loc "ť")); }
    }
  else
    { if ((fin - ini + 1) > (seq_fin - seq_ini + 1))
        { data_error(("requested segment Ť" ini ".." fin "ť is longer than source Ť" loc "ť")); }
      if ((ini < seq_ini) || (ini > seq_fin))
        { data_error(("requested segment Ť" ini ".." fin "ť should start in source range Ť" loc "ť")); }
    }
}

function process_gap(key,loc,qua,   i)
{
  # Processes the "gap" feature.
  if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny Ťgapť feature")); }
  # A gap applies to both strands: 
  label_range(loc, 0, "O");
  label_range(loc, 1, "O");
}

function process_rna(key,loc,qua,  k,ini,fin,i,note)
{
  # Processes the "*RNA" (transcribed region) features.
  show_feature_note(qua);
  # Just label all bases with 'N':
  label_location(loc, "N");
}

function process_cds(key,loc,qua,  frame,labels,k,ini,fin,i)
{
  # Processes the "CDS" (coding region) feature.
  
  # Essential qualifiers: 
  #   
  #   /codon_start={1|2|3}
  #  
  # Qualifiers relevant for consistency checking: 
  #  
  #  /translation="<aaletter_string>"
  #  /transl_table=<number>
  #  /codon=(seq:"codon-sequence",aa:<amino_acid>)
  #  /exception="text"
  #  /transl_except=(pos:<location>,aa:<amino_acid>)
  #     transl_except=(pos:213..215,aa:Trp)
  #     transl_except=(pos:213..214,aa:TERM) #-- incomplete stop codon
  
  show_feature_note(qua);
  
  # Get the starting offset from the "/codon_start" feature:
  frame = get_numeric_qualifier(qua, "/codon_start", 1);
  labels = substr("DEFDEF", 5 - frame, 3);
  if (frame != 1) { data_warning(("codon start = " frame "")); }
  label_location(loc, labels);

  # !!! Should check the translation
}

function process_useless_feature(key,loc,qua   )
{
  # Processes a common but useless feature.
  # Just ignore it:
  # Done. (That was easy...)
}

function show_feature_note(qua,   note)
{
  # Displays any "/note" qualifier in {qua}, if it looks interesting.
  note = get_text_qualifier("/note",qua,"");
  if (note != "") { printf "    /note=Ť%sť\n", note > "/dev/stderr"; }
}

function label_location(loc,labels,   nrt,rt,j,k,rng,len,strand)
{
  # Appends to the labeling range list one or more entries that
  # label the location {loc} with repeated copies of the string
  # {labels}. 
  
  # The location {loc} specifies a sequence of positions on a specific
  # strand.  For this function, {loc} must be either 
  #
  #   * a single position "{POS}",
  #   * a range of position "{INI}..{FIN}" with {INI <= FIN},
  #   * "complement({LOC})", or
  #   * "join({LOC[1]},{LOC[2]},..{LOC[n]})"
  #
  # where each {LOC} or {LOC[i]} is a location.  However, 
  # cannot be two nested calls to the same operator ("complement" or "join").
  # So the formula has maximum depth 2.  Moreover one cannot join 
  # positions belonging to opposite strands.

  # The "join" operator appends the sequences described by its operands
  # in the order they appear in the argument list. The "complement"
  # perator flips each position to the other strand and reverses the order
  # of the sequence.  Thus, "complement(join({A},{B}))"
  # is equivalent to "join(complement({B}),complement({A}))".
  
  # The procedure applies letters 1,2,... of {labels} (cyclically)
  # to the positions specified by {loc}, in the order specified by {loc}.
  # Thus, for example, the call
  #   
  #   {label_location("complement(join(50..60,30..40))","DEF")}
  #   
  # will start labeling "complement(40)" with 'D', "complement(39)"
  # with 'E', and so on.
  
  # The procedure fails with error if {loc} is malformed or 
  # names any position outside the "source" range.

  split("", rt); # Temp array for arguments of "join".
  if (loc ~ /^complement[(].*[)]$/)
    { # Strip the "complement" operator: 
      loc = substr(loc,12,length(loc)-12);
      # Extract the ranges:
      nrt = split_join_of_ranges(loc,rt);
      # Label them in reverse order:
      for (j = nrt; j > 0; j--)
        { rng = cleanup_range(rt[j]);
          len = label_range(rng, 1, labels);
          # Cyclically shift the labels:
          labels = cycle(labels, len);
        }
    }
  else if (loc ~ /^join[(].*[)]$/) 
    { # Get the joined ranges (or complements thereof): 
      nrt = split_join_of_ranges(loc, rt);
      # Label them in increasing order:
      for (j = 1; j <= nrt; j++)
        { rng = rt[j];
          if (rng ~ /^complement[(].*[)]$/) 
            { # strip the "complement" operator:
              rng = substr(rng,12,length(rng)-12);
              strand = 1;
            }
          else
            { strand = 0; }
          rng = cleanup_range(rng);
          len = label_range(rng, strand, labels);
          # Cyclically shift the labels:
          labels = cycle(labels, len);
        }
    }
  else
    { # Location should be a simple range or position:
      rng = cleanup_range(loc);
      label_range(rng, 0, labels);
    }
}

function cleanup_range(rng   )
{
  # Checks whether {rng} is a well-formed range or single position.
  # If it contains '<' or '>' constructs, prints a warning and
  # removes them. Returns the cleaned-up range.
  # printf "    rng = Ť%sť\n", rng > "/dev/stderr";
  if (rng ~ /[<>]/)
    { data_warning(("indeterminate range Ť" rng "ť ignoring the [<>]")); 
      gsub(/[<>]/, "", rng);
    }
  if (rng !~ /^[0-9]+([.][.][0-9]+|)$/) 
    { data_error(("expected position or range, found Ť" rng "ť")); }
  return rng;
}

function split_join_of_ranges(loc,rt,  nrt)
{
  # Given a location {loc} that is a join of locations, splits
  # its argument list.  Puts the arguments in {rt[1..]} 
  # and returns the number of arguments.  If {loc} is not
  # a "join(...)" construct, sets {rt[1]=loc} and returns 1.
  # The parameter {rt} must be an array.
  
  if (loc ~ /^join[(].*[)]$/)
    { # Remove the "join(...)" wrapper:
      loc = substr(loc, 6, length(loc)-6); 
      # Break at commas:
      nrt = split(loc, rt, ",");
      return nrt; 
    }
  else
    { # Not a "join(...)":
      rt[1] = loc;
      return 1;
    }
}

function label_range(rng,strand,labels,   rini,rfin)
{
  # Like {label_location}, specialized for a single range
  # on the indicated {strand}.  Returns the number of positions
  # labeled.
  
  # Assumes that {rng} is either a position or a
  # simple interval "{INI}..{POS}". Fails if the range is empty.

  rini = get_range_ini(rng);
  rfin = get_range_fin(rng);
  if (rini > rfin) 
    { data_error(("empty range = Ť" rng "ť")); }
  if ((rini < seq_ini) || ((! circular) && (rfin > seq_fin)))
    { data_error(("range Ť" rng "ť falls outside the source sequence")); }
  printf "    @ lab[%d,%s] = %s", strand,rng,labels > "/dev/stderr";
  if (overlaps_segment(rini,rfin))
    { # Range overlaps the region of interest:
      printf " (saved)" > "/dev/stderr";
      rng_ini[nrng] = rini;
      rng_fin[nrng] = rfin;
      rng_lab[nrng] = labels;
      rng_cmp[nrng] = strand;
      nrng++;
    }
  printf "\n" > "/dev/stderr";
  return (rfin - rini + 1);
}

function overlaps_segment(rini,rfin,   rnb,seq_nb)
{
  # Returns 1 iff the ranges {rini..rfin} and {ini..fin} overlap, 
  # taking {circular,seq_ini,seq_fin} into account.
  # Assumes that {1 <= rini <= rfin} and {1 <= ini <= fin}.
  if ((rini <= fin) && (rfin >= ini)) { return 1; }
  if (circular)
    { # May overlap by circularity.
      # Get number of bases in source:
      seq_nb = seq_fin - seq_ini + 1;
      # Reduce {rini,rfin} mod {1..seq_nb}:
      while ((rini > fin) && (rini > seq_nb)) { rini -= seq_nb; rfin -= seq_nb; }
      if ((rini <= fin) && (rfin >= ini)) { return 1; }
      if ((rini+seq_nb <= fin) && (rfin+seq_nb >= ini)) { return 1; }
      if ((rini <= fin+seq_nb) && (rfin >= ini+seq_nb)) { return 1; }
    }
  else
    { return 0; }
}

function inside_segment(i,   rnb,seq_nb)
{
  # Returns 1 iff the ranges {rini..rfin} and {ini..fin} overlap, 
  # taking {circular,seq_ini,seq_fin} into account.
  # Assumes that {1 <= rini <= rfin} and {1 <= ini <= fin}.
  if ((i >= ini) && (i <= fin)) { return 1; }
  if (circular)
    { # May overlap by circularity.
      # Get number of bases in source:
      seq_nb = seq_fin - seq_ini + 1;
      # Reduce {i} mod {1..seq_nb}:
      while ((i > fin) && (i > seq_nb)) { i -= seq_nb; }
      if ((i >= ini) && (i <= fin)) { return 1; }
      if ((i+seq_nb >= ini) && (i+seq_nb <= fin)) { return 1; }
      if ((i >= ini+seq_nb) && (i <= fin+seq_nb)) { return 1; }
    }
  else
    { return 0; }
}

function process_translation(arg,   i)
{
  # Processes the "/translation" (aminoacid sequence) entry.
  # Sets {nama} to its length and stores the base letters into {ama[1..nama]}.

  # Abbreviation    Amino acid name
  # ------------    ---------------
  # 
  # Ala     A       Alanine
  # Arg     R       Arginine
  # Asn     N       Asparagine
  # Asp     D       Aspartic acid (Aspartate)
  # Cys     C       Cysteine
  # Gln     Q       Glutamine
  # Glu     E       Glutamic acid (Glutamate)
  # Gly     G       Glycine
  # His     H       Histidine
  # Ile     I       Isoleucine
  # Leu     L       Leucine
  # Lys     K       Lysine
  # Met     M       Methionine
  # Phe     F       Phenylalanine
  # Pro     P       Proline
  # Pyl     O       Pyrrolysine
  # Ser     S       Serine
  # Sec     U       Selenocysteine
  # Thr     T       Threonine
  # Trp     W       Tryptophan
  # Tyr     Y       Tyrosine
  # Val     V       Valine
  # Asx     B       Aspartic acid or Asparagine
  # Glx     Z       Glutamine or Glutamic acid.
  # Xaa     X       Any amino acid.
  # Xle     J       Leucine or Isoleucine
  # TERM            termination codon
  # 
  data_error(("function {process_translation} not implemented yet")); 
}

function get_range_ini(r)
{
  if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format Ť" r "ť")); }
  gsub(/[.][.][0-9]+$/, "", r);
  r += 0;
  if (r < 1) { data_error(("invalid position Ť" r "ť")); }
  return r;
}

function get_range_fin(r)
{
  if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format Ť" r "ť")); }
  gsub(/^[0-9]+[.][.]/, "", r);
  r += 0;
  if (r < 1) { data_error(("invalid position Ť" r "ť")); }
  return r;
}

function get_numeric_qualifier(name,qua,def,  i)
{
  # Extracts from the string {qua} the value of the numeric-valued qualifier
  # named {name}.  The {name} should include the "/" but not the "=".
  # If there is no such qaulifier in {qua}, returns the {def}
  # value.  Assumes that {name} does not occur inside any text value.
  i = index(qua,name);
  if (i > 0)
    { qua = substr(qua,i + length(name));
      if (! match(qua, /^[ ]*[=][ ]*/)) { data_error(("missing Ť=ť after Ť" name "ť")); }
      qua = substr(qua, RSTART+RLENGTH);
      if (! match(qua, /^[0-9]+/)) { data_error(("missing number after Ť" name "=ť")); }
      return substr(qua,1,RLENGTH) + 0;
    }
  else
    { return def; }
}

function get_text_qualifier(name,qua,def,  i,res,sep)
{
  # Extracts from the string {qua} the value of the text-valued qualifier
  # named {name}.  The {name} should include the "/" but not the "=".
  # The value should be surrounded by double quotes, with any embedded
  # quotes duplicated. The extra quotes are removed from the result.
  # If there is no such qaulifier in {qua}, returns the {def}
  # value.  Assumes that {name} does not occur inside any text value.
  i = index(qua,name);
  if (i > 0)
    { qua = substr(qua,i + length(name));
      if (! match(qua, /^[ ]*[=][ ]*/)) { data_error(("missing Ť=ť after Ť" name "ť")); }
      qua = substr(qua, RSTART+RLENGTH);
      if (! (qua ~ /^["]/))  { data_error(("missing Ť\"ť after Ť" name "=ť")); }
      res = ""; sep = "";
      while (match(qua, /^["][^"]*["]/))
        { if (! match(qua, /^["][^"]*["]/)) { data_error(("malformed value of Ť" name "ť qualifier")); }
          res = ( res sep substr(qua,2,RLENGTH-2) );
          qua = substr(qua, RSTART+RLENGTH);
          if (match(qua, /^[ ]+/)) { qua = substr(qua, RSTART+RLENGTH); }
          sep = "\"";
        }
      return res;
    }
  else
    { return def; }
}

function cycle(str,k,    n)
{
  # Shifts the string {str} cyclically to the left by {k} characters.
  n = length(str);
  k = k % n;
  if (k == 0) 
    { return str; }
  else
    { return (substr(str, 1+k) substr(str,1,k)); }
}
  

function process_bases_chunk(str_ini,str,   n,str_fin,i,ch)
{
  # Processes a nucleotide string {str}, assumed to start 
  # at position {str_ini} of the whole sequence and contain 
  # just letters, without any blanks.
  
  # Stores the chunk whose positions {i} fall in the range {ini..fin}
  # into {bas[i]}.
  
  n = length(str);
  str_fin = str_ini + n - 1;
  if (overlaps_segment(str_ini,str_fin))
    { # Line overlaps the desired range:
      for (i = str_ini; i <= str_fin; i++)
        { if (inside_segment(i))
            { ch = tolower(substr(str, i - str_ini + 1, 1));
              if (ch !~ /^[abcdghkmnrstvwy]$/)
                { data_error(("invalid base letter Ť" ch "ť")); }
              bas[i] = ch;
            }
        }
    }
}

function output_data(    strand,i,j,nb)
{
  # Called at end of parsing to process all 
  # saved data and write the output files.

  setup_label_sequence();
  
  for (strand = 0; strand <= 1; strand++)
    { 
      # Choose the output file names:
      bas_file = ( outName "-" strand ".bas" );
      lab_file = ( outName "-" strand ".lab" );

      # Write minimal header lines:
      printf "> %s /strand=%d /ini=%d /fin=%d\n", locus, strand, ini, fin > bas_file;
      printf "> %s /strand=%d /ini=%d /fin=%d\n", locus, strand, ini, fin > lab_file;

      # Write the base and label sequences: 
      nb = fin - ini + 1;
      for (j = 0; j < nb; j++)
        { i = (strand ? fin - j : ini + j);
          if ((j > 0) && (j % 60 == 0)) 
            { printf "\n" > bas_file; printf "\n" > lab_file; }
          printf "%c", (strand ? complement[bas[i]] : bas[i]) > bas_file;
          printf "%c", lab[strand,i] > lab_file;
        }
      if (nb > 0) 
        { printf "\n" > bas_file;  printf "\n" > lab_file; }

      close(bas_file);
      close(lab_file);
    }
  printf "\n", locus, version, date > "/dev/stderr";
}

function setup_label_sequence(     k,i,j,rini,rfin,rlab,rcmp,m,ch,ex,nbug)
{
  # Fills the array {lab[strand,ini..fin]} with labels,
  # based on the saved labeling ranges. Namely, 
  # {rlab[0,i]} is the label for position {i} on strand 0,
  # and {rlab[1,i]} is the label for the base on strand 1
  # that gets paired to it.

  # printf "  saved labeling ranges:\n" > "/dev/stderr";
  # for (k = 0; k < nrng; k++)
  #   { rlab = rng_lab[k];
  #     rini = rng_ini[k];
  #     rfin = rng_fin[k];
  #     rcmp = rng_cmp[k];
  #     printf "    lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr";
  #   }
      
  # First pass: clear all labels.
  split("", lab);
  for (rcmp = 0; rcmp <= 1; rcmp++)
    { for (i = ini; i <= fin; i++) { lab[rcmp,i] = "Z"; } }
  
  # Second pass pass: label all gaps ('O') and transcribed regions ('N'):
  for (k = 0; k < nrng; k++)
    { rlab = rng_lab[k];
      if ((rlab == "O") || (rlab == "N"))
        { # Paint all positions: 
          rini = rng_ini[k];
          rfin = rng_fin[k];
          rcmp = rng_cmp[k];
          ch = rlab;
          printf "    lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr";
          for (i = rini; i <= rfin; i++)
            { if (inside_segment(i))
                { ex = lab[rcmp,i];
                  if ((ex != "Z") && (ex != ch))
                    { data_warning(("conflicting labels at position [" rcmp "," i "]: Ť" ex "ť, Ť" ch "ť"));
                      lab[rcmp,i] = "X";
                    }
                  else
                    { lab[rcmp,i] = ch; }
                }
            }
          # Kill the range: 
          rng_lab[k] = "";
        }
    }
    
  # Third pass: paint all CDS features: 
  for (k = 0; k < nrng; k++)
    { rlab = rng_lab[k];
      if (rlab ~ /[DEF]/)
        { # Paint all positions, in order: 
          rini = rng_ini[k];
          rfin = rng_fin[k];
          rcmp = rng_cmp[k];
          printf "    lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr";
          m = length(rlab);  # Number of label letters to cycle over.
          j = 1; # Index of next label letter into {rlab}.
          # Scan the positions covered by the range in the appropriate order: 
          nbug = 0; # Number of coding positions outside transcribed regions.
          for (i = (rcmp ? rfin : rini); (rcmp ? i >= rini : i <= rfin); i += (rcmp ? -1 : +1))
            { if (inside_segment(i))
                { ch = substr(rlab, j, 1);
                  ex = lab[rcmp,i];
                  if ((ex != "Z") && (ex != "N") && (ex != ch))
                    { data_warning(("conflicting labels at position [" rcmp "," i "]: Ť" ex "ť, Ť" ch "ť"));
                      lab[rcmp,i] = "X";
                    }
                  else 
                    { # Coding regions should be nested in transcribed regions:
                      if ((nbug == 0) && (ex != "N") && (ex != ch))
                        { data_warning(("position [" rcmp "," i "] is coding but not transcribed"));
                          nbug++;
                        }
                      lab[rcmp,i] = ch;
                    }
                }
              j++; if (j > m) { j = 1; }
            }
          if (nbug > 1)
            { data_warning(("total " nbug " untranscribed coding positions in [" rcmp "," rini ".." rfin "]")); }
          # Kill the range: 
          rng_lab[k] = "";
        }
    }
    
  # Fourth pass: check for funny ranges.
  for (k = 0; k < nrng; k++)
    { rlab = rng_lab[k];
      if (rlab != "")
        { data_error(("leftover range labeling Ť" rlab "ť")); }
    }
}

# AUXILIARY PROCEDURES

function show_codon(title,seq,pos,  j,jj)
{
  # Prints to {stderr} the three letters {seq[pos..pos+2]},
  # with some context, clipped to the range {seq_ini..seq_fin}.
  printf "  %s = ", title > "/dev/stderr";
  for (j = pos - 5; j <= pos + 7; j++)
    { if ((j >= seq_ini) && (circular || (j <= seq_fin)))
        { jj = (circular ? seq_ini + (j - seq_ini) % (seq_fin - seq_ini + 1) : j);
          if (j == pos) { printf "[" > "/dev/stderr"; }
          printf "%s", seq[jj] > "/dev/stderr";
          if (j == pos + 2) { printf "]" > "/dev/stderr"; }
        }
    }
  printf "\n" > "/dev/stderr";
}

function data_error(msg)
{
  printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
  abort = 1;
  exit abort;
}

function data_warning(msg)
{
  printf "%s:%s: warning: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
}

function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", USAGE > "/dev/stderr";
  abort = 1;
  exit abort;
}