#! /usr/bin/gawk -f # Last edited on 2008-06-16 21:07:11 by stolfi BEGIN { # Error flag abort = -1; USAGE = ( \ "extract_RefSeq_segment \\\n" \ " -v ini={INI} -v fin={FIN} \\\n" \ " -v outName={NAME} \\\n" \ " [<] {INFILE}.gbk" \ ); # Reads from standard input an NCBI RefSeq genomic cDNA file # in the full GenBank flatfile format. Writes several files with # data that is relevant to coding/non-coding detection research. # # The input format appears to be similar or identical to the EMBL format # described in # # http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html # # The output consists of four output files, two for each strand, called # # "{outName}-{S}.bas" - the sequence of bases [ATCG] on strand {S}. # # "{outName}-{S}.lab" - the corresponding functional label sequence. # # The strand code is "0" for the primary strand (as in the input file) or "1" # for the complementary strand. # # The {lab} files contain one label letter for each # nucleotide in the segment. The letters record the function of the # corresponding nucleotide, as inferred from the "FEATURES" section # of the input file. Each label may be: # # 'D', 'E', 'F' The three codon positions in coding regions. # 'N' Base of transcribed but non-coding region. # 'O' Gap in sequence. # 'Z' Unknown function. # 'X' Conflicting functions. # # The coding labels [DEF] are assigned only if (1) the nucleotide # is spanned by a "CDS" feature, and (2) all "CDS" features that # span the base agree on its phase (reading frame alignment). # # The label 'N' is assigned iff the base (1) is not spanned by # any "CDS" feature, but (2) is spanned by one or more transcription # features ("mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA"). # # The letter 'O' is assigned if the base is spanned by one or more # "gap" features. # # Bases that do not fit any of the above rules are assigned # label 'Z' (`unknown'). # # Bases that would receive two or more distinct labels by the above # rules are labeled 'X' (`conflict/bug'). # # All output files begin with a perfunctory header with the format # # "> {ITEMVERS} /strand={S} /ini={INI} /fin={FIN}" # # Argument checking and defaulting: if (outName == "") { arg_error(("must define {outName}")); } if (ini == "") { arg_error(("must define {ini}")); } else { ini += 0; } if (fin == "") { arg_error(("must define {fin}")); } else { fin += 0; } if (ini < 1) { arg_error(("bad {ini} = " ini "")); } if (fin < ini) { arg_error(("bad range {ini..fin} = " ini ".." fin "")); } # Parsing state: # 0 before "FEATURES" line # 1 between "FEATURES" and "ORIGIN" lines # 2 between "ORIGIN" and "//" lines. # 3 after "//" line.. state = 0; # One result of parsing the "FEATURES" section is a # collection of (possibly overlapping) /labeling ranges/ # A labeling range is defined by three attributes # # {rng_ini[k]} First DNA position in range. # {rng_fin[k]} Last DNA position in range. # {rng_lab[k]} Labels to be applied to the range. # {rng_cmp[k]} Strand (0 for primary, 1 for complementary). # # The interpretation of {rng_lab} is defined further on. The # positions {rng_ini[k],rng_fin[k]} refer to the primary strand, # even when {rng_cmp[k]} is 1. split("", rng_ini); split("", rng_fin); split("", rng_lab); split("", rng_cmp); nrng = 0; # The ranges are indexed by {k} in {0..nrng-1}. # Another result of parsing the "FEATURES" section is # a collection of /transcribed proteins/. Each transcribed # protein consists of two attributes: # # {prt_ama[j]} The 1-letter codes of the protein's aminoacids. # {prt_rng[j]} The index of the first range of its CDS. # # The protein's CDS must consist of one or more disjoint ranges with # consecutive indices {k} beginning with {k = prt_rng[j]} and # spanning at least {3*m+1} bases in all, where {m = # length(prt_ama[j])}. The first range must have type 'D' and # the subsequent ones must have types [DEF]. The ranges # need not be adjacent in the DNA sequence, and may lie on either # strand (but must be all on the same strand). The protein's # aminoacids must be compatible with the first {3*m} bases spanned # by those ranges, and the three bases following the last codon # (completed with 'A' nucleotides if necessary) must be a valid # stop codon. split("", prt_ama); split("", prt_rng); nprt = 0; # The proteins are indexed by {j} in {0..nprt-1}. # The parsing of the "FEATURES" section keeps only those ranges that # are relevant for the range {ini..fin} of positions to be # extracted. Note that a range {rng_ini[k]..rng_fin[k]} that is # disjoint with {ini..fin} may still be relevant if it is part of a # CDS or other multi-range fature that intersects {ini..fin}. # The parsing of the "ORIGIN" section results in the nucleotide # sequence {bas[ini..fin]} of the primary (cDNA) strand. # If the DNA is circular, the sequence is assumed to be # infintely replicated. (We assume that there is no # `Möbius strip DNA' --- circular DNA that consists of # a single closed strand twice looped, with the first # half paired to the second half in the direct sense.) split("", bas); # Base sequence will be {bas[ini..fin]}. # The complementary strand is just the reverse-complement of # {bas[ini..fin]}. # DNA encoding table # # Symbol Meaning # ------ ------- # # a a; adenine # c c; cytosine # g g; guanine # t t; thymine in DNA; uracil in RNA # # k g or t # m a or c # r a or g # s c or g # w a or t # y c or t # # b c or g or t; not a # d a or g or t; not c # h a or c or t; not g # v a or c or g; not t # # n a or c or g or t # Base complementation table: split("", complement); complement["a"] = "t"; complement["c"] = "g"; complement["g"] = "c"; complement["t"] = "a"; complement["k"] = "m"; complement["m"] = "k"; complement["r"] = "y"; complement["s"] = "s"; complement["w"] = "w"; complement["y"] = "r"; complement["b"] = "v"; complement["d"] = "h"; complement["h"] = "d"; complement["v"] = "b"; complement["n"] = "n"; # Other attributes: locus = ""; # NCBI locus identifier from the "LOCUS" line, or "". version = ""; # NCBI locus + version identifier from the "VERSION" line, or "". GenBank_id = ""; # GenBank acession number from the "VERSION" line, or "". source_ok = 0; # TRUE iff "source" entry has been found. } (abort >= 0) { exit abort; } # Ignore blank lines: /^ *$/ { next; } /[«»]/ { data_error(("funny quotes in file")); } # PARSING THE MAIN HEADER LINES /^LOCUS[ ]/ { if (state != 0) { data_error(("misplaced «LOCUS» line")); } if (locus != "") { data_error(("duplicate «LOCUS» line")); } # Grab attributes from "LOCUS" line: locus = $2; # Locus identifier tot_length = $3; # Total length of sequence. length_unit = $4; # Unit of {tot_length} (should be "bp"). seq_type = $5; # Type of sequence (should be "DNA"). seq_shape = $6; # "circular" or "linear". locus_kind = $7; # NCBI classifier: "PRI", "CON", etc. date = $8; # Date of file in {DD}-{MMM}-{YYYY} format. if (locus !~ /^[A-Z0-9_]+$/) { data_error(("invalid locus «" locus "»")); } if (tot_length !~ /^[0-9]+$/) { data_error(("invalid length «" tot_length "»")); } tot_length = tot_length + 0; if (length_unit != "bp") { data_error(("expected size unit «bp», found «" length_unit "»")); } if (seq_type != "DNA") { data_error(("expected sequence type «DNA», found «" seq_type "»")); } if (seq_shape == "linear") { circular = 0; } else if (seq_shape == "circular") { circular = 1; } else { data_error(("invalid sequence shape «" date "»")); } if (date !~ /^[0-9]+[-][A-Z][A-Z][A-Z]-[12][09][0-9][0-9]$/) { data_error(("invalid date format «" date "»")); } next; } /^VERSION[ ]/ { if (state != 0) { data_error(("misplaced «VERSION» line")); } if (version != "") { data_error(("duplicate «VERSION» line")); } version = $2; GenBank_id = $3; next; } /^FEATURES[ ]/ { if (state != 0) { data_error(("misplaced «FEATURES» line")); } state = 1; # Print global header data to stderr: if (locus == "") { data_error(("missing «LOCUS» line")); } if (version == "") { data_error(("missing «VERSION» line")); } printf "locus = %s version = %s date = %s\n", locus, version, date > "/dev/stderr"; printf " tot_length = %d\n", tot_length > "/dev/stderr"; printf " seq_type = %s circular = %d \n", seq_type, circular > "/dev/stderr"; # Prepare to parse the features: key = ""; # Keyword of current feature. arg = ""; # Argument string of current feature. next; } /^ORIGIN[ ]/ { if (state != 1) { data_error(("misplaced «ORIGIN» line")); } state = 2; # Finish processing the last feature, if any: if (key != "") { process_feature(key, arg); } # Check for required features: if (! source_ok) { data_error(("missing «source» entry")); } # Prepare to parse the DNA data: bas_next = seq_ini; # Position of next base that we expect on input. for (i = ini; i <= fin; i++) { bas[i] = "?"; } next; } /^[\/][\/]/ { if (state != 2) { data_error(("misplaced «//» line")); } if (bas_next != seq_fin+1) { data_error(("position of last base = " bas_next-1 " should be " seq_fin "")); } state = 3; next; } /^[A-Z]+([ ]|$)/ { # Any other header line: if (state != 0) { data_error(("misplaced «" $1 "» header line")); } # Ignore extra headers in state 0: next; } # PARSING THE NON-HEADER LINES (state == 0) { # Non-header line before the "FEATURES" lines: next; } (state == 1) { # Non-header line between "FEATURES" and "ORIGIN" lines: # Get keyword in columns 1--20: new_key = substr($0, 1, 20); gsub(/[ ]/, "", new_key); # Get argument in columns 21--end: new_arg = substr($0, 21); gsub(/^[ ]+/, "", new_arg); gsub(/[ ]+$/, "", new_arg); if (new_key == "") { # Continuation line: if (key == "") { data_error(("missing start of feature")); } arg = (arg " " new_arg); } else { # Start of a new feature: if (key != "") { process_feature(key, arg); } key = new_key; arg = new_arg; } next; } (state == 2) { # Non-header line between "ORIGIN" and "//" lines -- must be nucleotide chunk: # Parse the position {str_ini} of the first base of the chunk: if ($0 !~ /^[ ]*[0-9]+[ ]+[a-z ]+[ ]*$/) { data_error(("bad DNA line «" $0 "»")); } str_ini = $1; if (str_ini != bas_next) { data_error(("sequence error «" str_ini "» should be " bas_next "")); } # Get the nucleotide chunk {str} without any spaces: str = $0; gsub(/^[ ]*[0-9]+[ ]*/, "", str); gsub(/[ ]/, "", str); # Store it as appropriate: process_bases_chunk(str_ini,str); bas_next += length(str); next; } (state == 3) { # Non-header line after "//" line: data_error(("unexpected line after «//»")); next; } END { if (abort >= 0) { exit abort; } # Check whether all necessary information has been parsed: if (state != 3) { data_error(("missing «//» line")); } output_data(); } # DATA PROCESSING FUNCTIONS function process_feature(key,arg, loc,qua) { # Processes a feature from the "FEATURES" section, # with the given {key} and arguments {arg}. # The interesting feature keys are: # # at ce dr dm KEY # ------- ------- ------- ------- --- # 7 7 26 7 source # ------- ------- ------- ------- --- # . . 2458 8 gap # ------- ------- ------- ------- --- # 32615 23208 24225 20221 mRNA # 14 2 2 2 rRNA # 84 204 . 296 ncRNA # 577 228 614 339 misc_RNA # 689 630 22 314 tRNA # . . . . tmRNA # ------- ------- ------- ------- --- # 32817 23221 24220 20234 CDS # ------- ------- ------- ------- --- # # Other uninteresting keys that appear in the NCBI RefSeq files: # # at ce dr dm KEY # ------- ------- ------- ------- --- # 14 . . . promoter # 33263 21052 25205 14690 gene # 716 16980 12 975 STS # 5228 2 . 7 misc_feature # 32 . 3482 . exon # 21 . . . intron # . . 9 . C_region # . . 1 . D-loop # . . 2 . V_segment # . . 1 1 rep_origin # 3 . . 5395 repeat_region # 3 . . . repeat_unit # # Case is not significant for keys, so: key = tolower(key); # Split the argument {arg} into a location {loc} # and a qualifier string {qua}: if (match(arg, /[\/]/)) { loc = substr(arg,1,RSTART-1); qua = substr(arg,RSTART); } else { loc = arg; qua = ""; } # General cleanup of location: loc = tolower(loc); gsub(/[ ]/, "", loc); printf " key = «%s» loc = «%s»\n", key, loc > "/dev/stderr"; # Dispatch on {key}: if (key == "source") { if (source_ok) { data_error(("more than one «source» feature")); } source_ok = 1; process_source(key, loc, qua); } else { # The "source" feature must come first: if (! source_ok) { data_error(("missing «source» feature")); } if (key == "gap") { process_gap(key, loc, qua); } else if (key ~ /rna$/) { process_rna(key, loc, qua); } else if (key == "cds") { process_cds(key, loc, qua); } else if (key ~ /^(gene|sts|promoter|misc_feature|exon|intron|repeat_region|repeat_unit)$/) { process_useless_feature(key, loc, qua) } else { data_warning(("ignored feature key = «" key "»")); } } } function process_source(key,loc,qua, i) { # Processes the "source" feature. # Defines the variables {seq_ini} and {seq_fin}. if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny source location = «" loc "»")); } seq_ini = get_range_ini(loc); seq_fin = get_range_fin(loc); if (seq_ini != 1) { data_error(("source range «" loc "» does not start at 1")); } if (seq_fin < seq_ini) { data_error(("source range «" loc "» is empty")); } # Check whether requested segment is valid: if (! circular) { if ((ini < seq_ini) || (fin >= seq_fin)) { data_error(("requested segment «" ini ".." fin "» falls outside source range «" loc "»")); } } else { if ((fin - ini + 1) > (seq_fin - seq_ini + 1)) { data_error(("requested segment «" ini ".." fin "» is longer than source «" loc "»")); } if ((ini < seq_ini) || (ini > seq_fin)) { data_error(("requested segment «" ini ".." fin "» should start in source range «" loc "»")); } } } function process_gap(key,loc,qua, i) { # Processes the "gap" feature. if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny «gap» feature")); } # A gap applies to both strands: label_range(loc, 0, "O"); label_range(loc, 1, "O"); } function process_rna(key,loc,qua, k,ini,fin,i,note) { # Processes the "*RNA" (transcribed region) features. show_feature_note(qua); # Just label all bases with 'N': label_location(loc, "N"); } function process_cds(key,loc,qua, frame,labels,k,ini,fin,i) { # Processes the "CDS" (coding region) feature. # Essential qualifiers: # # /codon_start={1|2|3} # # Qualifiers relevant for consistency checking: # # /translation="" # /transl_table= # /codon=(seq:"codon-sequence",aa:) # /exception="text" # /transl_except=(pos:,aa:) # transl_except=(pos:213..215,aa:Trp) # transl_except=(pos:213..214,aa:TERM) #-- incomplete stop codon show_feature_note(qua); # Get the starting offset from the "/codon_start" feature: frame = get_numeric_qualifier(qua, "/codon_start", 1); labels = substr("DEFDEF", 5 - frame, 3); if (frame != 1) { data_warning(("codon start = " frame "")); } label_location(loc, labels); # !!! Should check the translation } function process_useless_feature(key,loc,qua ) { # Processes a common but useless feature. # Just ignore it: # Done. (That was easy...) } function show_feature_note(qua, note) { # Displays any "/note" qualifier in {qua}, if it looks interesting. note = get_text_qualifier("/note",qua,""); if (note != "") { printf " /note=«%s»\n", note > "/dev/stderr"; } } function label_location(loc,labels, nrt,rt,j,k,rng,len,strand) { # Appends to the labeling range list one or more entries that # label the location {loc} with repeated copies of the string # {labels}. # The location {loc} specifies a sequence of positions on a specific # strand. For this function, {loc} must be either # # * a single position "{POS}", # * a range of position "{INI}..{FIN}" with {INI <= FIN}, # * "complement({LOC})", or # * "join({LOC[1]},{LOC[2]},..{LOC[n]})" # # where each {LOC} or {LOC[i]} is a location. However, # cannot be two nested calls to the same operator ("complement" or "join"). # So the formula has maximum depth 2. Moreover one cannot join # positions belonging to opposite strands. # The "join" operator appends the sequences described by its operands # in the order they appear in the argument list. The "complement" # perator flips each position to the other strand and reverses the order # of the sequence. Thus, "complement(join({A},{B}))" # is equivalent to "join(complement({B}),complement({A}))". # The procedure applies letters 1,2,... of {labels} (cyclically) # to the positions specified by {loc}, in the order specified by {loc}. # Thus, for example, the call # # {label_location("complement(join(50..60,30..40))","DEF")} # # will start labeling "complement(40)" with 'D', "complement(39)" # with 'E', and so on. # The procedure fails with error if {loc} is malformed or # names any position outside the "source" range. split("", rt); # Temp array for arguments of "join". if (loc ~ /^complement[(].*[)]$/) { # Strip the "complement" operator: loc = substr(loc,12,length(loc)-12); # Extract the ranges: nrt = split_join_of_ranges(loc,rt); # Label them in reverse order: for (j = nrt; j > 0; j--) { rng = cleanup_range(rt[j]); len = label_range(rng, 1, labels); # Cyclically shift the labels: labels = cycle(labels, len); } } else if (loc ~ /^join[(].*[)]$/) { # Get the joined ranges (or complements thereof): nrt = split_join_of_ranges(loc, rt); # Label them in increasing order: for (j = 1; j <= nrt; j++) { rng = rt[j]; if (rng ~ /^complement[(].*[)]$/) { # strip the "complement" operator: rng = substr(rng,12,length(rng)-12); strand = 1; } else { strand = 0; } rng = cleanup_range(rng); len = label_range(rng, strand, labels); # Cyclically shift the labels: labels = cycle(labels, len); } } else { # Location should be a simple range or position: rng = cleanup_range(loc); label_range(rng, 0, labels); } } function cleanup_range(rng ) { # Checks whether {rng} is a well-formed range or single position. # If it contains '<' or '>' constructs, prints a warning and # removes them. Returns the cleaned-up range. # printf " rng = «%s»\n", rng > "/dev/stderr"; if (rng ~ /[<>]/) { data_warning(("indeterminate range «" rng "» ignoring the [<>]")); gsub(/[<>]/, "", rng); } if (rng !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("expected position or range, found «" rng "»")); } return rng; } function split_join_of_ranges(loc,rt, nrt) { # Given a location {loc} that is a join of locations, splits # its argument list. Puts the arguments in {rt[1..]} # and returns the number of arguments. If {loc} is not # a "join(...)" construct, sets {rt[1]=loc} and returns 1. # The parameter {rt} must be an array. if (loc ~ /^join[(].*[)]$/) { # Remove the "join(...)" wrapper: loc = substr(loc, 6, length(loc)-6); # Break at commas: nrt = split(loc, rt, ","); return nrt; } else { # Not a "join(...)": rt[1] = loc; return 1; } } function label_range(rng,strand,labels, rini,rfin) { # Like {label_location}, specialized for a single range # on the indicated {strand}. Returns the number of positions # labeled. # Assumes that {rng} is either a position or a # simple interval "{INI}..{POS}". Fails if the range is empty. rini = get_range_ini(rng); rfin = get_range_fin(rng); if (rini > rfin) { data_error(("empty range = «" rng "»")); } if ((rini < seq_ini) || ((! circular) && (rfin > seq_fin))) { data_error(("range «" rng "» falls outside the source sequence")); } printf " @ lab[%d,%s] = %s", strand,rng,labels > "/dev/stderr"; if (overlaps_segment(rini,rfin)) { # Range overlaps the region of interest: printf " (saved)" > "/dev/stderr"; rng_ini[nrng] = rini; rng_fin[nrng] = rfin; rng_lab[nrng] = labels; rng_cmp[nrng] = strand; nrng++; } printf "\n" > "/dev/stderr"; return (rfin - rini + 1); } function overlaps_segment(rini,rfin, rnb,seq_nb) { # Returns 1 iff the ranges {rini..rfin} and {ini..fin} overlap, # taking {circular,seq_ini,seq_fin} into account. # Assumes that {1 <= rini <= rfin} and {1 <= ini <= fin}. if ((rini <= fin) && (rfin >= ini)) { return 1; } if (circular) { # May overlap by circularity. # Get number of bases in source: seq_nb = seq_fin - seq_ini + 1; # Reduce {rini,rfin} mod {1..seq_nb}: while ((rini > fin) && (rini > seq_nb)) { rini -= seq_nb; rfin -= seq_nb; } if ((rini <= fin) && (rfin >= ini)) { return 1; } if ((rini+seq_nb <= fin) && (rfin+seq_nb >= ini)) { return 1; } if ((rini <= fin+seq_nb) && (rfin >= ini+seq_nb)) { return 1; } } else { return 0; } } function inside_segment(i, rnb,seq_nb) { # Returns 1 iff the ranges {rini..rfin} and {ini..fin} overlap, # taking {circular,seq_ini,seq_fin} into account. # Assumes that {1 <= rini <= rfin} and {1 <= ini <= fin}. if ((i >= ini) && (i <= fin)) { return 1; } if (circular) { # May overlap by circularity. # Get number of bases in source: seq_nb = seq_fin - seq_ini + 1; # Reduce {i} mod {1..seq_nb}: while ((i > fin) && (i > seq_nb)) { i -= seq_nb; } if ((i >= ini) && (i <= fin)) { return 1; } if ((i+seq_nb >= ini) && (i+seq_nb <= fin)) { return 1; } if ((i >= ini+seq_nb) && (i <= fin+seq_nb)) { return 1; } } else { return 0; } } function process_translation(arg, i) { # Processes the "/translation" (aminoacid sequence) entry. # Sets {nama} to its length and stores the base letters into {ama[1..nama]}. # Abbreviation Amino acid name # ------------ --------------- # # Ala A Alanine # Arg R Arginine # Asn N Asparagine # Asp D Aspartic acid (Aspartate) # Cys C Cysteine # Gln Q Glutamine # Glu E Glutamic acid (Glutamate) # Gly G Glycine # His H Histidine # Ile I Isoleucine # Leu L Leucine # Lys K Lysine # Met M Methionine # Phe F Phenylalanine # Pro P Proline # Pyl O Pyrrolysine # Ser S Serine # Sec U Selenocysteine # Thr T Threonine # Trp W Tryptophan # Tyr Y Tyrosine # Val V Valine # Asx B Aspartic acid or Asparagine # Glx Z Glutamine or Glutamic acid. # Xaa X Any amino acid. # Xle J Leucine or Isoleucine # TERM termination codon # data_error(("function {process_translation} not implemented yet")); } function get_range_ini(r) { if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format «" r "»")); } gsub(/[.][.][0-9]+$/, "", r); r += 0; if (r < 1) { data_error(("invalid position «" r "»")); } return r; } function get_range_fin(r) { if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format «" r "»")); } gsub(/^[0-9]+[.][.]/, "", r); r += 0; if (r < 1) { data_error(("invalid position «" r "»")); } return r; } function get_numeric_qualifier(name,qua,def, i) { # Extracts from the string {qua} the value of the numeric-valued qualifier # named {name}. The {name} should include the "/" but not the "=". # If there is no such qaulifier in {qua}, returns the {def} # value. Assumes that {name} does not occur inside any text value. i = index(qua,name); if (i > 0) { qua = substr(qua,i + length(name)); if (! match(qua, /^[ ]*[=][ ]*/)) { data_error(("missing «=» after «" name "»")); } qua = substr(qua, RSTART+RLENGTH); if (! match(qua, /^[0-9]+/)) { data_error(("missing number after «" name "=»")); } return substr(qua,1,RLENGTH) + 0; } else { return def; } } function get_text_qualifier(name,qua,def, i,res,sep) { # Extracts from the string {qua} the value of the text-valued qualifier # named {name}. The {name} should include the "/" but not the "=". # The value should be surrounded by double quotes, with any embedded # quotes duplicated. The extra quotes are removed from the result. # If there is no such qaulifier in {qua}, returns the {def} # value. Assumes that {name} does not occur inside any text value. i = index(qua,name); if (i > 0) { qua = substr(qua,i + length(name)); if (! match(qua, /^[ ]*[=][ ]*/)) { data_error(("missing «=» after «" name "»")); } qua = substr(qua, RSTART+RLENGTH); if (! (qua ~ /^["]/)) { data_error(("missing «\"» after «" name "=»")); } res = ""; sep = ""; while (match(qua, /^["][^"]*["]/)) { if (! match(qua, /^["][^"]*["]/)) { data_error(("malformed value of «" name "» qualifier")); } res = ( res sep substr(qua,2,RLENGTH-2) ); qua = substr(qua, RSTART+RLENGTH); if (match(qua, /^[ ]+/)) { qua = substr(qua, RSTART+RLENGTH); } sep = "\""; } return res; } else { return def; } } function cycle(str,k, n) { # Shifts the string {str} cyclically to the left by {k} characters. n = length(str); k = k % n; if (k == 0) { return str; } else { return (substr(str, 1+k) substr(str,1,k)); } } function process_bases_chunk(str_ini,str, n,str_fin,i,ch) { # Processes a nucleotide string {str}, assumed to start # at position {str_ini} of the whole sequence and contain # just letters, without any blanks. # Stores the chunk whose positions {i} fall in the range {ini..fin} # into {bas[i]}. n = length(str); str_fin = str_ini + n - 1; if (overlaps_segment(str_ini,str_fin)) { # Line overlaps the desired range: for (i = str_ini; i <= str_fin; i++) { if (inside_segment(i)) { ch = tolower(substr(str, i - str_ini + 1, 1)); if (ch !~ /^[abcdghkmnrstvwy]$/) { data_error(("invalid base letter «" ch "»")); } bas[i] = ch; } } } } function output_data( strand,i,j,nb) { # Called at end of parsing to process all # saved data and write the output files. setup_label_sequence(); for (strand = 0; strand <= 1; strand++) { # Choose the output file names: bas_file = ( outName "-" strand ".bas" ); lab_file = ( outName "-" strand ".lab" ); # Write minimal header lines: printf "> %s /strand=%d /ini=%d /fin=%d\n", locus, strand, ini, fin > bas_file; printf "> %s /strand=%d /ini=%d /fin=%d\n", locus, strand, ini, fin > lab_file; # Write the base and label sequences: nb = fin - ini + 1; for (j = 0; j < nb; j++) { i = (strand ? fin - j : ini + j); if ((j > 0) && (j % 60 == 0)) { printf "\n" > bas_file; printf "\n" > lab_file; } printf "%c", (strand ? complement[bas[i]] : bas[i]) > bas_file; printf "%c", lab[strand,i] > lab_file; } if (nb > 0) { printf "\n" > bas_file; printf "\n" > lab_file; } close(bas_file); close(lab_file); } printf "\n", locus, version, date > "/dev/stderr"; } function setup_label_sequence( k,i,j,rini,rfin,rlab,rcmp,m,ch,ex,nbug) { # Fills the array {lab[strand,ini..fin]} with labels, # based on the saved labeling ranges. Namely, # {rlab[0,i]} is the label for position {i} on strand 0, # and {rlab[1,i]} is the label for the base on strand 1 # that gets paired to it. # printf " saved labeling ranges:\n" > "/dev/stderr"; # for (k = 0; k < nrng; k++) # { rlab = rng_lab[k]; # rini = rng_ini[k]; # rfin = rng_fin[k]; # rcmp = rng_cmp[k]; # printf " lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr"; # } # First pass: clear all labels. split("", lab); for (rcmp = 0; rcmp <= 1; rcmp++) { for (i = ini; i <= fin; i++) { lab[rcmp,i] = "Z"; } } # Second pass pass: label all gaps ('O') and transcribed regions ('N'): for (k = 0; k < nrng; k++) { rlab = rng_lab[k]; if ((rlab == "O") || (rlab == "N")) { # Paint all positions: rini = rng_ini[k]; rfin = rng_fin[k]; rcmp = rng_cmp[k]; ch = rlab; printf " lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr"; for (i = rini; i <= rfin; i++) { if (inside_segment(i)) { ex = lab[rcmp,i]; if ((ex != "Z") && (ex != ch)) { data_warning(("conflicting labels at position [" rcmp "," i "]: «" ex "», «" ch "»")); lab[rcmp,i] = "X"; } else { lab[rcmp,i] = ch; } } } # Kill the range: rng_lab[k] = ""; } } # Third pass: paint all CDS features: for (k = 0; k < nrng; k++) { rlab = rng_lab[k]; if (rlab ~ /[DEF]/) { # Paint all positions, in order: rini = rng_ini[k]; rfin = rng_fin[k]; rcmp = rng_cmp[k]; printf " lab[%d,%d..%d] = %s\n", rcmp, rini, rfin, rlab > "/dev/stderr"; m = length(rlab); # Number of label letters to cycle over. j = 1; # Index of next label letter into {rlab}. # Scan the positions covered by the range in the appropriate order: nbug = 0; # Number of coding positions outside transcribed regions. for (i = (rcmp ? rfin : rini); (rcmp ? i >= rini : i <= rfin); i += (rcmp ? -1 : +1)) { if (inside_segment(i)) { ch = substr(rlab, j, 1); ex = lab[rcmp,i]; if ((ex != "Z") && (ex != "N") && (ex != ch)) { data_warning(("conflicting labels at position [" rcmp "," i "]: «" ex "», «" ch "»")); lab[rcmp,i] = "X"; } else { # Coding regions should be nested in transcribed regions: if ((nbug == 0) && (ex != "N") && (ex != ch)) { data_warning(("position [" rcmp "," i "] is coding but not transcribed")); nbug++; } lab[rcmp,i] = ch; } } j++; if (j > m) { j = 1; } } if (nbug > 1) { data_warning(("total " nbug " untranscribed coding positions in [" rcmp "," rini ".." rfin "]")); } # Kill the range: rng_lab[k] = ""; } } # Fourth pass: check for funny ranges. for (k = 0; k < nrng; k++) { rlab = rng_lab[k]; if (rlab != "") { data_error(("leftover range labeling «" rlab "»")); } } } # AUXILIARY PROCEDURES function show_codon(title,seq,pos, j,jj) { # Prints to {stderr} the three letters {seq[pos..pos+2]}, # with some context, clipped to the range {seq_ini..seq_fin}. printf " %s = ", title > "/dev/stderr"; for (j = pos - 5; j <= pos + 7; j++) { if ((j >= seq_ini) && (circular || (j <= seq_fin))) { jj = (circular ? seq_ini + (j - seq_ini) % (seq_fin - seq_ini + 1) : j); if (j == pos) { printf "[" > "/dev/stderr"; } printf "%s", seq[jj] > "/dev/stderr"; if (j == pos + 2) { printf "]" > "/dev/stderr"; } } } printf "\n" > "/dev/stderr"; } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function data_warning(msg) { printf "%s:%s: warning: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", USAGE > "/dev/stderr"; abort = 1; exit abort; }