#ifndef dnae_seq_H #define dnae_seq_H /* Numerically encoded and filtered DNA sequences */ /* Last edited on 2014-07-28 21:45:07 by stolfilocal */ #define dnae_seq_H_COPYRIGHT \ "Copyright © 2005 by the State University of Campinas (UNICAMP)" \ " and the Federal Fluminense University (UFF)" #define _GNU_SOURCE #include #include #include #include #include #include #include /* This interface defines a discrete representation for signals, as strings of /datum/ elements ({dnae_datum_t}s). */ typedef msm_seq_id_t dnae_seq_id_t; /* A {dnae_seq_id_t} is an internal identifier for a sequence, presently an index into a table of sequences. */ #define dnae_seq_id_none msm_seq_id_none /* A {dnae_seq_id_t} value that means "no sequence". */ typedef struct dnae_seq_t { msm_seq_desc_t sd; /* Abstract sequence attributes for matching. */ char *cmt; /* Comment text. */ dnae_datum_scale_t sfac; /* Per-channel scale factors for decoded samples. */ dnae_datum_vec_t dv; /* The datum array. */ } dnae_seq_t; /* A multichannel signal, sampled at and encoded. */ /* CREATION */ dnae_seq_t dnae_seq_new(int n); /* Allocates a new datum sequence with {n} sample datums, initially all set to {(0,..0)}. The sequence will have {sd.id=dnae_seq_id_none}, {sd.name=cmt=NULL}, {sd.rev=NULL}, {sd.estep=0}, {sd.skip=0}, {sd.size=n}, and {sfac[0..dnae_CHANNELS-1]=1}. */ dnae_seq_t dnae_seq_from_datum_vec ( dnae_seq_id_t id, char *name, bool_t rev, int8_t estep, int skip, char *cmt, dnae_datum_scale_t *sfac, dnae_datum_vec_t dv ); /* Assembles a sequence descriptor {sd} with the given fields. The number of sequence positions {sd.size} is the number of datums in {dv}. */ dnae_seq_t dnae_seq_from_nucleic_string(dnae_seq_id_t id, char *name, bool_t rev, char *cmt, char *bas); /* Converts the DNA/RNA string {bas} into a {dnae_datum_t} sequence. Each base will become one datum, with no filtering or resampling. If {rev} is true, applies reversal-complement to it first. The sequence's numeric and alphabetic identifiers will be {id} and {name}, respectively. The comment text will be {txt}. */ dnae_seq_t dnae_seq_copy(dnae_seq_t *seq); /* Creates a heap copy of {*seq}, including all its internal storage (so that {dnae_seq_free_datums(seq)} will not affect the copy, and vice-versa). */ dnae_seq_t dnae_seq_copy_sub(dnae_seq_t* seq, int ini, int fin); /* Creates a heap copy of the segment of {*seq} from datum number {ini} to datum {fin} inclusive. The datum vector is newly allocated so that {dnae_seq_free_datums(seq)} will not affect the copy, and vice-versa. */ /* EXTRACTING DATA */ int dnae_seq_num_datums(dnae_seq_t *seq); /* Number of datums in sequence {seq}. */ dnae_sample_enc_t *dnae_seq_get_sample_enc_address(dnae_seq_t *seq, int i, int k); dnae_sample_enc_t dnae_seq_get_sample_enc(dnae_seq_t *seq, int i, int k); void dnae_seq_set_sample_enc(dnae_seq_t *seq, int i, int k, dnae_sample_enc_t s); /* These procedures return the address, return the encoded value, and set the encoded value of the sample in channel {k} of datum {i} from the sequence {*seq}. The index {i} must lie in the range {[0..seq->dv.ne-1]}. */ dnae_datum_t *dnae_seq_get_datum_address(dnae_seq_t *seq, int i); dnae_datum_t dnae_seq_get_datum(dnae_seq_t *seq, int i); void dnae_seq_set_datum(dnae_seq_t *seq, int i, dnae_datum_t d); /* These procedures return the address, return the value, and set the value of encoded datum {i} from the sequence {*seq}. The index {i} must lie in the range {[0..seq->dv.ne-1]}. */ dnae_datum_t dnae_seq_eval(dnae_seq_t *seq, double x, bool_t smooth); /* Evaluates the sequence {seq} for argument {x}, which must be in the range {[0 _ seq.size-1]}. If {x} is not an integer, performs linear interpolation of neighboring decoded datums, and encodes the result using the same scale factors {seq->sfac}. */ /* SEQUENCE I/O The procedures in this section read or write a datum sequence {seq} from a file. The file format is described by the string {dnae_seq_file_format_INFO} below. */ #define dnae_seq_type_name "encoded_bio_seq" #define dnae_seq_version "2014-06-14" /* Current format, without {circular}, with {reverse}, 2-byte encoded samples. */ #define dnae_seq_version_old_2 "2013-10-23" /* Old format, without {circular}, with {reverse}, 1-byte encoded samples. */ #define dnae_seq_version_old_1 "2008-01-29" /* Old format, with {circular} and no {reverse}, 1-byte encoded samples. */ #define dnae_seq_file_format_INFO \ "A sequence file consists of \n" \ "\n" \ " a standard header line, \"begin {tname} (version of {vdate})\", where" \ " {tname} is {dnae_seq_type_name = \"" dnae_seq_type_name "\"}" \ " and {vdate} is {dnae_seq_version = \"" dnae_seq_version "\"};" \ "\n" \ " zero or more comment lines, starting with \"|\";\n" \ "\n" \ " a set of sequence parameter lines:\n" \ " \"id = {NUMID}\"\n"\ " \"name = {NAME}\"\n" \ " \"reversed = {REVERSED}\"\n" \ " \"resampling = {RESAMPLING}\"\n" \ " \"offset = {OFFSET}\"\n" \ " \"channels = {NC}\"\n" \ " \"scale = {SFAC[0]} {SFAC[1]} .. SFAC[NC-1]}\"\n" \ " \"datums = {NS}\"\n" \ "\n" \ " then {NS} datum lines, each with {NC} samples"\ " in the format \"{SMP[i,0]} {SMP[i,1] .. {SMP[i,NC-1]}\"\n" \ " for {i} ranging from 0 to {NS-1};\n" \ "\n" \ " a standard footer line, \"end {tname}\".\n" \ "\n" \ " In the paramter lines," \ "\n" \ " {NUMID} (field {seq.sd.id}) is an ID number"\ " of the original biosequence from which this datum"\ " sequence is derived, e.g. an index into some"\ " sequence database.\n"\ "\n" \ " {NAME} (field {seq.sd.name}) is a menemonic tag"\ " for that sequence, without quotes or embedded blanks.\n"\ "\n" \ " {REVERSED} (field {seq.sd.rev} is a logical"\ " value \"F\" or \"T\" (without quotes), specifying"\ " whether this datum sequence is derived from the original"\ " biosequence directly, or after reversing and"\ " complementing it.\n"\ "\n" \ " {RESAMPLING} (field {seq.sd.estep}) is a signed integer"\ " that defines the total amount of subsampling/supersampling"\ " that was applied to the original biosequence. Specifically,"\ " each step of this sequence corresponds to {2^RESAMPLING}"\ " steps of the original. The value is negative for"\ " supersampling, positive for downsampling.\n"\ "\n" \ " {OFFSET} (field {seq.sd.skip}) is a signed integer that"\ " specifies the position of the first datum relative to the first"\ " base of the original biosequence. Specifically, datum {i} of"\ " this sequence corresponds to element {(i - skip)*2^RESAMPLING} of"\ " the original biosequence. Note that if {RESAMPLING} is negative"\ " the corresponding index may be fractionary.\n"\ "\n" \ " {NC} is a positive integer, the number of samples per datum.\n"\ "\n" \ " {SFAC[k]} is a real scale factor to be multiplied into all"\ " samples of the corresponding" \ " channel, after applying {dnae_sample_decode}.\n"\ "\n" \ " {NS} (field {seq.sd.size) is a positive integer, the number of datums"\ " in this sequence.\n"\ "\n" \ " {SMP[i,k]} is a small signed integer, the un-scaled and encoded"\ " sample value in channel {k} of datum {i}.\n" \ "\n" \ " The format changed in "dnae_seq_version "; the previous"\ " versions (" dnae_seq_version_old_1 ", " dnae_seq_version_old_1 ") had"\ " different header format and 1-byte (instead of 2-byte) encoded samples.\n" \ "\n" \ "" dnae_seq_t dnae_seq_read(FILE *rd); /* Reads a numeric DNA/RNA sequence from file {rd}, and turns it into a sequence. The sequence will have {id=dnae_seq_id_none}, {name=NULL}. */ dnae_seq_t dnae_seq_read_named(char *fname, char *tag, char *ext); /* Same as {dnae_seq_read}, but reads from a file named "{fname}{tag}{ext}". If that file name is "-", however, reads from {stdin}. */ void dnae_seq_write(FILE *wr, dnae_seq_t *seq); /* Writes a numeric DNA/RNA sequence to file {wr}. */ void dnae_seq_write_named(char *fname, char *tag, char *ext, dnae_seq_t *seq); /* Same as {dnae_seq_write}, but creates a disk file called "{name}{tag}{ext}". If that file name is "-", however, writes to {stdout}. */ /* DNA/RNA SEQUENCE I/O */ dnae_seq_t dnae_seq_read_from_nucleic_file(FILE *rd, dnae_seq_id_t id, char *name, bool_t rev); /* Reads a DNA/RNA sequence from {rd} and converts it to a numerically encoded {dnae_seq_t}, with no filtering or resampling. The sequence's numeric and alphabetic identifiers will be {id} and {name}, respectively. If {rev} is true, the sequence is reversed and complemented before encoding. */ dnae_seq_t dnae_seq_read_from_nucleic_file_named(char *fname, char *tag, char *ext, dnae_seq_id_t id, char *name, bool_t rev); /* Same as {dnae_seq_read_from_nucleic_file}, but reads from disk file "{fname}{tag}{ext}" */ void dnae_seq_free_datums(dnae_seq_t *seq); /* Reclaims the internal storage of sequence {*seq} (but not {*seq} itself). */ void dnae_seq_free(dnae_seq_t *seq); /* Reclaims the sequence record {*seq} and all its internal storage. */ void dnae_seq_multi_free_datums(dnae_seq_t seq[], int maxLevel); /* Reclaims the internal storage of sequences {seq[0..maxLevel]} (but not the array {*seq} itself). */ void dnae_seq_multi_free(dnae_seq_t *seq[], int maxLevel); /* Reclaims the sequences {*(seq[0..maxLevel])} and all their internal storage. */ /* FILTERING AND INTERPOLATING */ dnae_seq_t dnae_seq_filter(dnae_seq_t *seq, double_vec_t *wtb, int8_t ek, char *wcmt); /* Returns a sequence {new} obtained by filtering a numeric datum sequence {seq} with the kernel {wtb}, and resampling it with step size {2^ek} times its current step. The exponent {ek} must be non-negative. The descriptor of {new} is given by {msm_desc_filter(seq.sd,nw,ek)} where {nw=wtb->ne}. More precisely the datums {pnew[0..nnew-1]} of the new sequence {new} are related to the datums {pseq[0..nseq-1]} of the old ones by the formula {pnew[i] = SUM{ pseq[2^ek*i+j+d] * wtb[j] : j in 0..nw-1}} where {d=new.skip*2^ek-hw} and {hw = (nw-1)/2}. The string {wcmt}, if not NULL, is assumed to be the name or description of the weight table {wtb}, and is appended to the comment {seq->cmt} to make the new string's comment. For each channel {k}, the new scale factor {sfac.f[k]} will be the root-mean-square value of the samples in channel {k} of the filtered sequence. */ dnae_seq_t dnae_seq_interpolate(dnae_seq_t *seq, int8_t ek); /* Returns a sequence {new} obtained by interpolating a numeric datum sequence {seq} with step size {2^ek} times its current step. The exponent {ek} must be zero or negative. The descriptor of {new} is given by {msm_seq_desc_resample(seq.sd,ek)}. More precisely, if {pnew[0..nnew-1]} are the datums of the new sequence {new}, and {pseq[o..nseq-1]} are the datums of {seq}, then {pnew[i*stepn] == pseq[i]} for {i} in {0..nseq-1}. The other elements of {pnew} are interpolated from the adjacent ones by an interpolating C1 cubic spline. Also appends the text "interpolated with step {1/stpn}" to the comment {seq->cmt} to make the new string's comment. For each channel {k}, the new scale factor {sfac.f[k]} will be the root-mean-square value of the samples in channel {k} of the interpolated sequence. Currently uses C1 cubic Hermite interpolation in each interval, with slopes at the original elements estimated by finite differences. */ void dnae_seq_multi_filter ( dnae_seq_t *s, int maxLevel, double_vec_t *wtb0, char *wname0, double_vec_t *wtb1, char *wname1, int8_t ek0, dnae_seq_t sr[] ); /* Places into {sr[0..maxLevel]} various filtered and resampled versions of the sequence {s}. Namely {sr[0]} is a storage-independent copy of {s}, and {sr[r]} is the filtered and possibly resampled version of {sr[r-1]}, for {r} in {1..maxLevel}. Specifically, the filter table {wtb0} is applied to sequence {sr[0]}, and the resulting sequence is up- or downsampled with step {2^ek0} to obtain sequence {sr[1]}. From then on, each sequence {sr[r+1]} is obtained by filtering the previous sequence with filter table {wtb1}, and then downsampling the result with step 2. The comment strings {wname0} and {wname1} are used in the {cmt} field of the resulting sequences, as appropriate. Therefore, sequence {sr[0]} has the same number of datums as {s}, and thereafter each sequence {sr[r+1]} will have at most {(n+1)/2^{ek0+r}} datums (but usually less, to account for the size of the filter tables). Therefore, the sequence {sr[r]} may be empty for {r} beyond a certain value. */ #endif