#ifndef msm_seq_desc_H #define msm_seq_desc_H /* Abstract sequence descriptors. */ /* Last edited on 2017-04-28 16:56:15 by stolfilocal */ #define msm_seq_H_COPYRIGHT \ "Copyright © 2005 by the State University of Campinas (UNICAMP)" #include #include #include /* SEQUENCE DESCRIPTOR The main data type {msm_seq_desc_t} below is a descriptor for a finite sequence of /sample values/ over some /sample space/ {V} that depends on the application. The sequence is assumed to be the result of upsampling or downsampling some /original sequence/ (e.g. RNA/DNA, digitized sound, digitized fragment outline) by some power of two. The descriptor specifies the {size} (number of samples) in the sequence. The valid /sample indices/ are {0..size-1}. This module does not provide for storage of the sample values, which may even be generated on-the-fly by an analytic formula or interpolation of some coarser sequence. The descriptor also specifies a correspondence between the valid sample indices and the sample indices of the original sequence, assuming that it is a simple affine map. (If the correspondence is not an affine map, it must be represented in some other way). */ typedef int msm_seq_id_t; /* A {msm_seq_id_t} is an internal identifier for an original data sequence, e.g. an index into a table of biosequences. */ #define msm_seq_id_none (-1) /* A {msm_seq_id_t} value that means "no original sequence". */ #define msm_seq_desc_estep_MAX 24 /* Should not need more than this much upsampling/downsampling. This limit must be less than 30 so that {2^{abs(estep)}} can be safely computed in 32 bits. */ typedef struct msm_seq_desc_t { /* Original sequence identification: */ msm_seq_id_t id; /* Internal identifier of the sequence. */ char *name; /* External identifier of the sequence. */ bool_t rev; /* TRUE if the original sequence was reversed. */ /* Sequence size: */ int32_t size; /* Number of valid samples in the sequence. */ /* Relation to the original sequence: */ int8_t estep; /* Each sampling step is {2^estep} steps of original seq; may be negative. */ int32_t skip; /* First sample matches sample {skip*(2^estep)} of original seq. */ } msm_seq_desc_t; /* The {id} and/or the {name} fields identify the original sequence from which this sequence was derived. The {rev} flag, if true, indicates that the sequence is derived from the reversal of the original sequence indicated by {id} and {name}. (For some applications, e.g. DNA/RNA or fragment outline matchings, this flag also indicates complementation of sample values.) The valid sample indices for this sequence are {0..size-1}. If {size} is zero,{skip} and {estep} are irrelevant. If {size} is nonzero, then sample index {i} of this sequence corresponds to sample {j = (i + skip)*{2^estep}} of the original sequence (or of the reversed sequence, if {rev} is false). Note that some integer indices of {s} may correspond to fractional indices of the oriiginal sequence, if {estep} is negative. The absolute value of {estep} must not exceed {msm_seq_desc_estep_MAX}. */ msm_seq_desc_t msm_seq_desc_make ( msm_seq_id_t id, /* Internal identifier of the sequence. */ char *name, /* External identifier of the sequence. */ bool_t rev, /* TRUE if the original sequence was reversed. */ int32_t size, /* Number of distinct values in the sequence. */ int8_t estep, /* Each sampling step is {2^estep} samples of original; may be negative. */ int32_t skip /* First sample matches sample {skip*(2^estep)} of original seq. */ ); /* Assembles an {msm_seq_desc_t} from the given fields. Fails if {size} is negative, or the parameters are not valid as specified under {msm_seq_desc_t}. */ /* DESCRIPTOR COMPARISON Each procedures in this section compares two descriptors {sa,sb} in various ways. If the comparison succeeds, the precedure returns TRUE. Otherwise, if {die} is true, the procedure fails with an error message; if {die} is FALSE, it returns FALSE silently. The {name} strings are compared with {strcmp}. */ bool_t msm_seq_desc_same_orig_seq(msm_seq_desc_t *sa, msm_seq_desc_t *sb, bool_t die); /* Returns TRUE if the sequence descriptors {sa} and {sb} are derived from the same original sequence (that is, if they have the same {name}, {id}, and {rev} fields). */ bool_t msm_seq_desc_equal(msm_seq_desc_t *sa, msm_seq_desc_t *sb, bool_t die); /* Returns TRUE if the sequence descriptors {sa} and {sb} are derived from the same original sequence and have the same sample positions relative to it (that is, if they have exactly the same fields. The fields {estep} and {skip} are compared only if the {size} fields match and are positive. */ /* DESCRIPTOR MANIPULATION */ msm_seq_desc_t msm_seq_desc_trim(msm_seq_desc_t *s, int32_t itrim, int32_t ftrim); /* Returns a descriptor for the sequence {t} that would be obtained from sequence {s} by discarding the first {itrim} samples and the last {ftrim} samples. The arguments {itrim} and {ftrim} may be zero, or negative to account for extrapolation of {s}. The procedure copies the fields {t.id}, {t.name}, {t.rev}, and {t.estep} from {s}, and computes the proper values for {t.size} and {t.skip}. If the sequence {s} has fewer than {itrim + ftrim} samples, then {t.size} and {t.skip} are set to zero. */ msm_seq_desc_t msm_seq_desc_resample(msm_seq_desc_t *s, int8_t ek); /* Returns a descriptor for the sequence {t} that would be obtained by resampling a sequence with descriptor {s} with one new sample every {2^ek} samples of {s}. In all cases, the result {t} has the same fields {id}, {name}, and {rev} as {s}, and has {t.estep = s.estep + ek}. If {ek} is zero, the result {t} is the same as {s}. If {s} has zero samples, the result {t} too has zero samples, and {t.skip} is set to zero. Otherwise, the {skip} and {size} of {t} are computed so that the first and last samples of {t} are aligned with samples of {s}. Namely, if {ek} is positive, the procedure assumes that {t} is obtained by taking one every {m} samples of {s}. The sequence {s} is first implicitly trimmed at both ends, if needed, so that {s.skip} and {s.size-1} are multiples of {m}. If {ek} is negative, the procedure assumes that {t} is obtained by inserting additional {m-1} samples between every two consecutive samples of {s}, where {m = 2^{-ek}}. In this case, the first and last sample positions of {t} will correspond to the first and last ones of {s}. That is, {t.size} will be {(s.size - 1)*m + 1}, and {t.skip} will be {s.skip*m}. */ msm_seq_desc_t msm_seq_desc_filter(msm_seq_desc_t *sd, int nw, int8_t ek); /* Returns an abstract sequence descriptor for the result of filtering an abstract sequence with descriptor {sd}, with a filter whose table has {nw} entries, and resampling the result with step {2^ek} times the old step. The output descriptor {fd} will have the same attributes {id,name,rev} as {sd}, and its {estep} will be {sd.estep+ek}. The {size} and {skip} will be adjusted so that the first and last samples of {fd} can be computed by convolution with that kernel fitting entirely inside {sd} and centered over a sample of the original biosequence. */ /* INDEX MAPPING BETWEEN SUCCESSIVE SCALES OF RESOLUTION */ double msm_seq_desc_map_index_to_orig_seq(double is, msm_seq_desc_t *s); /* Maps a sample index {ix} in sequence {s} to the corresponding sample index {ix0} of the original sequence {s0}. The given index {ix} is not required to be in {0..s.size-1} but must be an integer or a dyadic fraction. The result {ix0} may be outside {0..s0.size-1}, and may be a dyadic fraction if {ix} is a fraction or {s.estep} is negative. Even then, the the result will be exact. */ double msm_seq_desc_map_index_from_orig_seq(double ix0, msm_seq_desc_t *s); /* Maps a sample index {ix0} in some original sequence {s0} to the corresponding sample index in sequence {s}, which is assumed to be a filtered and resampled version of {s0}. The given index {ix0} is not required to be in {0..s0.size-1} and must be an integer or a dyadic fraction. The result {ix} may be outside {0..s.size-1} and may be a dyadic fraction, if {ix0} is a fraction or {s.estep} is positive. Even then, the the result will be exact. */ double msm_seq_desc_map_index(double ixa, msm_seq_desc_t *sa, msm_seq_desc_t *sb); /* Maps a sample index {ixa} in a sequence {sa} to the corresponding index {ixb} in another version {sb} of the same original sequence. The two sequences must have the same {id}, {name} and {rev} fields. The given index {ixa} must be an integer or a dyadic fraction. The result {ixb} will be an integer or a dyadic fraction. This operation is exact in any case. */ /* RANDOM SEGMENTS */ void msm_seq_desc_throw_segment(int32_t size, int32_t n, int32_t *iniP, int32_t *finP); /* Selects initial and final sample indices {*ini,*fin} for a random segment spanning {n} positions, taken from a sequence with {size} positions. */ /* PRINTOUT */ void msm_seq_desc_write ( FILE *wr, char *pre, msm_seq_desc_t *s, int idSize, int nameSize, int indexSize, char *suf ); /* Writes to {wr} the sequence descriptor {s}, in a format compatible with {msm_seq_desc_read}; namely, "{pre} {s.id} {s.name} {s.rev} {s.size} {s.estep} {s.skip} {suf}". The delimiters {pre} and {suf} will be omitted if NULL. The fields {s.id} and {s.name} will be blank-padded to {idSize} and {nameSize} chars. The field {s.rev} is "F" for false or "T" for true. The field {s.estep} is zero or has a sign "+" or "-". The fields {s.size} and {s.skip} will be blank-padded to {indexSize} chars. The format changed on 2013-10-16. Formerly it was "{pre} {s.id} {s.name} {s.level} {s.size} {s.circ} {suf}" where {s.level} was a non-negative `filtering level' (similar but not equivalent to {s.estep}) and {s.circ} was a circularity flag. */ msm_seq_desc_t msm_seq_desc_read(FILE *rd, char *pre, char *suf); /* Parses from {rd} a sequence descriptor, in the format "{pre} {id} {name} {rev} {size} {estep} {skip} {suf}". If the arguments {pre} and {suf} are not NULL, they must be present in the input. In general, consecutive fields should be separated by one or more spaces. The {name} field and the {pre,suf} arguments must not contain embedded spaces or other formatting chars. */ bool_t msm_seq_desc_is_valid(msm_seq_desc_t *s, bool_t die); /* Performs several consistency checks in {s}. If it is valid (even if empty), returns TRUE. Otherwise, fails with an error message if {die} is true, returns FALSE silently if {die} is FALSE. */ #endif