#ifndef dm_seq_H #define dm_seq_H /* Filtered DNA sequences */ /* Last edited on 2008-04-18 20:06:36 by stolfi */ #define dm_seq_H_COPYRIGHT \ "Copyright � 2005 by the State University of Campinas (UNICAMP)" \ " and the Federal Fluminense University (UFF)" #include #include #include #include #include #include #include #include /* This interface defines a discrete representation for signals, as strings of /datum/ elements ({dm_datum_t}s). */ typedef msm_seq_id_t dm_seq_id_t; /* A {dm_seq_id_t} is an internal identifier for a sequence, presently an index into a table of sequences. */ #define dm_seq_id_none msm_seq_id_none /* A {dm_seq_id_t} value that means "no sequence". */ typedef struct dm_seq_t { msm_seq_desc_t sd; /* Abstract sequence attributes for matching. */ char *cmt; /* Comment text. */ dm_datum_scale_t sfac; /* Per-channel scale factors for decoded samples. */ dm_datum_vec_t dv; /* The datum array. */ int nsub; /* Number os sequence positions per sample. */ } dm_seq_t; /* A multichannel signal, open or circular, sampled at and encoded. For matching purposes, the {dv.ne} datums in {dv} are assumed to be interpolated to obtain the {sd.npos} nominal sequence values. The interpolation depends on whether the sequence is circular or not. */ dm_seq_t dm_seq_new(int n); /* Allocates a new datum sequence with {n} sample datums, initially all set to {(0,..0)}. The sequence will have {sd.id=dm_seq_id_none}, {sd.name=sd.cmt=NULL}, {sd.circ=FALSE}, {sd.level=0}, {nsub=1}, {sfac[0..dm_CHANNELS-1]=1}, and {sd.npos=n}. */ bool_t dm_seq_is_circular(dm_seq_t *seqp); /* TRUE iff {seqp} is circular. */ int dm_seq_num_positions(dm_seq_t *seqp); /* Number of matching positions in sequence {seqp}, whether open or circular. */ dm_datum_t dm_seq_eval(dm_seq_t *seqp, int x); /* Evaluates the sequence {seqp} for argument {x}. If the sequence is circular, {x} can be any integer number; and the result will be periodic with period {n=dm_seq_num_positions(seqp)}. If the sequence is open, {x} must be in the range {0..seqp.npos-1}. */ int dm_seq_num_datums(dm_seq_t *seqp); /* Number of actual datums in sequence {seqp}, whether open or circular. Note that this may be less than {dm_seq_num_positions}. */ dm_sample_t *dm_seq_get_sample_address(dm_seq_t *seqp, int i, int k); dm_sample_t dm_seq_get_sample(dm_seq_t *seqp, int i, int k); void dm_seq_set_sample(dm_seq_t *seqp, int i, int k, dm_sample_t s); /* These procedures return the address, return the value, and set the value of the sample in channel {k} of datum {i} from the sequence {*seqp}. If the sequence is circular, the index {i} is reduced modulo {seqp->dv.ne}; otherwise {i} must lie in the range {[0..seqp->dv.ne-1]}. Note that {i} is a sample index, not a sequence position. */ dm_datum_t *dm_seq_get_datum_address(dm_seq_t *seqp, int i); dm_datum_t dm_seq_get_datum(dm_seq_t *seqp, int i); void dm_seq_set_datum(dm_seq_t *seqp, int i, dm_datum_t d); /* These procedures return the address, return the value, and set the value of datum {i} from the sequence {*seqp}. If the sequence is circular, the index {i} is reduced modulo {seqp->dv.ne}; otherwise {i} must lie in the range {[0..seqp->dv.ne-1]}. Note that {i} is a sample index, not a sequence position. */ dm_seq_t dm_seq_from_datum_vec ( dm_seq_id_t id, char *name, int level, char *cmt, dm_datum_scale_t *sfac, dm_datum_vec_t dv, int nsub ); /* Assembles a sequence descriptor ({dm_seq_t}) with the given fields. The number of sequence positions is computed from the subsampling factor {nsub}, the number of samples {dv.ne}, and the circularity {circ}, as in {dm_seq_compute_num_positions}. */ dm_seq_t dm_seq_copy_sub(dm_seq_t* sr, int ix_ini, int ix_fin); /* Creates a heap copy of the segment of {*sr} starting from sample {ix_ini} to sample {ix_fin} inclusive. The sample vector is newly allocated so that {dm_seq_free_datums(sr)} will not affect the copy, and vice-versa. The sequence must not be subsampled (that is, {sr->nsub} must be 1). */ dm_seq_t dm_seq_copy_datums(dm_seq_t *sr); /* Creates a heap copy of {*sr}, including all its internal storage (so that {dm_seq_free_datums(sr)} will not affect the copy, and vice-versa). */ int dm_seq_compute_num_positions(int nsmp, int nsub, bool_t circ); /* Computes the number of matching positions in a sequence with {nsmp} samples, subsampled at {nsub} points per sample. If {circ} is TRUE, assumes that sequence is circular, otherwise assumes that it is open. */ /* SEQUENCE I/O The procedures in this section read or write a datum sequence {seq} from a file. The file format is described by the string {dm_seq_file_format_INFO} below. */ #define dm_seq_file_format_INFO \ "The file consists of \n" \ "\n" \ " a standard header line, \"begin {tname} (version of {vdate})\", where" \ " {tname} is {dm_seq_type_name} and {vdate} is {dm_seq_version}; \n" \ "\n" \ " zero or more comment lines, starting with \"|\"; \n" \ "\n" \ " a line \"level = {seq.sd.level}\" \n" \ "\n" \ " a line \"channels = {nc}\", where {nc} is the number of samples per datum (for now," \ " it must be equal to {dm_CHANNELS}); \n" \ "\n" \ " a line \"scale = {sfac[0]} {sfac[1]} .. sfac[nc-1]}\", where each {sfac[i]} is a" \ " fractional scale factor to be multiplied into all samples of the corresponding" \ " channel, after applying {dm_sample_decode}; \n" \ "\n" \ " a line \"circular = F\"; \n" \ "\n" \ " a line \"samples = {nsmp}\", where {nsmp} is the number {seq.dv.ne} of sample datums; \n" \ "\n" \ " {nsmp} lines containing the sample datums, one per line, each consisting" \ " of {dm_CHANNEL} encoded integers; \n" \ "\n" \ " a standard footer line, \"end {tname}\".\n" \ "" #define dm_seq_version "2008-01-29" #define dm_seq_type_name "encoded_bio_seq" dm_seq_t dm_seq_read(FILE *rd, int nsub); /* Reads a numeric DNA/RNA sequence from file {rd}, and turns it into a sequence with {nsub} subsampling points per actual sample. The sequence will have {id=dm_seq_id_none}, {name=NULL}. */ void dm_seq_write(FILE *wr, dm_seq_t *seqp); /* Writes a numeric DNA/RNA sequence to file {wr}. */ void dm_seq_write_named(dm_seq_t *seqp, char *name, char *tag); /* Same as {dm_seq_write}, but creates a disk file called"{name}{tag}.egs". */ /* DNA/RNA SEQUENCE I/O */ dm_seq_t dm_seq_from_nucleic_string(dm_seq_id_t id, char *name, char *cmt, char *bas); /* Converts the DNA/RNA string {bas} into a {dm_datum_t} sequence. The sequence's numeric and alphabetic identifiers will be {id} and {name}, respectively. The comment text will be {txt}. The sequence is assumed to be circular iff {circ} is true. */ dm_seq_t dm_seq_read_from_nucleic_file(dm_seq_id_t id, char *name, char *fname); /* Reads a DNA/RNA sequence from file "{fname}.bas". The sequence's numeric and alphabetic identifiers will be {id} and {name}, respectively. The sequence is assumed to be circular iff {circ} is true. */ dm_seq_t dm_seq_read_from_nucleic_simple(dm_seq_id_t id, char *name, char *fname); /* Same as above, but without the ".bas" junk */ /* POSTSCRIPT PLOTTING */ void dm_seq_postscript_plot(msm_ps_tools_t *dp, dm_seq_t *seqp); /* Generates a Postscript plot of the sequence {seqp}, written out to the Postscript plotting stream {dp}. Each channel is decoded and drawn with a different color. The plot uses the whole plotting area of {dp} (with a bit of slack). */ void dm_seq_postscript_plot_named ( dm_seq_t *seqp, double hSize, double vSize, double fontSize, char *name, char *tag ); /* Generates a Postscript plot of the sequence {seqp}, written out to the Encapsulated Postscript file called "{name}{tag}.eps". Each channel is decoded and drawn with a different color. The figure will have {hSize} by {vSize} millimeters, including a blank margin {msm_EPS_MARGIN_MM} millimeters wide all around. The spectrum plot is scaled to fit in the stated area. */ void dm_seq_free_datums(dm_seq_t *sr); /* Reclaims the internal storage of sequence {*sr} (but not {*sr} itself). */ void dm_seq_free(dm_seq_t *sr); /* Reclaims the sequence record {*sr} and all its internal storage. */ void dm_seq_multi_free_datums(dm_seq_t sr[], int maxLevel); /* Reclaims the internal storage of sequences {sr[0..maxLevel]} (but not the array {*sr} itself). */ void dm_seq_multi_free(dm_seq_t *sr[], int maxLevel); /* Reclaims the sequences {*(sr[0..maxLevel])} and all their internal storage. */ #endif