#define PROG_NAME "dbd_digest_EID" #define PROG_DESC "Extracts DNA and protein sequences from the Saxonov's EID database" #define PROG_VERS "1.0" /* Last edited on 2024-12-21 14:05:07 by stolfi */ #define dbd_digest_EID_C_COPYRIGHT "Copyright © 2006,2008 by IC-UFF and IC-UNICAMP" #define PROG_HELP \ PROG_NAME " \\\n" \ " [ -maxItems {MAX_ITEMS} ] \\\n" \ " " argparser_help_info_HELP " \\\n" \ " {D_EID_FILE} \\\n" \ " {P_EID_FILE} \\\n" \ " {H_EID_FILE} \\\n" \ " {OUTDIR}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ " " PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " The program reads from standard input three EID-format dataset" \ " files (Saxonov et al., 2000), which must contain the same items in the" \ " same order. The program writes each valid item as four separate" \ " files (bases,labels, transcription, documentation), in a format" \ " suitable for the Bayesian coding region detection tools (Capua," \ " Leitão and Stolfi, CIARP 2007).\n" \ "\n" \ " An item may be considered invalid for several reasons, e.g." \ " inconsistent attributes found in the command line. The program" \ " writes to {stderr} a short summary of each item, indicating" \ " whether the item was considered invalid and why.\n" \ "\n" \ "OUTPUTS\n" \ " The inputs of the program are the three EID-format files" \ " named in the command line:\n" \ "\n" \ " {D_EID_FILE} -- with \".dEID\" extension, containing" \ " the DNA of whole genes, with introns and exons marked" \ " by upper- and lower-case letters, respectively;\n" \ "\n" \ " {P_EID_FILE} -- with \".pEID\" extension, containing the" \ " transcribed protein sequence; and\n" \ "\n" \ " {H_EID_FILE} -- with \".hEID\" extension, containing the" \ " item descriptions derived from the GenBank header lines.\n" \ "\n" \ "OUTPUTS\n" \ " For each valid item found in the input EID files, program creates" \ " four output files in the directory {OUTDIR}:\n" \ "\n" \ " \"{OUTDIR}/{ITEM}.bas\" -- the nucleotide" \ " sequence in (lowercase [atcg]);\n" \ "\n" \ " \"{OUTDIR}/{ITEM}.lab\" -- the corresponding" \ " functional labels (uppercase [A-Z]); and\n" \ "\n" \ " \"{OUTDIR}/{ITEM}.ama\" -- the aminoacid sequence" \ " encoded by the transcribed regions (uppercase [A-Z]); and\n" \ "\n" \ " \"{OUTDIR}/{ITEM}.doc\" -- the item's documentation" \ " from the \".hEID\" file (as in the input).\n" \ "\n" \ " The {ITEM} string in the file name is the item's" \ " identifier as given in the item's header lines, right" \ " after the initial \">\".\n" \ "\n" \ " Each output file will have a perfunctory '>' header in the" \ " format '> {ITEM} /gene=\"{GENE}\"'.\n" \ "\n" \ " The output label file uses the following encoding:\n" \ "\n" \ " 'D','E','F' -- the first, second, and third base of an aminoacid codon;\n" \ "\n" \ " 'H','I','J' -- the first, second, and third base of the stop codon;\n" \ "\n" \ " 'X' -- a base in the initial non-coding mRNA region (head UTR);\n" \ "\n" \ " 'Y' -- a base in the final non-coding mRNA region (tail UTR);\n" \ "\n" \ " 'P','Q' -- the first two bases in an intron (initial splice-marker);\n" \ "\n" \ " 'P','Q' -- the last two bases of an intron (final splice-marker);\n" \ "\n" \ " 'N' -- any other base of an intron.\n" \ "\n" \ " \"{OUTDIR}/{ITEM}.lab\" -- the corresponding" \ "\n" \ "OPTIONS\n" \ " -maxItems {MAX_ITEMS}\n" \ " This optional argument specifies the maximum number" \ " of items to be written out (not counting any items" \ " that are discarded for any reason).\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ argparser_help_info_HELP_INFO "\n" \ "\n" \ "SEE ALSO\n" \ " dbd_tabulate(1), dbd_summarize(1), dbd_paint(1).\n" \ "\n" \ "AUTHOR\n" \ " Created 2005-10-05 by Renatha Oliva Capua, IC-UFF.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2008-06-06 Various changes by by J. Stolfi, IC-UNICAMP.\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " dbd_digest_EID_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #include #include #include #include #include #include #include #include #include #include /* PROTOTYPES */ typedef struct Options_t { int maxItems; /* Max number of items to output. */ char *dFileName; /* Name of input ".dEID" file */ char *pFileName; /* Name of input ".pEID" file */ char *hFileName; /* Name of input ".hEID" file */ char *outDir; /* Name of directory where to put the output files. */ } Options_t; Options_t *get_options(int argc, char **argv); /* Parses the command line, returns them as an {Options_t} record. */ void item_error(char *fName, int lineNum, char *msg, char *etc); /* Prints an error message {msg} about the current item, but goes on. If {etc} is not NULL, prints it using {msg} as the format string. */ void process_files(FILE *dFile, FILE *pFile, FILE *hFile, Options_t *o); /* Extracts each item from the EID files {dFile,pFile,hFile}, writes it to files "{o->outDir}/{itemId}.{ext}" where {ext} is "bas", "lab", "ama", "doc" and the {itemId} is obtained from the item's '>'-lines. */ typedef struct item_data_t /* Global parsing state variables: */ { /* Item parameters read from header lines: */ char *itemId; /* Item identifier, or NULL before first '>' line. */ char *geneId; /* Gene identifier, or NULL before first '>' line. */ int CDS_start; /* Starting index of coding sequence (CDS). */ int CDS_end; /* Final index of coding sequence. */ int CDS_len; /* Num of coding nucleotides in CDS. */ bool_t badItem; /* Should the current item be ignored (TRUE) ou processed (FALSE)*/ } item_data_t; typedef struct parse_data_t /* Global parsing state variables: */ { /* Global parsing state: */ int nItemsRead; /* Number of items read. */ int nItemsGud; /* Number of items written out. */ int nItemsBad; /* Number of items discarded. */ /* Item parsing state: */ int dLineNum; /* Number of lines read from the \"dEID\" file. */ int pLineNum; /* Number of lines read from the \"pEID\" file. */ int hLineNum; /* Number of lines read from the \"hEID\" file. */ } parse_data_t; void read_d_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *bas, char_vec_t *lab); void read_p_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *ama); void read_h_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *doc); /* These functions are called by {read_files} to process the body of one item of an EID file {f} of the specified type. They assume that the global data of {idat} has been initialized, and that the item header has been consumed and processed. They save the items' contents in the specified character vectors, and trim them to the just size. Upon exit, the file {f} is positioned just before EOF or just before the next item's '>' marker. */ bool_t parse_item_header ( FILE *f, char *fName, int *lineNumP, char **itemIdP, char **geneIdP, int *CDS_startP, int *CDS_endP, int *CDS_lenP ); /* Reads an item header from file {f}. Expects that the first character is '>'. Returns FALSE if the file was in EOF, fails with error if it does not find the header or finds a serious syntax error, and returns TRUE otherwise. In the last two cases, increments {*lineNumP}. Expects to find the item's identifier just after the '>' in the header line. If the {*itemIdP} parameter is NULL, sets it to the parsed string; else requires the two strings to agree. Also expects to find the gene identifier as a '/gene="..."' construct, and treats the {geneIdP} parameter in the same way. If any of the parameters "CDS_start", "CDS_end", and "CDS_len" are present on the header line, the procedure parses its value {v}. Then, if the corresponding variable {*CDS_startP}, {*CDS_endP}, and {*CDS_lenP} is {INT_MAX}, it sets that variable to {v}; otherwise it requires that its value be equal to {v}. */ bool_t start_item(FILE *dFile, FILE *pFile, FILE *hFile, item_data_t *idat, parse_data_t *pdat, Options_t *o); /* Reads an item header from each of the given files, updates the {idat} state accordingly. In particular, stores the header attributes into {idat}, initializes {idat.badItem} according to the validity of the headers, and increments the fields {pdat.dLineNum,pdat.pLineNum,pdat.hLineNum,pdat.nItemsRead}. Also prints a line to {stderr} with the item ID and data. Returns FALSE if there are no more items, fails with error on malformed file, returns TRUE on success. */ bool_t finish_item ( char_vec_t *bas, char_vec_t *lab, char_vec_t *ama, char_vec_t *doc, item_data_t *idat, parse_data_t *pdat, char *outDir ); /* Prints an item finalization line. Returns TRUE iff the item is to be written to {stdout}. */ void write_sequence(char_vec_t *seq, char *outDir, char *itemId, char *geneId, char *ext, bool_t wrap); /* Writes biossequence {seq.e[0..n-1]} to file "{outDir}/{itemId}.{kind}". If {wrap} is true, breaks the sequence into lines of reasonable length. */ int main(int argc, char **argv); /* IMPLEMENTATIONS */ int main(int argc, char **argv) { Options_t *o = get_options(argc, argv); FILE *dFile = open_read(o->dFileName, TRUE); FILE *pFile = open_read(o->pFileName, TRUE); FILE *hFile = open_read(o->hFileName, TRUE); process_files(dFile, pFile, hFile, o); return 0; } void process_files(FILE *dFile, FILE *pFile, FILE *hFile, Options_t *o) { item_data_t idat; parse_data_t pdat; pdat.nItemsRead = 0; pdat.nItemsGud = 0; pdat.nItemsBad = 0; pdat.dLineNum = 0; pdat.pLineNum = 0; pdat.hLineNum = 0; char_vec_t bas = char_vec_new(0); /* Nucleotides of item. */ char_vec_t lab = char_vec_new(0); /* Labeling of item. */ char_vec_t ama = char_vec_new(0); /* Aminoacids of encoded protein. */ char_vec_t doc = char_vec_new(0); /* Documentation text. */ /* Loop on items: */ while (start_item(dFile,pFile,hFile,&idat,&pdat,o)) { if (pdat.nItemsGud >= o->maxItems) { item_error(o->dFileName, pdat.dLineNum, "stopped by the \"-maxItems\" limit", NULL); break; } read_d_item(dFile, o->dFileName, &(pdat.dLineNum), &idat, &bas, &lab); read_p_item(pFile, o->pFileName, &(pdat.pLineNum), &idat, &ama); read_h_item(hFile, o->hFileName, &(pdat.hLineNum), &idat, &doc); finish_item(&bas, &lab, &ama, &doc, &idat, &pdat, o->outDir); } fprintf(stderr, "\n"); fprintf(stderr, "items read = %d\n", pdat.nItemsRead); fprintf(stderr, "items written = %d\n", pdat.nItemsGud); fprintf(stderr, "items discarded = %d\n", pdat.nItemsBad); free(bas.e); free(lab.e); free(ama.e); free(doc.e); } bool_t start_item(FILE *dFile, FILE *pFile, FILE *hFile, item_data_t *idat, parse_data_t *pdat, Options_t *o) { auto bool_t parse_hdr(FILE *f, char *fName, int *lineNum); /* Calls {parse_item_header} for file {f}, with line counter {lineNum}. */ bool_t parse_hdr(FILE *f, char *fName, int *lineNum) { return parse_item_header ( f, fName, lineNum, &(idat->itemId), &(idat->geneId), &(idat->CDS_start), &(idat->CDS_end), &(idat->CDS_len) ); } /* All items are good until evidence to the contrary: */ idat->badItem = FALSE; /* Reads header from each file, checks consistency of ids and params: */ idat->itemId = NULL; idat->geneId = NULL; idat->CDS_start = INT_MAX; idat->CDS_end = INT_MAX; idat->CDS_len = INT_MAX; bool_t dOk = parse_hdr(dFile, o->dFileName, &(pdat->dLineNum)); bool_t pOk = parse_hdr(pFile, o->pFileName, &(pdat->pLineNum)); bool_t hOk = parse_hdr(hFile, o->hFileName, &(pdat->hLineNum)); /* Are we at end of file? */ if ((dOk != pOk) || (dOk != hOk)) { if (dOk) { item_error(o->dFileName, pdat->dLineNum, "excess items in file", NULL); } if (pOk) { item_error(o->pFileName, pdat->pLineNum, "excess items in file", NULL); } if (hOk) { item_error(o->hFileName, pdat->hLineNum, "excess items in file", NULL); } exit(1); } if (! dOk) { return FALSE; } /* We have one more item: */ pdat->nItemsRead ++; /* Check whether all parameters are set: */ if (idat->itemId == NULL) { item_error(o->dFileName, pdat->dLineNum, "missing item ID field", NULL); exit(1); } if (idat->geneId == NULL) { item_error(o->dFileName, pdat->dLineNum, "missing gene ID field", NULL); exit(1); } if (idat->CDS_start == INT_MAX) { item_error(o->dFileName, pdat->dLineNum, "missing CDS_start field", NULL); exit(1); } if (idat->CDS_end == INT_MAX) { item_error(o->dFileName, pdat->dLineNum, "missing CDS_end field", NULL); exit(1); } if (idat->CDS_len == INT_MAX) { item_error(o->dFileName, pdat->dLineNum, "missing CDS_len field", NULL); exit(1); } fprintf(stderr, "\n"); fprintf(stderr, "%s:%d: item = %s\n", o->dFileName, pdat->dLineNum, idat->itemId); fprintf(stderr, "%s:%d: item = %s\n", o->pFileName, pdat->pLineNum, idat->itemId); fprintf(stderr, "%s:%d: item = %s\n", o->hFileName, pdat->hLineNum, idat->itemId); fprintf(stderr, " gene = %s\n", idat->geneId); fprintf(stderr, " CDS start = %d end = %d len = %d\n", idat->CDS_start, idat->CDS_end, idat->CDS_len); if ((idat->CDS_len % 3)!= 0) { item_error(o->dFileName, pdat->dLineNum, "fractional codon count", NULL); idat->badItem = TRUE; } return TRUE; } bool_t finish_item ( char_vec_t *bas, char_vec_t *lab, char_vec_t *ama, char_vec_t *doc, item_data_t *idat, parse_data_t *pdat, char *outDir ) { if (! idat->badItem) { /* Check consistency of {lab} and {ama}: */ int nD = 0, nE = 0, nF = 0; int i; for (i = 0; i < lab->ne; i++) { if (lab->e[i] == 'D') { nD++; } else if (lab->e[i] == 'E') { nE++; } else if (lab->e[i] == 'F') { nF++; } } if ((nD != nE) || (nD != nF)) { fprintf(stderr, " unequal D/E/F counts\n"); idat->badItem = TRUE; } else if (nD != ama->ne) { fprintf(stderr, " num codons differs from num aminoacids\n"); idat->badItem = TRUE; } } if (idat->badItem) { pdat->nItemsBad ++; fprintf(stderr, " discarded\n"); return FALSE; } else { pdat->nItemsGud ++; fprintf(stderr, " accepted\n"); /* Write gene bases and labels: */ write_sequence(bas, outDir, idat->itemId, idat->geneId, "bas", TRUE); write_sequence(lab, outDir, idat->itemId, idat->geneId, "lab", TRUE); write_sequence(ama, outDir, idat->itemId, idat->geneId, "ama", TRUE); write_sequence(doc, outDir, idat->itemId, idat->geneId, "doc", FALSE); return TRUE; } } bool_t parse_item_header ( FILE *f, char *fName, int *lineNumP, char **itemIdP, char **geneIdP, int *CDS_startP, int *CDS_endP, int *CDS_lenP ) { auto char *copy_substring(char *pt, char *qt); /* Makes a copy of the string from {*pt} up to but excluding {*qt}. */ auto void set_or_check_string_param(char **varP, char *key, char *pt, char *qt); /* If {*varP} is NULL, sets it to a copy of the substring between {*pt} inclusive and {*qt} exclusive; else requires {*varP} to be equal to that string. Uses {key} in error messages. */ auto void set_or_check_int_param(int *varP, char *key, int val); /* If {*varP} is {INT_MAX}, sets it to {val}, else requires it to be equal to {val}. Uses {key} in error messages. */ auto char *find_key(char *inf, char *key); /* Looks for a keyword "{key}=" in {inf}; if found, returns a pointer after the '=', else returns NULL. */ auto void extract_item_identifier(char *inf, char **varP); /* Sets/checks {*varP} to the first identifier in {*inf}. Fails if there is no such parameter. */ auto void extract_string_param(char *inf, char *key, char **varP); /* Sets/checks {*varP} to the value of string parameter {key}, if that parameter is present in {*inf}. A no-op if there is no such parameter. */ auto void extract_int_param(char *inf, char *key, int *varP); /* Sets/checks {*varP} to the value of integer parameter {key}, if that parameter is present in {*inf}. A no-op if there is no such parameter. */ /* Seek and skip the initial '>': */ int ch = fgetc(f); bool_t bol = TRUE; while (TRUE) { if (ch == EOF) { return FALSE; } if (bol) { (*lineNumP)++; } if (ch == '>') { break; } if ((ch != ' ') && (ch != '\011') && (ch != '\015') && (ch != '\012')) { item_error(fName, *lineNumP, "missing '>'", NULL); exit(1); } bol = (ch == '\n'); ch = fgetc(f); } char *inf = notnull(read_line(f), "failed reading item header"); extract_item_identifier(inf, itemIdP); extract_string_param(inf, "gene", geneIdP); extract_int_param(inf, "CDS_start", CDS_startP); extract_int_param(inf, "CDS_end", CDS_endP); extract_int_param(inf, "CDS_len", CDS_lenP); return TRUE; char *copy_substring(char *pt, char *qt) { char *res = notnull(malloc(qt - pt + 1), "no mem"); strncpy(res, pt, qt - pt); res[qt - pt] = '\000'; return res; } char *find_key(char *inf, char *key) { /* !!! Should require word boundary !!! */ char *pt = strstr(inf,key); /* Look for {key} in {inf}. */ if(pt == NULL) { return NULL; } /* Skip key: */ pt += strlen(key); /* Skip equals sign: */ if ((*pt) != '=') { item_error(fName, *lineNumP, "missing '=' after keyword \"%s\"", key); exit(1); } pt++; return pt; } void extract_item_identifier(char *inf, char **varP) { /* Skip blanks: */ char *pt = inf; while (((*pt) != '\000') &&((*pt) == ' ') && (*pt != '\n')) { pt++; } if ((*pt) == '\000') { /* Missing item ID: */ return; } /* Find end of item ID: */ char *qt = pt; while ((*qt != '\000') && (*qt != ' ') && (*qt != '\n')) { qt++; } set_or_check_string_param(varP, ">", pt, qt); } void extract_string_param(char *inf, char *key, char **varP) { char *pt = find_key(inf,key); /* Look for {key} in {inf}. */ if(pt == NULL) { /* Key not present in header: */ return; } /* Skip quote: */ if ((*pt) != '\"') { item_error(fName, *lineNumP, "missing open quote in \"%s\"", key); exit(1); } pt++; /* Skip the opening '\"'. */ char *qt = pt; while ((*qt != '\"') && (*qt != '\n') && (*qt != '\000')) { qt++; } if (*qt != '\"') { item_error(fName, *lineNumP, "missing close quote in \"%s\"", key); exit(1); } set_or_check_string_param(varP, key, pt, qt); } void extract_int_param(char *inf, char *key, int *varP) { char *pt = find_key(inf,key); /* Look for {key} in {inf}. */ if(pt == NULL) { /* Key not present in header: */ return; } /* Parse and skip algebraic sign {sgn}, if any: */ int sgn = +1; if (*pt == '-') { sgn = -1; pt++; } else if (*pt == '+') { pt++; } /* Parse absolute value {val}: */ int64_t val = 0; if ((*pt < '0') || (*pt > '9')) { item_error(fName, *lineNumP, "missing numeric value of \"%s\"", key); exit(1); } while((*pt >= '0') && (*pt <= '9')) { /* !!! Should check for overflow... */ val = 10*val + ((*pt) - '0'); pt++; } /* Set/check the variable: */ set_or_check_int_param(varP, key, sgn*val); } void set_or_check_string_param(char **varP, char *key, char *pt, char *qt) { if ((*varP) == NULL) { /* Save the parameter value in {*varP}: */ (*varP) = copy_substring(pt, qt); } else { /* Check the parameter value against {*varP}: */ int n = qt - pt; if ((strncmp((*varP), pt, n) != 0) || ((*varP)[n] != '\0')) { item_error(fName, *lineNumP, "inconsistent string attribute \"%s\"", key); exit(1); } } } void set_or_check_int_param(int *varP, char *key, int val) { if ((*varP) == INT_MAX) { /* Save the parameter value in {*varP}: */ (*varP) = val; } else { /* Check the parameter value against {*varP}: */ if ((*varP) != val) { item_error(fName, *lineNumP, "inconsistent int attribute \"%s\"", key); exit(1); } } } } void read_d_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *bas, char_vec_t *lab) { int nFound = 0; /* Number of nucleotides scanned so far. */ int nCoding = 0; /* Number of coding nucleotides scanned so far. */ int nExon = 0; /* Number of exon nucleotides scanned so far. */ int nSaved = 0; /* Nucleotides found so far are {{bas,lab}.e[0..nSaved-1]}. */ auto void relabel_intron_splice_markers(void); /* Assumes that all introns of the item are marked with 'N' in the label string {lab}. Changes the labels of the first two and the last two bases of each intron to 'P', 'Q', 'R', and 'S', respectively. Fails if any intron has less than 4 bases. */ auto void relabel_stop_codon(void); /* Assumes that the last codon in the item's CDS is a stop codon. Expects it to be labeled "DEF" in {lab}. Relabels it with "HIJ". */ /* Loop on item body characters: */ bool_t bol = TRUE; int ch = fgetc(f); /* Current char in file. */ while(TRUE) { if (ch == EOF) { break; } if (bol) { /* One more line: */ (*lineNumP) ++; } if (ch == '\n') { /* Note beginning of line, then skip: */ bol = TRUE; } else { /* Character is not EOF nor newline. */ if ((ch == '>') && bol) { /* End of item. */ ungetc(ch, f); /* The '>' belongs to the next item. */ (*lineNumP) --; break; } else if ((ch == '\011') || (ch == ' ') || (ch == '\015')) { /* Blank, just skip: */ } else if (idat->badItem) { /* This item is bad, just skip: */ } else if (((ch >= 'A') && (ch <= 'Z')) || ((ch >= 'a') && ((ch) <= 'z'))) { /* Nucleotide code. */ nFound++; /* Note that {CDS_start,CDS_end} count from 1. */ /* Determine whether the letter is in the CDS and/or in exon: */ bool_t inExon = (ch >= 'A') && (ch <= 'Z'); /* Is this an exon letter? */ /* Determine the base letter {basChar} and its label {labChar}: */ char basChar, labChar; if (inExon) { /* Letter is part of an exon, map to lower case: */ basChar = ch - 'A' + 'a'; /* If we are in the CDS, then the letter is coding: */ if (nFound < idat->CDS_start) { /* Head UTR: */ labChar = 'X'; } else if (nFound > idat->CDS_end) { /* Tail UTR: */ labChar = 'Y'; } else { /* It is a coding letter: */ /* Compute the label for this base letter: */ switch(nCoding % 3) { case 0: labChar = 'D'; break; case 1: labChar = 'E'; break; case 2: labChar = 'F'; break; default: assert(FALSE); labChar = '?'; } /* We processed one more coding nucleotide: */ nCoding++; } /* We processed one more exon nucleotide: */ nExon++; } else { /* Letter is part of an intron, already in lower case: */ basChar = ch; labChar = 'N'; /* All introns are 'N' for now. */ } /* Save the nucleotide and its label: */ char_vec_expand(bas, nSaved); bas->e[nSaved] = basChar; char_vec_expand(lab, nSaved); lab->e[nSaved] = labChar; nSaved++; } else if (ch == '.') { /* We have got an unidentified base in the sequence. */ item_error(fName, (*lineNumP), "unidentified base", NULL); idat->badItem = TRUE; } else { /* Invalid character in file: */ item_error(fName, (*lineNumP), "bad character in \"dEID\" item body\n", NULL); exit(1); } /* The char we just processed was not a newline: */ bol = FALSE; } /* Get next character: */ ch = fgetc(f); } fprintf(stderr, " bases = %d exonic = %d coding = %d\n", nFound, nExon, nCoding); assert(nFound == nSaved); /* Check consistency with header-stated CDS length: */ if (idat->CDS_len != nCoding) { item_error(fName, (*lineNumP), "inconsistent CDS_len", NULL); idat->badItem = TRUE; } /* Trim {bas,lab} to correct size: */ char_vec_trim(bas, nSaved); char_vec_trim(lab, nSaved); /* Relabel splice markers and stop codon: */ relabel_intron_splice_markers(); relabel_stop_codon(); return; void relabel_intron_splice_markers(void) { /* Loop on introns: */ int ib = 0; while (ib < nSaved) { if (lab->e[ib] == 'N') { /* {lab->e[ib]} is start of intron; find the limit index {ie}: */ int ie = ib+1; while ((ie < nSaved) && (lab->e[ie] == 'N')) { ie++; } /* Now the intron is {lab.e[ib..ie-1]}. */ if (ie - ib < 4) { item_error(fName, (*lineNumP), "intron too short", NULL); idat->badItem = TRUE; } /* Relabel: */ lab->e[ib+0] = 'P'; lab->e[ib+1] = 'Q'; lab->e[ie-2] = 'R'; lab->e[ie-1] = 'S'; /* Skip intron: */ ib = ie; } else { ib ++; } } } void relabel_stop_codon(void) { /* Find last coding base: */ int ic = nSaved; do { ic --; } while ((ic >= 0) && ((lab->e[ic] < 'D') || (lab->e[ic] > 'F'))); /* Consistency checks: */ if (ic < 2) { fprintf(stderr, " the last coding base is bas[%d]\n", ic); item_error(fName, (*lineNumP), "not a single whole codon", NULL); idat->badItem = TRUE; } else if ((lab->e[ic-2] != 'D') || (lab->e[ic-1] != 'E') || (lab->e[ic-0] != 'F')) { fprintf(stderr, " the last codon has label lab[%d..%d] = ", ic-2, ic-0); int j; for (j = -5; j <= +3; j++) { if (((ic - j) >= 0) && ((ic - j) < nSaved)) { if (j == -2) { fprintf(stderr, "["); } fprintf(stderr, "%c", lab->e[ic-j]); if (j == 00) { fprintf(stderr, "]"); } } } fprintf(stderr, "\n"); item_error(fName, (*lineNumP), "last codon is truncated or garbled", NULL); idat->badItem = TRUE; } else { /* Relabel: */ lab->e[ic-2] = 'H'; lab->e[ic-1] = 'I'; lab->e[ic-0] = 'J'; } } } void read_p_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *ama) { int nFound = 0; /* Number of aminoacids scanned so far. */ int nSaved = 0; /* Aminoacids saved so far are {ama.e[0..nSaved-1]}. */ /* Loop on item body characters: */ bool_t bol = TRUE; int ch = fgetc(f); /* Current char in file. */ while(TRUE) { if (ch == EOF) { break; } if (bol) { /* One more line: */ (*lineNumP)++; } if (ch == '\n') { /* Note beginning of line, then skip: */ bol = TRUE; } else { /* Character is not EOF nor newline. */ if ((ch == '>') && bol) { /* End of item. */ ungetc(ch, f); /* The '>' belongs to the next item. */ (*lineNumP) --; break; } else if ((ch == '\011') || (ch == ' ') || (ch == '\015')) { /* Blank, ignore: */ } else if (idat->badItem) { /* Bad item, just skip everything until the next '>'. */ } else if ((ch >= 'A') && (ch <= 'Z')) { /* Aminoacid code, save it: */ nFound++; /* Note that {CDS_start,CDS_end} count from 1. */ char_vec_expand(ama, nSaved); ama->e[nSaved] = ch; nSaved++; } else if (ch == '.') { /* We have got an unidentified aminaocid in the sequence. */ item_error(fName, (*lineNumP), "unidentified aminoacid", NULL); idat->badItem = TRUE; } else { /* Invalid character in file: */ item_error(fName, (*lineNumP), "bad character\n", NULL); exit(1); } /* The char we just processed was not a newline: */ bol = FALSE; } /* Get next character: */ ch = fgetc(f); } fprintf(stderr, " aminoacids read = %d\n", nFound); assert(nFound == nSaved); /* Trim {ama} to correct size: */ char_vec_trim(ama, nSaved); } void read_h_item(FILE *f, char *fName, int *lineNumP, item_data_t *idat, char_vec_t *doc) { int nFound = 0; /* Number of doc bytes scanned so far. */ int nSaved = 0; /* Doc bytes so far are {doc.e[0..nSaved-1]}. */ /* Loop on item body characters: */ bool_t bol = TRUE; int ch = fgetc(f); /* Current char in file. */ while(TRUE) { if (ch == EOF) { break; } if (bol) { /* One more line: */ (*lineNumP) ++; } if (ch == '\n') { /* Note beginning of line: */ bol = TRUE; } else { /* Character is not EOF nor newline. */ if ((ch == '>') && bol) { /* End of item. */ ungetc(ch, f); /* The '>' belongs to the next item. */ (*lineNumP) --; break; } bol = FALSE; } if (idat->badItem) { /* Bad item, just skip everything until the next '>'. */ } else if (ch == '\015') { /* CR, discard: */ } else { /* Save char in {doc}: */ nFound++; char_vec_expand(doc, nSaved); doc->e[nSaved] = ch; nSaved++; } /* Get next character: */ ch = fgetc(f); } fprintf(stderr, " doc bytes read = %d\n", nFound); assert(nFound == nSaved); /* Trim {doc} to correct size: */ char_vec_trim(doc, nSaved); } void write_sequence(char_vec_t *seq, char *outDir, char *itemId, char *geneId, char *kind, bool_t wrap) { char *fName = jsprintf("%s/%s.%s", outDir, itemId, kind); FILE *f = open_write(fName, FALSE); fprintf(f, "> %s /gene=\"%s\"\n", itemId, geneId); int i; for (i = 0; i < seq->ne; i++) { if (wrap && (i > 0) && (i % 80 == 0)) { fputc('\n', f); } fputc(seq->e[i], f); } if (seq->e[seq->ne-1] != '\n') { fputc('\n', f); } fclose(f); free(fName); } Options_t *get_options(int argc, char **argv) { Options_t *o = notnull(malloc(sizeof(Options_t)), "no mem"); argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); /* Parse keyword parameters: */ if (argparser_keyword_present(pp, "-maxItems")) { o->maxItems = argparser_get_next_int(pp, 0, INT_MAX); } else { o->maxItems = INT_MAX; } /* Get positional parameters: */ argparser_skip_parsed(pp); o->dFileName = argparser_get_next(pp); o->pFileName = argparser_get_next(pp); o->hFileName = argparser_get_next(pp); o->outDir = argparser_get_next(pp); /* Check for spurious arguments: */ argparser_finish(pp); return o; } void item_error(char *fName, int lineNum, char *msg, char *etc) { fprintf(stderr, "%s:%d: **", fName, lineNum); if (etc == NULL) { fprintf(stderr, msg); } else { fprintf(stderr, msg, etc); } fprintf(stderr, "\n"); }