#define PROG_NAME "cfp_parser" #define PROG_DESC "the SELVA bootom-up parser for an arbitrary context-free grammar" #define PROG_VERSION "1.0" /* Copyright © 2003 by the State University of Campinas (UNICAMP). */ /* See the copyright, authorship, and warranty notice at end of file. */ /* Last edited on 2025-03-13 06:34:39 by stolfi */ #define PROG_HELP \ " " PROG_NAME " GRAMMAR.red PHRASE.phr AXIOM > CHART.chr" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " ???\n" \ "\n" \ "AUTHORS\n" \ " Created apr/2003 by Lucien Fantin and Jorge Stolfi at IC-UNICAMP." #include #include #include #include #include #include #include #include #include #include #include /* PROTOTYPES */ Grammar_t *cfp_read_Grammar(FILE *rd); /* File format: the first line must be standard header "begin grammar (format of ...)"; see "filefmt.h". Then come two lines of the form "symbols = {NS}" and "rules = {NR}". Then come {NS} symbol lines, of the form "{S}: {name}" where {S} is the symbol number in the range {1..NS}, in order, and {name} is either a token without any embedded blanks or '#'. Then come {NR} rule lines of the form "{ir}: {H} {L} {R} {cmt}" where {ir} is a rule number, in {1..NR}; {H}, {L}, and {R} are symbol numbers in the range {1..NS}; and {cmt} is either empty or "#" followed by any string. The {L} field may also be 0, denoting a unary rule. The standard footer "end grammar" indicates the end of the input. Comment lines and/or blank lines are allowed after the header, before the footer, and just before the symbol and rule data sections. */ Phrase_t *cfp_read_Phrase(FILE *rd, int NS, char **sdic); /* File format: the first line must be standard header "begin phrase (format of ...)"; see "filefmt.h". Then come two lines of the form "nodes = {NN}" and "arcs = {NA}". Then come {NA} arc lines of the form "{IA}: {O} {D} {S} {VAL} {CMT}". Here {IA} is the arc number, ranging over {1..NA}, in order; {O,D} are node numbers in the range {1..NN}; {S} is a symbol name, as in the grammar's symbol table {sdic}; {VAL} is either a token without embedded blanks or '#'; and {CMT} is either empty or "#" followed by any string. The standard footer "end phrase" indicates the end of the input. Comment lines and/or blank lines are allowed after the header, before the footer, and just before the arc data section. */ int main(int argc, char **argv); Chart_t *cfp_parse(Phrase_t *P, Grammar_t *G); Grammar_t *cfp_new_Grammar(int NS, int NR); Phrase_t *cfp_new_Phrase(int NN, int NA); Chart_t *cfp_new_Chart(int NN); int cfp_triang_index(PNodeId_t i, PNodeId_t j); void cfp_set_Chart(Chart_t *C, GSymbId_t S, PNodeId_t i, PNodeId_t j, CSymb_t *n); CSymb_t *cfp_get_Chart(Chart_t *C, GSymbId_t S, PNodeId_t i, PNodeId_t j); CSymb_t *cfp_new_CSymb(GSymbId_t S, bool_t term, void *info); CRule_t *cfp_new_CRule(GRuleId_t irule, PNodeId_t cut, CSymb_t *Ln, CSymb_t *Rn, CRule_t *next); void cfp_add_CRule ( Chart_t *C, GRuleId_t irule, GSymbId_t H, GSymbId_t L, GSymbId_t R, CSymb_t *Ln, CSymb_t *Rn, PNodeId_t ini, PNodeId_t cut, PNodeId_t fin ); void cfp_init_Chart(Chart_t *C, Phrase_t *P); bool_t cfp_mark_reachable(Chart_t *C, GSymbId_t N, bool_t value); /* Sets {n->reachable = value} for every {CSymb_t} node that is reachable from the {CSymb_t} node with symbol {N} spanning the whole phrase. */ bool_t cpf_rule_tagging_rule(Grammar_t *G, GRule_t *gr); /* TRUE iff rule {gr} was internally generated by {gpp} to handle the rule-tagging mechanism. Namely, iff {gr} is of the form "N(ARGS) -> N{TAG}(ARGS)" or "N{TAGS}(ARGS) -> N{TAG}(ARGS)" where {N} is a non-terminal symbol, {TAGS} is two or more rule tags, {TAG} is a single rule tag, and ({ARGS}) is a list of marker values (possibly empty or missing). */ /* PLAIN CHART PRINTOUT */ void cfp_write_Chart( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, bool_t numeric, bool_t all ); void cfp_write_Chart_nonterminal( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ); void cfp_write_Chart_terminal( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ); void cfp_write_Chart_rule( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, GSymbId_t N, CRule_t *cr, bool_t numeric ); char *cfp_plot_span(PNodeId_t i, PNodeId_t m, PNodeId_t j, int NN); /* CHART PRINTOUT WITH AUXILIARY SYMBOL SUPPRESSION */ void cfp_write_Chart_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, bool_t numeric, bool_t all ); void cfp_write_Chart_nonterminal_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ); void cfp_write_Chart_rule_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, GSymbId_t N, CRule_t *cr, bool_t numeric ); void cfp_write_Chart_nonterminal_seq( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, GSymbId_t N, /* Head symbol of expanded rule. */ int nS, /* Number of symbols in expanded rule. */ PNodeId_t cut[], /* Cut points of expanded chart rule. */ GSymbId_t S[], /* Symbol {S[i]} was parsed between {cut[i]} and {cut[i+1]}. */ bool_t numeric /* TRUE prints symbol numbers instead of names. */ ); char *cfp_plot_multispan(int n, PNodeId_t p[], int NN); void cfp_print_span(FILE *wr, PNodeId_t i, char *txspan, PNodeId_t j); void cfp_arg_error(char *msg); void cfp_file_error(char *where, int loc, char *msg); bool_t cfp_symbol_is_auxiliary(GSymbId_t N, Grammar_t *G); /* TRUE iff the symbol's name begins with '$' or '*'. */ bool_t cfp_symbol_is_function_labeled(GSymbId_t N, Grammar_t *G); /* TRUE iff the symbol's name has a ':' before the braces or parens. */ GSymbId_t cfp_lookup_symbol(char *name, int NS, char **sdic); /* Finds in {sdic} the index {S} in {1..NS} of the symbol with given {name}. Returns {NOSYMB} if not found. */ char *cfp_read_symbol_name(FILE *rd); /* Skips blanks then reads zero or more nonblank chars, up to (but not including) EOF or the first blank char. Returns a pointer to newly allocated area with the string and a terminating '\0'. */ char *cfp_read_arc_name(FILE *rd); /* Skips blanks then reads zero or more nonblank chars, up to (but not including) EOF or the first blank char. Returns a pointer to the string and a terminating '\0'. The string is newly allocated unless it is "-". */ void cfp_skip_comment_lines(FILE *rd); /* Reads zero or more lines that are either wholly blank or contain '#' as their first non-blank character. Also skips any leading spaces of the first non-comment line found. */ /* IMPLEMENTATIONS */ /* MAIN */ int main(int argc, char **argv) { if (argc != 5) { cfp_arg_error("bad args"); } char *nameG = argv[1]; /* Name of grammar file. */ char *nameP = argv[2]; /* Name of phrase file. */ char *nameO = argv[3]; /* Name prefix of output files. */ char *nameX = argv[4]; /* Name of axiom symbol. */ FILE *fG = open_read(nameG, TRUE); Grammar_t *G = cfp_read_Grammar(fG); FILE *fP = open_read(nameP, TRUE); Phrase_t *P = cfp_read_Phrase(fP, G->NS, G->sname); Chart_t *C = cfp_parse(P, G); GSymbId_t X = cfp_lookup_symbol(nameX, G->NS, G->sname); /* Axiom symbol ID. */ cfp_mark_reachable(C, X, TRUE); bool_t numeric, all; for (all = FALSE; all <= TRUE; all++) { for (numeric = FALSE; numeric <= TRUE; numeric++) { char *allTag = (all ? "-all" : "-axm"); char *numericTag = (numeric ? "-n" : "-s"); char *anTag = txtcat(allTag, numericTag); /* Write plain chart: */ FILE *fO_pl = open_write(txtcat3(nameO, anTag, "-pl.cht"), TRUE); cfp_write_Chart(fO_pl, C, P, G, numeric, all); fclose(fO_pl); /* Write expanded chart: */ FILE *fO_ex = open_write(txtcat3(nameO, anTag, "-ex.cht"), TRUE); cfp_write_Chart_expanded(fO_ex, C, P, G, numeric, all); fclose(fO_ex); } } return 0; } /* PARSING Parsing means building a chart that represents all possible parse trees of a given phrase {P} according to a given grammar {G}. The chart under construction is divided into two sections, the /complete/ entries and the /incomplete/ ones (the /agenda/). Um nó {n} da agenda ainda pode ter {T(n)} incompleto, e podem ser usados em {CRule_t}s ainda não criados. No primeiro caso, eles podem vir a ganhar novos {CRule_t}s filhos. Para um nó completo {n} {T(n)} já está completo, e nenhum descendente de {n} será alterado. */ Chart_t *cfp_parse(Phrase_t *P, Grammar_t *G) { Chart_t *C = cfp_new_Chart(P->NN); /* Agenda: */ PNodeId_t jA, iA; /* The chart consists of all entries {i,j,S} of {C} with {jiA}, or {j=jA}, {i=iA}, and {S>pA.S}. The remaining nodes in {C} constitute the agenda. */ cfp_init_Chart(C, P); for (jA = 2; jA <= C->NN; jA++) for (iA = jA-1; iA >= 1; iA--) { /* Get bucket {iA,jA} from chart: */ CSymb_t *nA = C->ent[cfp_triang_index(iA,jA)]; while (nA != NULL) { /* Scan entries of chart bucket {iA,jA}, in order of decreasing symbol id: */ GSymbId_t N = nA->symb; /* Neste ponto {T(nA)} está completo, {org(nA)=i, dst(nA)=j}. */ /* Process all unary rules {H --> N}: */ GRuleId_t irule = G->firstU[N]; /* A unary rule which uses {N} as the {right} part. */ while (irule != NORULE) { /* Rule number {irule} uses {N} as the {right} part. */ GRule_t *gr = &(G->rule[irule]); GSymbId_t H = gr->H, L = gr->L, R = gr->R; affirm(R == N, "R symb mismatch"); affirm(L == NOSYMB, "L used in unary rule"); cfp_add_CRule(C, irule, H,NOSYMB,N, NULL,nA, iA,0,jA); irule = gr->nextR; } /* Process all binary rules {H --> L N} such that {L} occurs ending at {iA}: */ PNodeId_t k; /* Candidate starting point of {L}. */ for (k = iA-1; k >= 1; k--) { /* !!! Should merge the lists {G->firstR[N]} and {C->ent[k,iA]} !!! */ GRuleId_t irule = G->firstR[N]; /* First rule which uses {N} as the {right} part. */ while (irule != NORULE) { /* Rule number {irule} uses {N} as the {right} part. */ GRule_t *gr = &(G->rule[irule]); GSymbId_t H = gr->H, L = gr->L, R = gr->R; affirm(R == N, "R symb mismatch"); affirm (L != NOSYMB, "null L in binary rule"); /* Neste ponto {cfp_get_Chart(L,k,iA)} está completa (i.e. no chart)! */ CSymb_t *p = cfp_get_Chart(C, L,k,iA); if (p != NULL) { cfp_add_CRule(C, irule, H,L,N, p,nA, k,iA,jA); } irule = gr->nextR; } } nA = nA->next; } } return C; } void cfp_init_Chart(Chart_t *C, Phrase_t *P) { PArcId_t iarc; for (iarc = 1; iarc <= P->NA; iarc++) { PArc_t *arc = P->arc[iarc]; GSymbId_t S = arc->symb; CSymb_t *n = cfp_new_CSymb(S,TRUE,(void*)arc); cfp_set_Chart(C, S, arc->org, arc->dst, n); } } void cfp_add_CRule ( Chart_t *C, GRuleId_t irule, GSymbId_t H, GSymbId_t L, GSymbId_t R, CSymb_t *Ln, CSymb_t *Rn, PNodeId_t ini, PNodeId_t cut, PNodeId_t fin ) { CSymb_t *m = cfp_get_Chart(C, H,ini,fin); if (m == NULL) { m = cfp_new_CSymb(H,FALSE,NULL); cfp_set_Chart(C, H,ini,fin, m); /* This new node should end up automatically in the agenda part of the chart. */ } CRule_t *cr = cfp_new_CRule(irule, cut, Ln, Rn, (CRule_t *)m->info); m->info = (void *)cr; } CSymb_t *cfp_new_CSymb(GSymbId_t S, bool_t term, void *info) { CSymb_t *n = (CSymb_t *)notnull(malloc(sizeof(CSymb_t)), "no mem"); n->symb = S; n->term = term; n->reachable = FALSE; n->info = info; n->next = NULL; return n; } CRule_t *cfp_new_CRule(GRuleId_t irule, PNodeId_t cut, CSymb_t *Ln, CSymb_t *Rn, CRule_t *next) { CRule_t *cr = (CRule_t *)notnull(malloc(sizeof(CRule_t)), "no mem"); cr->irule = irule; cr->cut = cut; cr->Ln = Ln; cr->Rn = Rn; cr->next = next; return cr; } CSymb_t *cfp_get_Chart(Chart_t *C, GSymbId_t S, PNodeId_t i, PNodeId_t j) { CSymb_t *p = C->ent[cfp_triang_index(i,j)]; while ((p != NULL) && (p->symb > S)) { p = p->next; } if ((p == NULL) || (p->symb != S)) { return NULL; } else { return p; } } void cfp_set_Chart(Chart_t *C, GSymbId_t S, PNodeId_t i, PNodeId_t j, CSymb_t *n) { CSymb_t **anext = &(C->ent[cfp_triang_index(i,j)]); CSymb_t *p = *anext; while ((p != NULL) && (p->symb > S)) { anext = &(p->next); p = *anext; } if (p != NULL) { affirm(p->symb != S, "dup symbol"); } affirm(n->next == NULL, "next not null"); n->next = p; (*anext) = n; } int cfp_triang_index(PNodeId_t i, PNodeId_t j) { return (j-1)*(j-2)/2+(j-i-1); } #define GrammarVersion "2004-11-01" Grammar_t *cfp_read_Grammar(FILE *rd) { Grammar_t *G; /* Read header "begin grammar (format of YYYY-MM-DD)" */ filefmt_read_header(rd, "grammar", GrammarVersion); cfp_skip_comment_lines(rd); /* Read number of symbols and number of rules. */ int NS = nget_int32(rd, "symbols"); fget_eol(rd); int NR = nget_int32(rd, "rules"); fget_eol(rd); /* Create grammar and initialize its tables: */ G = cfp_new_Grammar(NS, NR); /* Read symbol data: */ cfp_skip_comment_lines(rd); GSymbId_t S; for (S = 1; S <= NS; S++) { int N = fget_int32(rd); if (! fget_skip_spaces_and_test_char(rd, ':')) { cfp_file_error("symbol", S, "missing ':'"); } if (N != S) { cfp_file_error("symbol", S, "seq error"); } char *name = cfp_read_symbol_name(rd); /* Skip comments and end-of-line: */ fget_comment_or_eol(rd, '#', NULL); /* Store symb name in grammar: */ G->sname[S] = name; } /* Read rule data: */ cfp_skip_comment_lines(rd); GRuleId_t irule; for (irule = 1; irule <= NR; irule++) { int jrule = fget_int32(rd); if (! fget_skip_and_test_char(rd, ':')) { cfp_file_error("rule", irule, "missing ':'"); } if (jrule != irule) { cfp_file_error("rule", irule, "seq error"); } int H = fget_int32(rd); if ((H < 1) || (H > NS)) { cfp_file_error("rule", irule, "bad H field"); } int L = fget_int32(rd); if ((L < 0) || (L > NS)) { cfp_file_error("rule", irule, "bad L field"); } int R = fget_int32(rd); if ((R < 1) || (R > NS)) { cfp_file_error("rule", irule, "bad R field"); } /* Skip comments and end-of-line: */ fget_comment_or_eol(rd, '#', NULL); /* Store rule in grammar: */ if (L == 0) { /* Unary rule - prepend to the {firstU} list. */ L = NOSYMB; /* Just in case {NOSYMB} is not zero... */ if (R <= H) { cfp_file_error("rule", irule, "H >= R in unary rule"); } G->rule[irule] = (GRule_t){H,NOSYMB,R,G->firstU[R]}; G->firstU[R] = irule; } else { /* Binary rule - insert in the {firstR} list, sorting by {L}: */ GRuleId_t this = G->firstR[R], prev = NORULE; while ((this != NORULE) || (G->rule[this].L > L)) { prev = this; this = G->rule[this].nextR; } G->rule[irule] = (GRule_t){H,L,R,this}; if (prev == NORULE) { G->firstR[R] = irule; } else { G->rule[prev].nextR = irule; } } } /* Read footer "end grammar" */ cfp_skip_comment_lines(rd); filefmt_read_footer(rd, "grammar"); return G; } #define PhraseVersion "2004-11-01" Phrase_t *cfp_read_Phrase(FILE *rd, int NS, char **sdic) { Phrase_t *P; /* Read header "begin phrase (format of YYYY-MM-DD)" */ filefmt_read_header(rd, "phrase", PhraseVersion); cfp_skip_comment_lines(rd); /* Read number of nodes and arcs. */ int NN = nget_int32(rd, "nodes"); fget_eol(rd); int NA = nget_int32(rd, "arcs"); fget_eol(rd); /* Create phrase: */ P = cfp_new_Phrase(NN, NA); /* Read arcs: */ cfp_skip_comment_lines(rd); PArcId_t iarc; for (iarc = 1; iarc <= NA; iarc++) { int jarc = fget_int32(rd); if (! fget_skip_and_test_char(rd, ':')) { cfp_file_error("arc", iarc, "missing ':'"); } if (jarc != iarc) { cfp_file_error("arc", iarc, "seq error"); } PNodeId_t org = fget_int32(rd); if ((org < 0) || (org > NN)) { cfp_file_error("arc", iarc, "bad org field"); } PNodeId_t dst = fget_int32(rd); if ((dst < 1) || (dst > NN)) { cfp_file_error("arc", iarc, "bad dst field"); } if (org >= dst) { cfp_file_error("arc", iarc, "backwards arc"); } char *Sname = cfp_read_symbol_name(rd); GSymbId_t S = cfp_lookup_symbol(Sname, NS, sdic); if (S == NOSYMB) { cfp_file_error("arc", iarc, "unknown symbol"); } free(Sname); char *Aname = cfp_read_arc_name(rd); /* Skip comments and end-of-line: */ fget_comment_or_eol(rd, '#', NULL); /* Store arc in phrase: */ PArc_t *a = (PArc_t *)notnull(malloc(sizeof(PArc_t)), "no mem"); (*a) = (PArc_t){S,org,dst,Aname}; P->arc[iarc] = a; } /* Read footer "end phrase" */ cfp_skip_comment_lines(rd); filefmt_read_footer(rd, "phrase"); return P; } char *cfp_read_symbol_name(FILE *rd) { return fget_string(rd); } GSymbId_t cfp_lookup_symbol(char *name, int NS, char **sdic) { GSymbId_t S; for (S = 1; S <= NS; S++) { char *Sn = sdic[S]; if (strcmp(name,Sn) == 0) { return S; } } return NOSYMB; } char *cfp_read_arc_name(FILE *rd) { if (fget_skip_and_test_char(rd, '-')) { return "-"; } else { return fget_string(rd); } } void cfp_skip_comment_lines(FILE *rd) { while(1) { bool_t ok = fget_test_comment_or_eol(rd, '#', NULL); if (! ok) { return; } } } void cfp_file_error(char *where, int loc, char *msg) { fprintf(stderr, "data file error:"); if (where != NULL) { fprintf(stderr, " %s %d:", where, loc); } fprintf(stderr, " ** %s\n", msg); exit(1); } void cfp_arg_error(char *msg) { fprintf(stderr, "** %s: %s\n", PROG_NAME, msg); fprintf(stderr, "usage: %s\n", PROG_HELP); exit(1); } Grammar_t *cfp_new_Grammar(int NS, int NR) { Grammar_t *G = (Grammar_t *)notnull(malloc(sizeof(Grammar_t)), "no mem"); G->NS = NS; G->NR = NR; G->rule = (GRule_t *)notnull(malloc((NR+1)*sizeof(GRule_t)), "no mem"); G->sname = (char **)notnull(malloc((NS+1)*sizeof(char *)), "no mem"); G->firstR = (GRuleId_t *)notnull(malloc((NS+1)*sizeof(GRuleId_t)), "no mem"); G->firstU = (GRuleId_t *)notnull(malloc((NS+1)*sizeof(GRuleId_t)), "no mem"); /* Initialize the rule table, just in case: */ GRuleId_t irule; for (irule = 0; irule <= NR; irule++) { G->rule[irule] = (GRule_t){NOSYMB, NOSYMB, NOSYMB, NORULE}; } /* Initialize the symbol-indexed arrays: */ GSymbId_t S; for (S = 0; S <= NS; S++) { G->firstU[S] = NORULE; G->firstR[S] = NORULE; G->sname[S] = NULL; } return G; } Phrase_t *cfp_new_Phrase(int NN, int NA) { Phrase_t *P = (Phrase_t *)notnull(malloc(sizeof(Phrase_t)), "no mem"); P->NN = NN; P->NA = NA; P->arc = (PArc_t **)notnull(malloc((NA+1)*sizeof(PArc_t *)), "no mem"); return P; } Chart_t *cfp_new_Chart(int NN) { Chart_t *C = (Chart_t *)notnull(malloc(sizeof(Chart_t)), "no mem"); C->NN = NN; C->ent = (CSymb_t **)notnull(malloc(NN*(NN-1)/2*sizeof(CSymb_t *)), "no mem"); /* Initialize the triangular array: */ PNodeId_t i,j; for (i = 1; i <= NN; i++) { for (j = i+1; j <= NN; j++) { int ix = cfp_triang_index(i,j); C->ent[ix] = NULL; } } return C; } bool_t cfp_mark_reachable(Chart_t *C, GSymbId_t N, bool_t value) { auto void mark_all(CSymb_t *n); /* Sets {m->reachable = value} for every node {m} reachable from {n}. */ PNodeId_t i = 1, j = C->NN; /* Get bucket {i,j} from chart: */ CSymb_t *n = C->ent[cfp_triang_index(i,j)]; /* Scan entries of chart bucket {i,j}, in order of decreasing symbol id: */ while ((n != NULL) && (n->symb != N)) { n = n->next; } if (n == NULL) { return FALSE; } else { mark_all(n); return TRUE; } void mark_all(CSymb_t *n) { if ((n != NULL) && (n->reachable != value)) { n->reachable = value; if (! n->term) { CRule_t *cr = (CRule_t *)n->info; affirm(cr != NULL, "CSymb_t with zero rules"); while (cr != NULL) { mark_all(cr->Ln); mark_all(cr->Rn); cr = cr->next; } } } } } void cfp_write_Chart( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, bool_t numeric, bool_t all ) { PNodeId_t i,j; affirm(C->NN == P->NN, "P/C NN mismatch"); for (i = 1; i <= C->NN; i++) { for (j = C->NN; j >= i+1; j--) { /* Get bucket {i,j} from chart: */ CSymb_t *n = C->ent[cfp_triang_index(i,j)]; /* Scan entries of chart bucket {i,j}, in order of decreasing symbol id: */ while (n != NULL) { if (all || n->reachable) { if (n->term) { cfp_write_Chart_terminal(wr, C, P, G, i, j, n, numeric); } else { cfp_write_Chart_nonterminal(wr, C, P, G, i, j, n, numeric); } } n = n->next; } } } } void cfp_write_Chart_terminal( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ) { GSymbId_t N = n->symb; PArc_t *a = (PArc_t *)n->info; affirm(a->symb == N, "term symb mismatch"); char *txspan = cfp_plot_span(i, NONODE, j, C->NN); cfp_print_span(wr, i, txspan, j); free(txspan); if (numeric) { fprintf(wr, " s%d --> «%d..%d»", N, a->org, a->dst); } else { fprintf(wr, " %s --> «%s»", G->sname[N], a->aname); } fprintf(wr, "\n"); } void cfp_write_Chart_nonterminal( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ) { GSymbId_t N = n->symb; CRule_t *cr = (CRule_t *)n->info; affirm(cr != NULL, "CSymb_t with zero rules"); while (cr != NULL) { cfp_write_Chart_rule(wr, C, P, G, i, j, N, cr, numeric); cr = cr->next; } } void cfp_write_Chart_rule( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, GSymbId_t N, CRule_t *cr, bool_t numeric ) { GRule_t *gr = &(G->rule[cr->irule]); PNodeId_t cut = cr->cut; char *txspan = cfp_plot_span(i,cut,j, C->NN); cfp_print_span(wr, i, txspan, j); free(txspan); affirm (gr->H == N, "H symb mismatch"); if (gr->L == NOSYMB) { /* Instance of a unary rule. */ affirm(cut == NONODE, "nonzero cut in unary rule"); affirm (cr->Ln == NULL, "non-null L node in unary rule"); if (numeric) { fprintf(wr, " s%d --> s%d", N, gr->R); } else { fprintf(wr, " %s --> %s", G->sname[N], G->sname[gr->R]); } } else { /* Instance of a binary rule. */ affirm ((i <= cut) && (cut <= j), "bad cut"); affirm (cr->Ln->symb == gr->L, "L symb mismatch"); if (numeric) { fprintf(wr, " s%d --> s%d [%d] s%d", N, gr->L, cr->cut, gr->R); } else { fprintf(wr, " %s --> %s %s", G->sname[N], G->sname[gr->L], G->sname[gr->R]); } } affirm (cr->Rn->symb == gr->R, "R symb mismatch"); fprintf(wr, "\n"); } void cfp_write_Chart_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, bool_t numeric, bool_t all ) { PNodeId_t i,j; affirm(C->NN == P->NN, "P/C NN mismatch"); for (i = 1; i <= C->NN; i++) { for (j = C->NN; j >= i+1; j--) { /* Get bucket {i,j} from chart: */ CSymb_t *n = C->ent[cfp_triang_index(i,j)]; /* Scan entries of chart bucket {i,j}, in order of decreasing symbol id: */ while (n != NULL) { if (all || n->reachable) { if (n->term) { cfp_write_Chart_terminal(wr, C, P, G, i, j, n, numeric); } else if ( (! cfp_symbol_is_auxiliary(n->symb, G)) && (! cfp_symbol_is_function_labeled(n->symb, G)) ) { cfp_write_Chart_nonterminal_expanded ( wr, C, P, G, i, j, n, numeric ); } } n = n->next; } } } } void cfp_write_Chart_nonterminal_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, CSymb_t *n, bool_t numeric ) { GSymbId_t N = n->symb; CRule_t *cr = (CRule_t *)n->info; affirm(cr != NULL, "CSymb_t with zero rules"); while (cr != NULL) { cfp_write_Chart_rule_expanded(wr, C, P, G, i, j, N, cr, numeric); cr = cr->next; } } typedef void cfp_exrule_op_t(void); /* A procedure to be called by enumeration procedures. */ #define MAX_EXP_SYMBOLS 100 void cfp_write_Chart_rule_expanded( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, PNodeId_t i, PNodeId_t j, GSymbId_t N, CRule_t *cr, bool_t numeric ) { /* Pilha de recursão: */ int nS = 0; PNodeId_t cut[MAX_EXP_SYMBOLS + 1]; GSymbId_t S[MAX_EXP_SYMBOLS]; auto void print_rule(void); /* Prints the rule defined by {N,S[0..nS-1],cut[0..nS-1]}. Appends {j} as the last cutpoint. */ auto void expand_rule(CRule_t *cr, cfp_exrule_op_t op); /* Enumerates all expansions of {cr} that contain no internal symbols, appends each expansion to {S[0..nS-1],cut[0..nS]} and calls {op} for each of them. */ auto void expand_symbol(CSymb_t *n, cfp_exrule_op_t op); /* Enumerates all expansions of {n} that contain no internal symbols, appends each expansion to {S[0..nS-1],cut[0..nS]} and calls {op} for each of them. In particular, if {n} is an external symbol, just appends {n->symb}. */ /* Omit non-interesting rules: */ if (cpf_rule_tagging_rule(G, &(G->rule[cr->irule]))) { return; } cut[0] = i; expand_rule(cr, print_rule); return; void expand_rule(CRule_t *cr, cfp_exrule_op_t op) { auto void expand_right(void); /* Enumerates all expansions of {cr->Rn} that contain no internal symbols, appends each expansion to {S[0..nS-1],cut[0..nS]} and calls {op} for each of them. */ if (cr->Ln == NULL) { expand_symbol(cr->Rn, op); } else { expand_symbol(cr->Ln, expand_right); } void expand_right(void) { cut[nS] = cr->cut; expand_symbol(cr->Rn, op); } } void expand_symbol(CSymb_t *n, cfp_exrule_op_t op) { if ((! n->term) && cfp_symbol_is_auxiliary(n->symb, G)) { CRule_t *cr = (CRule_t *)n->info; affirm(cr != NULL, "CSymb_t with zero rules"); while (cr != NULL) { expand_rule(cr, op); cr = cr->next; } } else { demand(nS < MAX_EXP_SYMBOLS, "max rule length exceeded"); S[nS] = n->symb; nS++; op(); nS--; } } void print_rule(void) { cut[nS] = j; cfp_write_Chart_nonterminal_seq(wr, C, P, G, N, nS, cut, S, numeric); } } bool_t cpf_rule_tagging_rule(Grammar_t *G, GRule_t *gr) { if (gr->L != NOSYMB) { /* Not unary: */ return FALSE; } GSymbId_t H = gr->H; char *pH = G->sname[H]; GSymbId_t R = gr->R; char *pR = G->sname[R]; bool_t taggedR = FALSE; /* TRUE iff {R} has a rule tag. */ while (TRUE) { /* Skip {H} rule tag: */ if (*pH == '{') { bool_t commaH = FALSE; /* TRUE iff {H}'s rule tag has commas. */ pH++; /* Skip the opening brace. */ while (*pH != '}') { assert(*pH != '\000'); if (*pH == ',') { commaH = TRUE; } else if (((*pH < 'a') || (*pH > 'z')) && ((*pH < '0') || (*pH > '9'))) { assert(FALSE); } pH++; } pH++; /* Skip the closing brace. */ if (! commaH) { return FALSE; } } /* Skip {R} rule tag: */ if (*pR == '{') { taggedR = TRUE; pR++; /* Skip the opening brace. */ while (*pR != '}') { assert(*pR != '\000'); if (*pR == ',') { return FALSE; } else if (((*pR < 'a') || (*pR > 'z')) && ((*pR < '0') || (*pR > '9'))) { assert(FALSE); } pR++; }; pR++; /* Skip the closing brace. */ } if ((*pH == '\000') && (*pR == '\000')) { return taggedR; } if (*pH != *pR) { return FALSE; } pH++; pR++; } } void cfp_write_Chart_nonterminal_seq( FILE *wr, Chart_t *C, Phrase_t *P, Grammar_t *G, GSymbId_t N, /* Head symbol of expanded rule. */ int nS, /* Number of symbols in expanded rule. */ PNodeId_t cut[], /* Cut points of expanded chart rule. */ GSymbId_t S[], /* Symbol {S[i]} was parsed between {cut[i]} and {cut[i+1]}. */ bool_t numeric /* TRUE prints symbol numbers instead of names. */ ) { /* Plot span and cuts: */ { char *txspan = cfp_plot_multispan(nS, cut, C->NN); cfp_print_span(wr, cut[0], txspan, cut[nS]); free(txspan); } if (numeric) { /* Print rule with numeric symbol codes "sNN --> [N] sNN [N] sNN ... [N]": */ fprintf(wr, " s%d --> [%d]", N, cut[0]); int i; for (i = 0; i < nS; i++) { fprintf(wr, " s%d [%d]", S[i], cut[i+1]); } } else { /* Print rule with full symbol names: */ fprintf(wr, " %s -->", G->sname[N]); int i; for (i = 0; i < nS; i++) { fprintf(wr, " %s", G->sname[S[i]]); } } fprintf(wr, "\n"); } bool_t cfp_symbol_is_auxiliary(GSymbId_t N, Grammar_t *G) { char *p = G->sname[N]; return (p[0] == '$') || (p[0] == '*'); } bool_t cfp_symbol_is_function_labeled(GSymbId_t N, Grammar_t *G) { char *p = G->sname[N]; while (((*p) != '\000') && ((*p) != '{') && ((*p) != '(')) { if ((*p) == ':') { return TRUE; } p++; } return FALSE; } void cfp_print_span(FILE *wr, PNodeId_t i, char *txspan, PNodeId_t j) { char *txi = jsprintf(&txi, "[%d]", i); char *txj = jsprintf(&txj, "[%d]", j); fprintf(wr, " %5s %s %-5s", txi, txspan, txj); free(txi); free(txj); } char *cfp_plot_span(PNodeId_t i, PNodeId_t m, PNodeId_t j, int NN) { affirm(i > 0, "bad i"); affirm(j > i, "bad span"); /* Includes an initial '|', {NN} plot positions, another '|', and a '\000'. */ char *plot = (char *)notnull(malloc((NN+3)*sizeof(char)), "no mem"); int k; for (k = 0; k <= NN; k++) { plot[k] = ' '; } plot[0] = '|'; plot[NN+1] = '|'; plot[i] = '+'; for (k = i+1; k <= j-1; k++) { plot[k] = (k == m ? '+' : '-'); } plot[j] = '+'; plot[NN+2] = '\000'; return plot; } char *cfp_plot_multispan(int n, PNodeId_t p[], int NN) { /* Includes an initial '|', {NN} plot positions, another '|', and a '\000'. */ char *plot = (char *)notnull(malloc((NN+3)*sizeof(char)), "no mem"); { int k; for (k = 0; k <= NN; k++) { plot[k] = ' '; } } plot[0] = '|'; plot[NN+1] = '|'; { int i = p[0], j = p[n]; assert(i > 0); assert(j >= i); assert(j <= NN); plot[i] = '+'; int k; for (k = i+1; k <= j-1; k++) { plot[k] = '-'; } plot[j] = '+'; } { int r; for (r = 1; r < n; r++) { int k = p[r]; assert(k > p[r-1]); assert(k < p[r+1]); plot[k] = '+'; } } plot[NN+2] = '\000'; return plot; }