/* See grr_bypass.h */ /* Last edited on 2024-12-21 11:50:27 by stolfi */ #include #include #include #include #include #include #include #include #include Grammar_t *grr_remove_superfluous_symbols(Grammar_t *G) { fprintf(stderr, "eliminating superfluous passthrough symbols\n"); grr_debug = TRUE; /* Output numerically-coded input grammar for debugging. */ /* grr_write_Grammar(stderr, G); */ /* {scand[1..NSCand]} are the symbols that need to be checked for bypass: */ int NSCand = 0; GSymbId_t scand[G->NS+1]; /* {is_cand[S]} is true iff {S} is in {scand[1..NSCand]}: */ bool_t is_cand[G->NS+1]; /* {selim[S]} is true if symbol {S} is to be eliminated: */ bool_t selim[G->NS+1]; /* {ruleTot[0..NRTot]} are the current rules, incl. eliminated ones: */ int NRTot = 0; GRule_vec_t ruleTot = GRule_vec_new(G->NR+1); /* {relim[ir]} is true if rule {ruleNew[ir]} was discarded: */ bool_vec_t relim = bool_vec_new(G->NR+1); /* Copy the input rules and initialize the {relim} vector: */ { GRuleId_t ir; for (ir = 0; ir <= G->NR; ir++) { ruleTot.e[ir] = G->rule[ir]; relim.e[ir] = FALSE; } NRTot = G->NR; } /* New inverted list start pointers: */ GRuleId_t firstH[G->NS], firstL[G->NS], firstR[G->NS]; /* The inverted lists will grow as new rules get added, and they include all eliminated rules. Note that {firstH[S]} is an index into into {ruleTot}, not {G->rule}, and ditto for {firstL,firstR}. */ /* Number of symbols in the new grammar (excl. elim ones): */ int NSNew = G->NS; /* Number of rules in the new grammar (incl. new ones, excl. elim ones): */ int NRNew = G->NR; auto void grr_push_cand(GSymbId_t S); /* If {S} is not {NOSYMB} and is not yet in the stack {scand}, puts it there. */ auto GSymbId_t grr_pop_cand(void); /* Removes a symbol from the stack, returns it. */ auto void grr_process_cand(GSymbId_t S); /* Checks whether the candidate {S} can be bypassed, and if so performs the bypass. May push some new symbols into the queue. Returns TRUE iff the the bypass succeeded. Updates {ruleTot,relim,NRTot,selim,NSNew,firstH,firstL,firstR} as needed. */ auto bool_t grr_is_internal(GSymbId_t S); /* Returns TRUE iff {S} is an internal symbol. */ auto bool_t grr_has_reflexive_def(GSymbId_t S); /* TRUE iff there is a rule with {S} on both sides of the '->'. */ auto void grr_count_rules_defining(GSymbId_t S, int *ndef, bool_t *is_bin); /* Saves in {ndef} the number of distinct rules that have not been eliminated and have {S} as the LHS. Also sets {is_bin} to TRUE iff any of those rules is a binary one. */ auto int grr_compute_rule_increment(GSymbId_t S, int ndef, bool_t is_bin); /* Computes the number of new rules that would be added to the grammar if symbol {S} were to be replaced by its definitions, in all non-eliminated rules that use {S} in its RHS. Assumes that {S} is defined by {ndef} rules. Note that rules {H -> S}, {H -> S X}, or {H -> X S} generate {ndef} new rules, whereas {H -> S S} gnerates {ndef^2} new rules. The argument {is_bin} should be TRUE iff one of the rules defining {S} is binary. In that case, if {S} is also used in a binary rule, the procedure returns {INT_MAX} to indicate that the substitution cannot be made. */ auto void grr_append_rule(GSymbId_t H, GSymbId_t L, GSymbId_t R); /* Appends the rule {H -> L R} to the grammar, updating the lists {firstH,firstL,firstR}. */ auto void grr_subst_symb_in_rule(GSymbId_t S, GRule_t *r); /* Given a rule {r = (H -> R L)}, where {H != S}, appends to the grammar new rules obtained from it by replacing {S} in {L} and/or {R} by all its non-eliminated definitions. Note that if {S} has {m} such definitions, this procedure may add {0}, {k}, or {k^2} new rules to the grammar. Since this operation can only increase the number of rules definining {H} and using {L} and {R}, it cannot turn a non-viable candidate into a viable one. Therefore, this procedure will not push any symbols back into the {scand} stack. */ void grr_push_cand(GSymbId_t S) { if ((S != NOSYMB) && (! is_cand[S])) { NSCand++; scand[NSCand] = S; is_cand[S] = TRUE; grr_debug_symbol(stderr, G, " pushing ", S, "\n"); } } GSymbId_t grr_pop_cand(void) { demand(NSCand > 0, "stack underflow"); GSymbId_t S = scand[NSCand]; NSCand--; is_cand[S] = FALSE; grr_debug_symbol(stderr, G, " popping ", S, "\n"); return S; } void grr_append_rule(GSymbId_t H, GSymbId_t L, GSymbId_t R) { demand(H != NOSYMB, "null H"); demand(R != NOSYMB, "null R"); /* One more rule in new grammar: */ NRNew++; /* Append at end of {ruleTot}: */ NRTot++; GRuleId_t ir = NRTot; GRule_vec_expand(&ruleTot, ir); GRule_t *r = &(ruleTot.e[ir]); *r = (GRule_t){H,L,R, NORULE,NORULE,NORULE}; grr_debug_rule(stderr, G, " adding ", r, "\n"); /* Mark as not eliminated yet: */ bool_vec_expand(&relim, ir); relim.e[ir] = FALSE; /* Update inverted lists: */ r->nextH = firstH[H]; firstH[H] = ir; if (L != NOSYMB) { r->nextL = firstL[L]; firstL[L] = ir; } r->nextR = firstR[R]; firstR[R] = ir; } bool_t grr_is_internal(GSymbId_t S) { assert(S != NOSYMB); assert(G->sname[S] != NULL); char c = G->sname[S][0]; return ((c == '$') || (c == '*')); } bool_t grr_has_reflexive_def(GSymbId_t S) { GRuleId_t ir; ir = firstH[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); if (! relim.e[ir]) { assert(r->H == S); if ((r->L == S) || (r->R == S)) { return TRUE; } } ir = r->nextH; } return FALSE; } void grr_count_rules_defining(GSymbId_t S, int *ndef, bool_t *is_bin) { (*ndef) = 0; (*is_bin) = FALSE; GRuleId_t ir = firstH[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); assert(r->H == S); assert(r->R != NOSYMB); if (! relim.e[ir]) { (*ndef)++; if (r->L != NOSYMB) { (*is_bin) = TRUE; } } ir = r->nextH; } } int grr_compute_rule_increment(GSymbId_t S, int ndef, bool_t is_bin) { int nincr = 0; GRuleId_t ir; /* Scan uses of {S} as the {L} field: */ ir = firstL[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); /* Rule {r} is {H -> S Y} */ assert(r->L == S); assert(r->R != NOSYMB); if (! relim.e[ir]) { if (is_bin) { return INT_MAX; } if (r->R == S) { /* Rule {H -> S S} */ nincr += ndef*ndef - 1; } else { /* Rule {H -> S Y} */ nincr += ndef - 1; } } ir = r->nextL; } /* Scan uses of {S} as the {R} field: */ ir = firstR[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); /* Rule {r} is {H -> X S} */ assert(r->R == S); if (! relim.e[ir]) { if (r->L == NOSYMB) { /* Rule {r} is unary {H -> S} */ nincr += ndef - 1; } else if (r->L != S) { /* Rule {r} is binary {H -> X S}, with {X != S} */ if (is_bin) { return INT_MAX; } nincr += ndef; } } ir = r->nextR; } return nincr - ndef; } void grr_subst_symb_in_rule(GSymbId_t S, GRule_t *r) { grr_debug_rule(stderr, G, " replacing in ", r, "\n"); GSymbId_t H = r->H, L = r->L, R = r->R; assert(R != NOSYMB); if ((L == S) && (R == S)) { /* The given rule is {H -> S S} */ /* Loop on defintions for the first {S}: */ GRuleId_t ir0 = firstH[S]; while (ir0 != NORULE) { GRule_t *r0 = &(ruleTot.e[ir0]); /* {S -> X Y} */ if (! relim.e[ir0]) { /* Rule {r0} must be {S -> Y0}: */ assert(r0->H == S); demand(r0->L == NOSYMB, "replacing binary into binary"); GSymbId_t Y0 = r0->R; assert(Y0 != NOSYMB); /* Loop on defintions for the second {S}: */ GRuleId_t ir1 = firstH[S]; while (ir1 != NORULE) { GRule_t *r1 = &(ruleTot.e[ir1]); /* {S -> X Y} */ if (! relim.e[ir1]) { /* Rule {r1} must be {S -> Y1}: */ assert(r1->H == S); demand(r1->L == NOSYMB, "replacing binary into binary"); GSymbId_t Y1 = r1->R; assert(Y1 != NOSYMB); grr_append_rule(H,Y0,Y1); } ir1 = r1->nextH; } } ir0 = r0->nextH; } } else if (L == S) { /* The given rule is {H -> S R} with {R != S} */ GRuleId_t ir0 = firstH[S]; while (ir0 != NORULE) { GRule_t *r0 = &(ruleTot.e[ir0]); /* {S -> X Y} */ if (! relim.e[ir0]) { assert(r0->H == S); /* Rule must be {S -> Y}: */ demand(r0->L == NOSYMB, "replacing binary into binary"); GSymbId_t Y = r0->R; assert(Y != NOSYMB); grr_append_rule(H,Y,R); } ir0 = r0->nextH; } } else if (R == S) { /* The given rule is {H -> L S} with {L != S} */ GRuleId_t ir1 = firstH[S]; while (ir1 != NORULE) { GRule_t *r1 = &(ruleTot.e[ir1]); /* {S -> X Y} */ if (! relim.e[ir1]) { assert(r1->H == S); GSymbId_t X = r1->L; GSymbId_t Y = r1->R; assert(Y != NOSYMB); if (L == NOSYMB) { /* The definition is actually {H -> S}, so add {H -> X Y} */ grr_append_rule(H,X,Y); } else { /* Require {S -> X Y} to be unary: */ demand(X == NOSYMB, "replacing binary into binary"); /* Definition is {S -> Y}, so add {H -> L Y}: */ grr_append_rule(H,L,Y); } } ir1 = r1->nextH; } } } void grr_process_cand(GSymbId_t S) { if (! grr_is_internal(S)) { return; } grr_debug_symbol(stderr, G, " considering ", S, ""); if (grr_has_reflexive_def(S)) { if (grr_debug) { fprintf(stderr, " (reflexive)\n"); } return; } /* How many rules define the symbol {S}? */ int ndef; bool_t is_bin; grr_count_rules_defining(S, &ndef, &is_bin); if (grr_debug) { fprintf(stderr, " %d rules (%s)", ndef, (is_bin ? "some binary" : "all unary")); } /* See whether the bypass would make the grammar smaller. Namely, we compute the net increase {nincr} in the number of rules that would happen if we bypassed the symbol {S}. Note that the number of symbols wwould decrease by 1 in any case, and that {nincr} is {INT_MAX} if the replacement is impossible. */ int nincr = grr_compute_rule_increment(S, ndef, is_bin); if (nincr == INT_MAX) { if (grr_debug) { fprintf(stderr, " (can't replace)\n"); } return; } fprintf(stderr, " increment %+d rules", nincr); if (nincr > ndef) { if (grr_debug) { fprintf(stderr, " (not worth it)\n"); } return; } /* OK, we can and should do the bypass! */ if (grr_debug) { fprintf(stderr, " (eliminating)\n"); } /* Replace all rules that use {S} by their {S}-expansions: */ GRuleId_t ir; ir = firstL[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); assert(r->H != S); assert(r->L == S); if (! relim.e[ir]) { grr_subst_symb_in_rule(S, r); relim.e[ir] = TRUE; NRNew--; grr_debug_rule(stderr, G, " eliminating ", r, "\n"); } ir = r->nextL; } ir = firstR[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); assert(r->H != S); assert(r->R == S); if (! relim.e[ir]) { grr_subst_symb_in_rule(S, r); relim.e[ir] = TRUE; NRNew--; grr_debug_rule(stderr, G, " eliminating ", r, "\n"); } ir = r->nextR; } /* Eliminate the rules that define {S}: */ ir = firstH[S]; while (ir != NORULE) { GRule_t *r = &(ruleTot.e[ir]); assert(r->H == S); assert(r->R != NOSYMB); if (! relim.e[ir]) { relim.e[ir] = TRUE; NRNew--; grr_debug_rule(stderr, G, " eliminating ", r, "\n"); /* Eliminating a rule may cause its RHS symbols to become bypassable: */ grr_push_cand(r->L); grr_push_cand(r->R); } ir = r->nextH; } /* Eliminate the symbol {S}: */ selim[S] = TRUE; NSNew--; } /* Initialize the {scand} stack and the {selim} vector: */ { GSymbId_t S; is_cand[NOSYMB] = FALSE; selim[NOSYMB] = FALSE; scand[0] = NOSYMB; /* This entry is not used. */ for (S = 1; S <= G->NS; S++) { selim[S] = FALSE; is_cand[S] = FALSE; grr_push_cand(S); } } /* Copy the input inverted lists start pointers: */ { GSymbId_t S; for (S = 1; S <= G->NS; S++) { firstH[S] = G->firstH[S]; firstL[S] = G->firstL[S], firstR[S] = G->firstR[S]; } } assert(NSCand > 0); while (NSCand > 0) { /* Get some candidate: */ GSymbId_t S = grr_pop_cand(); grr_process_cand(S); } fprintf(stderr, " grammar reduced from (%d,%d) to (%d,%d)\n", G->NS, G->NR, NSNew, NRNew); /* Create new grammar: */ Grammar_t *GNew = grr_new_Grammar(NSNew,NRNew); /* Remove the discarded symbols, build old->new symbol table {dic}: */ GSymbId_t dic[G->NS+1]; { GSymbId_t SNew = 0, SOld; dic[NOSYMB] = NOSYMB; for (SOld = 1; SOld <= G->NS; SOld++) { if (! selim[SOld]) { SNew++; dic[SOld] = SNew; GNew->sname[SNew] = grr_copy_name(G->sname[SOld]); } else { dic[SOld] = NOSYMB; } } assert(NSNew == SNew); } /* Copy valid rules from {ruleTot} to {GNew->rule}, mapping the symbol ids: */ { GNew->rule[NORULE] = (GRule_t){NOSYMB,NOSYMB,NOSYMB, NORULE,NORULE,NORULE}; GRuleId_t irNew = 0; GRuleId_t irTot; for (irTot = 1; irTot <= NRTot; irTot++) { if (! relim.e[irTot]) { GRule_t *rulei = &(ruleTot.e[irTot]); GSymbId_t HOld = rulei->H, HNew = dic[HOld]; GSymbId_t LOld = rulei->L, LNew = dic[LOld]; GSymbId_t ROld = rulei->R, RNew = dic[ROld]; assert(HNew != NOSYMB); assert((LOld == NOSYMB) == (LNew == NOSYMB)); assert(RNew != NOSYMB); irNew++; GNew->rule[irNew] = (GRule_t){HNew,LNew,RNew, NORULE,NORULE,NORULE}; } } assert(irNew == NRNew); } /* Free temporary storage: */ free(ruleTot.e); free(relim.e); /* Rebuild inverted lists: */ grr_build_lists(GNew); return GNew; }