/* See grr_bypass.h */
/* Last edited on 2024-12-21 11:50:27 by stolfi */

#include <limits.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>

#include <vec.h>
#include <affirm.h>

#include <grr_grammar.h>
#include <grr_bypass.h>

Grammar_t *grr_remove_superfluous_symbols(Grammar_t *G)
  {
    fprintf(stderr, "eliminating superfluous passthrough symbols\n"); 
    grr_debug = TRUE; 
    
    /* Output numerically-coded input grammar for debugging. */
    /* grr_write_Grammar(stderr, G); */

    /* {scand[1..NSCand]} are the symbols that need to be checked for bypass: */
    int NSCand = 0;
    GSymbId_t scand[G->NS+1]; 
    /* {is_cand[S]} is true iff {S} is in {scand[1..NSCand]}: */
    bool_t is_cand[G->NS+1];
    
    /* {selim[S]} is true if symbol {S} is to be eliminated: */
    bool_t selim[G->NS+1];

    /* {ruleTot[0..NRTot]} are the current rules, incl. eliminated ones: */
    int NRTot = 0;
    GRule_vec_t ruleTot = GRule_vec_new(G->NR+1);

    /* {relim[ir]} is true if rule {ruleNew[ir]} was discarded: */
    bool_vec_t relim = bool_vec_new(G->NR+1);
    
    /* Copy the input rules and initialize the {relim} vector: */
    { GRuleId_t ir;
      for (ir = 0; ir <= G->NR; ir++)
        { ruleTot.e[ir] = G->rule[ir]; relim.e[ir] = FALSE; }
      NRTot = G->NR;
    }
    
    /* New inverted list start pointers: */
    GRuleId_t firstH[G->NS], firstL[G->NS], firstR[G->NS];
    /* The inverted lists will grow as new rules get added, and they
      include all eliminated rules. Note that {firstH[S]} is an index 
      into into {ruleTot}, not {G->rule}, and ditto for {firstL,firstR}. */
    
    /* Number of symbols in the new grammar (excl. elim ones): */
    int NSNew = G->NS;

    /* Number of rules in the new grammar (incl. new ones, excl. elim ones): */
    int NRNew = G->NR;

    auto void grr_push_cand(GSymbId_t S);
      /* If {S} is not {NOSYMB} and is not yet in the stack {scand}, puts it there. */
       
    auto GSymbId_t grr_pop_cand(void);
      /* Removes a symbol from the stack, returns it. */
    
    auto void grr_process_cand(GSymbId_t S);
      /* Checks whether the candidate {S} can be bypassed, and if so
        performs the bypass. May push some new symbols into the queue.
        Returns TRUE iff the the bypass succeeded. Updates
        {ruleTot,relim,NRTot,selim,NSNew,firstH,firstL,firstR} as
        needed. */

    auto bool_t grr_is_internal(GSymbId_t S);
      /* Returns TRUE iff {S} is an internal symbol. */

    auto bool_t grr_has_reflexive_def(GSymbId_t S);
      /* TRUE iff there is a rule with {S} on both sides of the '->'. */

    auto void grr_count_rules_defining(GSymbId_t S, int *ndef, bool_t *is_bin);
      /* Saves in {ndef} the number of distinct rules that have
        not been eliminated and have {S} as the LHS. Also sets
        {is_bin} to TRUE iff any of those rules is a binary one. */

    auto int grr_compute_rule_increment(GSymbId_t S, int ndef, bool_t is_bin);
    /* Computes the number of new rules that would be added to the 
      grammar if symbol {S} were to be replaced by its definitions,
      in all non-eliminated rules that use {S} in its RHS. 
      Assumes that {S} is defined by {ndef} rules. Note that rules
      {H -> S}, {H -> S X}, or {H -> X S} generate {ndef} new rules,
      whereas {H -> S S} gnerates {ndef^2} new rules. 
      
      The argument {is_bin} should be TRUE iff one of the rules
      defining {S} is binary. In that case, if {S} is also used in a
      binary rule, the procedure returns {INT_MAX} to indicate that
      the substitution cannot be made. */

    auto void grr_append_rule(GSymbId_t H, GSymbId_t L, GSymbId_t R);
    /* Appends the rule {H -> L R} to the grammar, updating the 
      lists {firstH,firstL,firstR}. */

    auto void grr_subst_symb_in_rule(GSymbId_t S, GRule_t *r);
    /* Given a rule {r = (H -> R L)}, where {H != S}, appends to the grammar
      new rules obtained from it by replacing {S} in {L} and/or {R}
      by all its non-eliminated definitions.  Note that if {S} has {m}
      such definitions, this procedure may add {0}, {k}, or {k^2}
      new rules to the grammar. 
      
      Since this operation can only increase the number of rules
      definining {H} and using {L} and {R}, it cannot turn a
      non-viable candidate into a viable one. Therefore, this
      procedure will not push any symbols back into the {scand}
      stack. */

    void grr_push_cand(GSymbId_t S)
      { if ((S != NOSYMB) && (! is_cand[S]))
          { NSCand++; scand[NSCand] = S;
            is_cand[S] = TRUE;
            grr_debug_symbol(stderr, G, "    pushing ", S, "\n");
          }
      }
      
    GSymbId_t grr_pop_cand(void)
      { demand(NSCand > 0, "stack underflow");
        GSymbId_t S = scand[NSCand]; NSCand--; 
        is_cand[S] = FALSE;
        grr_debug_symbol(stderr, G, "  popping ", S, "\n");
        return S;
      }
    
    void grr_append_rule(GSymbId_t H, GSymbId_t L, GSymbId_t R)
      { 
        demand(H != NOSYMB, "null H");
        demand(R != NOSYMB, "null R");
        /* One more rule in new grammar: */
        NRNew++;
        /* Append at end of {ruleTot}: */
        NRTot++; 
        GRuleId_t ir = NRTot;
        GRule_vec_expand(&ruleTot, ir);
        GRule_t *r = &(ruleTot.e[ir]);
        *r = (GRule_t){H,L,R, NORULE,NORULE,NORULE};
        grr_debug_rule(stderr, G, "        adding ", r, "\n"); 
        /* Mark as not eliminated yet: */
        bool_vec_expand(&relim, ir);
        relim.e[ir] = FALSE;
        /* Update inverted lists: */
        r->nextH = firstH[H]; firstH[H] = ir;
        if (L != NOSYMB) { r->nextL = firstL[L]; firstL[L] = ir; }
        r->nextR = firstR[R]; firstR[R] = ir;
      }

    bool_t grr_is_internal(GSymbId_t S)
      { assert(S != NOSYMB);
        assert(G->sname[S] != NULL);
        char c = G->sname[S][0];
        return ((c == '$') || (c == '*'));
      }

    bool_t grr_has_reflexive_def(GSymbId_t S)
      { GRuleId_t ir;
        ir = firstH[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]);
            if (! relim.e[ir]) 
              { assert(r->H == S); 
                if ((r->L == S) || (r->R == S)) { return TRUE; }
              }
            ir = r->nextH;
          }
        return FALSE;
      }

    void grr_count_rules_defining(GSymbId_t S, int *ndef, bool_t *is_bin)
      { (*ndef) = 0; (*is_bin) = FALSE;
        GRuleId_t ir = firstH[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]);
            assert(r->H == S);
            assert(r->R != NOSYMB);
            if (! relim.e[ir])
              { (*ndef)++; if (r->L != NOSYMB) { (*is_bin) = TRUE; } }
            ir = r->nextH;
          }
      }

    int grr_compute_rule_increment(GSymbId_t S, int ndef, bool_t is_bin)
      { int nincr = 0;
        GRuleId_t ir;
        /* Scan uses of {S} as the {L} field: */
        ir = firstL[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]); /* Rule {r} is {H -> S Y} */
            assert(r->L == S); 
            assert(r->R != NOSYMB);
            if (! relim.e[ir])
              { if (is_bin) { return INT_MAX; }
                if (r->R == S) 
                  { /* Rule {H -> S S} */  nincr += ndef*ndef - 1; }
                else 
                  { /* Rule {H -> S Y} */ nincr += ndef - 1; }
              }
            ir = r->nextL;
          }
        /* Scan uses of {S} as the {R} field: */
        ir = firstR[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]); /* Rule {r} is {H -> X S} */
            assert(r->R == S); 
            if (! relim.e[ir]) 
              { if (r->L == NOSYMB) 
                  { /* Rule {r} is unary {H -> S} */ nincr += ndef - 1; } 
                else if (r->L != S)
                  { /* Rule {r} is binary {H -> X S}, with {X != S} */
                    if (is_bin) { return INT_MAX; }
                    nincr += ndef;
                  }
              }
            ir = r->nextR;
          }
        return nincr - ndef;
      }

    void grr_subst_symb_in_rule(GSymbId_t S, GRule_t *r)
      { grr_debug_rule(stderr, G, "    replacing in ", r, "\n"); 
        GSymbId_t H = r->H, L = r->L, R = r->R;
        assert(R != NOSYMB);
        if ((L == S) && (R == S))
          { /* The given rule is {H -> S S} */
            /* Loop on defintions for the first {S}: */
            GRuleId_t ir0 = firstH[S];
            while (ir0 != NORULE) 
              { GRule_t *r0 = &(ruleTot.e[ir0]); /* {S -> X Y} */
                if (! relim.e[ir0])
                  { /* Rule {r0} must be {S -> Y0}: */
                    assert(r0->H == S);
                    demand(r0->L == NOSYMB, "replacing binary into binary");
                    GSymbId_t Y0 = r0->R; 
                    assert(Y0 != NOSYMB);
                    /* Loop on defintions for the second {S}: */
                    GRuleId_t ir1 = firstH[S];
                    while (ir1 != NORULE) 
                      { GRule_t *r1 = &(ruleTot.e[ir1]); /* {S -> X Y} */
                        if (! relim.e[ir1])
                          { /* Rule {r1} must be {S -> Y1}: */
                            assert(r1->H == S);
                            demand(r1->L == NOSYMB, "replacing binary into binary");
                            GSymbId_t Y1 = r1->R; 
                            assert(Y1 != NOSYMB);
                            grr_append_rule(H,Y0,Y1);
                          }
                        ir1 = r1->nextH;
                      }
                  }
                ir0 = r0->nextH;
              }
          }
        else if (L == S)
          { /* The given rule is {H -> S R} with {R != S} */
            GRuleId_t ir0 = firstH[S];
            while (ir0 != NORULE) 
              { GRule_t *r0 = &(ruleTot.e[ir0]); /* {S -> X Y} */
                if (! relim.e[ir0])
                  { assert(r0->H == S);
                    /* Rule must be {S -> Y}: */
                    demand(r0->L == NOSYMB, "replacing binary into binary");
                    GSymbId_t Y = r0->R; 
                    assert(Y != NOSYMB);
                    grr_append_rule(H,Y,R);
                  }
                ir0 = r0->nextH;
              }
          }
        else if (R == S)
          { /* The given rule is {H -> L S} with {L != S} */
            GRuleId_t ir1 = firstH[S];
            while (ir1 != NORULE) 
              { GRule_t *r1 = &(ruleTot.e[ir1]); /* {S -> X Y} */
                if (! relim.e[ir1])
                  { assert(r1->H == S);
                    GSymbId_t X = r1->L; 
                    GSymbId_t Y = r1->R; 
                    assert(Y != NOSYMB);
                    if (L == NOSYMB)
                      { /* The definition is actually {H -> S}, so add {H -> X Y} */
                        grr_append_rule(H,X,Y);
                      }
                    else
                      { /* Require {S -> X Y} to be unary: */
                        demand(X == NOSYMB, "replacing binary into binary");
                        /* Definition is {S -> Y}, so add {H -> L Y}: */
                        grr_append_rule(H,L,Y);
                      }
                  }
                ir1 = r1->nextH;
              }
          }
      }

    void grr_process_cand(GSymbId_t S)
      { 
        if (! grr_is_internal(S)) 
          { return; }
        grr_debug_symbol(stderr, G, "  considering ", S, ""); 
        if (grr_has_reflexive_def(S))
          { if (grr_debug) { fprintf(stderr, " (reflexive)\n"); } return; }

        /* How many rules define the symbol {S}? */
        int ndef; bool_t is_bin;
        grr_count_rules_defining(S, &ndef, &is_bin);
        if (grr_debug) 
          { fprintf(stderr, " %d rules (%s)", ndef, (is_bin ? "some binary" : "all unary")); }

        /* See whether the bypass would make the grammar smaller.
          Namely, we compute the net increase {nincr} in the number of
          rules that would happen if we bypassed the symbol {S}. Note
          that the number of symbols wwould decrease by 1 in any case,
          and that {nincr} is {INT_MAX} if the replacement is impossible. */
        int nincr = grr_compute_rule_increment(S, ndef, is_bin);
        if (nincr == INT_MAX)
          { if (grr_debug) { fprintf(stderr, " (can't replace)\n"); } return; }
        fprintf(stderr, " increment %+d rules", nincr); 
        if (nincr > ndef)
          { if (grr_debug) { fprintf(stderr, " (not worth it)\n"); } return; }

        /* OK, we can and should do the bypass! */
        if (grr_debug) { fprintf(stderr, " (eliminating)\n"); }
        
        /* Replace all rules that use {S} by their {S}-expansions: */
        GRuleId_t ir;
        ir = firstL[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]);
            assert(r->H != S); 
            assert(r->L == S); 
            if (! relim.e[ir]) 
              { grr_subst_symb_in_rule(S, r);
                relim.e[ir] = TRUE; NRNew--;
                grr_debug_rule(stderr, G, "    eliminating ", r, "\n");
              }
            ir = r->nextL;
          }
        ir = firstR[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]);
            assert(r->H != S); 
            assert(r->R == S); 
            if (! relim.e[ir]) 
              { grr_subst_symb_in_rule(S, r);
                relim.e[ir] = TRUE; NRNew--;
                grr_debug_rule(stderr, G, "    eliminating ", r, "\n");
              }
            ir = r->nextR;
          }
          
        /* Eliminate the rules that define {S}: */
        ir = firstH[S];
        while (ir != NORULE) 
          { GRule_t *r = &(ruleTot.e[ir]);
            assert(r->H == S);
            assert(r->R != NOSYMB);
            if (! relim.e[ir]) 
              { relim.e[ir] = TRUE; NRNew--;
                grr_debug_rule(stderr, G, "    eliminating ", r, "\n");
                /* Eliminating a rule may cause its RHS symbols to become bypassable: */
                grr_push_cand(r->L);  
                grr_push_cand(r->R);  
              }
            ir = r->nextH;
          }
          
        /* Eliminate the symbol {S}: */
        selim[S] = TRUE; NSNew--;
      }

    /* Initialize the {scand} stack and the {selim} vector: */
    { GSymbId_t S;
      is_cand[NOSYMB] = FALSE;
      selim[NOSYMB] = FALSE;
      scand[0] = NOSYMB; /* This entry is not used. */
      for (S = 1; S <= G->NS; S++)
        { selim[S] = FALSE; is_cand[S] = FALSE; grr_push_cand(S); }
    }

    /* Copy the input inverted lists start pointers: */
    { GSymbId_t S;
      for (S = 1; S <= G->NS; S++)
        { firstH[S] = G->firstH[S];
          firstL[S] = G->firstL[S],
          firstR[S] = G->firstR[S];
        }
    }

    assert(NSCand > 0);
    while (NSCand > 0)
      { /* Get some candidate: */
        GSymbId_t S = grr_pop_cand();
        grr_process_cand(S);
      }
      
    fprintf(stderr, "  grammar reduced from (%d,%d) to (%d,%d)\n", G->NS, G->NR, NSNew, NRNew); 
    
    /* Create new grammar: */
    Grammar_t *GNew = grr_new_Grammar(NSNew,NRNew);
   
    /* Remove the discarded symbols, build old->new symbol table {dic}: */
    GSymbId_t dic[G->NS+1];
    { GSymbId_t SNew = 0, SOld;
      dic[NOSYMB] = NOSYMB;
      for (SOld = 1; SOld <= G->NS; SOld++)
        { if (! selim[SOld])
            { SNew++; dic[SOld] = SNew; 
              GNew->sname[SNew] = grr_copy_name(G->sname[SOld]);
            }
          else
            { dic[SOld] = NOSYMB; }
        }
      assert(NSNew == SNew);
    }
    
    /* Copy valid rules from {ruleTot} to {GNew->rule}, mapping the symbol ids: */
    { GNew->rule[NORULE] = (GRule_t){NOSYMB,NOSYMB,NOSYMB, NORULE,NORULE,NORULE};
      GRuleId_t irNew = 0;
      GRuleId_t irTot;
      for (irTot = 1; irTot <= NRTot; irTot++)
        { if (! relim.e[irTot])
            { GRule_t *rulei = &(ruleTot.e[irTot]);
              GSymbId_t HOld = rulei->H, HNew = dic[HOld];
              GSymbId_t LOld = rulei->L, LNew = dic[LOld];
              GSymbId_t ROld = rulei->R, RNew = dic[ROld];
              assert(HNew != NOSYMB);
              assert((LOld == NOSYMB) == (LNew == NOSYMB));
              assert(RNew != NOSYMB);
              irNew++;
              GNew->rule[irNew] = (GRule_t){HNew,LNew,RNew, NORULE,NORULE,NORULE};
            }
        }
      assert(irNew == NRNew);
    }
    
    /* Free temporary storage: */
    free(ruleTot.e);
    free(relim.e);
      
    /* Rebuild inverted lists: */
    grr_build_lists(GNew);

    return GNew; 
  }