/* See ref_set.h */
/* Last edited on 2008-01-16 22:48:53 by stolfi */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include <vec.h>
#include <bool.h>
#include <ref.h>
#include <jsrandom.h>
#include <jsmath.h>
#include <affirm.h>

#include <ref_set.h>

/* In the following comments, `{~a}' means `any {ref_t} value {b} such
  that {eq(a,b)} is TRUE'. We also write {sz} instead of {sz(S)} {S->nel}, and
  {S[k]} for {S->el[k mod sz]}.
  
  By {S[i..j]} we denote all entries {S[r]} of the array {S} such that
  {i <= r <= i + ((j-i) mod sz)}, whether NULL or not. Note that if
  {j==i} or {j == i+sz} then {S[i..j]} is just one element, {S[i]}.

 OCCUPANCY RATIO AND PERFORMANCE
    
    The efficiency of the {ulist_t} operations generally depends on its
    /occupancy ratio/ {r(S) == ct(S)/cp(S)}.
    
    The functions {ulist_has}, {ulist_add}, and {ulist_del} are
    efficient as long as the ratio {r(S)} is well below 1. In that
    case, their expected cost is about {K + r(S)}. However, the
    cost of each call grows very quickly, to K*cp(S) or more,
    as {r(S)} approaches 1.
    
    Therefore, to preserve the efficiency of those operations, the
    client should keep track of {ct(S)}, and call {ulist_resize} as
    needed to keep the occupancy ratio {r(S)} below some limit {rmax << 1}.
    
    If the capacity of {S} is expanded or reduced by a fixed percentage
    (*not* a fixed amount) every time, the cost of all these resizes,
    divided by the number of {ulist_add} operations, will amount to
    only a constant overhead per operation. The best value of {rmax}
    depend on the situation, but {rmax==1/2} should be good enough for
    most applications.  In other words, one double the capacity
    of the {ulist_t} whenever it becomes more than half full.

    On the other hand, the expected cost of {ulist_pick} is roughly
    proportional to {cp(S)/(1 + ct(S))}, which is approximately
    {1/r(S)}. Moreover, the cost of {ulist_count} and
    {ulist_elems} is roughly proportional to {cp(S)}, not {ct(S)}.
    Therefore, a client who is concerned about memory usage may want
    to call {ulist_resize} as needed to keep {r(S)} above a certain
    minimum {rmin >> 0} --- say, {rmin == 1/4}. */
  
struct ref_set_t {
  ref_vec_t E;          /* The item storage slots. */
  unsigned int ct;      /* The current count of items (non-NULL slots). */
  ref_set_eq_t *eq;     /* The item equivalence predicate. */
  ref_set_hash_t *hash; /* The item hash function. */
};

/* PROCEDURES THAT DO NOT DEPEND ON ITEM LOCATIONS */

void ref_set_clear_all_slots(ref_vec_t *E);
  /* Sets all slots of {E} to NULL. */

ref_set_t *ref_set_new(int n)
  { ref_set_t *S = (ref_set_t *)notnull(malloc(sizeof(ref_set_t)), "out of mem");
    S->E = ref_vec_new(n);
    ref_set_clear_all_slots(&(S->E));
    S->ct = 0;
    S->eq = &ref_set_addr_eq;
    S->hash = &ref_set_addr_hash;
    return S;
  }

void ref_set_free(ref_set_t *S)
  { free(S->E.el); free(S); }

void ref_set_clear(ref_set_t *S)
  { if (S->ct != 0)
      { ref_set_clear_all_slots(&(S->E));
        S->ct = 0;
      }
  }

void ref_set_clear_all_slots(ref_vec_t *E)
  { unsigned int i;
    for (i = 0; i < E->nel; i++) { E->el[i] = NULL; }
  }

void ref_set_install_eq(ref_set_t *S, ref_set_eq_t *eq)
  { if (eq == NULL) { /* Provide default: */ eq = &ref_set_addr_eq; }
    if (eq != S->eq)
      { demand((S->ct <= 1), "cannot change {eq} of non-trivial set"); 
        S->eq = eq;
      }
  }

void ref_set_install_hash(ref_set_t *S, ref_set_hash_t *hash)
  { if (hash == NULL) { /* Provide default: */ hash = &ref_set_addr_hash; }
    if (hash != S->hash)
      { demand((S->ct == 0), "cannot change {hash} of non-empty set"); 
        S->hash = hash;
      }
  }

unsigned int ref_set_count(ref_set_t *S)
  { return S->ct; }

unsigned int ref_set_size(ref_set_t *S)
  { return S->E.nel; }

ref_vec_t ref_set_items(ref_set_t *S)
  { /* Allocate the result vector: */
    ref_vec_t v = ref_vec_new(S->ct);
    /* Scan {S} and copy the non-{NULL} entries to {v}: */
    unsigned int ct = 0; /* Count of non-NULL items found. */
    unsigned int i;
    ref_t *p; /* Pointer that scans the item slots. */
    for (i = 0, p = S->E.el; i < S->E.nel; i++, p++)
      { ref_t Ei = (*p);
        if (Ei != NULL) { v.el[ct] = Ei; ct++; }
      }
    return v;
  }

/* PROCEDURES THAT DEPEND ON THE ITEM LOCATIONS */

  /* Each new item {a} added to the set is stored in the first NULL
  slot found starting at positions {k}, {k+1}, {k+2}, ..., wrapping
  around modulo {S.nel}; where the starting index {k} is {hash(a,
  S.nel)}.
  
  Thus, the main structure invariant (I1) is: if {S[i] != NULL} then
  all the elements {S[k..i]} are non-NULL, where {k=hash(S[i],sz)}.
  
  A secondary invariant (I2) is that {S->ct} is equal to the 
  number of non-NULL entries in {S->E}.
  
  A third invariant (I3) is that {S->E} does not contain
  two distinct non-NULL entries {a,b} such that {eq(a,b)}.  */

bool_t ref_set_verify(ref_set_t *S, bool_t die);
  /* Checks whether {S} satisfies the invariants I1 and I2. 
    If it satisfies both, prints nothing and returns TRUE.  If it 
    fails one or both of them, prints a diagnostic message;
    then either aborts (if {die} is TRUE) or returns FALSE
    (if {die} is FALSE). */

unsigned int ref_set_locate(ref_set_t *S, ref_t a);
  /* If {~a} is in {S}, returns the index {k} such that {eq(S[k],a)}.
    
    If {~a} is not in {S}, and {S} is not full, returns the index {k},
    of the slot {S[k]} where {~a} should be stored to
    add it to the set; in that case, {S[k]} will be {NULL}.
    
    If {~a} is not in {S} but {S} is full, returns {S.nel}.
    
    Assumes that {S->nel > 0} and {a != NULL}. */

unsigned int ref_set_locate(ref_set_t *S, ref_t a)
  { unsigned int sz = S->E.nel;
    /* Cannot call the hash function with {sz == 0}: */
    if (sz == 0) { return 0; }
    unsigned int h = S->hash(a, sz);
    demand(h < sz, "bad hash value");
    unsigned int i = h;
    do { 
      ref_t b = S->E.el[i];
      if ((b == NULL) || S->eq(b, a)) { return i; }
      i++; if (i == sz) { i = 0; }
    } while (i != h);
    return sz;
  }

ref_t ref_set_find(ref_set_t *S, ref_t a)
  { /* Argument checking: */
    demand(a != NULL, "invalid item");
    /* Get the index {k} of {~a} in {S}: */
    unsigned int k = ref_set_locate(S, a);
    /* Return the {~a} found  there, or NULL: */
    return (k < S->E.nel ? S->E.el[k] : NULL);
  }

bool_t ref_set_has(ref_set_t *S, ref_t a)
  { /* Argument checking: */
    demand(a != NULL, "invalid item");
    /* Get the index {k} of {~a} in {S}: */
    unsigned int k = ref_set_locate(S, a);
    /* Return TRUE iff {~a} was there: */
    return ((k < S->E.nel) && (S->E.el[k] != NULL));
  }

ref_t ref_set_add(ref_set_t *S, ref_t a)
  { /* Argument checking: */
    demand(a != NULL, "invalid item");
    /* Get the index {k} of {~a} in {S}: */
    unsigned int k = ref_set_locate(S, a);
    /* If {k >= S->nel}, then {~a} is not in {S} and {S} is full: */
    demand(k < S->E.nel, "no space for new item");
    /* Let {b} be the {~a} item in {S}, or NULL if it doesn't exist: */
    ref_t b = S->E.el[k];
    S->E.el[k] = a;
    if (b == NULL) { /* Update the item count: */ S->ct++; }
    return b;
  }

ref_t ref_set_del(ref_set_t *S, ref_t a)
  { unsigned int sz = S->E.nel;
    bool_t debug = FALSE;
    /* bool_t debug = ((sz == 200) && (S->ct == 99)); */
    /* Argument checking: */
    demand(a != NULL, "invalid item");
    /* Trivial case: */
    if (S->ct == 0) { return NULL; }
    /* Get the index {k} of {~a} in {S}: */
    unsigned int k = ref_set_locate(S, a);
    if (k >= sz) { /* {~a} is not in {S} */ return NULL; }
    /* Let {b} be the {~a} item in {S}: */
    ref_t b = S->E.el[k];
    if (b == NULL)  { /* {~a} is not in {S} */ return NULL; }
    /* Remove {b} from {S}: */
    S->E.el[k] = NULL; S->ct--;
    if (debug) { fprintf(stderr, "-- start actual deletion --\n"); }
    if (debug) { fprintf(stderr, "  cleared S[%u]\n", k); }
    if (debug) { (void)ref_set_verify(S, FALSE); }
    /* Now entries that follow {S[k]} may need to be deleted and reinserted: */
    unsigned int i = (k + 1) % sz; /* Scans entries after {S[k]}. */
    unsigned int d = 0; /* Always equal to {imod(i - (k+1), sz)}. */ 
    while(TRUE)
      { /* Now entries {S[k+1..i]\S[i]}} are OK and not NULL. */
        ref_t c = S->E.el[i];
        if (c == NULL)
          { /* If our logic is correct, there are no more inconsistencies. */
            if (debug) { fprintf(stderr, "  found null slot S[%u]\n", i); }
            break;
          }
        /* At this point {S[i]} is distinct from {S[k]}. */
        /* Check whether {c} needs to be reinserted: */
        unsigned int hc = S->hash(c, sz);
        demand (hc < sz, "bad hash function");
        unsigned int dhc = imod(hc - (k+1), sz);
        if (debug) { fprintf(stderr, "  S[%u] hashes to %u  dhc = %u  d = %u", i, hc, dhc, d); }
        if (dhc > d) 
          { /* Entry {S[i]} is not OK, must reinsert it. */
            if (debug) { fprintf(stderr, "  presumed BAD\n"); }
            /* Item {c} must go into {S[k]}. */
            S->E.el[k] = c; 
            if (debug) { fprintf(stderr, "  copied S[%u] from S[%u]\n", k, i); }
            /* Now the hole is {S[i]}: */
            S->E.el[i] = NULL;
            if (debug) { fprintf(stderr, "  cleared S[%u]\n", i); }
            k = i; d = 0; 
            if (debug) { (void)ref_set_verify(S, FALSE); }
          }
        else
          { if (debug) { fprintf(stderr, "  presumed OK\n"); } }
        /* Go on to the next entry: */
        d++; i++; if (i == sz) { i = 0; }
      }
    (void)ref_set_verify(S, TRUE);
    if (debug) { fprintf(stderr, "-- end actual deletion --\n"); }
    return b;
  }

ref_t ref_set_pick(ref_set_t *S)
  { unsigned int sz = S->E.nel;
    if (sz == 0) { return NULL; }
    unsigned int k = abrandom(0, sz-1);
    if (S->E.el[k] != NULL) { return S->E.el[k]; }
    if (sz == 1) { return NULL; }
    /* Choose a good stride {d} rel prime to {sz}: */
    unsigned int d = (int)(0.618034 * (double)sz);
    while (gcd(d,sz) != 1) { d--; }
    unsigned int i = k + d % sz;
    do {
      ref_t a = S->E.el[i];
      if (a != NULL) { return a; }
      i = (i + d) % sz;
    } while (i != k);
    return NULL;
    
    /* 
      gawk \
        ' BEGIN { \
            for (n = 2; n < 1000; n++) \
              { d0 = int(0.618034 * n); \
                d = d0; \
                while (gcd(d,n) > 1) { d--; } \
                printf "%5d %5d %5d %5d\n", n, d0, d, d0 - d; \
              } \
          } \
          function gcd(a,b,  r) \
          { while (b != 0) \
              { r = a % b; a = b; b = r; } \
            return a; \
          } \
        '
    */
  }
  
void ref_set_resize(ref_set_t *S, int n, ref_set_hash_t *hash)
  { if (hash == NULL) { /* Provide default: */ hash = &ref_set_addr_hash; }
    unsigned int sz = S->E.nel;
    if ((sz != n) || (S->hash != hash))
      { /* Grab the slot vector {E} of {S}: */
        ref_vec_t E = S->E;
        /* Replace {S->E} by a new address vector with {n} slots: */
        S->E = ref_vec_new(n);
        /* Make {S} empty and change its hash function: */
        ref_set_clear_all_slots(&(S->E));
        S->ct = 0;
        S->hash = hash;
        /* Insert all items of {E} back into {S}. */
        /* We can use equality instead of {eq} since they are all non-equivalent: */
        unsigned int i;
        ref_t *p; /* Pointer that scans the item slots. */
        for (i = 0, p = E.el; i < sz; i++, p++)
          { ref_t a = (*p);
            if (a != NULL) { ref_set_add(S, a); }
          }
        /* Reclaim the old vector: */
        free(E.el);
      }
  }

bool_t ref_set_verify(ref_set_t *S, bool_t die)
  { unsigned int sz = S->E.nel;
    unsigned int ct = 0; /* Count of non-null items */
    int i;
    for (i = 0; i < sz; i++)
      { ref_t Si = S->E.el[i];
        if (Si != NULL)
          { ct++;
            unsigned int j = ref_set_locate(S, Si);
            if (j != i) 
              { fprintf(stderr, "%s:%d: ** invariant violated for S[%u]\n", __FILE__, __LINE__, i); 
                fprintf(stderr, "  sz = %u ct = %u\n", sz, ct); 
                fprintf(stderr, "  h(S[%u]) = %u  locate = %u", i, S->hash(Si, sz), j); 
                if (j < sz) 
                  { ref_t Sj = S->E.el[j];
                    if (Sj == NULL) 
                      { fprintf(stderr, "  S[%u] = NULL\n", j); }
                    else
                      { fprintf(stderr, "  h(S[%u],%u) = %u\n", j, sz, S->hash(Sj, sz)); }
                  }
                else
                  { fprintf(stderr, "  (nowhere)\n"); }
                fail_test(die, "aborting");
              }
          }
      }
    if (ct != S->ct)
      { fprintf(stderr, "%s:%d: ** inconsistent element count\n", __FILE__, __LINE__);
        fprintf(stderr, "  ct = %u S.ct = %u\n", ct, S->ct); 
        fail_test(die, "aborting");
      }
    return TRUE;
  }

/* DEFAULT EQ/HASH FUNCTIONS */

unsigned int ref_set_addr_hash(ref_t a, unsigned int n)
  { demand(a != NULL, "cannot hash NULL");
    demand(n > 0, "cannot hash to empty range");
    /* Let's hope that this works: */
    unsigned int ua = (unsigned int) a;
    unsigned int ub = ua & 8191u;
    unsigned int ur = (4615u * (ua >> 14)) + (417 * (ua >> 9)) + (ub * ub * 471703u);
    return (ur % n);
  }

bool_t ref_set_addr_eq(ref_t a, ref_t b)
  { 
    return (a == b);
  }
  
/* STRING EQUIVALENCE AND HASHING */


unsigned int ref_set_string_hash(ref_t a, unsigned int n)
  { demand(a != NULL, "cannot hash NULL");
    demand(n > 0, "cannot hash to empty range");
    /* Let's hope that this works: */
    unsigned uh = 314159u;
    unsigned char *p = (unsigned char *)a;
    while ((*p) != 0)
      { unsigned int uc = (*p);
        unsigned int ur = 
          (4615u * ((uc & 208u) >> 4)) + \
          (417u * ((uc & 13u) << 13)) + \
          (471703u *((uc & 34u) << 3));
        uh = 27183u * uh + ur + 141421 *(uh >> 16);
        p++;
      }
    return (uh % n);
  }

bool_t ref_set_string_eq(ref_t a, ref_t b)
  { 
    if ((a == NULL) || (b == NULL))
      { return ((a == NULL) && (b == NULL)); }
    else
      { return (strcmp(a,b) == 0); }
  }