/* See ref_set.h */ /* Last edited on 2008-01-16 22:48:53 by stolfi */ #include #include #include #include #include #include #include #include #include #include #include /* In the following comments, `{~a}' means `any {ref_t} value {b} such that {eq(a,b)} is TRUE'. We also write {sz} instead of {sz(S)} {S->nel}, and {S[k]} for {S->el[k mod sz]}. By {S[i..j]} we denote all entries {S[r]} of the array {S} such that {i <= r <= i + ((j-i) mod sz)}, whether NULL or not. Note that if {j==i} or {j == i+sz} then {S[i..j]} is just one element, {S[i]}. OCCUPANCY RATIO AND PERFORMANCE The efficiency of the {ulist_t} operations generally depends on its /occupancy ratio/ {r(S) == ct(S)/cp(S)}. The functions {ulist_has}, {ulist_add}, and {ulist_del} are efficient as long as the ratio {r(S)} is well below 1. In that case, their expected cost is about {K + r(S)}. However, the cost of each call grows very quickly, to K*cp(S) or more, as {r(S)} approaches 1. Therefore, to preserve the efficiency of those operations, the client should keep track of {ct(S)}, and call {ulist_resize} as needed to keep the occupancy ratio {r(S)} below some limit {rmax << 1}. If the capacity of {S} is expanded or reduced by a fixed percentage (*not* a fixed amount) every time, the cost of all these resizes, divided by the number of {ulist_add} operations, will amount to only a constant overhead per operation. The best value of {rmax} depend on the situation, but {rmax==1/2} should be good enough for most applications. In other words, one double the capacity of the {ulist_t} whenever it becomes more than half full. On the other hand, the expected cost of {ulist_pick} is roughly proportional to {cp(S)/(1 + ct(S))}, which is approximately {1/r(S)}. Moreover, the cost of {ulist_count} and {ulist_elems} is roughly proportional to {cp(S)}, not {ct(S)}. Therefore, a client who is concerned about memory usage may want to call {ulist_resize} as needed to keep {r(S)} above a certain minimum {rmin >> 0} --- say, {rmin == 1/4}. */ struct ref_set_t { ref_vec_t E; /* The item storage slots. */ unsigned int ct; /* The current count of items (non-NULL slots). */ ref_set_eq_t *eq; /* The item equivalence predicate. */ ref_set_hash_t *hash; /* The item hash function. */ }; /* PROCEDURES THAT DO NOT DEPEND ON ITEM LOCATIONS */ void ref_set_clear_all_slots(ref_vec_t *E); /* Sets all slots of {E} to NULL. */ ref_set_t *ref_set_new(int n) { ref_set_t *S = (ref_set_t *)notnull(malloc(sizeof(ref_set_t)), "out of mem"); S->E = ref_vec_new(n); ref_set_clear_all_slots(&(S->E)); S->ct = 0; S->eq = &ref_set_addr_eq; S->hash = &ref_set_addr_hash; return S; } void ref_set_free(ref_set_t *S) { free(S->E.el); free(S); } void ref_set_clear(ref_set_t *S) { if (S->ct != 0) { ref_set_clear_all_slots(&(S->E)); S->ct = 0; } } void ref_set_clear_all_slots(ref_vec_t *E) { unsigned int i; for (i = 0; i < E->nel; i++) { E->el[i] = NULL; } } void ref_set_install_eq(ref_set_t *S, ref_set_eq_t *eq) { if (eq == NULL) { /* Provide default: */ eq = &ref_set_addr_eq; } if (eq != S->eq) { demand((S->ct <= 1), "cannot change {eq} of non-trivial set"); S->eq = eq; } } void ref_set_install_hash(ref_set_t *S, ref_set_hash_t *hash) { if (hash == NULL) { /* Provide default: */ hash = &ref_set_addr_hash; } if (hash != S->hash) { demand((S->ct == 0), "cannot change {hash} of non-empty set"); S->hash = hash; } } unsigned int ref_set_count(ref_set_t *S) { return S->ct; } unsigned int ref_set_size(ref_set_t *S) { return S->E.nel; } ref_vec_t ref_set_items(ref_set_t *S) { /* Allocate the result vector: */ ref_vec_t v = ref_vec_new(S->ct); /* Scan {S} and copy the non-{NULL} entries to {v}: */ unsigned int ct = 0; /* Count of non-NULL items found. */ unsigned int i; ref_t *p; /* Pointer that scans the item slots. */ for (i = 0, p = S->E.el; i < S->E.nel; i++, p++) { ref_t Ei = (*p); if (Ei != NULL) { v.el[ct] = Ei; ct++; } } return v; } /* PROCEDURES THAT DEPEND ON THE ITEM LOCATIONS */ /* Each new item {a} added to the set is stored in the first NULL slot found starting at positions {k}, {k+1}, {k+2}, ..., wrapping around modulo {S.nel}; where the starting index {k} is {hash(a, S.nel)}. Thus, the main structure invariant (I1) is: if {S[i] != NULL} then all the elements {S[k..i]} are non-NULL, where {k=hash(S[i],sz)}. A secondary invariant (I2) is that {S->ct} is equal to the number of non-NULL entries in {S->E}. A third invariant (I3) is that {S->E} does not contain two distinct non-NULL entries {a,b} such that {eq(a,b)}. */ bool_t ref_set_verify(ref_set_t *S, bool_t die); /* Checks whether {S} satisfies the invariants I1 and I2. If it satisfies both, prints nothing and returns TRUE. If it fails one or both of them, prints a diagnostic message; then either aborts (if {die} is TRUE) or returns FALSE (if {die} is FALSE). */ unsigned int ref_set_locate(ref_set_t *S, ref_t a); /* If {~a} is in {S}, returns the index {k} such that {eq(S[k],a)}. If {~a} is not in {S}, and {S} is not full, returns the index {k}, of the slot {S[k]} where {~a} should be stored to add it to the set; in that case, {S[k]} will be {NULL}. If {~a} is not in {S} but {S} is full, returns {S.nel}. Assumes that {S->nel > 0} and {a != NULL}. */ unsigned int ref_set_locate(ref_set_t *S, ref_t a) { unsigned int sz = S->E.nel; /* Cannot call the hash function with {sz == 0}: */ if (sz == 0) { return 0; } unsigned int h = S->hash(a, sz); demand(h < sz, "bad hash value"); unsigned int i = h; do { ref_t b = S->E.el[i]; if ((b == NULL) || S->eq(b, a)) { return i; } i++; if (i == sz) { i = 0; } } while (i != h); return sz; } ref_t ref_set_find(ref_set_t *S, ref_t a) { /* Argument checking: */ demand(a != NULL, "invalid item"); /* Get the index {k} of {~a} in {S}: */ unsigned int k = ref_set_locate(S, a); /* Return the {~a} found there, or NULL: */ return (k < S->E.nel ? S->E.el[k] : NULL); } bool_t ref_set_has(ref_set_t *S, ref_t a) { /* Argument checking: */ demand(a != NULL, "invalid item"); /* Get the index {k} of {~a} in {S}: */ unsigned int k = ref_set_locate(S, a); /* Return TRUE iff {~a} was there: */ return ((k < S->E.nel) && (S->E.el[k] != NULL)); } ref_t ref_set_add(ref_set_t *S, ref_t a) { /* Argument checking: */ demand(a != NULL, "invalid item"); /* Get the index {k} of {~a} in {S}: */ unsigned int k = ref_set_locate(S, a); /* If {k >= S->nel}, then {~a} is not in {S} and {S} is full: */ demand(k < S->E.nel, "no space for new item"); /* Let {b} be the {~a} item in {S}, or NULL if it doesn't exist: */ ref_t b = S->E.el[k]; S->E.el[k] = a; if (b == NULL) { /* Update the item count: */ S->ct++; } return b; } ref_t ref_set_del(ref_set_t *S, ref_t a) { unsigned int sz = S->E.nel; bool_t debug = FALSE; /* bool_t debug = ((sz == 200) && (S->ct == 99)); */ /* Argument checking: */ demand(a != NULL, "invalid item"); /* Trivial case: */ if (S->ct == 0) { return NULL; } /* Get the index {k} of {~a} in {S}: */ unsigned int k = ref_set_locate(S, a); if (k >= sz) { /* {~a} is not in {S} */ return NULL; } /* Let {b} be the {~a} item in {S}: */ ref_t b = S->E.el[k]; if (b == NULL) { /* {~a} is not in {S} */ return NULL; } /* Remove {b} from {S}: */ S->E.el[k] = NULL; S->ct--; if (debug) { fprintf(stderr, "-- start actual deletion --\n"); } if (debug) { fprintf(stderr, " cleared S[%u]\n", k); } if (debug) { (void)ref_set_verify(S, FALSE); } /* Now entries that follow {S[k]} may need to be deleted and reinserted: */ unsigned int i = (k + 1) % sz; /* Scans entries after {S[k]}. */ unsigned int d = 0; /* Always equal to {imod(i - (k+1), sz)}. */ while(TRUE) { /* Now entries {S[k+1..i]\S[i]}} are OK and not NULL. */ ref_t c = S->E.el[i]; if (c == NULL) { /* If our logic is correct, there are no more inconsistencies. */ if (debug) { fprintf(stderr, " found null slot S[%u]\n", i); } break; } /* At this point {S[i]} is distinct from {S[k]}. */ /* Check whether {c} needs to be reinserted: */ unsigned int hc = S->hash(c, sz); demand (hc < sz, "bad hash function"); unsigned int dhc = imod(hc - (k+1), sz); if (debug) { fprintf(stderr, " S[%u] hashes to %u dhc = %u d = %u", i, hc, dhc, d); } if (dhc > d) { /* Entry {S[i]} is not OK, must reinsert it. */ if (debug) { fprintf(stderr, " presumed BAD\n"); } /* Item {c} must go into {S[k]}. */ S->E.el[k] = c; if (debug) { fprintf(stderr, " copied S[%u] from S[%u]\n", k, i); } /* Now the hole is {S[i]}: */ S->E.el[i] = NULL; if (debug) { fprintf(stderr, " cleared S[%u]\n", i); } k = i; d = 0; if (debug) { (void)ref_set_verify(S, FALSE); } } else { if (debug) { fprintf(stderr, " presumed OK\n"); } } /* Go on to the next entry: */ d++; i++; if (i == sz) { i = 0; } } (void)ref_set_verify(S, TRUE); if (debug) { fprintf(stderr, "-- end actual deletion --\n"); } return b; } ref_t ref_set_pick(ref_set_t *S) { unsigned int sz = S->E.nel; if (sz == 0) { return NULL; } unsigned int k = abrandom(0, sz-1); if (S->E.el[k] != NULL) { return S->E.el[k]; } if (sz == 1) { return NULL; } /* Choose a good stride {d} rel prime to {sz}: */ unsigned int d = (int)(0.618034 * (double)sz); while (gcd(d,sz) != 1) { d--; } unsigned int i = k + d % sz; do { ref_t a = S->E.el[i]; if (a != NULL) { return a; } i = (i + d) % sz; } while (i != k); return NULL; /* gawk \ ' BEGIN { \ for (n = 2; n < 1000; n++) \ { d0 = int(0.618034 * n); \ d = d0; \ while (gcd(d,n) > 1) { d--; } \ printf "%5d %5d %5d %5d\n", n, d0, d, d0 - d; \ } \ } \ function gcd(a,b, r) \ { while (b != 0) \ { r = a % b; a = b; b = r; } \ return a; \ } \ ' */ } void ref_set_resize(ref_set_t *S, int n, ref_set_hash_t *hash) { if (hash == NULL) { /* Provide default: */ hash = &ref_set_addr_hash; } unsigned int sz = S->E.nel; if ((sz != n) || (S->hash != hash)) { /* Grab the slot vector {E} of {S}: */ ref_vec_t E = S->E; /* Replace {S->E} by a new address vector with {n} slots: */ S->E = ref_vec_new(n); /* Make {S} empty and change its hash function: */ ref_set_clear_all_slots(&(S->E)); S->ct = 0; S->hash = hash; /* Insert all items of {E} back into {S}. */ /* We can use equality instead of {eq} since they are all non-equivalent: */ unsigned int i; ref_t *p; /* Pointer that scans the item slots. */ for (i = 0, p = E.el; i < sz; i++, p++) { ref_t a = (*p); if (a != NULL) { ref_set_add(S, a); } } /* Reclaim the old vector: */ free(E.el); } } bool_t ref_set_verify(ref_set_t *S, bool_t die) { unsigned int sz = S->E.nel; unsigned int ct = 0; /* Count of non-null items */ int i; for (i = 0; i < sz; i++) { ref_t Si = S->E.el[i]; if (Si != NULL) { ct++; unsigned int j = ref_set_locate(S, Si); if (j != i) { fprintf(stderr, "%s:%d: ** invariant violated for S[%u]\n", __FILE__, __LINE__, i); fprintf(stderr, " sz = %u ct = %u\n", sz, ct); fprintf(stderr, " h(S[%u]) = %u locate = %u", i, S->hash(Si, sz), j); if (j < sz) { ref_t Sj = S->E.el[j]; if (Sj == NULL) { fprintf(stderr, " S[%u] = NULL\n", j); } else { fprintf(stderr, " h(S[%u],%u) = %u\n", j, sz, S->hash(Sj, sz)); } } else { fprintf(stderr, " (nowhere)\n"); } fail_test(die, "aborting"); } } } if (ct != S->ct) { fprintf(stderr, "%s:%d: ** inconsistent element count\n", __FILE__, __LINE__); fprintf(stderr, " ct = %u S.ct = %u\n", ct, S->ct); fail_test(die, "aborting"); } return TRUE; } /* DEFAULT EQ/HASH FUNCTIONS */ unsigned int ref_set_addr_hash(ref_t a, unsigned int n) { demand(a != NULL, "cannot hash NULL"); demand(n > 0, "cannot hash to empty range"); /* Let's hope that this works: */ unsigned int ua = (unsigned int) a; unsigned int ub = ua & 8191u; unsigned int ur = (4615u * (ua >> 14)) + (417 * (ua >> 9)) + (ub * ub * 471703u); return (ur % n); } bool_t ref_set_addr_eq(ref_t a, ref_t b) { return (a == b); } /* STRING EQUIVALENCE AND HASHING */ unsigned int ref_set_string_hash(ref_t a, unsigned int n) { demand(a != NULL, "cannot hash NULL"); demand(n > 0, "cannot hash to empty range"); /* Let's hope that this works: */ unsigned uh = 314159u; unsigned char *p = (unsigned char *)a; while ((*p) != 0) { unsigned int uc = (*p); unsigned int ur = (4615u * ((uc & 208u) >> 4)) + \ (417u * ((uc & 13u) << 13)) + \ (471703u *((uc & 34u) << 3)); uh = 27183u * uh + ur + 141421 *(uh >> 16); p++; } return (uh % n); } bool_t ref_set_string_eq(ref_t a, ref_t b) { if ((a == NULL) || (b == NULL)) { return ((a == NULL) && (b == NULL)); } else { return (strcmp(a,b) == 0); } }