""" Implements the Smallest Maximizer Criterion algorithm Context tree selection and linguistic rhythm retrieval from written texts. Galves, A., Galves, C., García, J. E., Garcia, N. L., & Leonardi, F. (2012). Annals of Applied Statistics, 4(1), 186–209. https://doi.org/10.1214/11-AOAS511 """ import logging from .bic import BIC from .context_tree import ContextTree from .smc_base import SMCBase class SMC(SMCBase): def __init__(self, bootstrap_obj, penalty_interval=(0.1, 400), n_sizes=(0.3, 0.9), alpha=0.01, epsilon=0.01, cache_dir=None, df_method='perl', perl_compatible=False, num_cores=None): """Initializes the estimator Arguments: bootstrap_obj {Boostrap} -- A previously configured bootstrap object Keyword Arguments: penalty_interval {tuple} -- The interval of BIC constants to be scanned (default: {(0.1, 400)}) n_sizes {tuple} -- Bootstrap sample sizes. First element of tuple is the smallest size, whereas the second element is the largest one. (default: {(0.3, 0.9)}) alpha {number} -- Stop condition parameter for t-test phase. (default: {0.01}) epsilon {number} -- Stop condition parameter for BIC constant scanning. (default: {0.01}) cache_dir {str} -- Directory for temporary files. The program creates one by itself if none is set (default: {None}) df_method {str} -- Degree of freedom strategy (default: {'perl'}) perl_compatible {bool} -- Makes algorithm compatible with the original perl implementation. (default: {False}) num_cores {int} -- Number of cores for parallel processing (default: {None}) """ super().__init__(bootstrap_obj, cache_dir, num_cores=num_cores, n_sizes=n_sizes, alpha=alpha) self.penalty_interval = penalty_interval self.epsilon = epsilon self.df_method = df_method self.perl_compatible = perl_compatible assert epsilon > 0, 'epsilon must be greater than zero' def fit(self, X): self.context_trees = [] self.estimate_trees(X) # this methods are defined in the superclass (smc_base) self.find_optimal_tree(X) return self def estimate_trees(self, X): """Estimates the set of champion This method performs the algorithm describe in the section 4 of the original paper Arguments: X {Sample} -- A sample Returns: SMC -- This object instance itself """ self.thresholds = [] self.intervals = None #if cache.load_cache(estimator, X): # return estimator self.initial_tree = ContextTree.init_from_sample(X) min_c, max_c = self.penalty_interval self.trees_constructed = 0 tree_a = self.calc_bic(min_c) tree_b = tree_f = self.calc_bic(max_c) self.add(tree_a, min_c) a, b = (min_c, max_c) while not tree_a.equals_to(tree_b): while abs(b - a) > self.epsilon: while not tree_a.equals_to(tree_b) and abs(max_c - min_c) > 10 ** -5: old_b = b old_tree_b = tree_b b = (a + b)/2 tree_b = self.strategy_dynamic(b) a = b b = old_b tree_b = old_tree_b a = b tree_a = tree_b self.add(tree_a, b) b = max_c tree_b = tree_f def calc_bic(self, c): """ Estimate a context tree using the BIC estimator Parameters ---------- estimator : g4l.data.SMC The SMC estimator object c : float The penalty value ( > 0) used by the bayesian information criteria """ sample = self.initial_tree.sample self.trees_constructed += 1 bic_estimator = BIC(c, df_method=self.df_method, keep_data=True, perl_compatible=self.perl_compatible) bic_estimator.fit(sample) return bic_estimator.context_tree def add(self, t, c): """ Appends a context tree to the list of estimated trees Parameters ---------- t : g4l.models.ContextTree The resulting context tree c : float The penalty value used to estimate the tree """ logging.info('%s\t%s' % (c, t.to_str())) self.add_tree(t) self.thresholds.append(c) def strategy_dynamic(self, c): """ This strategy avoids computing trees where the current c is between 2 already computed values of c that have produced the same tree, for speed-up purposes. The original procedure described in the paper is found in `strategy_default` method. Parameters ---------- c : float The penalty value used to estimate the tree """ if self.intervals is None: self.intervals = dict() t = self._cached_trees(c) if not t: t = self.calc_bic(c) self._add_tree(c, t) logging.debug("[new] c=%s; \t\tt=%s" % (round(c, 4), t.to_str())) else: logging.debug("[skip] c=%s; \t\tt=%s" % (round(c, 4), t.to_str())) return t def strategy_default(self, c): # pragma: no cover """ Computes the context tree using the BIC estimator for a given penalty value `c` Parameters ---------- c : float The penalty value used to estimate the tree """ bic = self.calc_bic(c) logging.debug('c=%s; \t\tt=%s' % (round(c, 4), bic.to_str())) return bic def _cached_trees(self, k): """ Skips tree computation if the tree for the given constant is already known (for speed-up purposes) if i1 <= k <= i2, where i1, i2 are constant values that were already computed and tree_{i1} == tree_{i2} """ for a, b in self.intervals.keys(): if k >= a and b >= k: return self.intervals[(a, b)] return None def _add_tree(self, k, t): """ adds the tree to the interval [description] Arguments: k {float} -- BIC constant used to compute the tree t {ContextTree} -- a tree """ for i, t2 in enumerate(self.intervals.values()): if t.equals_to(t2): rng = list(self.intervals)[i] rng2 = (min(rng[0], k), max(rng[1], k)) del(self.intervals[rng]) self.intervals[rng2] = t return self.intervals[(k, k)] = t