/* rn_classif.h --- tools for vector classifiers. */ /* Last edited on 2024-12-21 11:22:38 by stolfi */ #ifndef rn_classif_H #define rn_classif_H #include #include #include #include #include /* VECTOR CLASSIFICATION PROBLEMS */ /* A (discrete) /vector classification function/ is a function that assigns to vectors from some space {\Reals^NA} a /class index/ {cl} in some range {1..NC}. The components of the vector are called /attributes/ and the subset of {\Reals^NA} that maps to a given class {cl} is called the /domain/ of that class. The function need not be defined on the whole {\Reals^NA}. Here the class index 0 is used to indicate `undefined class' or `invalid vector' or `do not care'. The region that maps to class 0 is not considered a class domain but the /background/, complement of all class domains. A (discrete) /vector classification problem/ is a vector classification function with a probability distribution over its valid classes. */ typedef int rn_classif_labeler_t(int NA, int NC, double p[]); /* Type of a vector classification function from {\Reals^NA} with {NC} valid classes. A function of this type should return the index of a class in {1..NC} whose domain contains the given vector {p[0..NA-1]}. If {p} is outside all domains it should return 0. If {p} is close to a domain boundary the result is undefined. (However it should preferably err in favor of domains with smaller dimension, so as to make them thick enough for visualization.) Each combination of {NA} and {NC} defines a different classification function, even with the same {rn_classif_labeler_t} procedure. The procedure may fail for certain combinations of {NA} and {NC}. */ typedef void rn_classif_thrower_t(int i, int NA, int NC, double p[], int *classP); /* Type of a sampler procedure associated to some vector classification function from {\Reals^NA} with {NC} valid classes. A procedure of this type should select a valid class domain {cl} and generate a random vector in {\Reals^NA} in that domain. It should store the attributes into {p[0..NA-1]} and the class {cl} into {*classP}. The attributes of a generated sample depend on the integer {i} and on the current state of the number generator {drandom()}, which must have been properly seeded by the client. Usually each attribute is in the range {U=[-1 _ +1]}. The procedure may choose the class domain to be sampled as a periodic function of {i}, so that if {i} ranges over {1..N} the samples will be more unformly distributed among the classes. Each combination of {NA} and {NC} defines a different sampling function, even with the same {rn_classif_thrower_t} procedure. The procedure may fail for certain combinations of {NA} and {NC}. */ typedef struct rn_classif_problem_t { int NA; /* Total number of attributes per sample. */ int NC; /* Number of classes. Classes are numbered {1..NC}. */ rn_classif_labeler_t *lab; /* Labelling procedure */ rn_classif_thrower_t *gen; /* Generation procedure */ } rn_classif_problem_t; /* A Discrete classification problem. */ void rn_classif_class_count(int NS, int class[], int NC, int num[]); /* Stores into {num[cl]} the number of elements of {class[0..NS-1]} with value {cl}, where {cl} ranges in {0..NC}. Note that {num} must have {NC+1} elements. */ void rn_classif_class_count_print(FILE *wr, int NC, int num[]); /* Prints the class count table {num[0..NC]}. */ void rn_classif_sample_print(FILE *wr, char *pre, int NA, double p[], char *sep, char *suf); /* Writes to {wr} the attributes {p[0..NA-1]}, preceded by {pre}, separated by {sep}, and terminated by {suf}. */ typedef struct rn_classif_dataset_t { int NS; /* Number of samples. Samples are numbered {0..NS-1}. */ int NA; /* Total number of attributes per sample. */ double **smp; /* {smp[i][t]} is attribute {t} of sample {i}. */ } rn_classif_dataset_t; /* A dataset for a vector classifier, cosisting of {NS} sample vectors, each with {NA} attributes. */ rn_classif_dataset_t *rn_classif_dataset_new(int NS, int NA); /* Allocates the storage for a dataset {D} with {NS} samples and {NA} attributes per sample. Allocates the vector {D.smp} but not the individual attribute vectors. */ void rn_classif_dataset_free(rn_classif_dataset_t *D); /* Deallocates all storage of {D} including the header {*D} itself. */ void rn_classif_dataset_label(rn_classif_dataset_t *D, int NAL, int NCL, rn_classif_labeler_t *lab, int class[]); /* Classifies each sample in {D} with the labeler {lab} into classes {0..NCL}, and stores the result in {class[0..D.NS-1]}. The classifier is given the first {NAL} attributes of each sample, so one must have {NAL <= D.NA}. */ void rn_classif_dataset_write(FILE *wr, rn_classif_dataset_t *D, int cl, int class[]); /* Writes {D} to {WR} in a human- and machine-readable format. If {class} is not NULL, then it must have {D.NS} elements, and the procedure writes only the samples {i} such that {class[i] = cl}. */ void rn_classif_compare(int NS, int cold[], int cnew[], int *nsP, int *nfP); /* The vectors {cold} and {cnew} should have {NS} elements each. The procedure computes the number {ns} of `successes' (indices {i} in {0..NS-1} such that {cold[i] == cnew[i]} and the number {nf} of `failures' (all the other cases). These counts are returned in {*nsP} and {*nfP}. */ void rn_classif_cross_matrix_build(int NS, int cold[], int NCold, int cnew[], int NCnew, int **nccP); /* The vectors {cold} and {cnew} should have {NS} elements each, in the ranges {0..NCold} and {0..NCnew}, respectively. The procedure builds a /cross-classification matrix/ {ncc} where each entry {ncc[old,new]} is the number of positions {i} with {cold[i] = old} and {cnew[i] = new}. The `non-class' 0 is counted like any valid class, so the array has {NCnew+1} columns and {NCold+1} rows. The array is linearized by rows, so {ncc[old,new]} is actually in {ncc[old*(NCnew+1)+new]}. The array {ncc} is allocated by the procedure and its address is returned in {*nccP}. */ void rn_classif_cross_matrix_print(FILE *wr, int NCold, int NCnew, int *ncc, bool_t score); /* Prints to {wr} the cross-classification matrix {ncc} which is assumed to have {NCold} rows (`old' classes) and {NCnew} columns (`new' classes). If {score} is true prints also the number of elements in the diagonal (`successes') and off the diagonal (`failures'). */ typedef double rn_classif_pq_dist_t(int NA, double p[], double q[]); typedef double rn_classif_pi_dist_t(int NA, double p[], int i); typedef double rn_classif_ij_dist_t(int i, int j); /* Types of distance functions for samples and elements of sample sets. A function of this type should returns some measure of the distance(not necessarily a metric in the mathematical sense) between the two given samples. The samples may be given as two vectors {p[0..NA-1],q[0..NA-1]} or as a vector {p[0..NA-1]} and an index {i} into some sample list, or two indices {i,j} into two such list (or the same list). */ /* ITERATIVE IMPROVEMENT ("TRAINING") */ void rn_classif_find_nearest_neighbor_index(int NS, int NA, double p[], rn_classif_pi_dist_t *dist, int *iminP, double *dminP); /* Finds in dataset of {NS} samples the sample whose first {NA} attributes are closest to {p[0..NA-1]} in the metric {dist}. Returns the index of said element in {*iminP} and the distance in {*distP}. */ int rn_classif_find_nearest_in_dataset(rn_classif_dataset_t *M, double HM[], int NA, double p[], rn_classif_pq_dist_t *dist); /* Finds the index {i} such that the first {NA} attributes of sample {M.smp[i]} are closest to {p[0..NA-1]}, in the metric {dist}. Requires {NA<=M.NA}. If {HM} is not null it must be a vector of {M.NS} handicaps; the distance {dist} is then replced by {DIST(p,M.smp[i]) = max(dist(p,M.smp[i]),HM[i])} */ void rn_classif_nn_label_dataset ( rn_classif_dataset_t *M, /* Model samples. */ double HM[], /* {HM[i]} is the handicap of sample {M.smp[i]}. */ int classM[], /* {classM[i]} is the class of sample {M.smp[i]}. */ rn_classif_dataset_t *C, /* Samples to be classified. */ rn_classif_pq_dist_t *dist, /* Sample distance function. */ int classC[] /* (OUT) {classC{j]} is the class assigned to {C.smp[j]}. */ ); /* Calls {i = rn_classif_find_nearest_in_dataset(M,HM,M.NA,C.smp[j],dist)} for each sample {C.smp[j]} and sets {classC[j]} to {classM[i]}. The vectors {HM} and {classM} must have {M.NS} elements. The vector {classC} must have {C.NS} elements. */ typedef void rn_classif_handicap_proc_t(rn_classif_dataset_t *M, int classM[], double HM[]); /* Type of a procedure that computes handicaps {HM[0..M.NS-1]} for a 1NN classifier {(M,HM,classM)}, given {M} and {classM}. */ void rn_classif_nn_improve ( rn_classif_dataset_t *M, /* (IN/OUT) Model samples. */ double HM[], /* (IN/OUT) {HM[i]} is the handicap of sample {M.smp[i]}. */ int classM[], /* (IN/OUT) {classM[i]} is the given class of sample {M.smp[i]}. */ rn_classif_dataset_t *R, /* (IN/OUT) Training samples. */ int classR[], /* (IN/OUT) {classR[i]} is the given class of sample {R.smp[i]}. */ int classX[], /* (OUT) {classX[i]} is the class of {R.smp[i]} assigned by {M}. */ rn_classif_pq_dist_t *dist, /* Sample distance function. */ rn_classif_handicap_proc_t *hproc, /* Procedure that computes handicaps for {M}. */ int maxIters, /* Maximum iterations. */ bool_t sameClass, /* TRUE only swaps samples of the same class. */ bool_t verbose /* TRUE to show the progress at each iteration. */ ); /* Try to improve the NN model {M} by replacing some of its elements by elements of {R}. The goal is to make the labeling of {R.smp} by the 1NN classifier {(M,HM,classM)} be as close as possible to the given classification {classR}. Each tentative improvement consists in swapping some sample of {M} by some sample of {R} (usually one that was misclassified by {M}) together with their given classifications in {classM} and {classR}. If the swap does not succeed it is undone, so the final result is never worse than the original one. This basic step is repeated at most {maxIter} times. If {sameClass} is TRUE then the swap is restricted to samples of the same class, unless {M} has no samples with the same class as the chosen {R} sample. The procedure {hproc} is called to compute the handicap vector {HM} upon entry and every time the model {M} is changed. On output, the sets {M,R} and their classifications {classM,classR} may have been partly swapped and rearranged. The vector {HM} will contain the handicaps computed for the final {M} by {hproc}. The vector {classX} will contain the labeling of {R} implied by {(M,HM,classM)}. */ void rn_classif_nn_print_handicaps(FILE *wr, char *title, int NS, double H[]); /* Writes the sample handicap vector {H[0..NS-1]} to the file {wr}, with the given {title}. */ frgb_t *rn_classif_pick_class_colors(int NC); /* Returns an array of {NC+1} colors for classes {0..NC}. Color 0 is always white, the rest has increasing brightness and variable hues. */ #endif