/* Last edited on 2024-12-21 14:04:44 by stolfi */ /* See {dm_cassify.h}. */ #include #include #include #include #include #include #include #include /* CONVERSION FUNCTIONS */ /* A {dm_score_rec_t} count record {s} is mapped to a point of {\RR^nc}, for some {nc}. A {dm_score_rec_t} set of weights {wt} is mapped to an affine hyperplane of {\RR^nc}. */ typedef enum { dm_classify_map_LONG, dm_classify_map_SHORT, dm_classify_map_END } dm_classify_map_t; /* Specifies a mapping from a {dm_score_rec_t} record {sc} to a point {p} of {\RR^nc}: {dm_classify_map_LONG}: {nc=dm_classify_NC_LONG}, currently {p=(sc.eql/den, sc.dif/den, sc.brk/den, sc.skp/den, (sc.eql + sc.dif + sc.skp/2)/den)} where {den = hypot(sc.dif+sc.eql+sc.skp/2, dm_classify_MIN_RUNGS)}. {dm_clasify_map_SHORT}: {nc=dm_classify_NC_SHORT}, currently {p=(sc.dif/eql, sc.brk/eql, sc.skp/eql)}. It also specifiies a mapping from hyperplane {coef,cind} to a {dm_score_rec_t} weights record {wt}, namely {dm_clasify_map_LONG}: currently, {wt.eql=coef[0]}, {wt.dif=coef[1]}, {wt.brk=coef[2], {wt.skp=coef[3]}. {dm_classify_map_SHORT}: currently, {wt.eql=cind}, {wt.dif=coef[0], {wt.brk=coef[1], {wt.skp=coef[2]}. */ #define dm_classify_NC_LONG 5 #define dm_classify_NC_SHORT 3 int dm_classify_map_range_dim(dm_classify_map_t map); /* Returns the size of the point vectors corresponding to a score count record under the given{map}. */ void dm_classify_score_counts_to_point(dm_score_rec_t *sc, dm_classify_map_t map, double p[]); /* Converts the counts {sc} to a point {p[0..nc-1]} by the mapping {map}. The size {nc} of {p} must be the correct one for the map. */ double *dm_classify_candidates_to_points(msm_cand_vec_t *cdv, dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_classify_map_t map); /* Returns an array with {nc} columns and {np = cdv->ne} rows where each row {i} is the point of {\RR^nc} corresponding to candididate {cdv->e[i]}. The array is linearized by rows. The size {nc} is determined by the {map}. */ void dm_classify_score_weights_to_hyperplane(dm_score_rec_t *wt, dm_classify_map_t map, double coef[], double* cind); /* Converts a set of scoring parameters {*wt,*wK} into a hyperplane equation with coefficients {coef[0..nc-1]} and indep term {*cind}. */ void dm_classify_hyperplane_to_score_weights(double coef[], double cind, dm_classify_map_t map, dm_score_rec_t *wt); /* Converts a hyperplane equation with coefficients {coef[0..nc-1]} and indep term {cind} to a set of scoring parameters {*wt,*wK}. */ /* GEOMETRIC FUNCTIONS */ void dm_classify_barycenter(int np, int nc, double p[], double bp[]); /* Computes the barycenter of {np} points and stores it into {bp[0..nc-1]}. Assumes that the {nc} coordinates of point {i} start at {p[i*nc]}. */ double dm_classify_dist_to_hyperplane(int nc, double p[], double coef[], double cind); /* Computes the Euclidean orthogonal distance from {p[0..nc-1]} and the hyperplane with coefficients {coef[0..nc-1]} and independent term {cind}. */ void dm_classify_distances_to_hyperplane(int np, int nc, double p[], double coef[], double cind, double dist[]); /* Computes the Euclidean orthogonal distances {dist[0..np-1]} from {np} points {p[0..nc*np-1]} and the hyperplane with coefficients {coef[0..nc-1]} and independent term {cind}. Assumes that the {nc} coordinates of point {i} start at {p[i*nc]}. */ void dm_classify_project_onto_hyperplane(int nc, double coef[], double cind, double p[], double pp[]); /* Computes the orthogonal projection {pp[0..nc-1]} of {p[0..nc-1]} onto the hyperplane with coefficients {coef[0..nc-1]} and independent term {cind}. */ double dm_classify_indep_coefficient(int nc, double coef[], double p[]); /* Computes the independent term of hyperplane that has coefficients {coeff[0..nc-1]} and passes through point {p[0..nc-1]}. */ void dm_classify_bisecting_hyperplane(int nc, double b0[], double b1[], double coef[], double *cind); /* Stores into {coef,cind} the coordinates of the hyperplane that prpendicularly bisects the segment from {0} to {p1}. The hyperplane will have {p1} on the positive side. */ /* INTERNAL PROCS */ void dm_classify_get_score_counts ( msm_cand_t *cd, dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *sc ); /* Gets the score event counts {sc} from candidate {cd}. */ bool_t dm_classify_try_to_adjust_hyperplane ( int np[], int nc, double *p[], double coef[], double *cind, double *dist[] ); /* Tries to improve the hyperplane {coef[0..nc-1],cind} to better separate the points of classes 0 and 1. Assumes that each class has {np[class]} points, each point has {nc} coordinates, and point {i} of each class starts at {p[class][i*nc]}. */ /* IMPLEMENTATIONS */ #define dm_classify_MIN_RUNGS 20 /* Pairings with fewer than this many rungs are relatively unimportant. */ void dm_classify_determine_score_weights ( msm_cand_vec_t *cdvT, msm_cand_vec_t *cdvF, dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *wt ) { dm_classify_map_t map = dm_classify_map_SHORT; int nc = dm_classify_map_range_dim(map); /* Coordinates per point. */ /* Convert the candidates to points: */ int np[2]; /* Number of points in each class is {np[class]}. */ double *p[2]; /* Points in each class are {p[class][0..np*nc-1]}. */ double *b[2]; /* Baryceter of each class is {b[class][0..nc-1]}. */ int class; for (class = 0; class <= 1; class++) { msm_cand_vec_t *cdv = (class == 0 ? cdvF : cdvT); np[class] = cdv->ne; p[class] = dm_classify_candidates_to_points(cdv, seq_a, seq_b, map); dm_classify_barycenter(np[class], nc, p[class], b[class]); } /* Get bisecting hyperplane of barycenters: */ double coef[nc]; double cind; dm_classify_bisecting_hyperplane(nc, b[0], b[1], coef, &cind); /* Debug: */ fprintf(stderr, "bisector of barycenters:\n"); int i; for(i = 0; i < nc; i++) { fprintf(stderr," %9.6lf", coef[i]); } fprintf(stderr," %9.6lf\n",cind); /* Convert to score_weights: */ dm_classify_hyperplane_to_score_weights(coef, cind, map, wt); free(p[0]); free(p[1]); } void dm_classify_adjust_score_weights ( msm_cand_vec_t *cdvT, msm_cand_vec_t *cdvF, dnae_seq_t *seq_a, dnae_seq_t *seq_b, int max_iter, dm_score_rec_t *wt ) { dm_classify_map_t map = dm_classify_map_SHORT; int nc = dm_classify_map_range_dim(map); /* Coordinates per point. */ /* Classes are numbered 0 = false, 1 = true. */ double *p[2]; /* Point {i} starts at {p[class][*nc]}. */ int np[2]; /* Number of points in each class. */ double *dist[2]; /* Signed distances from points to separating hyperplane in each class. */ int class; for (class = 0; class <= 1; class++) { /* Convert the candidates to points: */ msm_cand_vec_t *cdv = (class == 0 ? cdvF : cdvT); p[class] = dm_classify_candidates_to_points(cdv, seq_a, seq_b, map); np[class] = cdv->ne; /* Allocate wording storage: */ dist[class] = notnull(malloc(np[class]*sizeof(double)), "no mem"); } int num_iter = 0; while (num_iter < max_iter) { fprintf(stderr,"iteration %d current weights ", num_iter); dm_score_rec_print(stderr, NULL, wt, "\n"); /* Convert to hyperplane: */ double coef[nc]; double cind; dm_classify_score_weights_to_hyperplane(wt, map, coef, &cind); /* Adjust hyperplane: */ bool_t ok = dm_classify_try_to_adjust_hyperplane(np, nc, p, coef, &cind, dist); if (ok) { break; } /* Convert back to score weights: */ dm_classify_hyperplane_to_score_weights(coef, cind, map, wt); num_iter++; } /* Free working storage: */ free(p[0]); free(dist[0]); free(p[1]); free(dist[1]); } bool_t dm_classify_try_to_adjust_hyperplane ( int np[], int nc, double *p[], double coef[], double *cind, double *dist[] ) { /* The heuristic is to look for points that are misclassified, separate them and the well-classified points that are at about the same distance of the hyperplane, then use the bisector of the two classes within that restricted set. The two hyperplanes are then averaged. */ int np_bad[2]; /* Number of misclassified points in each class. */ int i, class; double sum2 = 0; for (class = 0; class <= 1; class++) { dm_classify_distances_to_hyperplane(np[class], nc, p[class], coef, (*cind), dist[class]); int sgn = 2*class - 1; /* {-1} for false, {+1} for true. */ /* Count how many misclassified points we have, and get their rms distance: */ np_bad[class] = 0; for (i = 0; i < np[class]; i++) { double di = dist[class][i]; if (di*sgn < 0) { np_bad[class]++; sum2 += di*di; } } fprintf(stderr, "there are %d misclassified candidates in class %c\n", np_bad[class], "FT"[class]); } if ((np_bad[0] + np_bad[1]) == 0) { return TRUE; } double sigma = sqrt(sum2/(np_bad[0] + np_bad[1])); /* Average (rms) distance of misclassified points. */ fprintf(stderr, "sigma = %9.6lf\n", sigma); /* Count points within distance {2*sigma} from the hyperplane: */ int nq[2]; for (class = 0; class <= 1; class++) { for (i = 0; i < np[class]; i++) { double di = dist[class][i]; if (fabs(di) <= 2*sigma) { nq[class]++; } } fprintf(stderr, "there are %d candidates within {2*sigma} of current plane\n", nq[class]); } /* Gather points within distance {2*sigma} from the hyperplane. */ /* save them into {q[class][0..nc*nq-1]}, gat their barycenters {b[class]}: */ double *q[2]; /* Selected point {i} of each class starts at {q[class][i*nc]}. */ double *bq[2]; /* Barycenter of selected points of each class is {bq[class][0..nc-1]}. */ for (class = 0; class <= 1; class++) { q[class] = notnull(malloc(nc*nq[class]*sizeof(double)), "no mem"); int k = 0; for (i = 0; i < np[class]; i++) { double di = dist[class][i]; if (fabs(di) <= 2*sigma) { assert (k < nq[class]); double *pi = &(p[class][i*nc]); double *qk = &(q[class][k*nc]); rn_copy(nc, pi, qk); k++; } } bq[class] = notnull(malloc(np[class]*sizeof(double)), "no mem"); dm_classify_barycenter(np[class], nc, q[class], bq[class]); } /* Bisect the two classes: */ double coef_new[nc]; double cind_new; dm_classify_bisecting_hyperplane(nc, bq[0], bq[1], coef_new, &cind_new); /* Mix them in proportion to number of misclassified points: */ double f = ((double)nq[0] + nq[1])/((double)np[0] + np[1]); rn_mix(nc, 1.0, coef, f, coef_new, coef); (*cind) = (*cind) + f*cind_new; /* Normalize {coef} to unit length: */ double cm = rn_dir(nc, coef, coef); (*cind) /= cm; free(q[0]); free(bq[0]); free(q[1]); free(bq[1]); return FALSE; } void dm_classify_write_svm_file ( FILE *wr, msm_cand_vec_t *cdvT, msm_cand_vec_t *cdvF, dnae_seq_t *seq_a, dnae_seq_t *seq_b ) { dm_classify_map_t map = dm_classify_map_LONG; int nc = dm_classify_map_range_dim(map); /* Coordinates per point. */ int i, j, class; for (class = 0; class <= 1; class++) { int sgn = 2*class - 1; /* {-1} for false, {+1} for true. */ msm_cand_vec_t *cdv = (class == 0 ? cdvF : cdvT); double *p = dm_classify_candidates_to_points(cdv, seq_a, seq_b, map); int np = cdv->ne; for (i = 0; i < np; i++) { double *pi = &(p[i*nc]); fprintf(wr,"%+d", sgn); for (j = 0; j < nc; j++) { if(pi[j] != 0) { fprintf(wr," %d:%09.6lf", j+1, pi[j]); } } fprintf(wr,"\n"); } free(p); } fflush(wr); } int dm_classify_map_range_dim(dm_classify_map_t map) { switch (map) { case dm_classify_map_LONG: return dm_classify_NC_LONG; case dm_classify_map_SHORT: return dm_classify_NC_SHORT; default: assert(FALSE); } } void dm_classify_score_counts_to_point(dm_score_rec_t *sc, dm_classify_map_t map, double p[]) { double den = NAN; switch (map) { case dm_classify_map_LONG: den = hypot(sc->eql + sc->dif + 0.5*sc->skp, dm_classify_MIN_RUNGS); p[0] = sc->eql/den; p[1] = sc->dif/den; p[2] = sc->brk/den; p[3] = sc->skp/den; p[4] = (sc->eql + sc->dif)/den; assert (dm_classify_NC_LONG == 5); break; case dm_classify_map_SHORT: den = sc->eql; p[0] = sc->dif/den; p[1] = sc->brk/den; p[2] = sc->skp/den; assert (dm_classify_NC_SHORT == 3); break; default: assert(FALSE); } } void dm_classify_hyperplane_to_score_weights(double coef[], double cind, dm_classify_map_t map, dm_score_rec_t *wt) { switch (map) { case dm_classify_map_LONG: wt->eql = coef[0]; wt->dif = coef[1]; wt->brk = coef[2]; wt->skp = coef[3]; assert (dm_classify_NC_LONG == 5); break; case dm_classify_map_SHORT: wt->eql = cind; wt->dif = coef[0]; wt->brk = coef[1]; wt->skp = coef[2]; assert (dm_classify_NC_SHORT == 3); break; default: assert(FALSE); } } void dm_classify_score_weights_to_hyperplane(dm_score_rec_t *wt, dm_classify_map_t map, double coef[], double* cind) { switch (map) { case dm_classify_map_LONG: coef[0] = wt->eql; coef[1] = wt->dif; coef[2] = wt->brk; coef[3] = wt->skp; assert (dm_classify_NC_LONG == 5); break; case dm_classify_map_SHORT: *cind = wt->eql; coef[0] = wt->dif; coef[1] = wt->brk; coef[2] = wt->skp; assert (dm_classify_NC_SHORT == 3); break; default: assert(FALSE); } } double *dm_classify_candidates_to_points(msm_cand_vec_t *cdv, dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_classify_map_t map) { int np = cdv->ne; /* Number of points. */ int nc = dm_classify_map_range_dim(map); /* Coordinates per point. */ /* Convert the candidates to points: */ double *p = notnull(malloc(np*nc*sizeof(double)), "no mem"); int i; for(i = 0; i < np; i++){ double *pi = &(p[i*nc]); dm_score_rec_t sc; dm_classify_get_score_counts(&(cdv->e[i]), seq_a, seq_b, &sc); dm_classify_score_counts_to_point(&sc, map, pi); } return p; } void dm_classify_get_score_counts ( msm_cand_t *cd, dnae_seq_t *seq_a, dnae_seq_t *seq_b, dm_score_rec_t *sc ) { msm_rung_vec_t gv = msm_pairing_get_rungs(cd->pr); dm_score_count_clear(sc); dm_score_count_rung_vec(sc, &gv, seq_a, seq_b); dm_score_count_rung_vec_check(sc, &gv); free(gv.e); } double dm_classify_dist_to_hyperplane(int nc, double p[], double coef[], double cind) { int i; double sum2 = 0; double val = 0; for(i = 0; i < nc; i++) { val += coef[i]*p[i]; sum2 += coef[i]*coef[i]; } val += cind; return val/sqrt(sum2); } void dm_classify_project_onto_hyperplane(int nc, double coef[], double cind, double p[], double pp[]) { double t = 0; double sum2 = 0; int i; for(i = 0; i < nc; i++) { t += p[i]*coef[i]; sum2 += coef[i]*coef[i]; } t += cind; t = -t/sqrt(sum2); for(i = 0; i < nc; i++) { pp[i] = p[i] + coef[i]*t; } } double dm_classify_indep_coefficient(int nc, double coef[], double p[]) { double cind = 0; int i; for(i = 0; i < nc; i++) { cind -= coef[i]*p[i]; } return cind; } void dm_classify_barycenter(int np, int nc, double p[], double bp[]) { int i; rn_zero(nc, bp); for(i = 0; i < np; i++){ double *pi = &(p[i*nc]); rn_add(nc, pi, bp, bp); } rn_scale(nc, 1.0/((double)np), bp, bp); } void dm_classify_bisecting_hyperplane ( int nc, double b0[], double b1[], double coef[], double *cind ) { /* Get the midpoint {q} of the barycenters: */ double bm[nc]; rn_mix (nc, 0.5, b0, 0.5, b1, bm); /* Compute the bisectiing hyperlane {coef[0..nc-1],cind}: */ rn_sub (nc, b1, b0, coef); (void)rn_dir(nc, coef, coef); (*cind) = dm_classify_indep_coefficient(nc, coef, bm); } void dm_classify_distances_to_hyperplane(int np, int nc, double p[], double coef[], double cind, double dist[]) { int i; for(i = 0; i < np; i++) { double *pi = &(p[i*nc]); dist[i] = dm_classify_dist_to_hyperplane(nc, pi, coef, cind); } }