#define PROG_NAME "pc_decomp" #define PROG_DESC "Principal component analysis on a set of data points." #define PROG_VERS "2023-02-02" #define pc_decomp_C_COPYRIGHT \ "Copyright © 2023 by the State University of Campinas (UNICAMP)" /* Last edited on 2024-12-01 00:23:34 by stolfi */ #define PROG_HELP \ " " PROG_NAME " \\\n" \ " -dataColumns {DATACOL}.. \\\n" \ " [ -weightColumn {WTCOL} ] \\\n" \ " [ -keyColumn {KEYCOL} ] \\\n" \ " -noise {NOISE} \\\n" \ " [ -centerFile {CENTERFILE} ] \\\n" \ " [ -covarFile {COVARFILE} ] \\\n" \ " [ -compFile {COMPFILE} ] \\\n" \ " [ -decompFile {DECOMPFILE} ] \\\n" \ " [ -format {FORMAT} ] \\\n" \ " [ -verbose {VERBOSE} ] \\\n" \ " " argparser_help_info_HELP " \\\n" \ " < {INFILE}" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ " The program reads from {stdin} a list of {nd} /data points/ each with {nv} coordinates" \ " or /variables/; that is, an array {D} on {nd} rows and {nv} columns, such" \ " that D[id,jv]} is coordinate {jv \in 0..nv-1} of data point {id \in 0..nd-1}.\n" \ "\n" \ " The program interprets that data as a cloud of points in the space {\RR^d}. It computes" \ " the barycenter {d[0..nv-1]} of the cloud, and then finds the {ne} orthogonal" \ " directions {E[0..ne-1]} of the cloud relative to {d} along which the cloud's" \ " standard deviation {e[ie]} is successively maximum, in decreasing order of that" \ " deviation; discarding directions whose {e[i]} is less than a given" \ " threshold {tol}.\n" \ "\n" \ " More precisely, the directions {E[0..ne-1]} are vectors with {nv} coordinates, unit norm," \ " and pairwise orthogonal. The vector {E[0]} is the direction of maximum cloud extent. The" \ " vector {E[1]} is the maximum-extent direction that is perpendicular to {E[0]}; {E[2]} is" \ " the maximum-extent direction perpendicular to {E[0]} and {E[1]}; and so on.\n" \ "\n" \ " These directions are the /principal components/ of the data point set.\n" \ "\n" \ " In other words, the program finds the best model of the cloud {D} that consists of" \ " the barycenter {d} plus the sum of the vectors {E[0..ne-1]}, each {E[ie]} being multiplied" \ " by {e[ie]} times an independent normal random variable.\n" \ "\n" \ " The program also computes the projection {P[id]} of each data vector {D[id]-d} onto the" \ " row space of {E}, that is a linear compbination of {E[0..ne-1]} with {ne} coefficients" \ " {C[id]}; plus the residual {nv}-vector {R[id] = D[id]-d-P[íd]}. These results, as well as the" \ " barycenter {d} and the component data {E,e}, may be written out to designated files.\n" \ "\n" \ " Optionally the point reads also a weight {w[id]} for each data point (row of {D}). The" \ " program then assumes that the data is actually {w[id]} copies of row {id}. The weights" \ " must be non-negative and may be fractional. Only their ratios matter.\n" \ "\n" \ "ALGEBRA\n" \ " In the notation of linear algebra, let {W} be the {nd} by {nd} diagonal matrix" \ " whose element {W[id,id]} is the weight {w[id]}, normalized so that the sum of all" \ " weights is 1. Let {u} be the column {nd}-vector whose elements are all 1.\n" \ "\n" \ " The" \ " barycenter {d} then can be written as the 1 by {nv} (row) vector {u'*W*D}. The covariance" \ " matrix is then {A = V'*W*V} where {V} is the {nd} by {nv} matrix of displacements" \ " from {d} to each {D[id]}, that is, {V = D-u*d}.\n" \ "\n" \ " The pricipal components {E[0..ne-1]} are then the {ne} eigenvectors of the covariance" \ " matrix {A} whose associated eigenvalues are at least {tol^2}. They are best" \ " represented as an array {E} with {ne} rows and {nv} columns, whose rows are" \ " orthonormal (meaning {E*E'} is the {ne} by {ne} identity matrix).\n" \ "\n" \ " The /principal component analysis/ of the data points then consists of" \ " computing the {nd} by {ne} coefficient matrix {C = E'*V}, the {nd} by {nv} projection" \ " matrix {P = C*E}, and the {nd} by {nv} residual matrix {R = V - P}.\n" \ "\n" \ "INPUTS\n" \ " In the input file, blank lines and comments starting with \"#\" to the end" \ " of the line ignored. Each data line normally has the format\n" \ "\n" \ " {K[id]} {D[id,0]} ... {D[id,nv-1]}\n" \ "\n" \ " or, if the data points are weighted,\n" \ "\n" \ " {K[id]} {w[id]} {D[id,0]} ... {D[id,nv-1]} \n" \ "\n" \ " where {K[id]} is a string with no embedded blanks that serves as an" \ " identifier of the data point in the decomposition file. However, the" \ " order of these fields my be varied with the options \"-dataColumns\"," \ " \"-weightColumn\", and \"-keyColumn\" (see below). In general," \ " apart from '#'-comments each non-blank line" \ " must have the same number of non-blank fields separated by blanks.\n" \ "\n" \ "OUTPUTS\n" \ " The program may be asked to write separate files with the results. See" \ " the options \"-centerFile\", \"-compFile\", and \"decompFile\" below.\n" \ "OPTIONS\n" \ " -dataColumns {DATACOL}.. \n" \ " This mandatory argument specifies {nv} indices {col[0..nv-1]} that" \ " indicate which columns of the input datafile are the data" \ " values {D[i,jv]}. Namely {D[id,jv]} will be field {col[jv]} of" \ " line {id} of the file. As per GNU/Linux tradition, columns are" \ " numbered from 1, not 0. Thus each {col[jv]} must be a number" \ " in {1..nf}, where {nf} is the number of fileds in every input line.\n" \ "\n" \ " -weightColumn {WTCOL}\n" \ " This optional argument specifies the index of the column in" \ " the input datafile that contains the weight {w[id]} of the data" \ " point {D[id]}. If given, it must be a number" \ " in {1..nf}. If omitted, the program assumes there is no weight" \ " column in the file, and all data points have the same " \ "weight ({w[id] = 1} for all {id}).\n" \ "\n" \ " -keyColumn {KEYCOL}\n" \ " This optional argument specifies the index of the column in" \ " the input datafile that contains the key {K[id]} identifying eack data" \ " point {D[id]}. If given, it must be a number" \ " in {1..nf}. If omitted, the program that {K[id]} is the string \"P{NNNNNN}\", where" \ " {NNNNNN} is the point index {id} formatted as \"%06d\".\n" \ "\n" \ " -noise {NOISE}\n" \ " This mandatory argument specifies the standard deviation of the" \ " noise expected in each variable {D[i,jv]}. The program will exclude" \ " from the principal components {E[0..ne-1} any vector whose" \ " deviation {e[ie]} is less than or equal to {NOISE} (which" \ " should be non-negative).\n" \ "\n" \ " -centerFile {CENTERFILE}\n" \ " This optional argument asks that the barycenter {d[0..nv-1]} of the" \ " data points be written to the file named {CENTERFILE}. The {nv} coordinates" \ " of this vector will be written, all on the same line, with the format" \ " specified by the \"-format\" option.\n" \ "\n" \ " -covarFile {COVARFILE}\n" \ " This optional argument asks that the covariance matrix {A[0..nv-1,0..nv-1]} of" \ " the data points, relative to the barycenter, be written to the file" \ " named {COVARFILE}. The elements will be written as {nv} lines with {nv} numbers" \ " per line, in the format \"%24.15e\" \n" \ "\n" \ " -compFile {COMPFILE}\n" \ " This optional argument asks that the matrix {E[0..ne-1,0..nv-1]} of principal" \ " component vectors, and the associated deviations {e[0..ne-1]}, be written to the" \ " file named {COMPFILE}. Each row {E[ie]} and the associated deviation {e[ie]} will" \ " be written, in that order, as a line of the file with {nv+1} numbers. The direction" \ " vector coordinates will be printed with fixed format \"%+18.15f\", while the" \ " deviation {e[ie]} will use the format specified by the \"-format\" option.\n" \ "\n" \ " -decompFile {DECOMPFILE}\n" \ " This optional argument asks for an output file named {DECOMPFILE} with the" \ " decomposition of the input data into the computed principal" \ " components {E[0..ne-1]}. Each line of the file have {1 + ne + nv + nv} fields: the" \ " point key {K[id]}, the {ne} coefficients {C[id,0..ne-1]} (projections of {V[id]} along" \ " each vector {E[ie]}), the {nv} components of the projection {P[id]} of {V[id]} on the" \ " row space of {E}, and the {nv} components of the residual {R[id]} orthogonal to" \ " that space. All these numbers will be in the format specified by the \"-format\" option.\n" \ "\n" \ " -format {FORMAT}\n" \ " Specifies a format for printing point coordinates and other commensurate" \ " quantities, including the the coordinates of the barycenter {d} in {CENTERFILE}, the" \ " deviations {e[ie]} in {COMPFILE}, and the elements of {C[id]}, {P[id]}, and" \ " {R[id]} in {DECOMPFILE}. If omitted, defaults to \"-format %+12.5f\".\n" \ "\n" \ " -verbose {VERBOSE}\n" \ " If present, this optional argumet asks the program to print various diagnorstic information.\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ argparser_help_info_HELP_INFO "\n" \ "\n" \ "SEE ALSO\n" \ " lineat_fit(JS).\n" \ "\n" \ "AUTHOR\n" \ " Created 2023-02-07 by Jorge Stolfi, IC-UNICAMP.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2023-02-07 J.Stolfi: Created from {linear_fit.c}.\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " pc_decomp_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #define stringify(x) strngf(x) #define strngf(x) #x #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include vec_typedef(fdtype_vec_t, fdype_vec, fget_data_type_t); typedef struct jpc_options_t { int64_vec_t dataColumns; /* Columns of input file with the data point coordinates. */ int32_t keyColumn; /* Column of input file with the point's key, or -1 if none. */ int32_t weightColumn; /* Column of input file with the point's weight, or -1 if none. */ double noise; /* Threshold for component magnitude. */ char *centerFile; /* Name of barycenter output file, or {NULL}. */ char *covarFile; /* Name of output file, or {NULL}. */ char *compFile; /* Name of output file, or {NULL}. */ char *decompFile; /* Name of output file, or {NULL}. */ char *format; /* Format for point coordinates. */ bool_t verbose; /* True to print diagnostics. */ /* Derived column data: */ fdtype_vec_t field_type; /* Type of each data field. */ } jpc_options_t; /* Arguments from command line. */ #define jpc_MAX_VARS 200 /* Max number of variables (point coordinates) */ #define jpc_MAX_FIELDS 1000 /* Max number of data fields to parse or skip. */ void jpc_read_data ( FILE *rd, jpc_options_t *o, int32_t *nd_P, char ***K_P, double **w_P, int32_t *nv_P, double **D_P, bool_t verbose ); /* Reads from {rd} the tuples {K[id]}, {w[id]}, {D[id,0],...D[id,nv-1]}, for {id} in {0..nd-1}, into newly allocated vectors {K[0..nd-1]}, {w[0..nd-1]}, and {D[0..nd*nv-1]}. The array {D} conceptually has {nd} rows and {nv} columns. It is linearized by rows; element {D[id*nv+k]} is conceptually {D[id,jv]} for {jv} in {0..nv-1}. Returns the number of samples {nd} in {*NZp} and the arrays {ID,Z,w,D} {*IDp,*Zp,*Wp,*Xp}. The number of variables {nv} is determined by the option {o->dataColumns}. The number of data points {nd} is the number of lines in the file that are not blank apart from '#'-comments. The variables {K[id]}, {w[id]}, and {D[id,0..nv-1]} are read from the columnd of the file specified by {o->keyColumn}, {o->weightColumn}, and {o->dataColumns}, respectively. These columns should be all distinct and between 1 and {nf} where {nf} is the number of non-blank items in each line (which should be the same for all lines). if {o->keyColumn} is {-1}, the key is set to "P{NNNNNN}" where {NNNNNN} is {id} in the format "%06d". If {o->weightColumn} is {-1}, all weights are set to 1.0. The integers {nv,nd} and the arrays {K,w,D} are returned in {*nv_P,*nd_P,*K_P,*w_P,*D_P}, respectively. */ void jpc_compute_barycenter(int32_t nd, double w[], int32_t nv, double D[], double d[], bool_t verbose); /* Computes the barycenter {d[0..nv-1]}, the average of the rows of {D[0..nd-1,0..nv-1]}, with weights {w{0..nd-1]}. */ void jpc_compute_principal_components ( int32_t nd, double w[], int32_t nv, double D[], double d[], double A[], double noise, int32_t *ne, double E[], double e[], bool_t verbose ); /* Given a point cloud {D[0..nd-1,0..nv-1]}, the point weights {w[0..nd-1]}, and the cloud barycenter {d[0..nv-1]}, computes the covariance matrix {A[0..nv-1,0..nv-1]} as well as the unit eigenvectors {E[0..nv-1,0..nv-1]} and the respective cloud extents (deviations) {e[0..nv-1]}. The eigenvectors (rows of {E}) will be sorted by decreasing extent {e[iv]} The procedure detetermines the number {ne} of eigenvectors whose extents {e[0..ne-1]} are at {noise} or more, and returns that number in {*ne_P). Rows {0..ne} of {E} are the principal components of the cloud. */ void jpc_compute_decomposition ( int32_t nd, int32_t nv, double D[], double d[], int32_t ne, double E[], double **C_P, double **P_P, double **R_P, bool_t verbose ); /* Given a point cloud {D[0..nd-1,0..nv-1]}, the cloud barycenter {d[0..nv-1]} and the principal directions {E[0..ne-1,0..nv-1]}, computes the coefficient matrix {C[0..nd-1,0..ne-1]}, the point projections {P[0..nd-1,0..nv-1]} on the row space of {E}, and the residuals {R[0..nd-1,0..nv-1]}. The matrices {C[0..nd*ne-1]}, {P[0..nd*nv-1]}, and {R[0..nd*nv-1]} are returned in {*C_P,*P_P,*R_P}, respectively. */ void jpc_write_barycenter(char *fname, char *format, int32_t nv, double d[]); void jpc_write_covariance_matrix(char *fname, int32_t nv, double A[]); void jpc_write_principal_components(char *fname, char *format, int32_t ne, int32_t nv, double E[], double e[]); void jpc_write_decomposition(char *fname, char *format, int32_t nd, int32_t ne, int32_t nv, double C[], double P[], double R[]); /* Write the given data tof the file {fname}. */ jpc_options_t *jpc_parse_options(int32_t argc, char **argv); int32_t main (int32_t argc, char **argv) { jpc_options_t *o = jpc_parse_options(argc, argv); bool_t verbose = o->verbose; int32_t nv = -1; /* Number of coordinates per data point. */ int32_t nd = -1; /* Number of data points. */ char **K = NULL; /* Data point ID key {K[0..nd-1]}. */ double *w = NULL; /* Weights of data records are {w[0..nd-1]}. */ double *D = NULL; /* Input independent variable values are {D[0..nd*nv-1]}. */ jpc_read_data(stdin, o, &nd, &K, &w, &nv, &D, verbose); if (verbose) { fprintf(stderr, "read %d data samples {D[id,jv]}\n", nd); } /* Find point barycenter: */ double *d = rn_alloc(nd); /* Point barycenter, {d[0..nv-1]}. */ jpc_compute_barycenter(nd, w, nv, D, d, verbose); if (o->centerFile != NULL) { jpc_write_barycenter(o->centerFile, o->format, nv, d); } /* Allocate covariance matrix {A[0..nv-1,0..nv-1]}: */ double *A = rmxn_alloc(nv,nv); /* Allocate eigenvector matrix {E[0..nv-1,0..nv-1]}: */ double *E = rmxn_alloc(nv,nv); /* Allocate extent vactor {e[0..nv-1]}: */ double *e = rn_alloc(nv); int32_t ne = -1; /* Number of principal components accepted. */ jpc_compute_principal_components(nd, w, nv, D, d, A, o->noise, &ne, E, e, verbose); if (o->covarFile != NULL) { jpc_write_covariance_matrix(o->covarFile, nv, A); } if (o->compFile != NULL) { jpc_write_principal_components(o->compFile, o->format, ne, nv, E, e); } if (o->decompFile != NULL) { /* Compute the PC decomposition: */ double *C = NULL; /* Coefficients of princ comps, {C[0..dn*ne-1]}. */ double *P = NULL; /* Projection {C[0..dn*ne-1]}. */ double *R = NULL; jpc_compute_decomposition(nd, nv, D, d, ne, E, &C, &P, &R, verbose); jpc_write_decomposition(o->decompFile, o->format, nd, ne, nv, C, P, R); } return 0; } void jpc_write_barycenter(char *fname, char *format, int32_t nv, double d[]) { FILE *wr = open_write(fname, TRUE); for (uint32_t jv = 0; jv < nv; jv++) { fprintf(wr, format, d[jv]); if (jv > 0) { fputc(' ', wr); } } fputc('\n', wr); fclose(wr); } void jpc_write_covariance_matrix(char *fname, int32_t nv, double A[]) { FILE *wr = open_write(fname, TRUE); for (uint32_t kv = 0; kv < nv; kv++) { for (uint32_t jv = 0; jv < nv; jv++) { fprintf(wr, "%24.14e", A[kv*nv + jv]); if (jv > 0) { fputc(' ', wr); } } fputc('\n', wr); } fclose(wr); } void jpc_write_principal_components(char *fname, char *format, int32_t ne, int32_t nv, double E[], double e[]) { FILE *wr = open_write(fname, TRUE); for (uint32_t ie = 0; ie < ne; ie++) { for (uint32_t jv = 0; jv < nv; jv++) { fprintf(wr, "%+15.12f", E[ie*nv + jv]); if (jv > 0) { fputc(' ', wr); } } fputs(" ", wr); fprintf(wr, format, e[ie]); fputc('\n', wr); } fclose(wr); } void jpc_write_decomposition(char *fname, char *format, int32_t nd, int32_t nv, int32_t ne, double C[], double P[], double R[]) { FILE *wr = open_write(fname, TRUE); for (uint32_t id = 0; id < nd; id++) { for (uint32_t ie = 0; ie < ne; ie++) { fprintf(wr, format, C[id*ne + ie]); if (ie > 0) { fputc(' ', wr); } } fputs(" ", wr); for (uint32_t jv = 0; jv < nv; jv++) { fprintf(wr, format, P[id*nv + jv]); if (jv > 0) { fputc(' ', wr); } } fputs(" ", wr); for (uint32_t jv = 0; jv < nv; jv++) { fprintf(wr, format, R[id*nv + jv]); if (jv > 0) { fputc(' ', wr); } } fputc('\n', wr); } fclose(wr); } void jpc_read_data ( FILE *rd, jpc_options_t *o, int32_t *nd_P, char ***K_P, double **w_P, int32_t *nv_P, double **D_P, bool_t verbose ) { /* Get the number {nv} and columns of variables (point coordinates): */ int32_t nv = o->dataColumns.ne; assert(nv > 0); /* Get number {nf} of data fields to read/skip, and type table: */ /* Note that there may be no weight of key. */ int32_t nf = o->field_type.ne; assert(nf >= nv); fget_data_type_t *type = o->field_type.e; /* Allocate data arrays: */ int32_t nd0 = 2048; /* Initial allocation; may be expanded. */ string_vec_t K = string_vec_new(nd0); double_vec_t w = double_vec_new(nd0); double_vec_t D = double_vec_new(nv*nd0); /* Arrays {alf,num} for {fget_data_fields} results: */ /* Note that these arrays are indexed {0..nf-1} */ /* because {fget_data_line} numbers the columns from 0. */ char *alf[nf]; double num[nf]; /* Read the data lines: */ int32_t nd = 0; /* Number of data lines seen. */ while (TRUE) { bool_t ok = fget_data_fields(rd, '#', nf, type, alf, num); if (! ok) { break; } int32_t id = nd; /* Row index of {D}, index of {K,w}. */ /* Expand arrays if needed: */ string_vec_expand(&K, id); double_vec_expand(&w, id); double_vec_expand(&D, id*nv + nv-1); /* Get the data point: */ double *Di = &(D.e[id*nv]); for (uint32_t jv = 0; jv < nv; jv++) { int32_t kf = (int32_t)(o->dataColumns.e[jv] - 1); double Dij = num[kf]; demand((! isnan(Dij)) && isfinite(Dij), "invalid data point coordinate"); Di[jv] = Dij; } /* Get or fake the weight: */ double wi; if (o->weightColumn > 0) { int32_t kf = (int32_t)(o->weightColumn - 1); wi = num[kf]; demand((! isnan(wi)) && isfinite(wi) && (wi >= 0), "invalid weight value"); } else { wi = 1.0; } w.e[id] = wi; /* Get or fake the key: */ char *Ki; if (o->keyColumn > 0) { int32_t kf = (int32_t)(o->keyColumn - 1); char *Ki = alf[kf]; assert(Ki != NULL); } else { char *Ki = NULL; char *Ki = jsprintf("P%06d", id); } K.e[id] = Ki; /* Skip rest of line: */ fget_skip_to_eol(rd); } string_vec_trim(&K,nd); double_vec_trim(&w,nd); double_vec_trim(&D,nd*nv); (*nd_P) = nd; (*K_P) = K.e; (*w_P) = w.e; (*nd_P) = nd; (*D_P) = D.e; } void jpc_compute_barycenter(int32_t nd, double w[], int32_t nv, double D[], double d[], bool_t verbose) { double d[nv]; double sum_w = 0; for (uint32_t kv = 0; kv < nv; kv++) { double sum_wD = 1.0e-308; for (int32_t id = 0; id < nd; id ++) { double wi = w[id]; double Dik = D[id*nv + kv]; if (kv == 0) { sum_w += wi; } sum_wD += wi*Dik; } assert(sum_w > 0); d[kv] = sum_wD/sum_w; } } void jpc_compute_principal_components ( int32_t nd, double w[], int32_t nv, double D[], double d[], double A[], double noise, int32_t *ne_P, double E[], double e[], bool_t verbose ) { if ( jspca_compute_components(nd, nv, D, d, w, E, e, verbose); /* Determine the number {ne} of significant components: */ int32_t ne = nv; while ((ne > 0) && (e[ne-1] < noise)) { ne--; } if (A_P != NULL) (*A_P) = A; (*ne_P) = ne; (*E_P) = E; if (A_P != NULL) (*A_P) = A; } void jpc_compute_decomposition ( int32_t nd, int32_t nv, double D[], double d[], int32_t ne, double E[], double **C_P, double **P_P, double **R_P, bool_t verbose ) { } void jpc_build_model ( int32_t nd, double Z[], double w[], int32_t nv, double D[], bool_t unitTerm, int32_t ne, double E[], bool_t verbose ) { double Xave[nv]; /* If {unitTerm}, average value of each {D[id]}. */ double Zave = NAN; /* If {unitTerm}, average value of {Z}. */ if (unitTerm) { demand(ne == nv + 1, "{ne} should be {nv+1}"); if (verbose) { fprintf(stderr, "removing variable averages...\n"); } /* Compute the weighted sums of {D[k],Z} in {Xave[k],Zave}: */ for (uint32_t k = 0; k < nv; k++) { Xave[k] = 0; } Zave = 0; double sumW = 0; for (uint32_t id = 0; id < nd; id++) { double Wi = w[id]; demand(Wi >= 0, "invalid weight {w[id]}"); double *Xi = &(D[id*nv]); for (uint32_t k = 0; k < nv; k++) { Xave[k] += Wi*Xi[k]; } Zave += Wi*Z[id]; sumW += Wi; } /* Convert weighted sums to weighted averages: */ if (sumW > 0) { for (uint32_t k = 0; k < nv; k++) { Xave[k] /= sumW; } Zave /= sumW; } /* Subtract averages from all variables: */ for (uint32_t id = 0; id < nd; id++) { double *Xi = &(D[id*nv]); for (uint32_t k = 0; k < nv; k++) { Xi[k] -= Xave[k]; } Z[id] -= Zave; } } else { demand(ne == nv, "{ne} should be {nv}"); } /* Build the least squares system: */ if (verbose) { fprintf(stderr, "building linear system matrices...\n"); } double *A = rmxn_alloc(nv, nv); double B[nv]; rn_zero(nv, B); rmxn_zero(nv, nv, A); /* Accumulate the matrix and vector of the least squares system: */ for (uint32_t id = 0; id < nd; id++) { double *Xi = &(D[id*nv]); jpc_accum_system(Z[id], w[id], nv, Xi, A, B); } if (verbose) { rmxn_gen_print2(stderr, nv, nv, A, 1, B, "%+18.9f", " ","\n ","\n", "[ "," "," ]", " "); } if (verbose) { fprintf(stderr, "solving system...\n"); } double tiny = 1.0e-8; uint32_t rank; gausol_solve(nv, nv, A, 1, B, E, TRUE,TRUE, tiny, NULL, &rank); if (rank < nv) { fprintf(stderr, "!! warning: system rank = %d\n", rank); } if (verbose) { rmxn_gen_print(stderr, nv, 1, E, "%+18.9f", " ","\n ","\n", "[ "," "," ]"); } if (unitTerm) { if (verbose) { fprintf(stderr, "adding variable averages to indep term...\n"); } double Cunit = Zave; for (uint32_t k = 0; k < nv; k++) { Cunit -= E[k]*Xave[k]; } E[ne-1] = Cunit; } free(A); } void jpc_apply_model(int32_t nd, int32_t nv, double D[], bool_t unitTerm, int32_t ne, double E[], double Y[]) { assert(ne == nv + (unitTerm ? 1 : 0)); for (uint32_t id = 0; id < nd; id++) { double *Xi = &(D[id*nv]); double sum = 0; for (uint32_t k = 0; k < nv; k++) { sum += E[k]*Xi[k]; } if (unitTerm) { sum += E[nv]; } Y[id] = sum; } } void jpc_residual_stats(int32_t nd, double Z[], double w[], double Y[], double *avgP, double *devP) { double sumWD = 0; double sumW = 0; for (uint32_t id = 0; id < nd; id++) { sumWD += w[id]*(Z[id] - Y[id]); sumW += w[id]; } double avg = sumWD/sumW; demand(sumW > 0, "total weight is zero"); double sumWD2 = 0; for (uint32_t id = 0; id < nd; id++) { double Di = (Z[id] - Y[id]) - avg; sumWD2 += w[id]*Di*Di; } double dev = sqrt(sumWD2/sumW); (*avgP) = avg; (*devP) = dev; } void jpc_write_model(char *fname, int32_t nv, char *tName[], bool_t unitTerm, int32_t ne, double E[], double avg, double dev) { assert(ne == nv + (unitTerm ? 1 : 0)); FILE *wr = open_write(fname, TRUE); fprintf(wr, "%d\n", nv); for (uint32_t k = 0; k < nv; k++) { fprintf(wr, "%3d %+24.16e", k, E[k]); if (tName != NULL) { fprintf(wr, " # %s", tName[k]); } fprintf(wr, "\n"); } if (unitTerm) { fprintf(wr, "%3d %+24.16e # 1\n", nv, E[nv]); } fprintf(wr, " %+24.16e\n", avg); fprintf(wr, " %+24.16e\n", dev); fclose(wr); } void jpc_print_model(FILE *wr, int32_t nv, char *tName[], bool_t unitTerm, int32_t ne, double E[], double avg, double dev) { assert(ne == nv + (unitTerm ? 1 : 0)); /* Index sort of {E[0..nv-1]} by absolute value of coefficient (leave {E[nv]} at end): */ int32_t ix[ne]; for (uint32_t k = 0; k < ne; k++) { ix[k] = k; } for (uint32_t k = 0; k < nv; k++) { /* Largest coeffs are {E[ix[id]]} for {id} in {0..k-1}. */ /* Find largest unsorted coeff: */ int32_t imax = k; for (int32_t id = k+1; id < nv; id++) { if (fabs(E[ix[id]]) > fabs(E[ix[imax]])) { imax = id; } } if (imax != k) { int32_t t = ix[k]; ix[k] = ix[imax]; ix[imax] = t; } } fprintf(wr, "fitted model:\n"); for (uint32_t k = 0; k < ne; k++) { int32_t id = ix[k]; fprintf(wr, " %+14.9f", E[id]); if (id < nv) { fprintf(wr, " * D[%2d]", id); if (tName != NULL) { fprintf(wr, " # %s", tName[id]); } } else { fprintf(wr, "%8s", ""); fprintf(wr, " # 1"); } fprintf(wr, "\n"); } fprintf(wr, " %+14.9f # avg residual\n", avg); fprintf(wr, " %+14.9f * RND() # dev residual\n", dev); } void jpc_accum_system(double Zi, double Wi, int32_t nv, double Xi[], double *A, double *B) { for (uint32_t k = 0; k < nv; k++) { B[k] += Wi*Zi*Xi[k]; for (uint32_t jv = 0; jv < nv; jv++) { A[k*nv + jv] += Wi*Xi[k]*Xi[jv]; } } } void jpc_write_data(FILE *wr, int32_t nd, char *K[], double Z[], double Y[], char *fmt) { for (uint32_t id = 0; id < nd; id++) { fprintf(wr, "%s ", K[id]); fprintf(wr, fmt, Z[id]); fprintf(wr, " "); fprintf(wr, fmt, Y[id]); fprintf(wr, "\n"); } fflush(wr); } #define jpc_terms_MAX 50 /* Maximum number of terms (group averages) in model. */ jpc_options_t *jpc_parse_options(int32_t argc, char **argv) { /* Initialize argument parser: */ argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_NAME " version " PROG_VERS ", usage:\n" PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); /* Allocate the command line argument record: */ jpc_options_t *o = notnull(malloc(sizeof(jpc_options_t)), "no mem"); /* Parse keyword parameters: */ /* Get indices of data file colums where the variables are: */ uint32_t nv_max = jpc_MAX_VARS; uint32_t nf_max = jpc_MAX_FIELDS; o->dataColumns = argparser_get_int_list(pp, 1000, nv_max, "-dataColumns", 1, (int32_t)nf_max); uint32_t nv = o->dataColumns.ne; /* Number of variables. */ if (nv == 0) { argparser_error(pp, "must specify at least one data column"); } /* Clear out the data file column types: */ o->type = fdtype_vec_new(nf_max); for (uint32_t kf = 0; kf < nf_max; kf++) { o->type.e[kf] = fget_data_type_NOT; } /* Set type = numeric for all the columns that contain variables. Remember largest col: */ uint32_t nf = 0; /* Number of data file cols actually used. */ for (uint32_t kv = 0; kv < nv; kv++) { int32_t kf = o->dataColumns.e[kv]; if (kf > 0) { defcol(kf, fget_data_type_NUM, FALSE, o->type.e, &nf); } } /* Get the weight column, if any: */ if (argparser_keyword_present(pp, "-weightColumn")) { o->weightColumn = (int32_t)argparser_get_next_int(pp, 1, jpc_MAX_FIELDS); defcol(o->weightColumn, fget_data_type_NUM, TRUE, o->type.e, &nf); } else { o->weightColumn = -1; }; /* Get the point key column, if any: */ if (argparser_keyword_present(pp, "-keyColumn")) { o->keyColumn = (int32_t)argparser_get_next_int(pp, 1, jpc_terms_MAX); defcol(o->keyColumn, fget_data_type_ALF, FALSE, o->type.e, &nf); } else { o->keyColumn = -1; }; argparser_get_keyword(pp, "-noise"); o->noise = argparser_get_next_double(pp, 0.0, 1.0e200); if (argparser_keyword_present(pp, "-centerFile")) { o->centerFile = argparser_get_next_non_keyword(pp); } else { o->centerFile = NULL; }; if (argparser_keyword_present(pp, "-covarFile")) { o->covarFile = argparser_get_next_non_keyword(pp); } else { o->covarFile = NULL; }; if (argparser_keyword_present(pp, "-compFile")) { o->compFile = argparser_get_next_non_keyword(pp); } else { o->compFile = NULL; }; if (argparser_keyword_present(pp, "-decompFile")) { o->decompFile = argparser_get_next_non_keyword(pp); } else { o->decompFile = NULL; }; if (argparser_keyword_present(pp, "-format")) { o->format = argparser_get_next_non_keyword(pp); } else { o->format = "%+12.5f"; }; if (argparser_keyword_present(pp, "-verbose")) { o->verbose = argparser_get_next_bool(pp); } else { o->verbose = FALSE; }; /* Check for spurious arguments: */ argparser_finish(pp); return o; /* ---------------------------------------------------------------------- */ void defcol(uint64_t col, fget_data_type_t tkf, fget_data_type_t type[], uint32_t *nf_P) { demand(col >= 1, "invalid column"); uint32_t kf = (uint32_t)(col - 1); assert(kf < nf_max); fget_data_set_field_type(kf, tkf, rep_ok, nf_max, type); if (kf >= (*nf_P)) { (*nf_P) = kf + 1; } } } vec_typeimpl(fdtype_vec_t, fdype_vec, fget_data_type_t);