#! /usr/bin/gawk -f # Last edited on 2014-05-10 16:10:57 by stolfilocal # Reads a file with Slumber weights, and actual and predicted prices at the Slumber times. # Writes to {stderr} statistics on the prediction errors. # Writes an histogram to {stdout}. # # Assumes that the prices are integers. # # Client must define (with "-v") the histogram bin width {bwid}, an odd integer, # and the initial and final dates {idate,fdate}. BEGIN { abort = -1; if (idate == "") { arg_error(("must define {idate}")); } if (fdate == "") { arg_error(("must define {fdate}")); } if (bwid == "") { arg_error(("must define {bwid}")); } if ((bwid % 2) != 1) { arg_error(("bin width {bwid} must be odd")); } brad = (bwid - 1)/2; # Bin radius. np = 2; # Number of prediction columns sum_w = 0; # Sum of weighs. nd = 0; # Number of data lines with valid predictions. # Indexed {0..np-1}: split("", mname); # Names of predicition methods. split("", icol); # Indices of prediction columns. split("", sum_we); # Sums of weighed errors. split("", sum_we2); # Sums of weighed squared errors. split("", nd_best); # Number of times the predictor was the best. split("", sum_w_best); # Sum of all weights when this predictor was best. # Indexed {0..np-1,bmin..bmax}: split("", hist); # Total weight for each prediction and error value. bmin = 0; # Minimum histogram bin index used. bmax = 0; # Maximum histogram bin index used. mname[0] = "Slumber"; icol[0] = 14; mname[1] = "Banal"; icol[1] = 16; sum_w = 0; for (kp = 0; kp < np; kp++) { sum_we[kp] = 0; sum_we2[kp] = 0; hist[kp,0] = 0; nd_best[kp] = 0; sum_w_best[kp] = 0; } } (abort >= 0) { exit abort; } /^[ ]*([\#]|$)/ { # Comment or blank: next; } /[!]/ { # Table header: next; } /^20[0-9][0-9][-]/ { # Data line: if (NF != 17) { data_error(("invalid {NF} = " NF)); } for (i = 3; i <= NF; i += 2) { if ($(i) != "|") { data_error(("expected '|' on column " i)); } } dt = $1; # Date. tm = $2; # Time of start of hourly interval. hr = $4; # Hours since Jan/01, 2014. wt = $6; # Slumber confidence weight. lo = $8; # Low price in hour interval. hi = $10; # High price in hour interval. md = $12; # Mid price {(lo+hi)/2} in interval. ps = $14; # Slumber Method prediction, or 0 if none. pb = $16; # Banal method prediction, or 0 if none. if ((ps != 0) && (pb != 0) && (dt >= idate) && (dt <= fdate)) { nd ++; sum_w += wt; erbest = 999999999; # Min absolute error among predictors. kpbest = -1; # Predictor index that gave the minimum error. for (kp = 0; kp < np; kp++) { pr = $(icol[kp]); # Predicted value. er = pr - md; sum_we[kp] += wt*er; sum_we2[kp] += wt*er*er; # Look for best predictor: if (er < erbest) { erbest = er; kpbest = kp; } # Compute the histogram bin index {ib}: ib = (er >= 0 ? int((er + brad)/bwid) : -int((brad - er)/bwid)); while(bmin > ib) { bmin--; for (ip = 0; ip < np; ip++) { hist[ip,bmin] = 0; } } while(bmax < ib) { bmax++; for (ip = 0; ip < np; ip++) { hist[ip,bmax] = 0; } } hist[kp,ib] += wt; } nd_best[kpbest]++; sum_w_best[kpbest] += wt; } next; } END { if (abort >= 0) { exit abort; } printf "considerd %d predictions\n", nd > "/dev/stderr"; for (kp = 0; kp < np; kp++) { nm = mname[kp]; ndb = nd_best[kp]; # Num of times {kp} was best. pwb = 100*sum_w_best[kp]/sum_w; # Percent weight when {kp} was best. avg = sum_we[kp]/sum_w; # Average error. rms = sqrt(sum_we2[kp]/sum_w); # RMS error. printf "%-20s mean error = %+7.1f rms error = %7.1f", (nm ":"), avg, rms > "/dev/stderr"; printf " was best %d times (%6.2f%% of weight)\n", ndb, pwb > "/dev/stderr"; } # Write histogram to standard output: for (ib = bmin; ib <= bmax; ib++) { er = ib*bwid; emin = er - brad; emax = er + brad; printf "%5d %5d", emin, emax; for (ip = 0; ip < np; ip++) { printf " %8.3f", hist[ip,ib]; } printf "\n"; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; # printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 } function end_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit 1 }