#! /usr/bin/gawk -f # Last edited on 2015-02-24 14:59:38 by stolfilocal # Reads a daily price/volume series file. Writes to {stdout} a file # with smoothed daily mean prices. # # The user must define (with {export}) the environment variable {TZ="UTC"} # The user must load (with "-f") the library {merged_price_functions.gawk}, and # define (with "-v") the variables # # {inFile} name of input series file. # {hrad}, the half-width of the smoothing window. # # The input file {inFile} must contain daily price and volume data for some exchange # in some currency. Each input line must have the format # "{DATE} {TIME} | {OPEN} | {HIGH} | {LOW} | {CLOSE} | {VBT} | {VCR} | {WTPRICE}" # where {DATE} is the line's date (UTC), {TIME} must be "00:00:00", # {VBT} is the total btc volume traded in that day, and {VCR} the total # volume in the exchange currency. The other fields are ignored, except for # validation. The {DATE}s must be consecutive days; days with # missing data must be present and have {VBT} and {VCR} both zero. # # Writes to standard output one line number {i} per day in the format # "{DATE} {TIME} | {VBT[i]} | {VCR[i]} | {PMD[i]}", where # # {DATE} is every day in the input file; # {TIME} is always "00:00:00"; # {VBT[i]} and {VCR[i]} are the input daily volumes on the {DATE}; # {PMD[i]} is a smoothed average of the price {VCR[j]/VBT[j]} in a window of several # days around the {DATE}. BEGIN \ { if (inFile == "") { arg_error(("must define {inFile}")); } if (hrad == "") { arg_error(("must define {hrad}")); } if (hrad !~ /^[0-9]+$/) { arg_error(("invalid {hrad}")); } pi = 3.1415926; ulp_vbt = 0.0001; # Unit in last place of input BTC daily volume {vbt}. ulp_vcr = 0.0001; # Unit in last place of currency daily volume {vcr}. ulp_pav = 0.00001; # Unit in the last place of input mean price {pav}. ulp_phl = 0.00001; # Unit in the last place of input {plo,phi}. # Series data tables: split("", date_ot); # Set to "1" for dates that exist, undef otherwise, indexed by {[DATE]}. split("", vbt_ot); # BTC volume, indexed with {[DATE]}. split("", vcr_ot); # Currency volume, indexed with {[DATE]}. split("", pmd_ot); # Average price in window around date, indexed with {[DATE]}. #Read the series and store in series data tables: read_daily_summary_file(\ inFile, \ date_ot,vbt_ot,vcr_ot \ ); # Sort lines by date: ndays = asorti(date_ot); # Now {date_ot} has the existing dates, indexed {1..ndays} compute_local_averages(inFile,ndays,hrad,date_ot,vbt_ot,vcr_ot,pmd_ot); write_smoothed_price(ndays,date_ot,vbt_ot,vcr_ot,pmd_ot); exit(0); } function read_daily_summary_file \ ( fname, \ date_ot,vbt_ot,vcr_ot, \ nlin,lin,ndays,fld,nfld,date,time,pop,phi,plo,pcl,vbt,vcr,pav,j,odate \ ) { # Reads from file {fname} the trade summary data in 1 day intervals. # Sets {date_ot[dt]} to 1, and stores the BTC and currency volumes in {vbt_ot[dt],vcr_ot[dt]}, # for each date {dt} present in the file. # Days with no data have must have a line with the corresponding date and zero volumes. printf "reading file %s ...\n", fname > "/dev/stderr"; ERRNO = ""; # Read the file: nlin = 0; # Number of lines read. ndays = 0; # Number of non-blank, non-header, non-comment lines. odate = ""; # Date on previosu data line. while((getline lin < fname) > 0) { nlin++; # Remove tabs, inline comments, spurious blanks gsub(/[\011]/, " ", lin); gsub(/[\#].*$/, "", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); if ((lin != "") && (! match(lin, /[!]/))) { nfld = split(lin, fld, " "); if (nfld != 16) { file_error(fname, nlin, ("bad summary entry = \"" lin "\"")); } for (j = 3; j <= NF; j = j + 2) { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } } date = mpf_check_date(fname,nlin,fld[1]); if (date in date_ot) { file_error(fname, nlin, ("repeated date = \"" date "\"")); } time = fld[2]; if (time != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" time "\"")); } pop = mpf_check_num(fname, nlin, fld[4]); phi = mpf_check_num(fname, nlin, fld[6]); plo = mpf_check_num(fname, nlin, fld[8]); pcl = mpf_check_num(fname, nlin, fld[10]); vbt = mpf_check_num(fname, nlin, fld[12]); vcr = mpf_check_num(fname, nlin, fld[14]); pav = mpf_check_num(fname, nlin, fld[16]); if ((odate != "") && (! mpf_dates_are_consecutive(odate,date))) { file_error(fname,nlin, ("non-consecutive dates \"" odate "\" \"" date "\"")); } odate = date; # Consistency checks: usf_check_prices(fname,nlin, pop,plo,phi,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav); # Save in arrays: date_ot[date] = 1; vbt_ot[date] = vbt; vcr_ot[date] = vcr; ndays++; } } if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "%6d input lines read\n", nlin > "/dev/stderr"; printf "%6d data lines found\n", ndays > "/dev/stderr"; } function check_prices(fname,nlin,vbt,vcr,plo,phi,pav, pavc,ploc,phic) { # Checks consistency of average, low, high, prices. # Uses global parameters {ulp_vbt,ulp_vcr,ulp_pav,ulp_phl}. if ((vcr == 0) != (vbt == 0)) { file_error(fname, nlin, ("inconsistent vbt = \"" vbt "\" vcr = \"" vcr "\"")); } if (vbt != 0) { if ((vbt < ulp_vbt) || (vcr < ulp_vcr)) { file_error(fname, nlin, ("too small vbt = \"" vbt "\" or vcr = \"" vcr "\"")); } pavc = vcr/vbt; # Computed average price. ploc = (vcr - 0.50001*ulp_vcr)/(vbt + 0.50001*ulp_vbt); # Min average price assuming worst rounding. phic = (vcr + 0.50001*ulp_vcr)/(vbt - 0.50001*ulp_vbt); # Max average price assuming worst rounding. if ((pavc - pav) > 0.500001*ulp_pav) { printf "%s:%s: !! average price inconsistent", fname, nlin > "/dev/stderr"; printf " is %.5f should be %.5f\n", pav, pavc > "/dev/stderr"; if ((phic < pav - 0.500001*ulp_pav) || (ploc > pav + 0.500001*ulp_pav)) { file_error(fname, nlin, ("excessive error")); } } if ((pavc < plo - 0.500001*ulp_phl) || (pavc > phi + 0.500001*ulp_phl)) { printf "%s:%s: !! average price out of lo-hi range", fname, nlin > "/dev/stderr"; printf " is %.5f should be in [%.5f _ %.5f]\n", pavc, plo, phi > "/dev/stderr"; if ((phic < plo - 0.500001*ulp_phl) || (ploc > phi + 0.500001*ulp_phl)) { file_error(fname, nlin, ("excessive overflow")); } } } } function compute_local_averages \ ( fname,ndays,hrad,date_ot,vbt_ot,vcr_ot,pmd_ot, \ kd,date,vbt,vcr,odate,pmd \ ) { # Assumes {date_ot[1..ndays]} are the selected input dates, in order. # Computes {pmd_ot[date]} as the average of prices in a window around {date}. # Sets {pmd_ot[date]} to zero if the price at the center of the window is missing. odate = ""; # Date on previous table line. for (kd = 1; kd <= ndays; kd++) { date = date_ot[kd]; if ((odate != "") && (! mpf_dates_are_consecutive(odate,date))) { prog_error(fname,nlin, ("non-consecutive dates \"" odate "\" \"" date "\"")); } pmd = compute_one_local_average(ndays,kd,hrad,date_ot,vbt_ot,vcr_ot); pmd_ot[date] = pmd; odate = date; } } function compute_one_local_average \ ( ndays,kd,hrad,date_ot,vbt_ot,vcr_ot, \ rad,jd,id,date,w,vcr,vbt,pmd,wht \ ) { # Assumes {date_ot[1..ndays]} are the selected dates, in order. # Computes the average of prices in a soft window # around {date_ot[kd]}. The window spans {2*hrad+1} days. # The procedure fits a polynomial by least squares to the # daily mean prices in the window, weighted by a Hann-like window weight # and the currency volume. Then it evaluates the fitted line # at the current date. Returns 0.0 if there is not enough # data to fit a line. # Collect the data points in window, indexed {-hrad..+hrad}: split("", pmd); # Mean price per day. split("", wht); # Weight (including currency volume) per day. for (jd = -hrad; jd <= hrad; jd++) { pmd[jd] = 0.0; wht[jd] = 0.0; # By default. id = kd + jd; if (id in date_ot) { date = date_ot[id]; vcr = (date == "" ? 0.0 : vcr_ot[date]); vbt = (date == "" ? 0.0 : vbt_ot[date]); if ((vcr >= 3*ulp_vcr) && (vbt >= 3*ulp_vbt)) { w = 0.5*(1 + cos(pi*jd/(hrad + 0.5))); pmd[jd] = vcr/vbt; wht[jd] = w; } } } # Fit polynomial and evaluate at center: return fit_and_eval(1,hrad,pmd,wht); } function fit_and_eval \ ( deg,hrad,pmd,wht, \ j,w,p,A00,A01,A11,A0p,A1p,M00,M01,M11,c0,c1 \ ) { # Fits a polynomial {P(j)} of degree {deg} to the data points {(j,log(pmd[j]))} with # weight {wht[j]}, then evaluates {exp(P(0))}. # Returns 0.0 if there is not enough data to fit the polynomial. # Uses only pairs of entries symmetric around current day, where both days have data. # Returns zero if there are no such pairs, or if there is no data for the current date. # The weight for each day is proportional to the volume. if (deg > 1) { prog_error(("degree not implemented")); } A00 = 0; # { = SUM{wht[j]} A01 = 0; # { = SUM{wht[j]*j} A11 = 0; # { = SUM{wht[j]*j*j} A0p = 0; # { = SUM{wht[j]*pmd[j]} A1p = 0; # { = SUM{wht[j]*pmd[j]*j} for (j = -hrad; j <= hrad; j++) { w = wht[j]; p = pmd[j]; if (w > 0.0) { if (p <= 0.0) { prog_error(("non-positive price")); } p = log(p); A00 += w; A01 += w*j; A11 += w*j*j; A0p += w*p; A1p += w*p*j; } } if (A00 < 0.000001) { return 0.0; } # Solve the system {A*c = b} to det = A00*A11 - A01*A01; if ((det >= 0 ? det : -det) < 1.0e-6) { return 0.0; } M00 = A11; M01 = -A01; M11 = A00; # Adjoint {A00,A01,A11}. c0 = (M00*A0p + M01*A1p)/det; c1 = (M01*A0p + M11*A1p)/det; # printf "(%8.5f) + (%8.5f)*j\n", c0, c1 > "/dev/stderr"; # for (j = -hrad; j <= +hrad; j++) # { w = wht[j]; # p = pmd[j]; # if (w > 0.0) # { p = log(p); # printf " %+4d %8.5f %8.5f %8.5f\n", j, w, p, c0+c1*j > "/dev/stderr"; # } # } # Evaluate at {j = 0}: return exp(c0); } function write_smoothed_price \ ( ndays,date_ot,vbt_ot,vcr_ot,pmd_ot, \ kd,date,time,vbt,vcr,pmd,odate \ ) { # Assumes {date_ot[1..ndays]} are the merged dates, in order. # Print header: printf "# Created by {compute_smoothed_price.gawk}\n" printf "\n"; for (kd = 1; kd <= ndays; kd++) { date = date_ot[kd]; time = "00:00:00"; printf "%s %s", date, time; vbt = vbt_ot[date]; vcr = vcr_ot[date]; pmd = pmd_ot[date]; printf " | %.2f | %.2f | %.4f", vbt, vcr, pmd; printf "\n"; otst = tst; } fflush("/dev/stdout"); }