#! /usr/bin/gawk -f # Last edited on 2020-11-17 22:57:25 by jstolfi # Reads a data files with volumes and prices at fixed intervals. # Checks the consistency of the prices and recomputes the mean price. # Writes out a file in the same format but with 5 decimals for the prices. # # The user must define (with {export}) the environment variable {TZ="UTC"}, # load (with "-f") the library "useful_functions.gawk", and define with "-v" # the variables # # {dt_step} Spacing of intervals (in seconds). # BEGIN \ { if (ENVIRON["TZ"] != "UTC") { arg_error(("must set TZ to 'UTC'")); } if (dt_step == "") { arg_error(("must define {dt_step}")); } nints = 0; # Number of data lines. # Precision of fields in input (note: sometimes the precision is greater than this): ulp_vbt = 0.01; # Unit in the last place of input {vbt} ulp_vcr = 0.01; # Unit in the last place of input {vcr} ulp_pav = 0.01; # Unit in the last place of input average price {pav} ulp_phl = 0.01; # Unit in the last place of high, low, open and close prices. # Precision of values on output (check output formats): ulp_vbt_out = 0.0001; # Unit in the last place of output {vbt} ulp_vcr_out = 0.0001; # Unit in the last place of output {vcr} ulp_pav_out = 0.00001; # Unit in the last place of output average price {pav} ulp_phl_out = 0.00001; # Unit in the last place of high, low, open and close prices # Minimum volume to keep the line min_vbt = 0.001; # Minimum BTC volume. min_vcr = 0.001; # Minimum currency volume. lost_vbt = 0; # Total BTC volume discarded. lost_vcr = 0; # Total currency volume discarded. # Checks for repeated date: split("", seen_dt); # Value is 1 if datetime {dt} was seen before. # Datetime range in file: dt_ini = "???"; dt_fin = "???"; odt = ""; # Previous datetime. } // \ { # Remove tabs: gsub(/[\011]/, " ", $0); } # Preserve comments and blank lines: /^[ ]*([#]|$)/ \ { print; next; } // \ { # Remove inline comments, spurious blanks gsub(/[#].*$/, "", $0); gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); gsub(/[ ][ ]+/, " ", $0); } # Preserve table headers: /[!]/ \ { print; next; } # Data lines: /^20/ \ { if (NF != 16) { data_error(("wrong field count = " NF "")); } for (j = 3; j <= NF; j = j + 2) { if ($(j) != "|") { data_error(("missing '|' in column " j "")); } } # Get the date and time fields: dy = usf_check_date(FILENAME,FNR,$1); # Day. tm = usf_check_time(FILENAME,FNR,$2); # Time of day. dt = (dy " " tm); # Date consistency checks: if (dt in seen_dt) { data_error(("repeated datetime = \"" dt "\"")); } seen_dt[dt] = 1; if ((odt != "") && (! usf_datetimes_are_consecutive(odt,dt,dt_step))) { file_error(FILENAME,FNR, ("non-consecutive datetimess \"" odt "\" \"" dt "\"")); } odt = dt; # Get value fields: pop = usf_check_num(FILENAME,FNR, $4); # Opening price. phi = usf_check_num(FILENAME,FNR, $6); # High price. plo = usf_check_num(FILENAME,FNR, $8); # Low price. pcl = usf_check_num(FILENAME,FNR, $10); # Closing price. vbt = usf_check_num(FILENAME,FNR, $12); # Trade volume in BTC. vcr = usf_check_num(FILENAME,FNR, $14); # Trade volume in ref currency. pav = usf_check_num(FILENAME,FNR, $16); # Raw mean price. # Adjust {vbt} and {vcr} to match {pav}. if (pav != 0) { if ((vbt == 0) && (vcr == 0)) { # There is no volume at all, clear the line: printf "%s:%s: !! no volume, clearin all prices", FILENAME, FNR > "/dev/stderr"; printf " %.5f [%.5f _ %.5f] %.5f %.5f\n", pop, plo, phi, pcl, pav > "/dev/stderr"; pop = 0; phi = 0; plo = 0; pcl = 0; pav = 0; } else { # Make sure that the other prices are non-zero: usf_check_non_null_value(FILENAME,FNR, pop, "opening price"); usf_check_non_null_value(FILENAME,FNR, phi, "high price"); usf_check_non_null_value(FILENAME,FNR, plo, "low price"); usf_check_non_null_value(FILENAME,FNR, pcl, "closing price"); # Check that opening and closing prices are strictly in the range {plo,phi}: usf_check_price_range(FILENAME,FNR, pop, plo,phi, 0.0, "opening price"); usf_check_price_range(FILENAME,FNR, pcl, plo,phi, 0.0, "closing price"); # The {pav} field often has smaller input precision than {phi,plo} # Check if average price is in the range, except for rounding: usf_check_price_range(FILENAME,FNR, pav, plo,phi, max_price(ulp_pav,ulp_phl), "nominal average price"); # However, adjust {pav} to be in the range too: if ((pav < plo) || (pav > phi)) { # Adjust {pav} to {plo,phi}: printf "%s:%s: !! clipping nominal average price", FILENAME, FNR > "/dev/stderr"; printf " %.5f to [%.5f _ %.5f]", pav, plo, phi > "/dev/stderr"; # Compute the interval {[ploc _ phic]}where {pav} may have been before rounding: tol = 0.50001*ulp_pav; ploc = max_price(plo, pav - tol); phic = min_price(phi, pav + tol); # If that interval is not empty, take the midpoint: if (ploc <= phic) { pav = (ploc + phic)/2; } # In any case, clip to {[plo _ phi]}: pav = min_price(phi, max_price(plo, pav)); printf " result is %.5f\n", pav > "/dev/stderr"; } # ---------------------------------------------------------------------- # ADJUSTMENT OF {vbt,vcr,pav} # Need to adjust {vbt,vcr,pav} because the # input precision was often insufficient: # Convert {vbt,vcr} to log scale distributions vbt_log_avg = estimate_log_average(vbt, ulp_vbt); vbt_log_dev = estimate_log_deviation(vbt, ulp_vbt); vcr_log_avg = estimate_log_average(vcr, ulp_vcr); vcr_log_dev = estimate_log_deviation(vcr, ulp_vcr); pav_log_avg = estimate_log_average(pav, ulp_pav); pav_log_dev = estimate_log_deviation(pav, ulp_pav); # Let {VBT,VCR,PAV} be normal deviates with means {vbt_log_avg,vcr_log_avg,pav_log_avg} # and deviations {vbt_log_dev,vcr_log_dev,pav_log_dev}. We need to find # {VBT*,VCR*,PAV*} such that {VCR* = VBT* + PAV*}. These triples have a # Normal distribution on the plane os that equation so we need to find the # maximum of that distribution. # Transform the scales of the coordinate axes so that {VBT,VCR,PAV} # become {U,V,W} with unit deviation: U_avg = vbt_log_avg/vbt_log_dev; V_avg = vcr_log_avg/vcr_log_dev; W_avg = pav_log_avg/pav_log_dev; # The equation {VCR = VBT + PAV} becomes {cU*U + cV*V + cW*W = 0}: cU = +vbt_log_dev; cV = -vcr_log_dev; cW = +pav_log_dev; # Now project {U_avg,V_avg,W_avg} onto that plane: dd = (cU*U_avg + cV*V_avg + cW*W_avg)/(cU*cU + cV*cV + cW*cW); U_sol = U_avg - dd*cU; V_sol = V_avg - dd*cV; W_sol = W_avg - dd*cW; # printf " dd: (%.6f --> %.6f)\n", dd, cU*U_sol + cV*V_sol + cW*W_sol > "/dev/stderr"; # Convert back to original linear scale: vbt_new = estimate_lin_average(U_sol*vbt_log_dev, ulp_vbt); vcr_new = estimate_lin_average(V_sol*vcr_log_dev, ulp_vcr); pav_new = estimate_lin_average(W_sol*pav_log_dev, ulp_pav); # printf " new: (%.4f --> %.4f) (%.4f --> %.4f) (%.5f --> %.5f)", vbt, vbt_new, vcr, vcr_new, pav, pav_new > "/dev/stderr"; # printf " (%.5f --> %.5f)\n", pav - vcr/vbt, pav_new - vcr_new/vbt_new > "/dev/stderr"; # Check that {pav_new} is in the interval {phi,plo} with some tolerance: usf_check_price_range(FILENAME,FNR, pav_new, plo,phi, 1.5*ulp_phl, "computed average price"); # Check if the differences can be explained by rounding: vbt_max = (vcr_new + 0.50001*ulp_vcr)/pav_new; vbt_min = (vcr_new - 0.50001*ulp_vcr)/(pav_new + ulp_pav); ulp_vbt_xxx = max_price(2*ulp_vbt, vbt_max - vbt_min); usf_check_rounding(FILENAME,FNR, vbt_new, vbt, ulp_vbt_xxx, "BTC volume"); usf_check_rounding(FILENAME,FNR, vcr_new, vcr, 2*ulp_vcr, "currency volume"); usf_check_rounding(FILENAME,FNR, pav_new, pav, 2*ulp_pav, "average price"); # Round to the proper number of digits: vbt_new = usf_round_value(vbt_new, ulp_vbt_out); vcr_new = usf_round_value(vcr_new, ulp_vcr_out); pav_new = usf_round_value(pav_new, ulp_pav_out); # Accumulate the changes: lost_vbt += (vbt - vbt_new); lost_vcr += (vcr - vcr_new); # Update the values: vbt = vbt_new; vcr = vcr_new; pav = pav_new; # Round the other prices too: pop = usf_round_value(pop, ulp_phl_out); phi = usf_round_value(phi, ulp_phl_out); plo = usf_round_value(plo, ulp_phl_out); pcl = usf_round_value(pcl, ulp_phl_out); # Make sure that the range {plo,phi} includes the new average price: phi = max_price(max_price(max_price(phi,pav),pop),pcl); plo = min_price(min_price(min_price(plo,pav),pop),pcl); # ---------------------------------------------------------------------- } } # Final consistency check: usf_check_prices( \ FILENAME,FNR, pop,phi,plo,pcl,vbt,vcr,pav,\ ulp_phl_out,ulp_vbt_out,ulp_vcr_out,ulp_pav_out \ ); # Print result (beware, output precision should match {ulp_vbt_out} etc.: printf "%s %s | %12.5f | %12.5f | %12.5f | %12.5f", dy, tm, pop, phi, plo, pcl; printf " | %16.4f | %16.4f | %12.5f", vbt, vcr, pav; printf "\n"; # Update the date/time range: if (nints == 0) { dt_ini = dt; } dt_fin = dt; nints++; next; } // \ { data_error(("invalid line format")); } END \ { printf "found %d intervals, from %s to %s\n", nints, dt_ini, dt_fin > "/dev/stderr"; if ((lost_vbt > 0) || (lost_vcr > 0)) { printf "discarded %.8f BTC and %.6f currency\n", lost_vbt, lost_vcr > "/dev/stderr"; } exit(0); } function estimate_log_average(val,ulp) { # The nominal expected value of {log(X)} knowing that # the printed value of {X} is {val} and the unit in the last digit is {ulp}. if (val < 0.5*ulp) { val = 0.5*ulp; } return log(val); } function estimate_lin_average(val,ulp) { # The inverse of {estimate_log_average(val,ulp)} # but does not return zero: return max_price(exp(val), 1.0e-8); } function estimate_log_deviation(val,ulp) { # The nominal standard deviation of {log(X)} # the printed value of {X} is {val} and the unit in the last digit is {ulp}. if (val < 0.5*ulp) { val = 0.5*ulp; } return log(val + 0.5*ulp) - log(val); } function min_price(x,y) { # Min of {x,y}, ignoring undefined: if (x+0 == 0) { return y; } else if (y+0 == 0) { return x; } else { return (x+0 < y+0 ? x : y); } } function max_price(x,y) { return (x+0 > y+0 ? x : y); }