#! /usr/bin/gawk -f # Last edited on 2015-02-21 18:15:52 by stolfilocal BEGIN { abort = -1; # Caller must define (with "-v") {wallet}, {iniDate}, {finDate}, {priceFile}, {usdBins} # Reads from {stdin} a file of transactions for a wallet, as produced # by {cleanup_wallet_data.gawk}. # Each input line represents # a transfer into or out of a specified wallet {WAL}, or a # transfer between addresses of the same wallet. The # fields in each input line are assumed to be # # {DAY} {TIME} | {TXID} | {IOINDEX} | {AMOUNT} | {IOWALLET} | {BALANCE} # # where {TXID} is a transaction ID, {IOINDEX} is a sequential number # of an input or output in that transaction (from 0), {AMOUNT} # is the amount transferred, {IOWALLET} is the ID of the # wallet at the other ends of the transfer, and {BALANCE} is # the claimed balance in {WAL} after the transaction. # The {DAY} and {TIME} need not be in order. # Theprogram also reads file "{priceFile}". Each line of this file must have # the format # # {DAY} {TIME} | {PMED} # # where {DAY} and {TIME} are as above, and {PMED} is the nominal # BTC prices in USD for the specified date. # The output file will have one line for each range of transaction size. # The fields are # # {IB} {VMIN} {VMID} {VMAX} {NOPS} {TBTC} {TUSD} # # where {IB} is a bin index, {VMIN} and {VMAX} are the lower and upper bounds of the # histogram bin, {VMID} is their geometric mean, {NOPS} is the number of operations # in that bin, {TBTC} is the total BTC in that bin (non-negative even for outputs), # and {TUSD} is the total in USD. If {usdBins} is true, the values {VMIN,VMID,VMAX} # will be in USD, else in BTC. # # Most of the bins have the same relative width, with an integer number of # bins spanning each power of 10. The index {IB} spans {-NB..+NB} for some # {NB >= 1}. Negative bins are "send" (output) # operations, positive bins are "receive" (input). # # There is a bin with {IB=VMIN=VMAX=VMID=0} for internal shuffles. # On each side of that, there is a bin that collects all positive # (resp. negative) values with absolute value less than some # small power of 10. Then there are the regular bins. # # Needs the wallet name, to check intra-wallet transactions: if (wallet == "") { arg_error(("must define {wallet}")); } if (iniDate == "") { arg_error(("must define {iniDate}")); } if (finDate == "") { arg_error(("must define {finDate}")); } if (priceFile == "") { arg_error(("must define {priceFile}")); } if (usdBins == "") { arg_error(("must define {usdBins}")); } # Make sure that conversions of numbers to strings and vice-versa preserve 8 decimals after point: OFMT = "%.8f"; CONVFMT = "%.8f"; # Get the nominal price per day: split("", price_dt); # Price of 1 BTC in USD, indexed by "{DAY} {TIME}" read_price_file(priceFile, price_dt); nb = -1; # Number of bins from each side of zero (-1 if no bins). # Bin attributes indexed by bin number {ib}, from {-nb} to {+nb} split("", vmin_ib); split("", vmid_ib); split("", vmax_ib); split("", nops_ib); split("", tbtc_ib); split("", tusd_ib); # Bin parameters vmin_min = (usdBins ? 0.001 : 0.00001); # The {vmin} of the first regular bin. val_eps = 1e-8; # Fudge amount: 1 satoshi if BTC, 1 microcent if USD. bins_per_10 = 4; # Number of bins per decade. # Set up the core bins: init_bins(0); init_bins(1); nread = 0; # Number of data records read from input. nused = 0; # Number of data records in date range. ndays = 0; # Number of distinct days seen. odt = "2009-01-03 00:00:00"; # Previous date and time. } /^20[01][0-9]-[01][0-9]/ { nread++; if (NF != 12) { data_error(("invalid field count")); } for (j = 3; j <= NF; j += 2 ) { if ($(j) != "|") { data_error(("field " j "is not \"|\"")); } } dy = check_day(FILENAME,FNR,$1); # Day, "{yyyy}-{mm}-{dd}". tm = check_time(FILENAME,FNR,$2); # Time of day, "{HH}:{MM}:{SS}" (UTC). tx = $4; # Transaction ID. iotx = check_nat(FILENAME,FNR,$6); # Input/output index in transaction. val_btc = check_amount(FILENAME,FNR,$8); # BTC amount received/sent from/to that input/output. iowal = $10; # Wallet identifier as per "http://www.walletexplorer.com". eb = check_amount(FILENAME,FNR,$12); # Wallet balance, as per "http://www.walletexplorer.com". dt = (dy " " tm); # Check chronological order of day + hour (not essential): if (dt < odt) { data_warning(("times out of order")); } odt = dt; if ((dy < iniDate) || (dy > finDate)) { next; } # Check for new date: if (! (dy in nread_day)) { ndays++; } # Determine the kind of operation by the sign of the value: if (val_btc+0 > 0) { kind = +1; } else if (val_btc+0 < 0) { kind = -1; } else { kind = 0; if (iowal != wallet) { data_error(("wrong wallet \"" iowal "\" in internal shuffle")); } } if (kind*val_btc < 0) { prog_error(("wrong {kind} " val_btc " --> " kind)); } # Determine amount in USD: val_usd = price_dt[dt] * val_btc; # Accumulate histogram: accum_histogram(kind,val_btc,val_usd); nused++; next; } // { data_error(("invalid format")); } END { printf "read %d records \n", nread, iniDate, finDate > "/dev/stderr"; printf "found %d records in range %s .. %s\n", nused, iniDate, finDate > "/dev/stderr"; printf "found %d dates in that range\n", ndays > "/dev/stderr"; output_histogram(); } function read_price_file(fname,tbl, ntbl,nlin,lin,fld,nfld,dy,tm,dt,pr,tmp) { # Reads daily average prices from file {fname}, # stores them in {tbl}, indexed by "{DAY} {TIME}". ntbl=0; nlin=0; while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([\#]|$)/)) { nfld = split(lin, fld, " "); if ((nfld > 4) && (fld[5] ~ /^[\#]/)) { nfld = 4; } if (nfld != 4) { file_error(fname,nlin,("bad price file entry = \"" lin "\"")); } dy = check_day(fname,nlin,fld[1]); # Day, "{yyyy}-{mm}-{dd}". tm = check_time(fname,nlin,fld[2]); # Time of day, "{HH}:{MM}:{SS}" (UTC). dt = (dy " " tm); # Date and time. if (fld[3] != "|") { file_error(fname,nlin,("missing vbar in price file = \"" lin "\"")); } pr = check_tbl_amount(fname,nlin,fld[4]); # Average price in day. if (dt in tbl) { file_error(fname,nlin,("repeated date in price file = \"" lin "\"")); } tbl[dt] = pr; ntbl++; } } if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "loaded %6d prices\n", ntbl > "/dev/stderr" } function check_nat(fname,nlin,x) { if (x !~ /^[0-9]+$/) { file_error(fname,nlin,("invalid nat \"" x "\"")); } return sprintf("%d", x + 0); } function check_amount(fname,nlin,x) { if (x !~ /^[-+]?[0-9]+[.][0-9]+$/) { file_error(fname,nlin,("invalid amount \"" x "\"")); } x = x + 0.0; if (x == 0) { return "00.0000000000"; } else { return sprintf("%+.8f", x + 0.0); } } function check_day(fname,nlin,x) { if (x !~ /^20[01][0-9]-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$/) { file_error(fname,nlin,("invalid day \"" x "\"")); } return x; } function check_time(fname,nlin,x) { if (x !~ /^([01][0-9]|2[0-3])[:][0-5][0-9][:]([0-5][0-9]|60)$/) { file_error(fname,nlin,("invalid time of day \"" x "\"")); } return x; } function init_bins(ib, vmin,vmid,vmax,kind) { # Initalizes bins {nb+1..|ib|} and increments {nb}. # Also initializes bin {-|ib|..-nb-1} with the symmetric values. # if (ib == 0) { if (nb != -1) { prog_error(("init bin 0 at wrong time")); } vmin_ib[ib] = 0.0; vmid_ib[ib] = 0.0; vmax_ib[ib] = 0.0; nops_ib[ib] = 0; tbtc_ib[ib] = 0.0; tusd_ib[ib] = 0.0; nb = 0; } else { ib = (ib < 0 ? -ib : ib); # Discard sign of {ib}. if (ib <= nb) { prog_error(("init bin " ib " at wrong time")); } while (nb < ib) { nb++; vmin = compute_vmin(nb); vmax = compute_vmax(nb); vmid = sqrt(vmin*vmax); vmin_ib[+nb] = +vmin; vmax_ib[+nb] = +vmax; vmin_ib[-nb] = -vmax; vmax_ib[-nb] = -vmin; for (kind = -1; kind <= +1; kind += 2) { vmid_ib[kind*nb] = kind*vmid; nops_ib[kind*nb] = 0; tbtc_ib[kind*nb] = 0.0; tusd_ib[kind*nb] = 0.0; } } } } function compute_vmin(ib, kb,vmin) { # Computes the {vmin} of bin {ib} (positive). # Uses global parameters {vmin_min,bins_per_10,val_eps}. if (ib == 1) { return val_eps; } else { kb = ib - 2; vmin = exp(log(vmin_min) + log(10)*kb/bins_per_10); if ((vmin >= 0.9999) && ((kb % bins_per_10) == 0)) { # Make sure it is an integer: vmin = int(vmin + 0.5); } return vmin; } } function compute_vmax(ib) { # Computes the {vmax} of bin {ib} (positive). # Uses global parameters {vmin_min,bins_per_10,val_eps}. if (ib == 0) { return 0.0; } else { return compute_vmin(ib+1) - 0.5*val_eps; } } function accum_histogram(kind,val_btc,val_usd, val,ib) { # Accumulates an operation of the given # {kind} and signed amounts {val_btc,val_usd}, in the appropriate bin.. # Uses global parameters {vmin_min,bins_per_10,nb,val_eps,usdBins}, updates {nb}. val = (usdBins ? val_usd : val_btc); ib = compute_bin_index(kind, val); if ((ib < -nb) || (ib > nb)) { init_bins(ib); } if ((val < vmin_ib[ib] - 0.5*val_eps) || (val_btc0 > vmax_ib[ib] + 0.5*val_eps)) { printf " kind = %d amount = %+18.8f\n", kind, val_btc > "/dev/stderr"; printf " ib = %d nb = %d\n", ib, nb > "/dev/stderr"; printf " range = %+18.8f (%+18.8f) %+18.8f\n", vmin_ib[ib], vmid_ib[ib], vmax_ib[ib] > "/dev/stderr"; prog_error(("bad bin index")); } nops_ib[ib] += 1; tbtc_ib[ib] += kind*val_btc; tusd_ib[ib] += kind*val_usd; } function compute_bin_index(kind,val, aval,kb,ib) { # computes the bin index for an operation of the given {kind} and signed amount {val} # (BTC or USD, as desired). Uses global parameters {vmin_min,bins_per_10,val_eps}. if (kind == 0) { if (val+0 != 0) { prog_error(("non-zero shuffle amount")); } ib = 0; } else { aval = kind*val; # Absolute value of {val} if (aval <= 0) { prog_error(("non-positive op amount")); } if (aval < vmin_min) { # Bin of small nonzero values: ib = 1; } else { kb = int(bins_per_10*(log(aval + 0.5*val_eps) - log(vmin_min))/log(10)); if (kb < 0) { prog_error(("negative {kb}")); } ib = kb + 2; # Correct for rounding errors: while ((ib >= 2) && (aval < compute_vmin(ib))) { ib--; } while (aval > compute_vmax(ib)) { ib++; } } } return kind*ib; } function output_histogram( ib) { for (ib = -nb; ib <= +nb; ib++) { printf "%4d", ib; printf " %+18.8f %+18.8f %+18.8f", vmin_ib[ib], vmid_ib[ib], vmax_ib[ib]; printf " %8d %+18.8f %+18.8f", nops_ib[ib], tbtc_ib[ib], tusd_ib[ib]; printf "\n"; } } function file_error(fname,nlin,msg) { # Error in file "{fname}" line {nlin}: printf "%s:%s: ** %s\n", fname, nlin, msg > "/dev/stderr"; abort = 1; exit(abort); } function data_error(msg) { # Error in input data file: printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function data_warning(msg) { # Warning in input data file: printf "%s:%s: !! %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; } function arg_error(msg) { # Error in command-line args: printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); } function prog_error(msg) { printf "** PROGRAM ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }