#! /usr/bin/gawk -f
# Last edited on 2020-11-17 22:57:25 by jstolfi

# Reads a data files with volumes and prices at fixed intervals.
# Checks the consistency of the prices and recomputes the mean price.
# Writes out a file in the same format but with 5 decimals for the prices.
# 
# The user must define (with {export}) the environment variable {TZ="UTC"},
# load (with "-f") the library "useful_functions.gawk", and define with "-v"
# the variables
#
#   {dt_step} Spacing of intervals (in seconds).
#

BEGIN \
  { 
    if (ENVIRON["TZ"] != "UTC") { arg_error(("must set TZ to 'UTC'")); }
    if (dt_step == "") { arg_error(("must define {dt_step}")); }
    
    nints = 0;            # Number of data lines.
    
    # Precision of fields in input (note: sometimes the precision is greater than this):
    ulp_vbt = 0.01;   # Unit in the last place of input {vbt}
    ulp_vcr = 0.01;   # Unit in the last place of input {vcr}
    ulp_pav = 0.01;   # Unit in the last place of input average price {pav}
    ulp_phl = 0.01;   # Unit in the last place of high, low, open and close prices.
    
    # Precision of values on output (check output formats):
    ulp_vbt_out = 0.0001;   # Unit in the last place of output {vbt}
    ulp_vcr_out = 0.0001;   # Unit in the last place of output {vcr}
    ulp_pav_out = 0.00001;  # Unit in the last place of output average price {pav}
    ulp_phl_out = 0.00001;  # Unit in the last place of high, low, open and close prices
    
    # Minimum volume to keep the line
    min_vbt = 0.001; # Minimum BTC volume.
    min_vcr = 0.001; # Minimum currency volume.
    
    lost_vbt = 0; # Total BTC volume discarded.
    lost_vcr = 0; # Total currency volume discarded.
    
    # Checks for repeated date:
    split("", seen_dt); # Value is 1 if datetime {dt} was seen before.
    
    # Datetime range in file:
    dt_ini = "???";
    dt_fin = "???";
    
    odt = ""; # Previous datetime.

  }

// \
  { # Remove tabs:
    gsub(/[\011]/, " ", $0);
  }
  
# Preserve comments and blank lines:
/^[ ]*([#]|$)/ \
  { print; 
    next;
  }

// \
  { # Remove inline comments, spurious blanks
    gsub(/[#].*$/, "", $0);
    gsub(/^[ ]+/, "", $0); 
    gsub(/[ ]+$/, "", $0); 
    gsub(/[ ][ ]+/, " ", $0); 
  }
  
# Preserve table headers:
/[!]/ \
  { print; 
    next;
  }
    
# Data lines:
/^20/ \
  { 
    if (NF != 16) { data_error(("wrong field count = " NF "")); }
    for (j = 3; j <= NF; j = j + 2)
      { if ($(j) != "|") { data_error(("missing '|' in column " j "")); } }

    # Get the date and time fields:
    dy = usf_check_date(FILENAME,FNR,$1); # Day.
    tm = usf_check_time(FILENAME,FNR,$2); # Time of day.
    dt = (dy " " tm);
    
    # Date consistency checks:
    if (dt in seen_dt)  { data_error(("repeated datetime = \"" dt "\"")); }
    seen_dt[dt] = 1;
    if ((odt != "") && (! usf_datetimes_are_consecutive(odt,dt,dt_step)))
      { file_error(FILENAME,FNR, ("non-consecutive datetimess \"" odt "\" \"" dt "\"")); }
    odt = dt;

    # Get value fields:
    pop = usf_check_num(FILENAME,FNR, $4);  # Opening price.
    phi = usf_check_num(FILENAME,FNR, $6);  # High price.
    plo = usf_check_num(FILENAME,FNR, $8);  # Low price.
    pcl = usf_check_num(FILENAME,FNR, $10); # Closing price.
    vbt = usf_check_num(FILENAME,FNR, $12); # Trade volume in BTC.
    vcr = usf_check_num(FILENAME,FNR, $14); # Trade volume in ref currency.
    pav = usf_check_num(FILENAME,FNR, $16); # Raw mean price.
    
    # Adjust {vbt} and {vcr} to match {pav}.
    if (pav != 0)
      { 
        if ((vbt == 0) && (vcr == 0))
          { # There is no volume at all, clear the line:
            printf "%s:%s: !! no volume, clearin all prices", FILENAME, FNR > "/dev/stderr";
            printf " %.5f [%.5f _ %.5f] %.5f %.5f\n", pop, plo, phi, pcl, pav > "/dev/stderr";
            pop = 0;
            phi = 0;
            plo = 0;
            pcl = 0;
            pav = 0;
          }
        else
          { # Make sure that the other prices are non-zero:
            usf_check_non_null_value(FILENAME,FNR, pop, "opening price");
            usf_check_non_null_value(FILENAME,FNR, phi, "high price");
            usf_check_non_null_value(FILENAME,FNR, plo, "low price");
            usf_check_non_null_value(FILENAME,FNR, pcl, "closing price");

            # Check that opening and closing prices are strictly in the range {plo,phi}:
            usf_check_price_range(FILENAME,FNR, pop, plo,phi, 0.0, "opening price");
            usf_check_price_range(FILENAME,FNR, pcl, plo,phi, 0.0, "closing price");

            # The {pav} field often has smaller input precision than {phi,plo}
            # Check if average price is in the range, except for rounding:
            usf_check_price_range(FILENAME,FNR, pav, plo,phi, max_price(ulp_pav,ulp_phl), "nominal average price");

            # However, adjust {pav} to be in the range too:
            if ((pav < plo) || (pav > phi))
              { # Adjust {pav} to {plo,phi}: 
                printf "%s:%s: !! clipping nominal average price", FILENAME, FNR > "/dev/stderr";
                printf " %.5f to [%.5f _ %.5f]", pav, plo, phi > "/dev/stderr";
                # Compute the interval {[ploc _ phic]}where {pav} may have been before rounding:
                tol = 0.50001*ulp_pav;
                ploc = max_price(plo, pav - tol);
                phic = min_price(phi, pav + tol);
                # If that interval is not empty, take the midpoint:
                if (ploc <= phic) { pav = (ploc + phic)/2; }
                # In any case, clip to {[plo _ phi]}:
                pav = min_price(phi, max_price(plo, pav));
                printf " result is %.5f\n", pav > "/dev/stderr";
              }

            # ----------------------------------------------------------------------
            # ADJUSTMENT OF {vbt,vcr,pav}
            # Need to adjust {vbt,vcr,pav} because the
            # input precision was often insufficient:

            # Convert {vbt,vcr} to log scale distributions 
            vbt_log_avg = estimate_log_average(vbt, ulp_vbt);
            vbt_log_dev = estimate_log_deviation(vbt, ulp_vbt);

            vcr_log_avg = estimate_log_average(vcr, ulp_vcr);
            vcr_log_dev = estimate_log_deviation(vcr, ulp_vcr);

            pav_log_avg = estimate_log_average(pav, ulp_pav);
            pav_log_dev = estimate_log_deviation(pav, ulp_pav);

            # Let {VBT,VCR,PAV} be normal deviates with means {vbt_log_avg,vcr_log_avg,pav_log_avg}
            # and deviations {vbt_log_dev,vcr_log_dev,pav_log_dev}.  We need to find
            # {VBT*,VCR*,PAV*} such that {VCR* = VBT* + PAV*}. These triples have a 
            # Normal distribution on the plane os that equation so we need to find the 
            # maximum of that distribution.

            # Transform the scales of the coordinate axes so that {VBT,VCR,PAV}
            # become {U,V,W} with unit deviation:
            U_avg = vbt_log_avg/vbt_log_dev;
            V_avg = vcr_log_avg/vcr_log_dev;
            W_avg = pav_log_avg/pav_log_dev;
            
            # The equation {VCR = VBT + PAV} becomes {cU*U + cV*V + cW*W = 0}:
            cU = +vbt_log_dev;
            cV = -vcr_log_dev;
            cW = +pav_log_dev;

            # Now project {U_avg,V_avg,W_avg} onto that plane:
            dd = (cU*U_avg + cV*V_avg + cW*W_avg)/(cU*cU + cV*cV + cW*cW);
            U_sol = U_avg - dd*cU;
            V_sol = V_avg - dd*cV;
            W_sol = W_avg - dd*cW;
            
            # printf "  dd: (%.6f --> %.6f)\n", dd, cU*U_sol + cV*V_sol + cW*W_sol > "/dev/stderr";

            # Convert back to original linear scale:
            vbt_new = estimate_lin_average(U_sol*vbt_log_dev, ulp_vbt);
            vcr_new = estimate_lin_average(V_sol*vcr_log_dev, ulp_vcr);
            pav_new = estimate_lin_average(W_sol*pav_log_dev, ulp_pav);

            # printf "  new: (%.4f --> %.4f) (%.4f --> %.4f) (%.5f --> %.5f)", vbt, vbt_new, vcr, vcr_new, pav, pav_new > "/dev/stderr";
            # printf "  (%.5f --> %.5f)\n", pav - vcr/vbt, pav_new - vcr_new/vbt_new > "/dev/stderr";
            
            # Check that {pav_new} is in the interval {phi,plo} with some tolerance:
            usf_check_price_range(FILENAME,FNR, pav_new, plo,phi, 1.5*ulp_phl, "computed average price");

            # Check if the differences can be explained by rounding:
            vbt_max = (vcr_new + 0.50001*ulp_vcr)/pav_new;
            vbt_min = (vcr_new - 0.50001*ulp_vcr)/(pav_new + ulp_pav);
            ulp_vbt_xxx = max_price(2*ulp_vbt, vbt_max - vbt_min);
            usf_check_rounding(FILENAME,FNR, vbt_new, vbt, ulp_vbt_xxx, "BTC volume");
            usf_check_rounding(FILENAME,FNR, vcr_new, vcr, 2*ulp_vcr, "currency volume");
            usf_check_rounding(FILENAME,FNR, pav_new, pav, 2*ulp_pav, "average price");
            
            # Round to the proper number of digits:
            vbt_new =  usf_round_value(vbt_new, ulp_vbt_out);
            vcr_new =  usf_round_value(vcr_new, ulp_vcr_out);
            pav_new =  usf_round_value(pav_new, ulp_pav_out);

            # Accumulate the changes:
            lost_vbt += (vbt - vbt_new);
            lost_vcr += (vcr - vcr_new);

            # Update the values:
            vbt = vbt_new;
            vcr = vcr_new;
            pav = pav_new;
            
            # Round the other prices too:
            pop = usf_round_value(pop, ulp_phl_out);
            phi = usf_round_value(phi, ulp_phl_out);
            plo = usf_round_value(plo, ulp_phl_out);
            pcl = usf_round_value(pcl, ulp_phl_out);

            # Make sure that the range {plo,phi} includes the new average price:
            phi = max_price(max_price(max_price(phi,pav),pop),pcl);
            plo = min_price(min_price(min_price(plo,pav),pop),pcl);

            # ----------------------------------------------------------------------
          }
      }
      
    # Final consistency check:
    usf_check_prices( \
      FILENAME,FNR, pop,phi,plo,pcl,vbt,vcr,pav,\
      ulp_phl_out,ulp_vbt_out,ulp_vcr_out,ulp_pav_out \
    );

    # Print result (beware, output precision should match {ulp_vbt_out} etc.:
    printf "%s %s | %12.5f | %12.5f | %12.5f | %12.5f", dy, tm, pop, phi, plo, pcl;
    printf " | %16.4f | %16.4f | %12.5f", vbt, vcr, pav;
    printf "\n";

    # Update the date/time range:
    if (nints == 0) { dt_ini = dt; }
    dt_fin = dt;

    nints++;
    next;
  }
  
// \
  { 
    data_error(("invalid line format"));
  }

END \
  { 
    printf "found %d intervals, from %s to %s\n", nints, dt_ini, dt_fin > "/dev/stderr";
    if ((lost_vbt > 0) || (lost_vcr > 0)) 
      { 
        printf "discarded %.8f BTC and %.6f currency\n", lost_vbt, lost_vcr > "/dev/stderr";
      }
    exit(0);
  } 

function estimate_log_average(val,ulp)
  { 
    # The nominal expected value of {log(X)} knowing that
    # the printed value of {X} is {val} and the unit in the last digit is {ulp}.
    if (val < 0.5*ulp) { val = 0.5*ulp; }
    return log(val);
  }

function estimate_lin_average(val,ulp)
  { 
    # The inverse of {estimate_log_average(val,ulp)}
    # but does not return zero:
    return max_price(exp(val), 1.0e-8);
  }

function estimate_log_deviation(val,ulp)
  { 
    # The nominal standard deviation of {log(X)} 
    # the printed value of {X} is {val} and the unit in the last digit is {ulp}.
    if (val < 0.5*ulp) { val = 0.5*ulp; }
    return log(val + 0.5*ulp) - log(val);
  }
   
function min_price(x,y)
  { # Min of {x,y}, ignoring undefined:
    if (x+0 == 0)
      { return y; }
    else if (y+0 == 0)
      { return x; }
    else
      { return (x+0 < y+0 ? x : y); }
  }
          
function max_price(x,y)
  { return (x+0 > y+0 ? x : y); }