#! /usr/bin/gawk -f
# Last edited on 2015-02-24 14:59:38 by stolfilocal

# Reads a daily price/volume series file. Writes to {stdout} a file 
# with smoothed daily mean prices.
#
# The user must define (with {export}) the environment variable {TZ="UTC"}
# The user must load (with "-f") the library {merged_price_functions.gawk}, and 
# define (with "-v") the variables
#
#   {inFile} name of input series file.
#   {hrad}, the half-width of the smoothing window.
# 
# The input file {inFile} must contain daily price and volume data for some exchange 
# in some currency. Each input line must have the format
# "{DATE} {TIME} | {OPEN} | {HIGH} | {LOW} | {CLOSE} | {VBT} | {VCR} | {WTPRICE}"
# where {DATE} is the line's date (UTC), {TIME} must be "00:00:00", 
# {VBT} is the total btc volume traded in that day, and {VCR} the total 
# volume in the exchange currency.  The other fields are ignored, except for
# validation.  The {DATE}s must be consecutive days; days with 
# missing data must be present and have {VBT} and {VCR} both zero.
#
# Writes to standard output one line number {i} per day in the format
# "{DATE} {TIME} | {VBT[i]} | {VCR[i]} | {PMD[i]}", where
#
#    {DATE} is every day in the input file;
#    {TIME} is always "00:00:00";
#    {VBT[i]} and {VCR[i]} are the input daily volumes on the {DATE};
#    {PMD[i]} is a smoothed average of the price {VCR[j]/VBT[j]} in a window of several
#       days around the {DATE}.

BEGIN \
  { 
    if (inFile == "") { arg_error(("must define {inFile}")); }

    if (hrad == "") { arg_error(("must define {hrad}")); }
    if (hrad !~ /^[0-9]+$/) { arg_error(("invalid {hrad}")); }

    pi = 3.1415926;

    ulp_vbt = 0.0001;  # Unit in last place of input BTC daily volume {vbt}.
    ulp_vcr = 0.0001;  # Unit in last place of currency daily volume {vcr}.
    ulp_pav = 0.00001; # Unit in the last place of input mean price {pav}.
    ulp_phl = 0.00001; # Unit in the last place of input {plo,phi}.

    # Series data tables:
    split("", date_ot);   # Set to "1" for dates that exist, undef otherwise, indexed by {[DATE]}.
    split("", vbt_ot);    # BTC volume, indexed with {[DATE]}.
    split("", vcr_ot);    # Currency volume, indexed with {[DATE]}.
    split("", pmd_ot);    # Average price in window around date, indexed with {[DATE]}.
    
    #Read the series and store in series data tables:
    read_daily_summary_file(\
      inFile, \
      date_ot,vbt_ot,vcr_ot \
    );

    # Sort lines by date: 
    ndays = asorti(date_ot); # Now {date_ot} has the existing dates, indexed {1..ndays}
    compute_local_averages(inFile,ndays,hrad,date_ot,vbt_ot,vcr_ot,pmd_ot);
    write_smoothed_price(ndays,date_ot,vbt_ot,vcr_ot,pmd_ot);
    exit(0);
  }

function read_daily_summary_file \
  ( fname, \
    date_ot,vbt_ot,vcr_ot,  \
    nlin,lin,ndays,fld,nfld,date,time,pop,phi,plo,pcl,vbt,vcr,pav,j,odate \
  )
  {
    # Reads from file {fname} the trade summary data in 1 day intervals.
    # Sets {date_ot[dt]} to 1, and stores the BTC and currency volumes in {vbt_ot[dt],vcr_ot[dt]},
    # for each date {dt} present in the file.
    # Days with no data have must have a line with the corresponding date and zero volumes.

    printf "reading file %s ...\n", fname > "/dev/stderr";
    ERRNO = "";
    
    # Read the file:
    nlin = 0;   # Number of lines read.
    ndays = 0;  # Number of non-blank, non-header, non-comment lines.
    odate = ""; # Date on previosu data line.
    while((getline lin < fname) > 0) { 
      nlin++;
      # Remove tabs, inline comments, spurious blanks
      gsub(/[\011]/, " ", lin);
      gsub(/[\#].*$/, "", lin);
      gsub(/^[ ]+/, "", lin); 
      gsub(/[ ]+$/, "", lin); 
      gsub(/[ ][ ]+/, " ", lin); 
      if ((lin != "") && (! match(lin, /[!]/)))
        { nfld = split(lin, fld, " ");
          if (nfld != 16) 
            { file_error(fname, nlin, ("bad summary entry = \"" lin "\"")); }
          for (j = 3; j <= NF; j = j + 2)
            { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } }
          date = mpf_check_date(fname,nlin,fld[1]);
          if (date in date_ot) { file_error(fname, nlin, ("repeated date = \"" date "\"")); }
          time = fld[2];
          if (time != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" time "\"")); }
          pop = mpf_check_num(fname, nlin, fld[4]);
          phi = mpf_check_num(fname, nlin, fld[6]);
          plo = mpf_check_num(fname, nlin, fld[8]);
          pcl = mpf_check_num(fname, nlin, fld[10]);
          vbt = mpf_check_num(fname, nlin, fld[12]);
          vcr = mpf_check_num(fname, nlin, fld[14]);
          pav = mpf_check_num(fname, nlin, fld[16]);
          if ((odate != "") && (! mpf_dates_are_consecutive(odate,date)))
            { file_error(fname,nlin, ("non-consecutive dates \"" odate "\" \"" date "\"")); }
          odate = date;
          
          # Consistency checks:
          usf_check_prices(fname,nlin, pop,plo,phi,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav);
          
          # Save in arrays:
          date_ot[date] = 1;
          vbt_ot[date] = vbt;
          vcr_ot[date] = vcr;
          ndays++;
        }
    }
    if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); }
    close (fname);
    if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
    printf "%6d input lines read\n", nlin > "/dev/stderr";
    printf "%6d data lines found\n", ndays > "/dev/stderr";
  }

function check_prices(fname,nlin,vbt,vcr,plo,phi,pav,  pavc,ploc,phic)
  {
    # Checks consistency of average, low, high, prices.
    # Uses global parameters {ulp_vbt,ulp_vcr,ulp_pav,ulp_phl}.
    
    if ((vcr == 0) != (vbt == 0))
      { file_error(fname, nlin, ("inconsistent vbt = \"" vbt "\" vcr = \"" vcr "\"")); }
    if (vbt != 0)
      { if ((vbt < ulp_vbt) || (vcr < ulp_vcr))
          { file_error(fname, nlin, ("too small vbt = \"" vbt "\" or vcr = \"" vcr "\"")); }
        pavc = vcr/vbt; # Computed average price.
        ploc = (vcr - 0.50001*ulp_vcr)/(vbt + 0.50001*ulp_vbt); # Min average price assuming worst rounding.
        phic = (vcr + 0.50001*ulp_vcr)/(vbt - 0.50001*ulp_vbt); # Max average price assuming worst rounding.
        if ((pavc - pav) > 0.500001*ulp_pav)
          { printf "%s:%s: !! average price inconsistent", fname, nlin > "/dev/stderr";
            printf " is %.5f should be %.5f\n", pav, pavc > "/dev/stderr";
            if ((phic < pav - 0.500001*ulp_pav) || (ploc > pav + 0.500001*ulp_pav))
              { file_error(fname, nlin, ("excessive error")); }
          }
        if ((pavc < plo - 0.500001*ulp_phl) || (pavc > phi + 0.500001*ulp_phl))
          { printf "%s:%s: !! average price out of lo-hi range", fname, nlin > "/dev/stderr";
            printf " is %.5f should be in [%.5f _ %.5f]\n", pavc, plo, phi > "/dev/stderr";
            if ((phic < plo - 0.500001*ulp_phl) || (ploc > phi + 0.500001*ulp_phl)) { file_error(fname, nlin, ("excessive overflow")); }
          }
      }
   }
          
function compute_local_averages \
  ( fname,ndays,hrad,date_ot,vbt_ot,vcr_ot,pmd_ot, \
    kd,date,vbt,vcr,odate,pmd \
  )
  {
    # Assumes {date_ot[1..ndays]} are the selected input dates, in order.
    # Computes {pmd_ot[date]} as the average of prices in a window around {date}.
    # Sets {pmd_ot[date]} to zero if the price at the center of the window is missing.
    
    odate = ""; # Date on previous table line.
    for (kd = 1; kd <= ndays; kd++)
      { date = date_ot[kd];
        if ((odate != "") && (! mpf_dates_are_consecutive(odate,date)))
          { prog_error(fname,nlin, ("non-consecutive dates \"" odate "\" \"" date "\"")); }
        pmd = compute_one_local_average(ndays,kd,hrad,date_ot,vbt_ot,vcr_ot);
        pmd_ot[date] = pmd;
        odate = date;
      }
  }
  
function compute_one_local_average \
  ( ndays,kd,hrad,date_ot,vbt_ot,vcr_ot, \
    rad,jd,id,date,w,vcr,vbt,pmd,wht \
  )
  {
    # Assumes {date_ot[1..ndays]} are the selected dates, in order.
    # Computes the average of prices in a soft window 
    # around {date_ot[kd]}. The window spans {2*hrad+1} days.
    
    # The procedure fits a polynomial by least squares to the
    # daily mean prices in the window, weighted by a Hann-like window weight
    # and the currency volume.  Then it evaluates the fitted line
    # at the current date.  Returns 0.0 if there is not enough
    # data to fit a line.
    
    # Collect the data points in window, indexed {-hrad..+hrad}:
    split("", pmd);  # Mean price per day.
    split("", wht);  # Weight (including currency volume) per day. 
    
    for (jd = -hrad; jd <= hrad; jd++)
      { pmd[jd] = 0.0; wht[jd] = 0.0; # By default.
        id = kd + jd; 
        if (id in date_ot)
          { date = date_ot[id];
            vcr = (date == "" ? 0.0 : vcr_ot[date]);
            vbt = (date == "" ? 0.0 : vbt_ot[date]);
            if ((vcr >= 3*ulp_vcr) && (vbt >= 3*ulp_vbt))
              { w = 0.5*(1 + cos(pi*jd/(hrad + 0.5)));
                pmd[jd] = vcr/vbt;
                wht[jd] = w;
              }
          }
      }
    
    # Fit polynomial and evaluate at center:
    return fit_and_eval(1,hrad,pmd,wht);
  }
          
function fit_and_eval \
  ( deg,hrad,pmd,wht, \
    j,w,p,A00,A01,A11,A0p,A1p,M00,M01,M11,c0,c1 \
  )
  {
    # Fits a polynomial {P(j)} of degree {deg} to the data points {(j,log(pmd[j]))} with
    # weight {wht[j]}, then evaluates {exp(P(0))}.
    
    # Returns 0.0 if there is not enough data to fit the polynomial.
    # Uses only pairs of entries symmetric around current day, where both days have data.
    # Returns zero if there are no such pairs, or if there is no data for the current date.
    # The weight for each day is proportional to the volume.
    
    if (deg > 1) { prog_error(("degree not implemented")); }
    
    A00 = 0; # {<f0|f0> = SUM{wht[j]}
    A01 = 0; # {<f0|f1> = SUM{wht[j]*j}
    A11 = 0; # {<f1|f1> = SUM{wht[j]*j*j}
    A0p = 0; # {<f0|p>  = SUM{wht[j]*pmd[j]}
    A1p = 0; # {<f1|p>  = SUM{wht[j]*pmd[j]*j}
    
    for (j = -hrad; j <= hrad; j++)
      { w = wht[j];
        p = pmd[j];
        if (w > 0.0)
          { if (p <= 0.0) { prog_error(("non-positive price")); }
            p = log(p);
            A00 += w;
            A01 += w*j;
            A11 += w*j*j;
            A0p += w*p;
            A1p += w*p*j;
          }
      }
    
    if (A00 < 0.000001) { return 0.0; }

    # Solve the system {A*c = b} to 
    det = A00*A11 - A01*A01;
    if ((det >= 0 ? det : -det) < 1.0e-6) { return 0.0; }
    M00 = A11; M01 = -A01; M11 = A00; # Adjoint {A00,A01,A11}.
    c0 = (M00*A0p + M01*A1p)/det;
    c1 = (M01*A0p + M11*A1p)/det;

    # printf "(%8.5f) + (%8.5f)*j\n", c0, c1 > "/dev/stderr";
    # for (j = -hrad; j <= +hrad; j++)
    #   { w = wht[j];
    #     p = pmd[j];
    #     if (w > 0.0)
    #       { p = log(p);
    #         printf "  %+4d %8.5f %8.5f %8.5f\n", j, w, p, c0+c1*j > "/dev/stderr";
    #       }
    #   }
    
    # Evaluate at {j = 0}:
    return exp(c0);
  }
          
function write_smoothed_price \
  ( ndays,date_ot,vbt_ot,vcr_ot,pmd_ot, \
    kd,date,time,vbt,vcr,pmd,odate \
  )
  {
    # Assumes {date_ot[1..ndays]} are the merged dates, in order.
    # Print header:
    printf "# Created by {compute_smoothed_price.gawk}\n"
    printf "\n";

    for (kd = 1; kd <= ndays; kd++)
      { date = date_ot[kd];
        time = "00:00:00";
        printf "%s %s", date, time;
        vbt = vbt_ot[date];
        vcr = vcr_ot[date];
        pmd = pmd_ot[date];
        printf " | %.2f | %.2f | %.4f", vbt, vcr, pmd;
        printf "\n";
        otst = tst;
      }
    fflush("/dev/stdout");
  }