#! /usr/bin/gawk -f
# Last edited on 2015-03-04 23:24:14 by stolfilocal

# Reads a daily price/volume series file. Writes to {stdout} a file 
# with smoothed daily mean prices.
#
# The user must define (with {export}) the environment variable {TZ="UTC"}
# The user must load (with "-f") the library {useful_functions.gawk}, and 
# define (with "-v") the variables
#
#   {inFile} name of input series file.
#   {hrad}, the half-width of the smoothing window.
# 
# The input file {inFile} must contain daily price and volume data for some exchange 
# in some currency. Each input line must have the format
# "{DATE} {TIME} | {OPEN} | {HIGH} | {LOW} | {CLOSE} | {VBT} | {VCR} | {WTPRICE}"
# where {DATE} is the line's date (UTC), {TIME} must be "00:00:00", 
# {VBT} is the total btc volume traded in that day, and {VCR} the total 
# volume in the exchange currency.  The other fields are ignored, except for
# validation.  The {DATE}s must be consecutive days; days with 
# missing data must be present and have {VBT} and {VCR} both zero.
#
# Writes to standard output one line number {i} per day in the format
# "{DATE} {TIME} | {VBT[i]} | {VCR[i]} | {PMD[i]}", where
#
#    {DATE} is every day in the input file;
#    {TIME} is always "00:00:00";
#    {VBT[i]} and {VCR[i]} are the input daily volumes on the {DATE};
#    {PMD[i]} is a smoothed average of the price {VCR[j]/VBT[j]} in a window of several
#       days around the {DATE}.

BEGIN \
  { 
    if (inFile == "") { arg_error(("must define {inFile}")); }

    if (hrad == "") { arg_error(("must define {hrad}")); }
    if (hrad !~ /^[0-9]+$/) { arg_error(("invalid {hrad}")); }

    pi = 3.1415926;

    # Precision (unit-in-last-place) of input and output valies:
    ulp_vbt = 0.0001;  # Unit in last place of input BTC daily volume {vbt}.
    ulp_vcr = 0.0001;  # Unit in last place of currency daily volume {vcr}.
    ulp_pav = 0.00001; # Unit in the last place of input mean price {pav}.
    ulp_phl = 0.00001; # Unit in the last place of input {plo,phi}.

    # Series data tables indexed by {[DATE]}:
    split("", date_dy);   # Set to "1" for dates that exist, undef otherwise.
    split("", vbt_dy);    # BTC volume.
    split("", vcr_dy);    # Currency volume.
    split("", pav_dy);    # Input average price per day.
    
    #Read the series and store in series data tables:
    read_daily_summary_file(\
      inFile, \
      date_dy,vbt_dy,vcr_dy,pav_dy \
    );

    # Output series tables, indexed {1..ndays}:
    split("", date_id);   # Date.
    split("", vbt_id);    # BTC volume.
    split("", vcr_id);    # Currency volume.
    split("", pav_id);    # Input average price per day.
    split("", pmd_id);    # Smoothed average price in window around date.
    
    # Sort lines by date: 
    ndays = asorti(date_dy,date_id); # Now {date_id} has the existing dates, indexed {1..ndays}
    
    # Reindex input data by day number, {1..ndays}:
    for (id = 1; id <= ndays; id++)
      { dy = date_id[id];
        if (date_dy[dy] != 1) { prog_error(("inconsistent {date_dy,date_id}")); }
        vbt_id[id] = vbt_dy[dy];
        vcr_id[id] = vcr_dy[dy];
        pav_id[id] = pav_dy[dy];
      }
    
    # Discard data points with very small volumes:
    for (id = 1; id <= ndays; id++)
      { if ((vcr_id[id] < 3*ulp_vcr) || (vbt_id[id] < 3*ulp_vbt))
          { pav_id[id] = 0.0; }
      }
    
    spf_smooth_series(1, ndays, pav_id, hrad, pmd_id);
    
    # Round values:
    for (id = idmin; id <= idmax; id++)
      { pmd = pmd_id[id];
        pmd = (pmd == 0 ? 0.0 : usf_round_value(pmd,ulp_pav));
        pmd_id[id] = pmd;
      }
      
    write_smoothed_price(ndays,date_id,vbt_id,vcr_id,pmd_id);
    exit(0);
  }
          
function write_smoothed_price \
  ( ndays,date_id,vbt_id,vcr_id,pmd_id, \
    id,dy,tm,vbt,vcr,pmd,ody \
  )
  {
    # Assumes {date_id[1..ndays]} are the merged dates, in order.
    # Print header:
    printf "# Created by {compute_smoothed_price.gawk}\n"
    printf "\n";

    for (id = 1; id <= ndays; id++)
      { dy = date_id[id];
        tm = "00:00:00";
        printf "%s %s", dy, tm;
        vbt = vbt_id[id];
        vcr = vcr_id[id];
        pmd = pmd_id[id];
        if (pmd == 0) { printf "!! smoothed price undefined for %s\n", dy > "/dev/stderr"; }
        printf " | %.4f | %.4f | %.5f", vbt, vcr, pmd;
        printf "\n";
        otst = tst;
      }
    fflush("/dev/stdout");
  }

function read_daily_summary_file \
  ( fname, \
    date_dy,vbt_dy,vcr_dy,pav_dy,  \
    nlin,lin,ndays,fld,nfld,dy,tm,pop,phi,plo,pcl,vbt,vcr,pav,j,ody \
  )
  {
    # Reads from file {fname} the trade summary data in 1 day intervals.
    # Sets {date_dy[dt]} to 1, and stores the BTC and currency volumes in {vbt_dy[dt],vcr_dy[dt]},
    # for each date {dt} present in the file.
    # Days with no data have must have a line with the corresponding date and zero volumes.
    
    # !!! Should be a library function, reading all series fields !!!
    # !!! Should return all fields indexed by line number {id} !!!
    # !!! Should take the time step as parameter and allow it !!!
    
    printf "reading file %s ...\n", fname > "/dev/stderr";
    ERRNO = "";
    
    # Read the file:
    nlin = 0;   # Number of lines read.
    ndays = 0;  # Number of non-blank, non-header, non-comment lines.
    ody = ""; # Date on previosu data line.
    while((getline lin < fname) > 0) { 
      nlin++;
      # Remove tabs, inline comments, spurious blanks
      gsub(/[\011]/, " ", lin);
      gsub(/[\#].*$/, "", lin);
      gsub(/^[ ]+/, "", lin); 
      gsub(/[ ]+$/, "", lin); 
      gsub(/[ ][ ]+/, " ", lin); 
      if ((lin != "") && (! match(lin, /[!]/)))
        { nfld = split(lin, fld, " ");
          if (nfld != 16) 
            { file_error(fname, nlin, ("bad summary entry = \"" lin "\"")); }
          for (j = 3; j <= NF; j = j + 2)
            { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } }
          dy = usf_check_date(fname,nlin,fld[1]);
          if (dy in date_dy) { file_error(fname, nlin, ("repeated date = \"" dy "\"")); }
          tm = fld[2];
          if (tm != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" tm "\"")); }
          pop = usf_check_num(fname, nlin, fld[4]);
          phi = usf_check_num(fname, nlin, fld[6]);
          plo = usf_check_num(fname, nlin, fld[8]);
          pcl = usf_check_num(fname, nlin, fld[10]);
          vbt = usf_check_num(fname, nlin, fld[12]);
          vcr = usf_check_num(fname, nlin, fld[14]);
          pav = usf_check_num(fname, nlin, fld[16]);
          if ((ody != "") && (! usf_dates_are_consecutive(ody,dy)))
            { file_error(fname,nlin, ("non-consecutive dates \"" ody "\" \"" dy "\"")); }
          ody = dy;
          
          # Consistency checks:
          usf_check_prices(fname,nlin, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav);

          # Save in arrays:
          date_dy[dy] = 1;
          vbt_dy[dy] = vbt;
          vcr_dy[dy] = vcr;
          pav_dy[dy] = pav;
          ndays++;
        }
    }
    if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); }
    close (fname);
    if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
    printf "%6d input lines read\n", nlin > "/dev/stderr";
    printf "%6d data lines found\n", ndays > "/dev/stderr";
  }