#! /usr/bin/gawk -f
# Last edited on 2019-10-28 04:59:24 by jstolfi

# Reads two data files with daily volumes and smoothed prices.
# Combines them into a single reference price file.
# 
# The user must define (with {export}) the environment variable {TZ="UTC"},
# load (with "-f") the libraries "useful_functions.gawk" and "index_file_functions.gawk",
# and define (with "-v") the program variables
#
#   {indexFile}, the name of the index file (that specifies the data files to combine).
#   {inDir}, the directory that contains the smoothed price files to combine.
# 
# The files to be combined are specified the index file.
# The format of index file is described in the file "index_file_functions.gawk", 
# function {ixf_read_index_file}.
#
# From each line of the index file the 
# program gets the nominal date range {INIDATE .. FINDATE}, the exchange tag
# {EXTAG}, the currency tag {CRTAG}, and the date range to use {RLODATE .. RHIDATE}.
#
# Those parameters specify the file "{inDir}/{INIDATE}--{FINDATE}-{EXTAG}-{CRTAG}-01d.txt"
# which must contain daily price and volume data for the exchange {EXTAG} in currency {CRTAG}
# ("USD", "CNY", etc.), between dates {INIDATE} and {FINDATE} inclusive. 
# The program will use only entries of the volume file whose date is between 
# {RLODATE} and {RHIDATE}, inclusive both.
# 
# The format of the input data files is the one used for original time series.
# Outputs a file in the same format with a reference USD price.
# The high and low prices are merged.  The open and close prices are blended.
# The BTC and currency volumes are blended, and the weighted price is computed
# from them.


BEGIN \
  { 
    if (ENVIRON["TZ"] != "UTC") { arg_error(("must set TZ to 'UTC'")); }
    if (indexFile == "") { arg_error(("must define {indexFile}")); }
    if (inDir == "") { arg_error(("must define {inDir}")); }
    
    pi = 3.1415926;
    
    # Initialize global tables to be read from the index file:
    ixf_initialize_index_tables();
    
    # Read data from file {indexFile}, saves in tables {inidate_fi[0..nfiles-1],.. color_fi[0..nfiles-1]}:
    nfiles = ixf_read_index_file( \
      indexFile, \
      inidate_fi,findate_fi,extag_fi,crtag_fi,exname_fi,rate_fi,rlodate_fi,rhidate_fi,color_fi \
    );
    
    # There must be two exchanges, MGOX and BSTP:
    if (nfiles != 2) 
      { file_error(indexFile, 0, "must blend exactly two files"); }
    if ((extag_fi[0] != "MGOX") || (crtag_fi[0] != "USD")) 
      { file_error(indexFile, 0, "first exchange should be MGOX.USD"); }
    if ((extag_fi[1] != "BSTP") || (crtag_fi[1] != "USD")) 
      { file_error(indexFile, 0, "second exchange should be BSTP.USD"); }
      
    # Timestamps for start and end of MGOX-BSTP switch:
    sec_sw_ini = usf_date_and_time_to_timestamp("2012-05-01", "00:00:00");
    sec_sw_fin = usf_date_and_time_to_timestamp("2013-01-31", "00:00:00");
    
    ndays = 0;            # Number of data lines.
    
    # Series data from input, indexed with day date {dy}
    split("", date_dy);   # Dates on successive lines, indexed {dy}.
    split("", pop_dy);    # Opening price, indexed with [dy,0..nfiles-1].
    split("", phi_dy);    # High price, indexed with [dy,0..nfiles-1].
    split("", plo_dy);    # Low price, indexed with [dy,0..nfiles-1].
    split("", pcl_dy);    # Closing price, indexed with [dy,0..nfiles-1].
    split("", vbt_dy);    # BTC volume, indexed with [dy,0..nfiles-1].
    split("", vcr_dy);    # Currency volume, indexed with [dy,0..nfiles-1].
    split("", pav_dy);    # Average price, indexed with [dy,0..nfiles-1].
    
    # Sorted dates:
    split("", date_kd);   # Dates on successive lines, indexed {1..ndays}.
    
    # Precision (unit-in-last-place) of input and output valies:
    ulp_vbt = 0.0001;   # Unit in the last place of input {vbt}
    ulp_vcr = 0.0001;   # Unit in the last place of input {vcr}
    ulp_pav = 0.00001;  # Unit in the last place of input average price {pav}
    ulp_phl = 0.00001;  # Unit in the last place of {pop,phi,plo,pcl}.

    # Read the smoothed price data and store in series data tables:
    # Note that the indices are the date {dy} and the file index {kf}.
    for (kf = 0; kf < nfiles; kf++)
      { inidate = inidate_fi[kf];
        findate = findate_fi[kf];
        extag = extag_fi[kf];
        crtag = crtag_fi[kf];
        rlodate = rlodate_fi[kf];
        rhidate = rhidate_fi[kf];
        read_daily_price_file(\
          inDir,inidate,findate,kf,extag,crtag,rlodate,rhidate, \
          date_dy,pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy \
        );
      }
    printf "done reading %d daily price files\n", nfiles > "/dev/stderr"; 

    # Sort lines by date: 
    ndays = asorti(date_dy,date_kd); # Now {date_kd} has the existing dates, indexed {1..ndays}
    printf "%d days in reference file, from %s to %s\n", ndays, date_kd[1], date_kd[ndays] > "/dev/stderr";
    write_ref_price_file(\
      nfiles,extag_fi,crtag_fi,rate_fi,\
      ndays,date_kd,date_dy,\
      pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy\
    );
    printf "done writing the reference file\n" > "/dev/stderr"; 
    exit(0);
  } 

function read_daily_price_file \
  ( inDir,inidate,findate,kf,extag,crtag,rlodate,rhidate, \
    date_dy,pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy,  \
    fname,nlin,lin,ndays,nsave,fld,nfld,dy,tm,\
    pop,phi,plo,pcl,vbt,vcr,pav,j,ody \
  )
  {
    # Reads a file with price data, total BTC and currency volumes, in 1 day intervals.
    # The file name is "{inDir}/{inidate}--{findate}-{extag}-{crtag}-01d.txt".
    # Stores the data in {date_dy[dy]},
    # {pop_dy[dy,kf],phi_dy[dy,kf],plo_dy[dy,kf],pcl_dy[dy,kf]}
    # {vbt_dy[dy,kf],vcr_dy[dy,kf],pav_dy[dy,kf]},
    # for each date {dy} present in the file that lies in the range {rlodate..rhidate} inclusive.
    # Uses global parameters {ulp_vbt,ulp_vcr,ulp_pav,ulp_phl}.
    
    # Assemble the name of the input daily volume file:
    
    fname = ( inDir "/" inidate "--" findate "-" extag "-" crtag "-01d.txt" );
    printf "reading file %s ...\n", fname > "/dev/stderr";
    ERRNO = "";

    # Read the file:
    nlin = 0;   # Number of lines read.
    ndays = 0;  # Number of non-blank, non-header, non-comment lines.
    nsave = 0;  # Number of data lines saved in the output arrays.
    ody = ""; # Date on previous data line.
    while((getline lin < fname) > 0) { 
      nlin++;
      # Remove tabs, inline comments, spurious blanks
      gsub(/[\011]/, " ", lin);
      gsub(/[\#].*$/, "", lin);
      gsub(/^[ ]+/, "", lin); 
      gsub(/[ ]+$/, "", lin); 
      gsub(/[ ][ ]+/, " ", lin); 
      if ((lin != "") && (! match(lin, /[!]/)))
        { /* Data line: */
          nfld = split(lin, fld, " ");
          if (nfld != 16) { file_error(fname, nlin, ("wrong field count = \"" lin "\"")); }
          for (j = 3; j <= NF; j = j + 2)
            { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } }
          # Get the input fields:
          dy = usf_check_date(fname,nlin,fld[1]);
          tm = fld[2];
          if (tm != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" tm "\"")); }
          pop = usf_check_num(fname, nlin, fld[4]);
          phi = usf_check_num(fname, nlin, fld[6]);
          plo = usf_check_num(fname, nlin, fld[8]);
          pcl = usf_check_num(fname, nlin, fld[10]);
          vbt = usf_check_num(fname, nlin, fld[12]);
          vcr = usf_check_num(fname, nlin, fld[14]);
          pav = usf_check_num(fname, nlin, fld[16]);
          
          # Consistency checks:
          if ((dy,kf) in vbt_dy) 
            { file_error(fname, nlin, ("repeated date = \"" dy "\"")); }
          if ((ody != "") && (! usf_dates_are_consecutive(ody,dy)))
            { file_error(fname,nlin, ("non-consecutive dates \"" ody "\" \"" dy "\"")); }
          ody = dy;
          
          if (pav != 0)
            { # Adjust {vcr} to be consistent with {vbt,pav}:
              vcr = pav*vbt;
            }
          
          usf_check_prices(fname,nlin, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav);
          
          if ((dy >= rlodate) && (dy <= rhidate))
            { # Save in arrays:
              date_dy[dy] = 1;
              pop_dy[dy,kf] = pop;
              phi_dy[dy,kf] = phi;
              plo_dy[dy,kf] = plo;
              pcl_dy[dy,kf] = pcl;
              vbt_dy[dy,kf] = vbt;
              vcr_dy[dy,kf] = vcr;
              pav_dy[dy,kf] = pav;
              nsave++;
            }
          ndays++;
        }
    }
    if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); }
    close (fname);
    if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
    printf "%6d lines read\n", nlin > "/dev/stderr"
    printf "%6d data lines found\n", ndays > "/dev/stderr"
    printf "%6d data lines used\n", nsave > "/dev/stderr"
  }
  
function write_ref_price_file\
  ( nfiles,extag_fi,crtag_fi,rate_fi,\
    ndays,date_kd,date_dy,\
    pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy, \
    kd,kf,ody,dy,tm,sec,wt0,wt1,wtt,pop,phi,plo,pcl,vbt,vcr,rate \
  )
  {
    # Assumes {date_kd[0..ndays-1]} are the merged dates, in order.
    # Writes the blended volumes {vbt,vcr}
    # blended prices {pop,pcl}, united prices {phi,plo}}
    # and the mean price {pav} to standard output.
    # Recomputes the blended price from the volumes.
    
    if (nfiles != 2) { prog_error(("must blend exactly two files")); }
    
    printf "# Created by {compute_ref_price.gawk}\n"
    printf "# Blend of "
    for (kf = 0; kf < nfiles; kf++) { printf " %s.%s", extag_fi[kf], crtag_fi[kf]; }
    printf "\n";
    printf "Timestamp ! Open ! High ! Low ! Close";
    printf " ! V.BTC ! V.USD ! WTPrice\n";
    printf "\n";
    ody = ""; # Previous date.
    for (kd = 1; kd <= ndays; kd++)
      { # Get date:
        dy = date_kd[kd];
        if (date_dy[dy] != 1) { prog_error(("inconsistent date tables")); }
        if ((ody != "") && (! usf_dates_are_consecutive(ody,dy)))
          { prog_error(("non-consecutive dates \"" ody "\" \"" dy "\"")); }
        tm = "00:00:00";
        
        # Apply the currency rate factors:
        for (kf = 0; kf < nfiles; kf++) 
          { rate = rate_fi[kf];
            if (rate != 1) 
              { pop_dy[dy,kf] /= rate;
                phi_dy[dy,kf] /= rate;
                plo_dy[dy,kf] /= rate;
                pcl_dy[dy,kf] /= rate;
                vcr_dy[dy,kf] /= rate;
                pav_dy[dy,kf] /= rate;
              }
          }
        
        # Compute blend weights of the two files (VERY SPECIFIC):
        sec = usf_date_and_time_to_timestamp(dy, "00:00:00");
        wt0 = blend_weight(sec, sec_sw_ini,sec_sw_fin);
        wt1 = 1 - wt0;
        
        # If either price data is missing, use the other one, unless its weight is zero:
        if (pav_dy[dy,0] <= 0.0) { wt0 = 0.0; }
        if (pav_dy[dy,1] <= 0.0) { wt1 = 0.0; }
        
        # Blend:
        wtt = wt0 + wt1;
        if (wtt <= 0.0)
          { # Reference price is undefined
            printf "!! reference price is undefined for %s\n", dy > "/dev/stderr";
            printf "   wt0 = %8.6f pav0 = %11.5f", wt0, pav_dy[dy,0] > "/dev/stderr";
            printf "   wt1 = %8.6f pav1 = %11.5f\n", wt1, pav_dy[dy,1] > "/dev/stderr";
            pop = 0;
            phi = 0;
            plo = 0;
            pcl = 0;
            vbt = 0;
            vcr = 0;
            pav = 0;
          }
        else
          { # Blend the ranges:
            phi = (wt0*phi_dy[dy,0] + wt1*phi_dy[dy,1])/wtt;
            if (phi < ulp_phl) { phi = ulp_phl; }
            
            plo = (wt0*plo_dy[dy,0] + wt1*plo_dy[dy,1])/wtt;
            if (plo < ulp_phl) { plo = ulp_phl; }
            
            # Blend the opening and closing prices, no better idea:
            pop = (wt0*pop_dy[dy,0] + wt1*pop_dy[dy,1])/wtt;
            if (pop < ulp_phl) { pop = ulp_phl; }
            
            pcl = (wt0*pcl_dy[dy,0] + wt1*pcl_dy[dy,1])/wtt;
            if (pcl < ulp_phl) { pcl = ulp_phl; }
            
            # Blend the volumes:
            vbt = (wt0*vbt_dy[dy,0] + wt1*vbt_dy[dy,1])/wtt;
            if (vbt < ulp_vbt) { vbt = ulp_vbt; }
            
            vcr = (wt0*vcr_dy[dy,0] + wt1*vcr_dy[dy,1])/wtt;
            if (vcr < ulp_vcr) { vbt = ulp_vcr; }
            
            # Recompute the price from the volumes:            
            pav = vcr/vbt;
            if (pav < ulp_pav) { pav = ulp_pav; }

            # Make sure mean price is in interval:
            phi = max_price(phi,pav);
            plo = min_price(plo,pav);

            # Consitency of computed prices:
            usf_check_prices("BLEND",kd, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav);
            
          }
        printf "%s %s | %12.5f | %12.5f | %12.5f | %12.5f", dy, tm, pop, phi, plo, pcl;
        printf " | %16.4f | %16.4f | %12.5f", vbt, vcr, pav;
        printf "\n";
        ody = dy;
      }
  }
   
function check_null_value(fname,nlin,val,name)
  {  
    if (val != 0.0) { file_error(fname,nlin, (name " = \"" val "\" should be zero")); }
  }
   
function check_non_null_value(fname,nlin,val,name)
  {  
    if (val == 0.0) { file_error(fname,nlin, (name " = \"" val "\" should not be zero")); }
  }
          
function min_price(x,y)
  { # Min of {x,y}, ignoring undefined:
    if (x+0 == 0)
      { return y; }
    else if (y+0 == 0)
      { return x; }
    else
      { return (x+0 < y+0 ? x : y); }
  }
          
function max_price(x,y)
  { return (x+0 > y+0 ? x : y); }

function blend_weight(sec,sec_ini,sec_fin,  z,wt)
  { 
    # A blend function, 1 before {sec_ini}, 0 after {sec_fin}.
    if (sec <= sec_ini)
      { return 1.0; }
    else if (sec >= sec_fin)
      { return 0.0; }
    else 
      { z = (sec - sec_ini)/(sec_fin - sec_ini);
        wt = 0.5*(1 + cos(pi*z));
        return wt;
      }
  }