#! /usr/bin/gawk -f
# Last edited on 2014-01-17 03:12:58 by stolfilocal

BEGIN \
  {
    # Reads a market data file, ignores headers and comments, converts 
    # the weighted mean prices to log10 scale {z[t]}, computes 
    # pairs of sucessive increments {d1 = z[i-1]-z[i-2]} and {d0 = z[i]-z[i-1]},
    # writes out a 2D histogram of the pairs {(d1,d0)}.
    
    abort = -1;
    
    debug = 0;
    
    if (dmax == "") { arg_error(("must define {dmax}")); }
    dmax += 0; # Max absolute difference between log10 of consecutive prices to expect in histogram.
    if (dmax <= 0) { arg_error(("bad {dmax} = " dmax)); }
    
    if (nh == "") { arg_error(("must define {nh}")); }
    nh += 0; # Number of histogram bins on each side of zero.
    if ((nh < 1) || (int(nh) != nh)) { arg_error(("bad {nh} = " nh)); }
    
    split("", dbreak); # {dbreak[0]} is zero, {dbreak[1..nh]} are the positive bin breaks.
    compute_breaks(nh, dmax, dbreak);
    
    # The histogram {hist} is an array of {2*nh} by {2*nh} bins, each index is in {0..2*nh-1}.
    # The first index {i1} corresponds to the older increment {d1}, 
    # The second index {i0}  corresponds to the newer increment {d0}.

    nrecs = 0;   # Number of data lines read.
    nzeros = 0;  # Number of data lines with zero price.

    nincrs = 0;  # Number of valid increments computed.  
    npairs = 0;  # Number of valid consecutive increment pairs considered.
    
    split("", hist);      
    clear_hist(nh, hist);
    hist_overflow = 0; # Total mass outside the histogram.
    
    actual_dmax = 0;   # Actual max absolute increment.

    z1 = "NONE"; z2 = "NONE";  # Last two values of {z[t]}.
    
    dsum = 0;   # Sum of valid increments. 
    dsum2 = 0;  # Sum of squares of valid increments.
    dsumpr = 0; # Sum of products of consecutive valid increment pairs.
  }

(abort >= 0) { exit(abort); }

# discard comments, blank lines, table headers:
/^[ ]*([#]|$)/ { next; }
/[!]/ { next; }

/^20[0-9][0-9]-/ \
  {
    nrecs++;
    if (NF != 16) { data_error(("invalid field count" NF)); }
    v = $(16);
    if (v+0 == 0) 
      { z0 = "NONE"; 
        z1 = "NONE";
        z2 = "NONE";
        nzeros++;
      }
    else
      { z0 = log(v)/log(10);
        if (z1 != "NONE")
          { d0 = z0 - z1;
            nincrs++;
            # Accumulate for increment mean, variance:
            dsum += d0;
            dsum2 += d0*d0;
            # Keep track of max increment:
            if (d0 > actual_dmax) { actual_dmax = d0; }
            if (-d0 > actual_dmax) { actual_dmax = -d0; }
            if (z2 != "NONE")
              { # Got a pair of consecutive increments:
                d1 = z1 - z2;
                if (debug > 0) { printf "%+9.5f %9.5f\n", d1, d0 > "/dev/stderr"; }
                # Accumulate for covariance:
                dsumpr += d1*d0;
                accum_hist(nh, dbreak, hist, d1, d0);
              }
          }
      }
    # Advance:
    z2 = z1;
    z1 = z0;
    next;
  }

// { data_error(("invalid line format")); }

END \
  {
    if (abort >= 0) { exit(abort); }
    printf "%d data lines read\n", nrecs > "/dev/stderr";
    printf "%d data lines with zero prices\n", nzeros > "/dev/stderr";
    printf "%d valid increments\n", nincrs > "/dev/stderr";
    printf "%d valid consecutive increment pairs\n", npairs > "/dev/stderr";
    printf "actual max absolute increment = %9.5f\n", actual_dmax > "/dev/stderr";
    
    davg = dsum/nincrs;
    dvar = dsum2/nincrs;
    ddev = sqrt(dvar);
    printf "increment average = %+11.6f variance = %11.8f deviation = %11.6f\n", davg, dvar, ddev > "/dev/stderr";
    
    dcov = dsumpr/npairs;
    dcorr = dcov/dvar;
    printf "increment pair covariance = %+11.8f correlation = %+11.6f\n", dcov, dcorr > "/dev/stderr";
    
    if (hist_overflow > 0.1)
      { printf "lost %.2f mass outside histogram\n", hist_overflow > "/dev/stderr"; }
    dump_hist(nh, hist, dbreak);
  }
  
function compute_breaks(nh,dmax,dbreak,  i)
  {
    for (i = 0; i <= nh; i++) { dbreak[i] = (i*dmax)/nh; }
  }
  
function clear_hist(nh,dbreak,hist,  i1,i0)
  { 
    # Assumes {hist} has been initialized as an array.
    # Bins are numbered from 0 to {2*nh-1} on each axis.
    for (i1 = 0; i1 < 2*nh; i1++)
      { for (i0 = 0; i0 < 2*nh; i0++)
          { hist[i1,i0] = 0; }
      }
  }

function accum_hist(nh,dbreak,hist,d1,d0,  x1,x0,i1,i0,j1,j0,f1,f0,e1,e0)
  {
    # Accumulate in the histogram the pair {d1,d0} (older and newer increment).
    # Also increments {hist_overflow,npairs}.
    
    # Map {d1,d0} to fractional low bin indices {x1,x0} in {[0 _ 2*nh]}
    x1 = increment_to_index(nh, dbreak, d1) - 0.5;
    x0 = increment_to_index(nh, dbreak, d0) - 0.5;

    # Compute indices {i1,i0,j1,j0} of relevant bins and their weights {e1,e0,f1,f0}:
    i1 = int(x1); j1 = i1 + 1; f1 = x1 - i1; e1 = 1 - f1;
    i0 = int(x0); j0 = i0 + 1; f0 = x0 - i0; e0 = 1 - f0;

    # Must add {e1*e0} to {hist[i1,i0]}, {e1*f0} to {hist[i1,j0]}, etc.
    # But beware of bins that do not exist:

    accum_hist_bin(nh,hist,i1,i0,e1*e0); 
    accum_hist_bin(nh,hist,i1,j0,e1*f0); 
    accum_hist_bin(nh,hist,j1,i0,f1*e0); 
    accum_hist_bin(nh,hist,j1,j0,f1*f0);
    
    npairs++;
  }
  
function increment_to_index(nh,dbreak,d,  da,i,xa)
  { 
    # Returns the fractional position of an increment {d} in the histogram.
    # The result is in the range {[0 _ 2*nh]} if {abs(d) <= dmax}, or a few 
    # units outside if {abs(d) > dmax}.
    
    # Namely, let {da} be the absolute value of {d}. 
    # if {da} is in the range {[dbreak[i] _ dbreak[i+1]]} for
    # some {i} in {0..nh-1}, maps it to a non-negative float {xa} in {[i _ i+1]} by linear interpolation.
    # If {da > dmax = dbreak[nh]}, maps it to {xa = nh + 3}.
    # Then returns {nh+xa} if {d} is non-negative, {nh-xa} if {d} is negative.

    da = (d < 0 ? -d : d);
    # !!! Should use binary search !!!
    i = 0;
    while ((i < nh) && (da > dbreak[i+1])) { i++; }
    if (i >= nh)
      { # The increment {d} is too big: 
        xa = nh + 3;
      }
    else
      { # Increment is within valid range, interpolate indices:
        xa = i + (da - dbreak[i])/(dbreak[i+1] - dbreak[i]);
      }
    return (d < 0 ? nh - xa : nh + xa);
  }

function accum_hist_bin(nh,hist,i1,i0,w)
  { # Adds the mass {w} to {hist[i1,i0]}; unless the indices are invalid,
    # in which case adds it to {hist_overflow}.
    if ((i0 < 0) || (i0 >= 2*nh) || (i1 < 0) || (i1 >= 2*nh))
      { hist_overflow += w; }
    else
      { hist[i1,i0] += w; }
  }
 
function dump_hist(nh,hist,dbreak,  i1,i0)
  {
    for (i1 = 0; i1 < 2*nh; i1++)
      { for (i0 = 0; i0 < 2*nh; i0++)
          { # Increment ranges associated with bin:
            lo1 = (i1 < nh ? -dbreak[nh-i1] : dbreak[i1-nh]);     # Low limit of {d1}. 
            hi1 = (i1 < nh ? -dbreak[nh-i1-1] : dbreak[i1-nh+1]); # High limit of {d1}. 
            lo0 = (i0 < nh ? -dbreak[nh-i0] : dbreak[i0-nh]);     # Low limit of {d0}. 
            hi0 = (i0 < nh ? -dbreak[nh-i0-1] : dbreak[i0-nh+1]); # High limit of {d0}. 
            printf "%4d %4d", i1, i0;
            printf "  %+9.5f %+9.5f", lo1, hi1;
            printf "  %+9.5f %+9.5f", lo0, hi0;
            printf "  %11.6f", hist[i1,i0];
            printf "\n";
          }
        printf "\n";
      }
  }

function data_error(msg)
  { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
    printf "  Â«%sÂ»\n", $0 > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  } 
          
function data_warning(msg)
  { printf "%s:%s: !! warning - %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
    printf "  Â«%sÂ»\n", $0 > "/dev/stderr"; 
  } 
          
function arg_error(msg)
  { printf "** %s\n", msg > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  }