#! /usr/bin/gawk -f
# Last edited on 2021-08-08 18:09:39 by stolfi

# Reads a file with a certain number {N >= 1} of fields per line.
# Computes a smoothed version of a specified column, with a rolling 
# Hann window of prescribed radius.  
#
# User must specify the column to be smoothed with "-v col={COLUMN}",
# and the window radius with "-v rad={RADIUS}". The smoothing window 
# will have width {2*RADIUS + 1}.
#
# Field values that are equal to the string "NAN", "NaN", or "nan" are assumed to be
# missing data and are omitted from the window-weighted average.
#
# Writes out the input data with the smoothed value as field {N+1}.
# If the total weight of the data values in the window that are not NONE 
# less than 40% of the total weight, the smoothed field is set to the
# string "nan".

# Removes #-comments and discards lines with no data.

BEGIN { 
  col = check_int_arg("col", col, 1, 10000);
  rad = check_int_arg("rad", rad, 0, 10000);

  nw = 2*rad + 1;  # Number of samples in full window.
  split("", data); # Actual samples in window are {data[nw-nd..nw-1]}.
  split("", line); # Corresponding lines are {line[nw-nd..nw-1]}.
  for (i = 0; i < nw; i++) { data[i] = "nan"; line[i] = ""; }
  nf = -1;         # Number of fields, or {-1}.
  pi = 3.14159;
}

# Remove #-commants and CRs:
// {
  gsub(/[#].*$/, "", $0);
  gsub(/[\015]/, "", $0);
}

# Discard blank lines:
/^ *$/ { next; }


// {
  if (nf == -1) {
    if (col > NF) { data_error("not enough fields col = " col "  NF = " NF); }
    nf = NF;
  } else {
    if (NF != nf) { data_error("inconsistents field counts " nf " " NF); }
  } 
  
  process_datum($(col) + 0, $0);
  next;
}

END {
  # process the last half-window:
  for (i = 0; i < rad; i++) { process_datum("nan", ""); }
}
  
function process_datum(d,lin,   i) {
  # Add the datum {d} and the line {lin} to the {data.line} lists.
  
  for (i = 0; i < nw-1; i++) { data[i] = data[i+1]; line[i] = line[i+1]; }
  data[nw-1] = d;
  line[nw-1] = lin;
  
  val = compute_smoothed();
  if (line[rad] != "") { printf "%s %s\n", line[rad], val; }
}

function compute_smoothed(   i,di,wi,sw,sdw) {
  # Computes the weighted average of {data[0..nw-1]} ignoring
  # entries which are the string "nan". If the non-"nan" entries
  # have less than 40% of the total weight, returns "nan".

  # Compute the smoothed value:
  sw = 0; sdw = 0;
  for (i = 0; i < nw; i++)
    { di = data[i];
      if ((di != "nan") && (di != "NaN") && (di != "NAN")) {
        wi = 0.5*(1 - cos(2*pi*(i+0.5)/nw));
        sdw += di*wi; sw += wi;
      }
    } 
  if (sw < 0.199999*nw) { return "nan"; } else { return sdw/sw; }
}
  

function check_int_arg(name, val, vmin, vmax) {
  if (val == "") { arg_error("must specify \"-v " name "={VALUE}\""); }
  if (! match(val, /^[-+]?[0-9]+$/)) { arg_error("invalid value \"" val "\" for " name); }
  val = val + 0
  if ((val < vmin) || (val > vmax)) { arg_error("value " val " for " name " out of range"); }
  return val;
}

function data_error(msg)
  {
    printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
    printf "  [%s]\n", $0 > "/dev/stderr";
    abort = 1; exit abort;
  }

function arg_error(msg)
  {
    printf "** %s\n", msg > "/dev/stderr";
    bort = 1; exit abort;
  }