#! /usr/bin/gawk -f # Last edited on 2021-08-08 18:09:39 by stolfi # Reads a file with a certain number {N >= 1} of fields per line. # Computes a smoothed version of a specified column, with a rolling # Hann window of prescribed radius. # # User must specify the column to be smoothed with "-v col={COLUMN}", # and the window radius with "-v rad={RADIUS}". The smoothing window # will have width {2*RADIUS + 1}. # # Field values that are equal to the string "NAN", "NaN", or "nan" are assumed to be # missing data and are omitted from the window-weighted average. # # Writes out the input data with the smoothed value as field {N+1}. # If the total weight of the data values in the window that are not NONE # less than 40% of the total weight, the smoothed field is set to the # string "nan". # Removes #-comments and discards lines with no data. BEGIN { col = check_int_arg("col", col, 1, 10000); rad = check_int_arg("rad", rad, 0, 10000); nw = 2*rad + 1; # Number of samples in full window. split("", data); # Actual samples in window are {data[nw-nd..nw-1]}. split("", line); # Corresponding lines are {line[nw-nd..nw-1]}. for (i = 0; i < nw; i++) { data[i] = "nan"; line[i] = ""; } nf = -1; # Number of fields, or {-1}. pi = 3.14159; } # Remove #-commants and CRs: // { gsub(/[#].*$/, "", $0); gsub(/[\015]/, "", $0); } # Discard blank lines: /^ *$/ { next; } // { if (nf == -1) { if (col > NF) { data_error("not enough fields col = " col " NF = " NF); } nf = NF; } else { if (NF != nf) { data_error("inconsistents field counts " nf " " NF); } } process_datum($(col) + 0, $0); next; } END { # process the last half-window: for (i = 0; i < rad; i++) { process_datum("nan", ""); } } function process_datum(d,lin, i) { # Add the datum {d} and the line {lin} to the {data.line} lists. for (i = 0; i < nw-1; i++) { data[i] = data[i+1]; line[i] = line[i+1]; } data[nw-1] = d; line[nw-1] = lin; val = compute_smoothed(); if (line[rad] != "") { printf "%s %s\n", line[rad], val; } } function compute_smoothed( i,di,wi,sw,sdw) { # Computes the weighted average of {data[0..nw-1]} ignoring # entries which are the string "nan". If the non-"nan" entries # have less than 40% of the total weight, returns "nan". # Compute the smoothed value: sw = 0; sdw = 0; for (i = 0; i < nw; i++) { di = data[i]; if ((di != "nan") && (di != "NaN") && (di != "NAN")) { wi = 0.5*(1 - cos(2*pi*(i+0.5)/nw)); sdw += di*wi; sw += wi; } } if (sw < 0.199999*nw) { return "nan"; } else { return sdw/sw; } } function check_int_arg(name, val, vmin, vmax) { if (val == "") { arg_error("must specify \"-v " name "={VALUE}\""); } if (! match(val, /^[-+]?[0-9]+$/)) { arg_error("invalid value \"" val "\" for " name); } val = val + 0 if ((val < vmin) || (val > vmax)) { arg_error("value " val " for " name " out of range"); } return val; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " [%s]\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; bort = 1; exit abort; }