#! /usr/bin/gawk -f
# Last edited on 2025-06-17 05:43:21 by stolfi

# Reads a list of records, each containg a numeric value {z} in some
# specified column.
#
# Outputs a text file with a histogram of those values Each line of the
# histogram will describe a bin of the histogram. It will have three
# fields "{VLO} {VMD} {VHI} {COUNT}" where {COUNT} is the number of input {z}
# values that are in the interval {(VLO _VHI)}, and {VMD} is {(VLO+VHI)/2}.
#
# The client must define (with "-v") the variable {col} as the index
# (from 1) of the data column of the input file to be considered. As in
# {gawk}, the column separator is a strings of one or more blanks and/or
# tabs, and blanks or tabs before the first column are ignored.
#
# The client may also define the variable {sync} (default 0) and must
# define exactly one of the variables {bins} and {step} to a positive
# numeric value. If {step} is defined, each bin will have that width,
# with {VLO=sync+k*step} and {VHI=sync+(k+1)*step} for some integer {k}
# (positive, zero, or negative). If {bins} is defined, the with {step}
# will be chosen internally to a roundish number such that the number of
# bins will be {bins} or a bit more than that.
#
# The user may also define the variables {emin}, {emax}, {vmin}, and
# {vmax}. The program will ignore ant {z} input values that are INSIDE
# the range {[emin _ emax]} or OUTSIDE the range {[vmin _ vmax]}.
#
# The user may also specify how to treat values that fall exactly at the
# boundary between two bins by setting the variable {bround}. If {bround} is
# {-1}, those values will be counted in the next lower bin (that is,
# each bin will count values in the half-open interval {(VLO _ VHI]}).
# If {bround} is {+1}, they will be counted in the next higher bin (so
# the bin intervals will be {[VLO _ VHI)}. If {bround} is 0 (default),
# each value will be counted as {0.5} in both bins.
#
# The script ignores '#'-comments, blank lines, and lines where column
# {col} is not a number (including lines where column {col} is "nan",
# "-inf", "+inf", or "inf".
#
# The user may also define the boolean variable {verbose} to 1 in order to 
# get diagnostics.

BEGIN {
  abort = -1;
  debug = 0;
  verbose += 0;
  inf = "+inf" + 0; # Gawk is such a crock.
  if (debug) { printf "inf = %24.16e\n", inf > "/dev/stderr" }
  if (verbose) { printf "verbose = %d\n", verbose > "/dev/stderr"; }
  col += 0; sync += 0; step += 0; bins += 0;
  if (verbose) { printf "col = %d  sync = %24.16e step = %24.16e bins = %d\n", col, sync, step, bins > "/dev/stderr"; }
  if (step < 0) { arg_error("invalid {step}  =" step); }
  if ((bins < 0) || (bins != int(bins))) { arg_error("invalid {bins}  =" bins); }
  if ((step == 0) == (bins == 0)) {
    arg_error("must specify exactly one numeric {step} or {bins}");
  }
  bround += 0;
  if (verbose) { printf "bround = %d\n", bround > "/dev/stderr"; }
  if ((bround != -1) && (bround != 0) && (bround != +1)) { arg_error("invalid {bround} = " bround);  }
  # Default {vmin,vmax} is everything:
  if (vmin == "") { vmin = -inf; }
  if (vmax == "") { vmax = +inf; }
  vmin += 0; vmax += 0;   
  # Default {emin,emax} is 
  if (emin == "") { emin = +inf; }
  if (emax == "") { emax = -inf; }
  emin += 0; emax += 0;  
  nlin = 0; # Non-blank lines read.
  nexc = 0; # Samples excluded by {[emin _ emax]}
  nout = 0; # Samples excluded by {[vmin _ vmax]}
  nnon = 0; # Samples excluded for being not numeric. 
  nbad = 0; # Samples excluded for being {nan} or {inf}. 
  ns = 0;   # Samples saved in first pass.
  split("", zs); # Sample saved in first pass are {zs[0..ns-1]}
  zmin = +inf; zmax = -inf; # Actual range of data values.
  if (debug) { printf "  range [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; }
  if (debug) { 
    check_lo_hi_bin_index(0.00,1.00,7.00);
    check_lo_hi_bin_index(3.00,0.25,0.75);
  }
}

(abort >= 0) { exit abort; }

// {
  # Convert tabs and funny spaces to spaces,just in case:
  gsub(/[ \011\015]/, " ", $0);
  # Erase '#'-comments:
  gsub(/[ ]*[#].*$/, "", $0);
}

# Discard blank lines:
/^ *$/ { next; }

# Gather samples from data lines:
// { 
  if ((col < 1) || (col > NF)) {
    data_error(("not enough fields in record: {col} = " col " {NF} = " NF));
  } 
  nlin++;
  z = $(col);
  if ((z ~ /^[-+]?[Nn][Aa][Nn]$/) || (z ~ /^[-+]?[Ii][Nn][Ff]$/)) { nbad++; next; }
  if (z !~ /^[-+]?[0-9]*([0-9]([.][0-9]*|)|[.][0-9]+)(|[Ee][-+]?[0-9]+)$/) { nnon++; next; }
  z += 0;
  if ((vmin < vmax) && ((z < vmin) || (z > vmax))) { nout++; next; }
  if ((emin < emax) && (z >= emin) && (z <= emax)) { nexc++; next; }
  if (z < zmin) { zmin = z; }
  if (z > zmax) { zmax = z; }
  if (debug) { printf "  line %d  z = «%s» [%24.16e _ %24.16e]\n", FNR, z, zmin, zmax > "/dev/stderr"; }
  zs[ns] = z; ns++;
  maxbins = 10001; # Max number of bins allowed.
  next;
}

END { 
  if (abort >= 0) { exit abort; }
  if (verbose) {
    printf "%d non-blank input lines\n", nlin > "/dev/stderr";
    if (nbad > 0) { printf "%d values were {nan} or {inf}\n", nbad > "/dev/stderr"; }
    if (nnon > 0) { printf "%d values were not numeric\n", nnon > "/dev/stderr"; }
    if (nout > 0) { printf "%d values outside the valid range\n", nout > "/dev/stderr"; }
    if (nexc > 0) { printf "%d values inside the exclusion range\n", nexc > "/dev/stderr"; }
    printf "%d values considered\n", ns > "/dev/stderr";
    printf "range of accepted values = [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr";
  }
  if (ns == 0) {
    kmin = 0; kmax = 0;
  } else { 
    if (zmin == zmax) { 
      # Widen range by fiat: 
      zeps = 1.0e-5 * sqrt(1 + zmin*zmin)
      zmin = zmin - zeps; zmax = zmax + zeps;
      if (verbose) { printf "null range widened to = [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; }
    }
    if (step == 0) { step = choose_step(zmin, zmax, sync, bins); }
    if (verbose) { printf "using step = %24.16e\n", step > "/dev/stderr"; }

    write_histogram(zs, ns, zmin, zmax);
    write_stats(zs, ns);
  }
}

function choose_step(zmin,zmax,sync,bins,  scale,step) {
  # Choose a good {step}:
  if (verbose) { printf "computing {step} for {bins} = %d\n", bins > "/dev/stderr"; }
  if (verbose) { printf "  observed sample range [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; }
  step = (zmax - zmin)/bins;
  scale = 1;
  while (step*scale < 1.0) { scale = scale*10; }
  while (step*scale >= 10.0) { scale = max/10; }
  if (verbose) { printf "  {scale} = %24.16e {step*scale} = %12.8f\n", scale, step*scale > "/dev/stderr"; }
  if (step*scale < 1.0) 
    { step = 1.0/scale; }
  else if (step*scale < 2.0) 
    { step = 2.0/scale; }
  else if (step*scale < 2.5) 
    { step = 2.5/scale; }
  else if (step*scale < 5.0) 
    { step = 5.0/scale; }
  else if (step*scale < 10.0) 
    { step = 10.0/scale; }
  else
    { prog_error("duh (1)?"); }
  return step;
}

function lo_bin_index(zi,sync,step,bround,  b,k) { 
  b = (zi - sync)/step;
  k = int(b);
  while (k > b) { k--; }
  # Now {k} is {floor(b)}.
  if ((bround <= 0) && (k == b)) { k--; }
  return k;
}

function hi_bin_index(zi,sync,step,bround,  b,k) { 
  b = (zi - sync)/step;
  k = int(b);
  while (k < b) { k++; }
  # Now {k} is {ceil(b)}.
  if ((k != b) || ((bround < 0) && (k == b))) { k--; }
  return k;
}
    
function write_histogram(zs,ns,zmin,zmax, kmin,kmax,i,klo,khi,ct,k,lo,md,hi) {
  
  # Compute bin index range {kmin..kmax}:
  kmin = lo_bin_index(zmin, sync, step, bround);
  kmax = hi_bin_index(zmax, sync, step, bround);
  if (verbose) { printf "bin index range = %d .. %d\n", kmin, kmax > "/dev/stderr"; }
  if (kmax - kmin + 1 > maxbins) { data_error("too many bins"); }
  if (kmin > kmax) { kmin = 0; kmax = 0; }

  # Collect counts:
  split("", ct);
  for (i = 0; i < ns; i++) {
    klo = lo_bin_index(zs[i], sync, step, bround); 
    khi = hi_bin_index(zs[i], sync, step, bround); 
    if ((klo < kmin) || (khi > kmax)) {
      prog_error(("klo = " klo " khi = " khi " expected in " kmin ".." kmax));
    }
    if (! (klo in ct)) { ct[klo] = 0; }
    if (! (khi in ct)) { ct[khi] = 0; }
    if (klo == khi) { 
      ct[klo]++;
    } else {
      if ((khi - klo) != 1) {
        prog_error(("z = " zs[i] " klo = " klo " khi = " khi " not 1 apart"));
      }
      ct[klo] += 0.5;
      ct[khi] += 0.5;
    }
  }
  
  # Write counts:
  for (k = kmin-1;  k <= kmax+1; k++) { 
    lo = sync + k*step; hi = sync + (k+1)*step; md = (lo + hi)/2;
    printf " %24.16e %24.16e %24.16e %18.1f\n", lo, md, hi, ct[k]; 
  }
}

function write_stats(zs,ns, i,zi,zmin,zmax,sum_z,avg,sum_dz2,dz,dev) {
    
  # Compute min, max, average:
  zmin = +inf;
  zmax = -inf;
  sum_z = 0;
  for (i = 0; i < ns; i++) {
    zi = zs[i];
    if (zi < zmin) { zmin = zi; }
    if (zi > zmax) { zmax = zi; }
    sum_z += zi;
  }
  avg = (ns < 1 ? 0 : sum_z/ns );
  
  # Compute deviation:
  sum_dz2 = 0;
  for (i = 0; i < ns; i++) { dz = zs[i] - avg; sum_dz2 += dz*dz; }
  dev = (ns < 2 ? 0 : sqrt(sum_dz2/(ns-1)) );
  
  printf "stats for the counted values:\n" > "/dev/stderr";
  printf "ns = %d  min = %24.16e  max = %24.16e", ns, zmin, zmax > "/dev/stderr";
  printf "  avg = %24.16e  dev = %24.16e\n", avg, dev > "/dev/stderr";
}
  
function check_lo_hi_bin_index(sy,st,ztest, br,dz,z,khi,klo,zhi,zlo) {
  { # Check {lo_bin_index,hi_bin_index}:
    printf "checking {lo_bin_index,hi_bin_index} ...\n" > "/dev/stderr"
    printf "sy = %24.16e  st = %24.16e\n", sync, step  > "/dev/stderr"
    for (br = -1; br <= +1; br++) 
      { for (dz = -1; dz <= +1; dz++)
          { z = ztest + dz*0.00001; 
            klo = lo_bin_index(z, sy, st, br)
            khi = hi_bin_index(z, sy, st, br)
            printf "z = %8.5f  bround = %2d", z, br > "/dev/stderr";
            printf "  lo..hi =  %+4d..%+4d", klo, khi > "/dev/stderr";
            zlo = sy + klo*st; zhi = sy + (khi+1)*st
            printf "  =  [%24.16e _ %24.16e]\n", zlo, zhi > "/dev/stderr";
          }
        printf "\n" > "/dev/stderr"; 
      }
  }
}

function arg_error(msg)
  { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }

function data_error(msg)
  { printf "** line %d: %s\n", FNR, msg > "/dev/stderr"; 
    printf "  [[%s]]\n", $0 > "/dev/stderr"; 
    abort = 1; exit abort; }

function prog_error(msg)
  { printf "** line %d: prog error - %s\n", FNR, msg > "/dev/stderr"; 
    abort = 1; exit abort;
  }