#! /usr/bin/gawk -f # Last edited on 2025-06-17 05:43:21 by stolfi # Reads a list of records, each containg a numeric value {z} in some # specified column. # # Outputs a text file with a histogram of those values Each line of the # histogram will describe a bin of the histogram. It will have three # fields "{VLO} {VMD} {VHI} {COUNT}" where {COUNT} is the number of input {z} # values that are in the interval {(VLO _VHI)}, and {VMD} is {(VLO+VHI)/2}. # # The client must define (with "-v") the variable {col} as the index # (from 1) of the data column of the input file to be considered. As in # {gawk}, the column separator is a strings of one or more blanks and/or # tabs, and blanks or tabs before the first column are ignored. # # The client may also define the variable {sync} (default 0) and must # define exactly one of the variables {bins} and {step} to a positive # numeric value. If {step} is defined, each bin will have that width, # with {VLO=sync+k*step} and {VHI=sync+(k+1)*step} for some integer {k} # (positive, zero, or negative). If {bins} is defined, the with {step} # will be chosen internally to a roundish number such that the number of # bins will be {bins} or a bit more than that. # # The user may also define the variables {emin}, {emax}, {vmin}, and # {vmax}. The program will ignore ant {z} input values that are INSIDE # the range {[emin _ emax]} or OUTSIDE the range {[vmin _ vmax]}. # # The user may also specify how to treat values that fall exactly at the # boundary between two bins by setting the variable {bround}. If {bround} is # {-1}, those values will be counted in the next lower bin (that is, # each bin will count values in the half-open interval {(VLO _ VHI]}). # If {bround} is {+1}, they will be counted in the next higher bin (so # the bin intervals will be {[VLO _ VHI)}. If {bround} is 0 (default), # each value will be counted as {0.5} in both bins. # # The script ignores '#'-comments, blank lines, and lines where column # {col} is not a number (including lines where column {col} is "nan", # "-inf", "+inf", or "inf". # # The user may also define the boolean variable {verbose} to 1 in order to # get diagnostics. BEGIN { abort = -1; debug = 0; verbose += 0; inf = "+inf" + 0; # Gawk is such a crock. if (debug) { printf "inf = %24.16e\n", inf > "/dev/stderr" } if (verbose) { printf "verbose = %d\n", verbose > "/dev/stderr"; } col += 0; sync += 0; step += 0; bins += 0; if (verbose) { printf "col = %d sync = %24.16e step = %24.16e bins = %d\n", col, sync, step, bins > "/dev/stderr"; } if (step < 0) { arg_error("invalid {step} =" step); } if ((bins < 0) || (bins != int(bins))) { arg_error("invalid {bins} =" bins); } if ((step == 0) == (bins == 0)) { arg_error("must specify exactly one numeric {step} or {bins}"); } bround += 0; if (verbose) { printf "bround = %d\n", bround > "/dev/stderr"; } if ((bround != -1) && (bround != 0) && (bround != +1)) { arg_error("invalid {bround} = " bround); } # Default {vmin,vmax} is everything: if (vmin == "") { vmin = -inf; } if (vmax == "") { vmax = +inf; } vmin += 0; vmax += 0; # Default {emin,emax} is if (emin == "") { emin = +inf; } if (emax == "") { emax = -inf; } emin += 0; emax += 0; nlin = 0; # Non-blank lines read. nexc = 0; # Samples excluded by {[emin _ emax]} nout = 0; # Samples excluded by {[vmin _ vmax]} nnon = 0; # Samples excluded for being not numeric. nbad = 0; # Samples excluded for being {nan} or {inf}. ns = 0; # Samples saved in first pass. split("", zs); # Sample saved in first pass are {zs[0..ns-1]} zmin = +inf; zmax = -inf; # Actual range of data values. if (debug) { printf " range [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; } if (debug) { check_lo_hi_bin_index(0.00,1.00,7.00); check_lo_hi_bin_index(3.00,0.25,0.75); } } (abort >= 0) { exit abort; } // { # Convert tabs and funny spaces to spaces,just in case: gsub(/[ \011\015]/, " ", $0); # Erase '#'-comments: gsub(/[ ]*[#].*$/, "", $0); } # Discard blank lines: /^ *$/ { next; } # Gather samples from data lines: // { if ((col < 1) || (col > NF)) { data_error(("not enough fields in record: {col} = " col " {NF} = " NF)); } nlin++; z = $(col); if ((z ~ /^[-+]?[Nn][Aa][Nn]$/) || (z ~ /^[-+]?[Ii][Nn][Ff]$/)) { nbad++; next; } if (z !~ /^[-+]?[0-9]*([0-9]([.][0-9]*|)|[.][0-9]+)(|[Ee][-+]?[0-9]+)$/) { nnon++; next; } z += 0; if ((vmin < vmax) && ((z < vmin) || (z > vmax))) { nout++; next; } if ((emin < emax) && (z >= emin) && (z <= emax)) { nexc++; next; } if (z < zmin) { zmin = z; } if (z > zmax) { zmax = z; } if (debug) { printf " line %d z = «%s» [%24.16e _ %24.16e]\n", FNR, z, zmin, zmax > "/dev/stderr"; } zs[ns] = z; ns++; maxbins = 10001; # Max number of bins allowed. next; } END { if (abort >= 0) { exit abort; } if (verbose) { printf "%d non-blank input lines\n", nlin > "/dev/stderr"; if (nbad > 0) { printf "%d values were {nan} or {inf}\n", nbad > "/dev/stderr"; } if (nnon > 0) { printf "%d values were not numeric\n", nnon > "/dev/stderr"; } if (nout > 0) { printf "%d values outside the valid range\n", nout > "/dev/stderr"; } if (nexc > 0) { printf "%d values inside the exclusion range\n", nexc > "/dev/stderr"; } printf "%d values considered\n", ns > "/dev/stderr"; printf "range of accepted values = [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; } if (ns == 0) { kmin = 0; kmax = 0; } else { if (zmin == zmax) { # Widen range by fiat: zeps = 1.0e-5 * sqrt(1 + zmin*zmin) zmin = zmin - zeps; zmax = zmax + zeps; if (verbose) { printf "null range widened to = [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; } } if (step == 0) { step = choose_step(zmin, zmax, sync, bins); } if (verbose) { printf "using step = %24.16e\n", step > "/dev/stderr"; } write_histogram(zs, ns, zmin, zmax); write_stats(zs, ns); } } function choose_step(zmin,zmax,sync,bins, scale,step) { # Choose a good {step}: if (verbose) { printf "computing {step} for {bins} = %d\n", bins > "/dev/stderr"; } if (verbose) { printf " observed sample range [%24.16e _ %24.16e]\n", zmin, zmax > "/dev/stderr"; } step = (zmax - zmin)/bins; scale = 1; while (step*scale < 1.0) { scale = scale*10; } while (step*scale >= 10.0) { scale = max/10; } if (verbose) { printf " {scale} = %24.16e {step*scale} = %12.8f\n", scale, step*scale > "/dev/stderr"; } if (step*scale < 1.0) { step = 1.0/scale; } else if (step*scale < 2.0) { step = 2.0/scale; } else if (step*scale < 2.5) { step = 2.5/scale; } else if (step*scale < 5.0) { step = 5.0/scale; } else if (step*scale < 10.0) { step = 10.0/scale; } else { prog_error("duh (1)?"); } return step; } function lo_bin_index(zi,sync,step,bround, b,k) { b = (zi - sync)/step; k = int(b); while (k > b) { k--; } # Now {k} is {floor(b)}. if ((bround <= 0) && (k == b)) { k--; } return k; } function hi_bin_index(zi,sync,step,bround, b,k) { b = (zi - sync)/step; k = int(b); while (k < b) { k++; } # Now {k} is {ceil(b)}. if ((k != b) || ((bround < 0) && (k == b))) { k--; } return k; } function write_histogram(zs,ns,zmin,zmax, kmin,kmax,i,klo,khi,ct,k,lo,md,hi) { # Compute bin index range {kmin..kmax}: kmin = lo_bin_index(zmin, sync, step, bround); kmax = hi_bin_index(zmax, sync, step, bround); if (verbose) { printf "bin index range = %d .. %d\n", kmin, kmax > "/dev/stderr"; } if (kmax - kmin + 1 > maxbins) { data_error("too many bins"); } if (kmin > kmax) { kmin = 0; kmax = 0; } # Collect counts: split("", ct); for (i = 0; i < ns; i++) { klo = lo_bin_index(zs[i], sync, step, bround); khi = hi_bin_index(zs[i], sync, step, bround); if ((klo < kmin) || (khi > kmax)) { prog_error(("klo = " klo " khi = " khi " expected in " kmin ".." kmax)); } if (! (klo in ct)) { ct[klo] = 0; } if (! (khi in ct)) { ct[khi] = 0; } if (klo == khi) { ct[klo]++; } else { if ((khi - klo) != 1) { prog_error(("z = " zs[i] " klo = " klo " khi = " khi " not 1 apart")); } ct[klo] += 0.5; ct[khi] += 0.5; } } # Write counts: for (k = kmin-1; k <= kmax+1; k++) { lo = sync + k*step; hi = sync + (k+1)*step; md = (lo + hi)/2; printf " %24.16e %24.16e %24.16e %18.1f\n", lo, md, hi, ct[k]; } } function write_stats(zs,ns, i,zi,zmin,zmax,sum_z,avg,sum_dz2,dz,dev) { # Compute min, max, average: zmin = +inf; zmax = -inf; sum_z = 0; for (i = 0; i < ns; i++) { zi = zs[i]; if (zi < zmin) { zmin = zi; } if (zi > zmax) { zmax = zi; } sum_z += zi; } avg = (ns < 1 ? 0 : sum_z/ns ); # Compute deviation: sum_dz2 = 0; for (i = 0; i < ns; i++) { dz = zs[i] - avg; sum_dz2 += dz*dz; } dev = (ns < 2 ? 0 : sqrt(sum_dz2/(ns-1)) ); printf "stats for the counted values:\n" > "/dev/stderr"; printf "ns = %d min = %24.16e max = %24.16e", ns, zmin, zmax > "/dev/stderr"; printf " avg = %24.16e dev = %24.16e\n", avg, dev > "/dev/stderr"; } function check_lo_hi_bin_index(sy,st,ztest, br,dz,z,khi,klo,zhi,zlo) { { # Check {lo_bin_index,hi_bin_index}: printf "checking {lo_bin_index,hi_bin_index} ...\n" > "/dev/stderr" printf "sy = %24.16e st = %24.16e\n", sync, step > "/dev/stderr" for (br = -1; br <= +1; br++) { for (dz = -1; dz <= +1; dz++) { z = ztest + dz*0.00001; klo = lo_bin_index(z, sy, st, br) khi = hi_bin_index(z, sy, st, br) printf "z = %8.5f bround = %2d", z, br > "/dev/stderr"; printf " lo..hi = %+4d..%+4d", klo, khi > "/dev/stderr"; zlo = sy + klo*st; zhi = sy + (khi+1)*st printf " = [%24.16e _ %24.16e]\n", zlo, zhi > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; } function data_error(msg) { printf "** line %d: %s\n", FNR, msg > "/dev/stderr"; printf " [[%s]]\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "** line %d: prog error - %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; }