#! /usr/bin/gawk -f # Last edited on 2012-10-03 18:23:24 by stolfilocal BEGIN \ { # Reads from standard input a ".dat" file generated by {extract-votes.gawk}, # possibly condensed with {condense-dat-file.gawk} into one record for each # {grouping} unit. # IMPORTANT: the input must be sorted by the {spanning} unit! # Splits the file records into groups with the same {spanning} unit. # Outputs one record for each such group with the median, low percentile, and high percentile # within the group for the fraction of votes on two specified candidates. # Parameters (define with "-v"): # {grouping} Accounting unit of each input record ("urn" or "zlv") # {spanning} Larger accounting unit that defines the groups ("zon", "zol" or "loc"). # {cix1} Relative position of the selected candidate 1 in the input file. # {cix2} Relative position of the selected candidate 2 in the input file. # {pcttype} Kind of candidate percentage desired ("pce", "pcv", or "pcp"). # {level} Percentile level for spread computation. # {minvotes} Minimum number of cast votes in each record. # {minrecs} Minimum number of input records in a group. # {ofnum} Office number code (1=President, etc.). # The candidate indices {cix1,cix2} are 0 for BLANK, 1 for NULL, then 2,3,... for the # candidates which were selected for inclusion in the ".dat" file. # The input ".dat" file should have one line for each {grouping} unit, # with the results of a specific election round in a specific state and for a specific office. # Records with less than {minvotes} votes cast are discarded. # The output has one line for every {spanning} unit that has at least {minrecs} non-discarded # input records, with fields: # {stccd} Two-letter state code. # {zon} Zone number (3 digit, zero-padded). # {loc} Locality code (5 digit, zero-padded). # {gseq} Sequential group index (6 digit, zero-padded). # {avgvot1} Median vote fraction on {cix1}. # {lovot1} Low percentile of vote fraction on {cix1}. # {hivot1} High percentile of vote fraction on {cix1}. # {avgvot2} Median vote fraction on {cix2}. # {lovot2} Low percentile of vote fraction on {cix2}. # {hivot2} High percentile of vote fraction on {cix2}. abort = -1; PROG_NAME = "compute-group-spreads.gawk"; # Maximum number of real candidates (including BLANK and NULL): max_cands = 100; # Maximum value for {cix}: max_cix = max_cands - 1; # Check parameters and make sure that they are numeric: if (grouping !~ /^(urn|zlv|zol)$/) { arg_error("missing or invalid {grouping} = \"" grouping "\""); } if (spanning !~ /^(zon|zol|loc)$/) { arg_error("missing or invalid {spanning} = \"" spanning "\""); } if (! is_num(cix1,0,max_cix)) { arg_error("missing or invalid {cix1} = \"" cix1 "\""); } cix1 += 0; if (! is_num(cix2,0,max_cix)) { arg_error("missing or invalid {cix2} = \"" cix2 "\""); } cix2 += 0; if (! is_frac(level,0.0,1.0)) { arg_error("missing or invalid {level} = \"" level "\""); } level += 0; if (pcttype !~ /^(pcp|pcv|pce)$/) { arg_error("missing or invalid {pcttyle} = \"" pcttype "\""); } if (! is_num(minvotes,0,99999999)) { arg_error("missing or invalid {minvotes} = \"" minvotes "\""); } minvotes += 0; if (! is_num(minrecs,0,99999999)) { arg_error("missing or invalid {minrecs} = \"" minrecs "\""); } minrecs += 0; if (! is_num(ofnum,0,8)) { arg_error("missing or invalid {ofnum} = \"" ofnum "\""); } ofnum += 0; # Current group data: ostcd = ""; # State code, or "" if not defined yet. ozon = ""; # Zone number, or "" if not defined yet. oloc = ""; # Locality code, or "" if not defined yet. oid = ""; # Group identity "{ozon}.{oloc}", or "" if not defined yet. ogseq = 0; # Output record ({spanning} group) sequence number. # Global counters: nrecs_read = 0; # Number of input data records read. nrecs_small = 0; # Number of input records that were discarded for having too few votes cast. ngroups_written = 0; # Number of output records ({spanning} units) written. ngroups_small = 0; # Number of {spanning} units discarded for being to small. # Number of fields expected in ".dat" file (set from first record): num_fields = -1; # Voting ratio per candidate: split("", vot); # {vot[1..2]} is the fraction of votes for candidates 1 and 2 over selecetd denominator. # Print comments: printf "# File created by %s\n", PROG_NAME; printf "# cix1=%d\n", cix1; printf "# cix2=%d\n", cix2; printf "# level=%8.6f\n", level; printf "# pcttype=%s\n", pcttype; printf "# minvotes=%s\n", minvotes; printf "# minrecs=%s\n", minrecs; printf "\n"; } ( abort >= 0) { exit abort; } // \ { nrecs_read ++; # We should have at least 2 cands besides BLANK and NULL: if ((NF < 9 + 4) || (NF > 9 + max_cands)) { data_error(("wrong number of fields " NF)); } # Check constancy of the number of fields: if ((FNR > 1) && (NF != num_fields)) { data_error(("wrong number of fields " NF " expected " num_fields)); } num_fields = NF; # Grab general fields: est=$1; zon=$2; sec=$3; lov=$4; loc=$5; car=$6; ele=$7; # Number of registered voters. aus=$8; # Number of absentees. pre=$9; # Number of votes cast ( { == ele-aus} ). bra=$10; # Number of BLANK votes. nul=$11; # Number of NULL votes. # Check consistency of state code field: if ((FNR > 1) && (est != ostcd)) { data_error(("inconsistent {est} field = " est " previous " ostcd)); } ostcd = est; # Check consistency of vote totals: if (! is_num(ele,0,999999999)) { data_error(("invalid {ele} = \"" ele "\"")); } if (! is_num(aus,0,ele)) { data_error(("invalid {aus} = \"" aus "\"")); } if (! is_num(pre,0,ele)) { data_error(("invalid {pre} = \"" pre "\"")); } if (pre + aus != ele) { data_error(("wrong vote counts ele = " ele " aus = " aus " pre = " pre)); } # Choose denominator for voting percentage: if (pcttype == "pcp") { den = pre; } else if (pcttype == "pcv") { den = pre - bra - nul; } else if (pcttype == "pce") { den = ele; } else { prog_error(("{pcttype} = " pcttype)); } # Hack to avoid divide by zero: if (den == 0) { den = 1.0; } # Compute candidate vote percentages {vot[1..3]}: for (k = 1; k <= 2; k++) { # Compute candidate's vote count {cvt}: cix = (k == 1 ? cix1 : cix2); fix = 10 + cix; if (fix > NF) { data_error(("candidate index " cix " too large, max " NF - 10)); } cvt = $(fix); # Consistency of count. Note that people vote twice for Senator, # so BRANCO and NULO may be twice the number of present voters: exv = ((ofnum == 5) && (cix <= 1) ? 2*pre : pre); if (! is_num(cvt,0,exv)) { data_error(("invalid vote count \"" cvt "\" max is " exv)); } # Compute candidate's voting percentage {vot[k]} vot[k] = (cvt + 0.0)/den; } # Dump current {spanning} group if changed: ezon = (spanning == "loc" ? 0 : zon); eloc = (spanning == "zon" ? 0 : loc); eid = sprintf("%03d.%05d", ezon, eloc); if (eid != oid) { if (oid != "") { if (eid < oid) { data_error(("input is incorrectly sorted " oid " " eid)); } dump_group_data(); } clear_group_data(ezon,eloc,eid); } # Discard record if too few votes cast: if (pre < minvotes) { printf "record %03d %04d %05d has only %4d votes cast, discarded\n", zon, sec, loc, pre > "/dev/stderr"; nrecs_small++; next; } # Accumulate current record: save_group_data(vot); next; } END \ { if ( abort >= 0) { exit abort; } # Dump current {spanning} group if any: if (oid != "") { dump_group_data(); } printf "%8d data records (\"%s\"s) read\n", nrecs_read, grouping > "/dev/stderr"; printf "%8d \"%s\"s with too few votes\n", nrecs_small, grouping > "/dev/stderr"; printf "%8d \"%s\"s with too few data records\n", ngroups_small, spanning > "/dev/stderr"; printf "%8d output records (\"%s\"s) written\n", ngroups_written, spanning > "/dev/stderr"; } function clear_group_data(nzon,nloc,nid ) { # Clears all data for the current group with {nid = "{ezon}.{eloc}"}: nrecs = 0; # Number of sections in group. split("", vot_tb1); # Values of {vot[1]} in group, indexed {1..nrecs}. split("", vot_tb2); # Values of {vot[2]} in group, indexed {1..nrecs}. ogseq++; ozon = nzon; oloc = nloc; oid = nid; } function save_group_data(vot) { # Save the vote fractions for another data record of the current {spanning} unit. # Cast them in fixed format so that sorting works. nrecs++; vot_tb1[nrecs] = sprintf("%8.6f", vot[1]); vot_tb2[nrecs] = sprintf("%8.6f", vot[2]); } function dump_group_data( n,i) { if (nrecs < minrecs) { printf "group %s.%s has only %4d valid records, discarded\n", ozon, oloc, nrecs > "/dev/stderr"; ngroups_small++; return; } printf "%06d = %03d.%05d\n", ogseq, ozon, oloc > "/dev/stderr"; printf "%s %s %s %06d", ostcd, ozon, oloc, ogseq; printf "%06d = %03d.%05d %4d records considered\n", ogseq, ozon, oloc, nrecs > "/dev/stderr"; n = asort(vot_tb1); if (n != nrecs) { prog_error(("{n vot_tb1} != {nrecs}")); } printf " %s %s %s", perc(0.5,vot_tb1,nrecs), perc(level,vot_tb1,nrecs), perc(1-level,vot_tb1,nrecs); n = asort(vot_tb2); if (n != nrecs) { prog_error(("{n vot_tb2} != {nrecs}")); } printf " %s %s %s", perc(0.5,vot_tb2,nrecs), perc(level,vot_tb2,nrecs), perc(1-level,vot_tb2,nrecs); printf "\n"; ngroups_written++; } function perc(lev,tb,n, i,j,levi,levj,r,v) { # Given a sorted table of values {tb[1..n]}, returns # the element with a fractionary position {lev}. # Namely if {lev=0} returns {tb[1]}, # if {lev=1} returns {tb[n]}, and if {lev} # is intermediate, interpolates between # the two elements that bracket the # position {lev} between {tb[1]} and {tb[n]} if (lev <= 0.0) { return tb[1]; } else if (lev >= 1.0) { return tb[n]; } else if (n == 1) { return tb[1]; } else { # Get brcketing elements: i = 1 + int(lev*(n-1)); if (i < 1) { return tb[1]; } if (i >= n) { return tb[n]; } # Interpolate: r = lev*(n-1) - (i-1); v = (1-lev)*tb[i] + lev*tb[i+1]; return sprintf("%8.6f",v); } } function is_num(x,lo,hi) { if (x !~ /^[0-9]+$/) { return 0; }; if (x+0 < lo) { return 0; } if (x+0 > hi) { return 0; } return 1; } function is_frac(x,lo,hi) { if (x !~ /^[0-9]+([.][0-9]+|)$/) { return 0; }; if (x+0 < lo) { return 0; } if (x+0 > hi) { return 0; } return 1; } function data_error(msg) { printf "%s:%d: ** [cgs] %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function data_warning(msg) { printf "%s:%d: !! [cgs] %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; } function arg_error(msg) { printf "** [cgs] %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); } function prog_error(msg) { printf "** [cgs] PROGRAM ERROR - %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }