#! /usr/bin/gawk -f
# Last edited on 2012-10-03 18:23:24 by stolfilocal

BEGIN \
  { 
    # Reads from standard input a ".dat" file generated by {extract-votes.gawk},
    # possibly condensed with {condense-dat-file.gawk} into one record for each 
    # {grouping} unit.  
    
    # IMPORTANT: the input must be sorted by the {spanning} unit!
    
    # Splits the file records into groups with the same {spanning} unit.
    # Outputs one record for each such group with the median, low percentile, and high percentile
    # within the group for the fraction of votes on two specified candidates.

    # Parameters (define with "-v"):
    #   {grouping}   Accounting unit of each input record ("urn" or "zlv")
    #   {spanning}   Larger accounting unit that defines the groups ("zon", "zol" or "loc").
    #   {cix1}       Relative position of the selected candidate 1 in the input file.
    #   {cix2}       Relative position of the selected candidate 2 in the input file.
    #   {pcttype}    Kind of candidate percentage desired ("pce", "pcv", or "pcp").
    #   {level}      Percentile level for spread computation.
    #   {minvotes}   Minimum number of cast votes in each record.
    #   {minrecs}    Minimum number of input records in a group.
    #   {ofnum}      Office number code (1=President, etc.).

    # The candidate indices {cix1,cix2} are 0 for BLANK, 1 for NULL, then 2,3,... for the 
    # candidates which were selected for inclusion in the ".dat" file.
    
    # The input ".dat" file should have one line for each {grouping} unit, 
    # with the results of a specific election round in a specific state and for a specific office.
    # Records with less than {minvotes} votes cast are discarded.

    # The output has one line for every {spanning} unit that has at least {minrecs} non-discarded 
    # input records, with fields:
    #   {stccd}     Two-letter state code.
    #   {zon}       Zone number (3 digit, zero-padded).
    #   {loc}       Locality code (5 digit, zero-padded).
    #   {gseq}      Sequential group index (6 digit, zero-padded).
    #   {avgvot1}   Median vote fraction on {cix1}.
    #   {lovot1}    Low percentile of vote fraction on {cix1}.
    #   {hivot1}    High percentile of vote fraction on {cix1}.
    #   {avgvot2}   Median vote fraction on {cix2}.
    #   {lovot2}    Low percentile of vote fraction on {cix2}.
    #   {hivot2}    High percentile of vote fraction on {cix2}.
  
    abort = -1;
    PROG_NAME = "compute-group-spreads.gawk";

    # Maximum number of real candidates (including BLANK and NULL):
    max_cands = 100;

    # Maximum value for {cix}:
    max_cix = max_cands - 1;

    # Check parameters and make sure that they are numeric:
    if (grouping !~ /^(urn|zlv|zol)$/)  { arg_error("missing or invalid {grouping} = \"" grouping "\""); }
    if (spanning !~ /^(zon|zol|loc)$/)  { arg_error("missing or invalid {spanning} = \"" spanning "\""); }
    if (! is_num(cix1,0,max_cix))       { arg_error("missing or invalid {cix1} = \"" cix1 "\""); } cix1 += 0;
    if (! is_num(cix2,0,max_cix))       { arg_error("missing or invalid {cix2} = \"" cix2 "\""); } cix2 += 0;
    if (! is_frac(level,0.0,1.0))       { arg_error("missing or invalid {level} = \"" level "\""); } level += 0;
    if (pcttype !~ /^(pcp|pcv|pce)$/)   { arg_error("missing or invalid {pcttyle} = \"" pcttype "\""); }
    if (! is_num(minvotes,0,99999999))  { arg_error("missing or invalid {minvotes} = \"" minvotes "\""); } minvotes += 0;
    if (! is_num(minrecs,0,99999999))   { arg_error("missing or invalid {minrecs} = \"" minrecs "\""); } minrecs += 0;
    if (! is_num(ofnum,0,8))            { arg_error("missing or invalid {ofnum} = \"" ofnum "\""); } ofnum += 0;
    
    # Current group data:
    ostcd = ""; # State code, or "" if not defined yet.
    ozon = "";  # Zone number, or "" if not defined yet.
    oloc = "";  # Locality code, or "" if not defined yet.
    oid = "";   # Group identity "{ozon}.{oloc}", or "" if not defined yet.
    ogseq = 0;  # Output record ({spanning} group) sequence number.
    
    # Global counters:
    nrecs_read = 0;      # Number of input data records read.
    nrecs_small = 0;     # Number of input records that were discarded for having too few votes cast.
    ngroups_written = 0; # Number of output records ({spanning} units) written.
    ngroups_small = 0;   # Number of {spanning} units discarded for being to small.
    
    # Number of fields expected in ".dat" file (set from first record):
    num_fields = -1;

    # Voting ratio per candidate:
    split("", vot);   # {vot[1..2]} is the fraction of votes for candidates 1 and 2 over selecetd denominator.
    
    # Print comments:
    printf "# File created by %s\n", PROG_NAME;
    printf "# cix1=%d\n", cix1;
    printf "# cix2=%d\n", cix2;
    printf "# level=%8.6f\n", level;
    printf "# pcttype=%s\n", pcttype;
    printf "# minvotes=%s\n", minvotes;
    printf "# minrecs=%s\n", minrecs;
    printf "\n";
  }

( abort >= 0) { exit abort; }

// \
  {
    nrecs_read ++;
    
    # We should have at least 2 cands besides BLANK and NULL:
    if ((NF < 9 + 4) || (NF > 9 + max_cands))
      { data_error(("wrong number of fields " NF)); }

    # Check constancy of the number of fields:
    if ((FNR > 1) && (NF != num_fields))
      { data_error(("wrong number of fields " NF " expected " num_fields)); }
    num_fields = NF;
    
    # Grab general fields:
    est=$1; zon=$2; sec=$3; lov=$4; loc=$5; 
    car=$6; 
    ele=$7;  # Number of registered voters.
    aus=$8;  # Number of absentees.
    pre=$9;  # Number of votes cast ( { == ele-aus} ).
    bra=$10; # Number of BLANK votes.
    nul=$11; # Number of NULL votes.

    # Check consistency of state code field:
    if ((FNR > 1) && (est != ostcd))
      { data_error(("inconsistent {est} field = " est " previous " ostcd)); } 
    ostcd = est;

    # Check consistency of vote totals:
    if (! is_num(ele,0,999999999)) { data_error(("invalid {ele} = \"" ele "\"")); }
    if (! is_num(aus,0,ele)) { data_error(("invalid {aus} = \"" aus "\"")); }
    if (! is_num(pre,0,ele)) { data_error(("invalid {pre} = \"" pre "\"")); }
    if (pre + aus != ele) 
      { data_error(("wrong vote counts  ele = " ele "  aus = " aus "  pre = " pre)); }
    
    # Choose denominator for voting percentage:
    if (pcttype == "pcp")
      { den = pre; }
    else if (pcttype == "pcv")
      { den = pre - bra - nul; }
    else if (pcttype == "pce")
      { den = ele; }
    else
      { prog_error(("{pcttype} = " pcttype)); }

    # Hack to avoid divide by zero:
    if (den == 0) { den = 1.0; }
    
    # Compute candidate vote percentages {vot[1..3]}:
    for (k = 1; k <= 2; k++)
      { # Compute candidate's vote count {cvt}: 
        cix = (k == 1 ? cix1 : cix2);
        fix = 10 + cix;
        if (fix > NF) { data_error(("candidate index " cix " too large, max " NF - 10)); } 
        cvt = $(fix);
        # Consistency of count. Note that people vote twice for Senator,
        # so BRANCO and NULO may be twice the number of present voters: 
        exv = ((ofnum == 5) && (cix <= 1) ? 2*pre : pre);
        if (! is_num(cvt,0,exv)) { data_error(("invalid vote count \"" cvt "\" max is " exv)); }
       # Compute candidate's voting percentage {vot[k]}
        vot[k] = (cvt + 0.0)/den;
      }
    
    # Dump current {spanning} group if changed:
    ezon = (spanning == "loc" ? 0 : zon);
    eloc = (spanning == "zon" ? 0 : loc);
    eid = sprintf("%03d.%05d", ezon, eloc);
    if (eid != oid)
      { if (oid != "") 
          { if (eid < oid) { data_error(("input is incorrectly sorted " oid " " eid)); }
            dump_group_data();
          }
        clear_group_data(ezon,eloc,eid);
      }
    
    # Discard record if too few votes cast:
    if (pre < minvotes)
      { printf "record %03d %04d %05d has only %4d votes cast, discarded\n", zon, sec, loc, pre > "/dev/stderr";
        nrecs_small++;
        next;
      }

    # Accumulate current record:
    save_group_data(vot);
    
    next;
  }

END \
  { 
    if ( abort >= 0) { exit abort; }
    
    # Dump current {spanning} group if any:
    if (oid != "") { dump_group_data(); }
    
    printf "%8d data records (\"%s\"s) read\n", nrecs_read, grouping > "/dev/stderr";
    printf "%8d \"%s\"s with too few votes\n", nrecs_small, grouping > "/dev/stderr";
    printf "%8d \"%s\"s with too few data records\n", ngroups_small, spanning > "/dev/stderr";
    printf "%8d output records (\"%s\"s) written\n", ngroups_written, spanning > "/dev/stderr";
  }
   
function clear_group_data(nzon,nloc,nid )
  { 
    # Clears all data for the current group with {nid = "{ezon}.{eloc}"}:
    nrecs = 0;          # Number of sections in group.
    split("", vot_tb1); # Values of {vot[1]} in group, indexed {1..nrecs}.
    split("", vot_tb2); # Values of {vot[2]} in group, indexed {1..nrecs}.
    ogseq++;
    ozon = nzon;
    oloc = nloc;
    oid = nid;
  }
 
function save_group_data(vot)
  { 
    # Save the vote fractions for another data record of the current {spanning} unit.
    # Cast them in fixed format so that sorting works.
    nrecs++;
    vot_tb1[nrecs] = sprintf("%8.6f", vot[1]);
    vot_tb2[nrecs] = sprintf("%8.6f", vot[2]);
  }
  
function dump_group_data(  n,i)
  { 
    if (nrecs < minrecs)
      { printf "group %s.%s has only %4d valid records, discarded\n", ozon, oloc, nrecs > "/dev/stderr";
        ngroups_small++;
        return;
      }
    
    printf "%06d = %03d.%05d\n", ogseq, ozon, oloc > "/dev/stderr";

    printf "%s %s %s %06d", ostcd, ozon, oloc, ogseq;
    printf "%06d = %03d.%05d  %4d records considered\n", ogseq, ozon, oloc, nrecs > "/dev/stderr";
    
    n = asort(vot_tb1); if (n != nrecs) { prog_error(("{n vot_tb1} != {nrecs}")); }
    printf " %s %s %s", perc(0.5,vot_tb1,nrecs), perc(level,vot_tb1,nrecs), perc(1-level,vot_tb1,nrecs);
    
    n = asort(vot_tb2); if (n != nrecs) { prog_error(("{n vot_tb2} != {nrecs}")); }
    printf " %s %s %s", perc(0.5,vot_tb2,nrecs), perc(level,vot_tb2,nrecs), perc(1-level,vot_tb2,nrecs);
    
    printf "\n";
    
    ngroups_written++;
  }
  
function perc(lev,tb,n,  i,j,levi,levj,r,v)
  { 
    # Given a sorted table of values {tb[1..n]}, returns
    # the element with a fractionary position {lev}.
    # Namely if {lev=0} returns {tb[1]}, 
    # if {lev=1} returns {tb[n]}, and if {lev}
    # is intermediate, interpolates between
    # the two elements that bracket the 
    # position {lev} between {tb[1]} and {tb[n]}
    
    if (lev <= 0.0)
      { return tb[1]; }
    else if (lev >= 1.0)
      { return tb[n]; }
    else if (n == 1)
      { return tb[1]; }
    else
      { # Get brcketing elements:
        i = 1 + int(lev*(n-1)); 
        if (i < 1) { return tb[1]; }
        if (i >= n) { return tb[n]; }
        
        # Interpolate:
        r = lev*(n-1) - (i-1);
        v = (1-lev)*tb[i] + lev*tb[i+1];
        return sprintf("%8.6f",v);
      }
  }

function is_num(x,lo,hi)
  { 
    if (x !~ /^[0-9]+$/) { return 0; };
    if (x+0 < lo) { return 0; }
    if (x+0 > hi) { return 0; }
    return 1;
  }
  
function is_frac(x,lo,hi)
  { 
    if (x !~ /^[0-9]+([.][0-9]+|)$/) { return 0; };
    if (x+0 < lo) { return 0; }
    if (x+0 > hi) { return 0; }
    return 1;
  }

function data_error(msg)
  {
    printf "%s:%d: ** [cgs] %s\n", FILENAME, FNR, msg > "/dev/stderr";
    printf "  %s\n", $0 > "/dev/stderr";
    abort = 1;
    exit(abort);
  }

function data_warning(msg)
  {
    printf "%s:%d: !! [cgs] %s\n", FILENAME, FNR, msg > "/dev/stderr";
    printf "  %s\n", $0 > "/dev/stderr";
  }

function arg_error(msg)
  {
    printf "** [cgs] %s\n", msg > "/dev/stderr";
    abort = 1;
    exit(abort);
  }

function prog_error(msg)
  {
    printf "** [cgs] PROGRAM ERROR - %s\n", msg > "/dev/stderr";
    abort = 1;
    exit(abort);
  }