#! /usr/bin/gawk -f
# Last edited on 2009-12-12 00:04:29 by stolfi

# Reads the file with raw but reformatted editor counts of wikipedia
# at arbitrary dates, in the format
# "{TIME} {YEAR} {MONTH} {DAY} {TOTEDS} {NEWEDS} {ACTEDS5} {ACTEDS100}"

# Outputs the same data in the same format, but resampled at equal intervals
# (every {sper} days).

# Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk"

BEGIN{ 
  abort = -1;

  yz = 2001;   # Year zero of Wikipedia
  ott = -1;    # Time (days) for last input entry.

  split("", mp);    # {mp[0] = mp000, mp[1] = mp005, mp[2] = mp100}.
  split("", omp);   # {omp[k]} is the last input value of {mp[k]}.
  split("", ymp);   # {zmp[k]} is the last output value of {mp[k]}.
  split("", zmp);   # {zmp[k]} is the next output value of {mp[k]}.
  split("", smp);   # {smp[k]} is the amount to subtract from {mp[k]} after scale adjustment.

  nf = 3; # Number of {mp} counts.
  
  for (k = 0; k < nf; k++) { mp[k] = omp[k] = ymp[k] = zmp[k] = smp[k] = 0; }
  
  sper = 28;       # Sampling period.
  xtt = sper-1;    # Time of next output entry.
  maxgap = 2*sper; # Warn about interpolations over more than these many days.
  nout = 0;        # Output data line count.
  
  # Scaling correction to apply after each day:
  bcorr[   0] = 1.0000;

  # Parameters of articles to exclude from the count:
  # {tosub[tt,k]} is the extra amount to subtract from {mp[k]} from day {tt} onwards.
  split("", tosub); 
  # Some articles:
  tosub[ 655,0] =      0; tosub[ 655,1] =      0; tosub[ 655,2] =      0;
  
  # Current scaling factor to compensate undercounting by {mpacIII}:
  scale = 1.000;

  date_time_init()  
  # date_time_sanity_check(yz)  
}

(abort >= 0) { exit abort; }

//{ # Save line for error reporting:
  lin = $0;
}

/^ *([\#]|$)/ { print; next; } 

/^[ 0-9]*$/ { 
  if (NF != 8) { data_error(("bad line format"), lin); }
  tt = 0+$1; yr = 0+$2; mo = 0+$3; da = 0+$4; mp000 = 0+$5; dp000 = 0+$6; mp005 = 0+$7; mp100 = 0+$8;
  
  check_entry(yz, tt, yr, mo, da, mp000, dp000, mp005, mp100, omp[0]);
  mp[0] = mp000; mp[1] = mp005; mp[2] = mp100;
  process_entry(yz, tt, yr, mo, da, mp);
  next
}

// { data_error(("unrecognized line format"), lin); next; }

END{
  fflush();
  printf "subtracted %d articles overall\n", tosub_tot > "/dev/stderr";
}

function check_entry(yz,tt,yr,mo,da,mp000,dp000,mp005,mp100,omp000,   r,xmp000)
{
  if ((tt < 0) || (tt > 36500))
    { data_error(("bad time [" tt "]"), lin); }
  if (tt != time_from_date(yz, yr, mo, da))
    { data_error(("date error [" tt "]"), lin); }
  
  if ((yr < yz) || (yr > 2099))
    { data_error(("bad year [" yr "]"), lin); }
  if ((mo < 1) || (mo > 12))
    { data_error(("bad month [" mo "]"), lin); }
  if ((da < 1) || (da > 31))
    { data_error(("bad day [" da "]"), lin); }
  
  if ((mp000 < 0) || (mp000 > 999999999))
    { data_error(("bad tot editors [" mp000 "]"), lin); }
  
  if ((mp005 < 0) || (mp005 > 999999999))
    { data_error(("bad light editors [" mp005 "]"), lin); }
    if (mp005 > mp000)
    { data_warning(("more light editors than editors [" mp000 "] [" mp005 "]"), lin); }
  
  if ((mp100 < 0) || (mp100 > 999999999))
    { data_error(("bad regular editors [" mp100 "]"), lin); }
  
  if (mp100 > mp005)
    { data_warning(("more regular editors than light editors [" mp005 "] [" mp100 "]"), lin); }
  
  if (dp000 != mp000 - omp000 )
    { data_error(("bad new editors [" mp100 "]"), lin); }
}

function process_entry(yz,tt,yr,mo,da,mp,   mtt,r,k,zdp000,xdt)
{
  # Assumes global vars {ott,xtt,omp[],smp[],ymp[],zmp[],scale,nf}.
  
  # Check chrono order
  if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); }

  adjust_raw_counts(yz, tt, yr, mo, da, mp);

  if (xtt <= tt)
    {
      # Get sampling time {mtt} closest to the center of the interval:
      mtt = (tt + ott)/2;
      mtt = (int((mtt + 1)/sper) - 1)*sper + (sper - 1);
      if (mtt < xtt) { mtt = xtt; }
      if (mtt > tt)
        { data_error(("midsample bug [" ott "] [" xtt "] [" mtt "] [" tt "]"), lin); }
      # Generate interpolated data.
      while (xtt <= tt)
        { if (xtt < ott) { data_error(("sampling bug [" ott "] [" xtt "] [" tt "]"), lin); }

          # Interpolate geometrically the editors {xmp000} at time {xtt}:
          r = (xtt - ott + 0.0)/(tt - ott + 0.0);
          for(k = 0; k < nf; k++) 
            { zmp[k] = (omp[k]+0.5)*exp(r*log((mp[k]+0.5)/(omp[k]+0.5))) - 0.5;
              zmp[k] = int(zmp[k] + 0.5);
            }

          # Compute recent growth rate {xdp} in new editors per sampling period:
          zdp000 = zmp[0] - ymp[0];

          # Flag gaps longer than {maxgap} days:
          if (tt - ott > maxgap) 
            { data_warning(("excessively long gap [" ott "] [" tt "]"), lin); }

          # Convert time of interpolated sample back to year, month, day:
          xdt = date_from_time(yz, xtt);
          # Print an explanatory comment:
          if (nout == 0) { printf "# Interpolated by compute-wp-edor-gr-rate.gawk\n\n"; }
          printf "%6d %s %10d %10d %10d %10d\n", xtt, xdt, zmp[0], zdp000, zmp[1], zmp[2]; 
          for (k = 0; k < nf; k++) { ymp[k] = zmp[k]; }
          xtt += sper;
          nout++;
        }
    }
  ott = tt; 
  for (k = 0; k < nf; k++) { omp[k] = mp[k]; }
}

function adjust_raw_counts(yz,tt,yr,mo,da,mp,   k,ti,sc)
{
  # Check chrono order
  if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); }
  
  # Gather all {tosub[ti],bcorr[ti]} entries since last entry to present:
  ti = ott+1;
  while (ti <= tt)
    { for (k = 0; k < nf; k++) { if ((ti,k) in tosub) { smp[k] += tosub[ti,k]; } }
      if (ti in bcorr) { scale = 1.0/bcorr[ti]; }
      ti++;
    }
  
  # Aply scaling correction:
  if (scale != 1.0) 
    { for (k = 0; k < nf; k++) { mp[k] = int(mp[k]*scale + 0.5); } }
  
  # Subtract what is to be subtracted:
  for (k = 0; k < nf; k++) { mp[k] = mp[k] - smp[k]; }
}