#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:04:29 by stolfi # Reads the file with raw but reformatted editor counts of wikipedia # at arbitrary dates, in the format # "{TIME} {YEAR} {MONTH} {DAY} {TOTEDS} {NEWEDS} {ACTEDS5} {ACTEDS100}" # Outputs the same data in the same format, but resampled at equal intervals # (every {sper} days). # Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk" BEGIN{ abort = -1; yz = 2001; # Year zero of Wikipedia ott = -1; # Time (days) for last input entry. split("", mp); # {mp[0] = mp000, mp[1] = mp005, mp[2] = mp100}. split("", omp); # {omp[k]} is the last input value of {mp[k]}. split("", ymp); # {zmp[k]} is the last output value of {mp[k]}. split("", zmp); # {zmp[k]} is the next output value of {mp[k]}. split("", smp); # {smp[k]} is the amount to subtract from {mp[k]} after scale adjustment. nf = 3; # Number of {mp} counts. for (k = 0; k < nf; k++) { mp[k] = omp[k] = ymp[k] = zmp[k] = smp[k] = 0; } sper = 28; # Sampling period. xtt = sper-1; # Time of next output entry. maxgap = 2*sper; # Warn about interpolations over more than these many days. nout = 0; # Output data line count. # Scaling correction to apply after each day: bcorr[ 0] = 1.0000; # Parameters of articles to exclude from the count: # {tosub[tt,k]} is the extra amount to subtract from {mp[k]} from day {tt} onwards. split("", tosub); # Some articles: tosub[ 655,0] = 0; tosub[ 655,1] = 0; tosub[ 655,2] = 0; # Current scaling factor to compensate undercounting by {mpacIII}: scale = 1.000; date_time_init() # date_time_sanity_check(yz) } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^[ 0-9]*$/ { if (NF != 8) { data_error(("bad line format"), lin); } tt = 0+$1; yr = 0+$2; mo = 0+$3; da = 0+$4; mp000 = 0+$5; dp000 = 0+$6; mp005 = 0+$7; mp100 = 0+$8; check_entry(yz, tt, yr, mo, da, mp000, dp000, mp005, mp100, omp[0]); mp[0] = mp000; mp[1] = mp005; mp[2] = mp100; process_entry(yz, tt, yr, mo, da, mp); next } // { data_error(("unrecognized line format"), lin); next; } END{ fflush(); printf "subtracted %d articles overall\n", tosub_tot > "/dev/stderr"; } function check_entry(yz,tt,yr,mo,da,mp000,dp000,mp005,mp100,omp000, r,xmp000) { if ((tt < 0) || (tt > 36500)) { data_error(("bad time [" tt "]"), lin); } if (tt != time_from_date(yz, yr, mo, da)) { data_error(("date error [" tt "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((da < 1) || (da > 31)) { data_error(("bad day [" da "]"), lin); } if ((mp000 < 0) || (mp000 > 999999999)) { data_error(("bad tot editors [" mp000 "]"), lin); } if ((mp005 < 0) || (mp005 > 999999999)) { data_error(("bad light editors [" mp005 "]"), lin); } if (mp005 > mp000) { data_warning(("more light editors than editors [" mp000 "] [" mp005 "]"), lin); } if ((mp100 < 0) || (mp100 > 999999999)) { data_error(("bad regular editors [" mp100 "]"), lin); } if (mp100 > mp005) { data_warning(("more regular editors than light editors [" mp005 "] [" mp100 "]"), lin); } if (dp000 != mp000 - omp000 ) { data_error(("bad new editors [" mp100 "]"), lin); } } function process_entry(yz,tt,yr,mo,da,mp, mtt,r,k,zdp000,xdt) { # Assumes global vars {ott,xtt,omp[],smp[],ymp[],zmp[],scale,nf}. # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } adjust_raw_counts(yz, tt, yr, mo, da, mp); if (xtt <= tt) { # Get sampling time {mtt} closest to the center of the interval: mtt = (tt + ott)/2; mtt = (int((mtt + 1)/sper) - 1)*sper + (sper - 1); if (mtt < xtt) { mtt = xtt; } if (mtt > tt) { data_error(("midsample bug [" ott "] [" xtt "] [" mtt "] [" tt "]"), lin); } # Generate interpolated data. while (xtt <= tt) { if (xtt < ott) { data_error(("sampling bug [" ott "] [" xtt "] [" tt "]"), lin); } # Interpolate geometrically the editors {xmp000} at time {xtt}: r = (xtt - ott + 0.0)/(tt - ott + 0.0); for(k = 0; k < nf; k++) { zmp[k] = (omp[k]+0.5)*exp(r*log((mp[k]+0.5)/(omp[k]+0.5))) - 0.5; zmp[k] = int(zmp[k] + 0.5); } # Compute recent growth rate {xdp} in new editors per sampling period: zdp000 = zmp[0] - ymp[0]; # Flag gaps longer than {maxgap} days: if (tt - ott > maxgap) { data_warning(("excessively long gap [" ott "] [" tt "]"), lin); } # Convert time of interpolated sample back to year, month, day: xdt = date_from_time(yz, xtt); # Print an explanatory comment: if (nout == 0) { printf "# Interpolated by compute-wp-edor-gr-rate.gawk\n\n"; } printf "%6d %s %10d %10d %10d %10d\n", xtt, xdt, zmp[0], zdp000, zmp[1], zmp[2]; for (k = 0; k < nf; k++) { ymp[k] = zmp[k]; } xtt += sper; nout++; } } ott = tt; for (k = 0; k < nf; k++) { omp[k] = mp[k]; } } function adjust_raw_counts(yz,tt,yr,mo,da,mp, k,ti,sc) { # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } # Gather all {tosub[ti],bcorr[ti]} entries since last entry to present: ti = ott+1; while (ti <= tt) { for (k = 0; k < nf; k++) { if ((ti,k) in tosub) { smp[k] += tosub[ti,k]; } } if (ti in bcorr) { scale = 1.0/bcorr[ti]; } ti++; } # Aply scaling correction: if (scale != 1.0) { for (k = 0; k < nf; k++) { mp[k] = int(mp[k]*scale + 0.5); } } # Subtract what is to be subtracted: for (k = 0; k < nf; k++) { mp[k] = mp[k] - smp[k]; } }