#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:01:03 by stolfi # Reads the file with raw but reformatted size of wikipedia # at arbitrary dates, in the format # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SZOK}" # Outputs the same data in the format # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SU} {DZ} {DU}" # where {TIME} is spaced a fixed number {sper} of days apart starting with {sper-1}, {SZ} is size # interpolated at {TIME}, {DZ} is the increment in {SZ} during the last {sper} days. # The fields {SU} and {DU} are reliability flags (1 ok, 0 dubious, 9 ignore) # Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk" BEGIN{ abort = -1; yz = 2001; # Year zero of Wikipedia ott = -1; # Time (days) for last OK input entry. osz = 0; # Size as of last OK input entry, adjusted. oxsz = 0; # Size field of last output entry. oxsu = 1; # Status of size field of last output entry. sper = 28; # Sampling period. xtt = sper-1; # Time of next output entry. maxgap = 2*sper; # Distrust interpolations over more than these many days. nout = 0; # Output data line count. # Parameters of articles to exclude from the count: split("", tosub); # {tosub[tt]} is the amount to subtract from day {tt} onwards. # Rambot articles: tosub[ 655] = 350 ; tosub[ 656] = 6690 ; tosub[ 657] = 5194 ; tosub[ 658] = 316 ; tosub[ 659] = 7217 ; tosub[ 660] = 2903 ; tosub[ 661] = 3880 ; tosub[ 662] = 6100 ; tosub[ 663] = 350 ; tosub_tot = 0; # Cumulative {tosub} values up to present entry. # Counter adjustments: bcorr[ 0] = 1.0000; # B1 bcorr[ 683] = 0.9940; # B4 bcorr[ 705] = 0.9721; # B5 bcorr[ 718] = 0.9700; # B5.5 bcorr[ 722] = 0.9670; # B5.6 bcorr[ 730] = 0.9600; # B5.7 bcorr[ 740] = 0.9595; # B5.8 bcorr[ 750] = 0.9590; # B5.9 bcorr[ 760] = 0.9585; # B6 bcorr[ 790] = 0.9575; # B6.1 bcorr[ 800] = 0.9542; # B6.2 bcorr[ 810] = 0.9500; # B6.3 bcorr[ 818] = 0.9480; # B6.4 bcorr[ 825] = 0.9440; # B6.5 bcorr[ 833] = 0.9422; # B7 bcorr[ 865] = 0.9300; # B7.5 bcorr[ 873] = 0.9855; # B8 bcorr[ 913] = 0.9983; # B9 bcorr[ 930] = 1.0000; # B9 # Current scaling factor to compensate undercounting by {mpacIII}: scale = 1.000; date_time_init() # date_time_sanity_check(yz) } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^[ 0-9]*$/ { if (NF != 6) { data_error(("bad line format"), lin); } tt = 0+$1; yr = 0+$2; mo = 0+$3; da = 0+$4; sz = 0+$5; su = 0+$6; check_entry(yz, tt, yr, mo, da, sz, su); if (su != 1) { next; } sz = adjust_raw_size(yz, tt, yr, mo, da, sz); process_entry(yz, tt, yr, mo, da, sz); next } // { data_error(("unrecognized line format"), lin); next; } END{ fflush(); printf "subtracted %d articles overall\n", tosub_tot > "/dev/stderr"; } function check_entry(yz,tt,yr,mo,da,sz,su, r,xsz) { if ((tt < 0) || (tt > 36500)) { data_error(("bad time [" tt "]"), lin); } if (tt != time_from_date(yz, yr, mo, da)) { data_error(("date error [" tt "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((da < 1) || (da > 31)) { data_error(("bad day [" da "]"), lin); } if ((sz < 0) || (sz > 999999999)) { data_error(("bad size [" sz "]"), lin); } if ((su != 0) && (su != 1)) { data_error(("bad status [" su "]"), lin); } } function process_entry(yz,tt,yr,mo,da,sz, r,xsz,xsu,xdz,xdu,xdt,mtt) { # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } if (xtt <= tt) { # Get sampling time {mtt} closest to the center of the interval: mtt = (tt + ott)/2; mtt = (int((mtt + 1)/sper) - 1)*sper + (sper - 1); if (mtt < xtt) { mtt = xtt; } if (mtt > tt) { data_error(("midsample bug [" ott "] [" xtt "] [" mtt "] [" tt "]"), lin); } # Generate interpolated data. while (xtt <= tt) { if (xtt < ott) { data_error(("sampling bug [" ott "] [" xtt "] [" tt "]"), lin); } # Interpolate geometrically the size {xsz} at time {xtt}: r = (xtt - ott + 0.0)/(tt - ott + 0.0); xsz = (osz+1)*exp(r*log((sz+1)/(osz+1))) - 1; # The interpolated value is reliable iff the gap is at most {maxgap} days: xsu = (tt - ott > maxgap ? 0 : 1); # Compute recent growth rate {xdz} in articles per sampling period: xdz = xsz - oxsz; # The growth rate is reliable iff the gap is at most {maxgap} days: xdu = (tt - ott > maxgap ? 0 : 1); # ... and if the previous size was reliable: if (oxsu != 1) { xdu = 0; } # ... and we are as close as possible to the gap's midpoint: if (xtt != mtt) { xdu = 9; } # Convert time of interpolated sample back to year, month, day: xdt = date_from_time(yz, xtt); # Print an explanatory comment: if (nout == 0) { printf "# Interpolated by compute-wp-size-gr-rate.gawk\n\n"; } printf "%6d %s %10d %d %10d %d\n", xtt, xdt, xsz, xsu, xdz, xdu; oxsz = xsz; oxsu = xsu; xtt += sper; nout++; } } ott = tt; osz = sz; } function adjust_raw_size(yz,tt,yr,mo,da,sz, ti,sc) { # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } # Gather all {tosub[ti],bcorr[ti]} entries since last entry to present: ti = ott+1; while (ti <= tt) { if (ti in tosub) { tosub_tot += tosub[ti]; } if (ti in bcorr) { scale = 1.0/bcorr[ti]; } ti++; } # Aply {mpacIII} correction: if (scale != 1.0) { sz = int(sz*scale + 0.5); } # Subtract what is to be subtracted: sz = sz - tosub_tot; return sz; }