#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:02:59 by stolfi # Options: "-v phase={NUM}" "-v seasonal={BOOL}" # Reads the file with size of wikipedia in the format # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SU} {DZ} {DU}" # Outputs the same with four additional fields # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SU} {SP} {SE} {DZ} {DU} {DP} {DE}" # "{SP} {DP}" predicted size and increase # "{SE} {DE}" errors in those predictions # If {seasonal} is set, includes the seasonal factor. # If {phase} is nonzero, shows only that phase. # Must be called with "-f lib_date_time.gawk -f lib_functions.gawk -f model_params.gawk" BEGIN{ abort = -1; if (phase == "") { phase = 0; } if ((phase != 0) && (phase != 1) && (phase != 2)) { arg_error(("bad phase [" phase "]")); } if (seasonal == "") { seasonal = 0; } if ((seasonal != 0) && (seasonal != 1)) { arg_error(("bad seasonal [" seasonal "]")); } yz = 2001; # Year zero of Wikipedia pi = 3.1415926; # Euler's number. sp = 0; # Predicted size. sper = 28; # Sampling period assumed in input file (days). ntt = sper-1; # Next expected sample time {tt} (days). # Initialize the date conversion tables: date_time_init(); # Set the model parameters: set_model_params(); } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^ *[0-9]/ { if (NF != 8) { data_error(("bad line format"), lin); } tt = 0+$1; yr = 0+$2; mo = 0+$3; da = 0+$4; sz = 0+$5; su = 0+$6; dz = 0+$7; du = 0+$8; check_entry(yz, tt, yr, mo, da, sz, su, dz, du); # Check for uniform sampling every 28 days: if ((tt % sper) != (sper-1)) { data_error(("sample time is not " sper-1 " mod " sper " [" tt "]"), lin); } if (tt != ntt) { data_error(("unexpected sample time [" ntt "] [" tt "]"), lin); } ntt += sper; # Predict monthly increase {dp} for mid-period {tt}, and accumulate in {sp}: dp = pred_increase(tt-(0.5*sper)); sp += dp; # Compute prediction errors and write out: se = sp - sz; de = dp - dz; printf "%6d %4d %02d %02d %10d %d %10d %10d %8d %d %8d %8d\n", \ tt, yr, mo, da, sz, su, sp, se, dz, du, dp, de; next; } // { data_error(("bad line format"), lin); } END { # Estimate total articles: st = sp - Kph2/Rph2*exp(Rph2*(tt - Sph2))/sper; printf "current time = %d\n", tt > "/dev/stderr"; printf "current size = %10d\n", sp > "/dev/stderr"; printf "predicted total size = %10d articles\n", st > "/dev/stderr"; } function check_entry(yz,tt,yr,mo,da,sz,su,dz,du, xtt) { if ((tt < 0) || (tt > 36500)) { data_error(("bad time [" tt "]"), lin); } xtt = time_from_date(yz, yr, mo, da); if (tt != xtt) { data_error(("date error [" tt "] [" xtt "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((da < 1) || (da > 31)) { data_error(("bad day [" da "]"), lin); } if ((sz < 0) || (sz > 999999999)) { data_error(("bad size [" sz "]"), lin); } if ((su != 0) && (su != 1)) { data_error(("bad size status [" su "]"), lin); } if ((dz < -10) || (dz > 999999999)) { data_error(("bad growth [" dz "]"), lin); } if ((du != 0) && (du != 1) && (du != 9)) { data_error(("bad growth status [" du "]"), lin); } } function pred_increase(tc, f1,f2) { # Predicts growth (articles per lunar mont) for date {tc}. pri = 0; if ((phase == 0) || (phase == 1)) { f1 = (seasonal ? model_season(tc, Aph1, Tph1, Dph1) : 1.0); pri += f1 * model_expo(tc, bph1, eph1, Sph1, Kph1, Rph1); } if ((phase == 0) || (phase == 2)) { f2 = (seasonal ? model_season(tc, Aph2, Tph2, Dph2) : 1.0); pri += f2 * model_expo(tc, bph2, eph2, Sph2, Kph2, Rph2); } return pri; } function model_season(tc,A,T,D) { # Models a seasonal factor with amplitude {A}, freq {F} in {day^-1}, delay {D}. return exp(A*cos(2*pi*(tc/T - D))); } function model_expo(tc,b,e,m,K,R, wf,mo,ex) { # Models an exponential functon of {tc} # with multiplier {K} rate {R} that has value {K} when {tc == m}. # Softly windowed between days {b} and {e} with {± wblur} days. wf = window_factor(tc, b, e, wblur); if (wf == 0) { return 0; } ex = K*exp(R*(tc - m)); return wf*ex; }