#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:05:12 by stolfi # Reads the file with size of wikipedia at # the end of each month, as obtained from the Wikipedia page on # "Modeling Wikipedia's growth". # Outputs the same data in the format # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SZOK}" # where {TIME} is elapsed days since Jan 1, 2001 # and {SU} is 0 for dubious, 1 for OK. # Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk" BEGIN{ abort = -1; yz = 2001; # Year zero of Wikipedia ott = -1; # Sequential day for last input entry. date_time_init() # date_time_sanity_check(yz) } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^(28|29|30|31)[\/][01][0-9][\/]20[0-9][0-9][,][ \011]*[0-9]+[ ]*$/ { # Data format used by [[User:HenkvD]] gsub(/[,\/ ]+/, " ", $0); if (NF != 4) { data_error(("bad line format"), lin); } da = 0+$1; mo = 0+$2; yr = 0+$3; sz = 0+$4; check_entry(yz, yr, mo, da, sz); # There is no comment field {cm}: cm = ""; process_raw_entry(yz, yr, mo, da, sz, cm); next } // { data_error(("unrecognized line format"), lin); next; } END{ fflush(); } function check_entry(yz,yr,mo,da,sz) { if ((da < 1) || (da > 31)) { data_error(("bad day [" da "]"), lin); } if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } } function process_raw_entry(yz,yr,mo,da,sz,cm, su) { # Compute time {tt} as day count since day 0. tt = time_from_date(yz, yr, mo, da); # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } # Decide the status {su}. su = 1; # Sizes that end in round '000' are suspicious: if ((sz % 1000) == 0) { su = 0; } # Output clean entry: printf "%6d %04d %02d %02d %10d %2d\n", tt, yr, mo, da, sz, su; ott = tt; }