#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:03:29 by stolfi # Reads the file with size of wikipedia at # irregular dates, as obtained from the page # [[Wikipedia:Size of Wikipedia]]. # Outputs the same data in the format # "{TIME} {YEAR} {MONTH} {DAY} {SZ} {SU}" # where {TIME} is elapsed days since Jan 1, 2001 # and {SU} is 0 for dubious, 1 for OK. # Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk" BEGIN{ abort = -1; yz = 2001; # Year zero of Wikipedia ott = -1; # Sequential day for last input entry. date_time_init() # date_time_sanity_check(yr) } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^20[0-9][0-9][-][01][0-9][-][0-9][0-9][,][ \011]*[0-9]+[,]/ { # Data format used by [[Wikipedia:Size of Wikipedia]] gsub(/[,-]+/, " ", $0); if (NF < 4) { data_error(("bad line format"), lin); } yr = 0+$1; mo = 0+$2; da = 0+$3; sz = 0+$4; check_entry(yz, yr, mo, da, sz); # Get the comments field {cm}: cm = lin; gsub(/^[-0-9]+,[ ]*[0-9]+,[ ]*/, "", cm); gsub(/[ ]*$/, "", cm); process_raw_entry(yz, yr, mo, da, sz, cm); next } // { data_error(("unrecognized line format"), lin); next; } END{ fflush(); } function check_entry(yz,yr,mo,da,sz) { if ((da < 1) || (da > 31)) { data_error(("bad day [" da "]"), lin); } if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } } function process_raw_entry(yz,yr,mo,da,sz,cm, su) { # Compute time {tt} as day count since day 0. tt = time_from_date(yz, yr, mo, da); # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } # Decide the status {su}. su = 1; # Comments that say "lowerbound" or "approx" are worrisome but let's trust them. # In this file, sizes that end in round '000' are milestones, not suspicious. # Sizes marked "estimate" or "commacnt" seem to be less reliable. # There is an erroneous entry marked "RAMBOT causing Error in counter". if (cm ~ /estimate|commacnt|approx|error in counter/) { su = 0; } # Output clean entry: printf "%6d %04d %02d %02d %10d %2d\n", tt, yr, mo, da, sz, su; ott = tt; }