#! /usr/bin/gawk -f # Last edited on 2009-12-12 00:04:56 by stolfi # Reads the file with monthly statistics on Wikipedia # editors collected by Erik Zachte # [[http://stats.wikimedia.org/EN/TablesWikipediaEN.htm]]. # Outputs the same data in the format # "{TIME} {YEAR} {MONTH} {DAY} {TOTEDS} {NEWEDS} {ACTEDS5} {ACTEDS100}" # where {TIME} is elapsed days since Jan 1, 2001. # Assumes that it was executed with "-f lib_date_time.gawk -f lib_functions.gawk" BEGIN{ abort = -1; yz = 2001; # Year zero of Wikipedia ott = -1; # Sequential day for last input entry. date_time_init() # date_time_sanity_check(yr) } (abort >= 0) { exit abort; } //{ # Save line for error reporting: lin = $0; } /^ *([\#]|$)/ { print; next; } /^[ ]*[A-Z][a-z][a-z][ ]+20[0-9][0-9][ ]+[0-9]+[ ]+[0-9]+[ ]+[0-9]+[ ]+[0-9]+[ ]*$/ { # Data format used by [[http://stats.wikimedia.org/EN/TablesWikipediaEN.htm]] if (NF < 6) { data_error(("bad line format"), lin); } mo = month_name_to_num[$1]; yr = 0+$2; mp000 = 0+$3; dp000 = 0+$4; mp005 = 0+$5; mp100 = 0+$6; check_entry(yz, yr, mo, mp000, dp000, mp005, mp100); process_raw_entry(yz, yr, mo, mp000, dp000, mp005, mp100); next } // { data_error(("unrecognized line format"), lin); next; } END{ fflush(); } function check_entry(yz,yr,mo,mp000,dp000,mp005,mp100) { if ((mo < 1) || (mo > 12)) { data_error(("bad month [" mo "]"), lin); } if ((yr < yz) || (yr > 2099)) { data_error(("bad year [" yr "]"), lin); } if (mp005 < mp100) { data_error(("bad month pops [" mp005 "] [" mp100 "]"), lin); } } function process_raw_entry(yz,yr,mo,mp000,dp000,mp005,mp100, su) { # Compute time {tt} for end-of-month as day count since day 0. tt = time_from_date(yz, yr, mo+1, 0); # To get the last day of month {mo}. dt = date_from_time(yz,tt); # Check chrono order if (tt <= ott) { data_error(("seq error [" ott "] [" tt "]"), lin); } # Output clean entry: printf "%6d %s %10d %10d %10d %10d\n", tt, dt, mp000, dp000, mp005, mp100; ott = tt; }