#! /usr/bin/gawk -f # Last edited on 2008-05-01 10:29:06 by stolfi # Reads a list of "From " lines extracted by {grep} from UNIX mail folders, # in the format # # "{FOLDER}:From {SENDER} {WWW} {mmm} {dd} {HH}:{MM}:{SS} {yyyy} {ZZZZZ}" # # where {FOLDER} is the folder name, {WWW} is the day of the week, and # {ZZZZZ} is the time zone offset, a sign and 4 digits (hours and minutes). # The "{FOLDER}:" is otpional, defaults to "FOLDER:". # The time zone {ZZZZZ} is optional (defaults to {defzone} below). # # Lines that do not fit the input format above are left unchanged. # Writes them out in the format # # "©© {yyyy}-{mm}-{dd} {HH}:{MM}:{SS} {ZZZZZ} {FOLDER} {SENDER}" # BEGIN { abort = -1; /* When non-negative, stops all processing. */ # Build month tables {num_from_name} and {name_from_num} tables: nmonths = split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", name_from_num); if (nmonths != 12) { prog_error("duh? (1)"); } split("", num_from_name); for (i = 1; i <= nmonths; i++) { mna = name_from_num[i]; mnu = sprintf("%02d", i); num_from_name[mna] = mnu; } defzone = "-0300"; # Default time zone (Brazil East standard time) # Initialize counters: nfrom = 0; # Counts "From " lines that were processed. netc = 0; # Counts non-"From " lines that were processed. nerr = 0; # Counts format errors. } (abort >= 0) { exit(abort); } /^[©][©]/ { data_warning("warning - input line begins with \"©©\""); } /^([-A-Za-z0-9_.]+[:]|)From / { if ((NF != 7) && (NF != 8)) { data_error("wrong num of fields = " NF ""); } fd = $1; sn = $2; dw = $3; mo = $4; dy = $5; hr = $6; yr = $7; zn = (NF == 8 ? $8 : defzone); # Cleanup folder name {fd}: if (fd !~ /^([-A-Za-z0-9_.]+[:]|)From$/) { data_error("bad folder/From \"" fd "\""); } if (fd == "From") { fd = "FOLDER"; } else { gsub(/[:]From$/, "", fd); } # Check day of week {dw}: if (dw !~ /^(Mon|Tue|Wed|Thu|Fri|Sat|Sun)$/) { data_error("bad day-of-week \"" dw "\""); } # Check month name {mo} and convert to numeric: if (! (mo in num_from_name)) { data_error(("bad month name \"" mo "\"")); } mo = num_from_name[mo]; # Check day of month {dy} and reformat as 2-digit: if (dy !~ /^([1-9]|[0-2][0-9]|3[012])$/) { data_error("bad day-of-month \"" dy "\""); } dy = sprintf("%02d", dy); # Check hour format, provide leading "0": if (hr !~ /^[0-9]?[0-9]:[0-9][0-9]:[0-9][0-9]$/) { data_error("bad hour \"" hr "\""); } if (hr ~ /^[0-9][:]/) { hr = ("0" hr); } # Check year: if (yr !~ /^(19[6-9][0-9]|20[0-2][0-9])$/) { data_error("bad year \"" yr "\""); } # Check time zone offset: if (zn !~ /^[-+][0-9][0-9][0-9][0-9]$/) { data_error("bad zone \"" zn "\""); } # Write it: printf "%s-%s-%s %s %s %-15s %s\n", yr, mo, dy, hr, zn, fd, sn; nfrom++; next; } //{ print; netc++; next; } END { if (abort >= 0) { exit(abort); } printf "%s:%d: %d headers, %d non-headers, %d errors\n", \ FILENAME, FNR, nfrom, netc, nerr \ > "/dev/stderr"; } function data_warning(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf "%s:%d: «%s»\n", $0 > "/dev/stderr"; } function data_error(msg) { data_warning(msg); nerr++; if (nerr > 200) { data_warning("too many errors"); abort = 1; exit(abort); } } function prog_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit(abort); }