#! /usr/bin/gawk -f # Last edited on 2012-05-23 19:05:12 by stolfi # To be included with -f by gawk programs. BEGIN { # Days elapsed since Jan 1 and the first of each month in non-leap years: split("0 31 59 90 120 151 181 212 243 273 304 334 365", pud_month_offset, " "); printf "**BUGGY - FIX**\n"; } # DATE PARSING AND FORMATTING function pud_parse_date(dt,deftz,df, nf,f,mtb,yr,mo,dy,tm,hr,mi,sc,tz,ts) { # Parses a date {dt} in Unix format and saves its fields into {df[1..6]}. # The argument {df} must be declared as array by the caller. # Expects one of the following formats # "{DAY} +{MONTH} +{YEAR} +{HOUR}:?{MIN}[:?{SEC}] +[{TZOFF}]" # "{MONTH} +{DAY} +{HOUR}:?{MIN}[:?{SEC}] +[{TZOFF}] {YEAR}" # "{YEAR}( +|-){MONTH}( +|-){DAY}( +|-){HOUR}:?{MIN}[:{SEC}][ *{TZOFF}]" # The argument {dt} may be optionally include the day of the week, # and/or with a timezone comment code in parens e.g. "(GMT)", "(PST)", # eithe as pefix or suffix. # Tabs, commas and slashes are treated as blanks. # The colons in the time part are optional. # # LIMITS # # The {YEAR} must be a four-digit number in {1900 .. 2999}. # # The {MONTH} may be a one- or two-digit number in {1 .. 12}, or # an alphabetic month name (as accepted by {pud_month_number}). # # The {DAY} must be a one- or two-digit number in {1 .. 31}. # It need not be within the month's day range, so "30 Feb" and "31 Apr" # are OK. # # The {HOUR} must be a one- or two-digit number in {0 .. 23}. # # The minute {MIN} must be a one- or two-digit number in {0 .. 59}. # # The second {SEC}, if present, must be a one- or two-digit number # in {0 .. 60}. Note that "60" is valid only in some time standards # and only in a few specific instants (such as the leap seconds of # UTC); but the procedure does not check these constraints. If {SEC} # is missing, the procedure assumes {SEC = "00"}. # # If the timezone offset {TZOFF} is missing and {deftz} is not # empty, assumes {TZOFF = deftz}. If {TZOFF} is missing and {deftz} # is empty, assumes {TZOFF = "+0000"}. In any case, {TZOFF} must # have the format "{TZSIGN}{HOFF}{MOFF}", where {SIGN} (mandatory) # is "+" or "-", {HOFF} is two-digit hour offset in {00 .. 12}, and {MOFF} is a # two-digit minute offset in {00 .. 59}. The signed values # "{TZSIGN}{HOFF}" and "{TZSIGN}{MOFF}" are subtracted from {HOUR} # and {MIN}, respectively. The {DAY}, {HOUR} and {MIN} values are # then adjusted so that {HOUR} remains between 0 and 23 and {MIN} is # between 0 and 59. Note that {DAY} may become zero or negative, or # may exceed 31, as a result of this adjustment. # # On success, the fields {df[1..6]} will be set to {YEAR MONTH DAY # HOUR MIN SEC}. The {MONTH}, {DAY}, {HOUR}, {MIN} and {SEC} fields # will be zero-padded to two digits; but because of the timezone # adjustment, the {DAY} may have a minus sign. # # In case of error, the procedure sets {df[1..6]} to # {"9999" "99" "99" "99" "99" "99"}. #Initialize {df} df[1] = "9999"; df[2] = "99"; df[3] = "99"; df[4] = "99"; df[5] = "99"; df[6] = "99"; # Default default timezone: if (deftz == "") { deftz = "+0000"; } gsub(/[(][A-Z][A-Z]+[)]/, " ",dt); dt = pud_remove_day_of_week(dt); gsub(/^[\011 ]+/, "", dt); gsub(/[\011 ]+$/, "", dt); gsub(/[\011 ]+/, " ", dt); gsub(/[,\/]/, " ",dt); nf = split(dt,f," "); if ((nf < 3) || (nf > 5)) { return; } if (dt ~ /^(0?[1-9]|[12][0-9]|30|31) (0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+) (19|20)[0-9][0-9]/) { dy = f[1]; mo = pud_month_number(f[2]); yr = f[3]; if (nf >= 4) { tm = f[4]; } else { tm = "00:00:00"; } if (nf >= 5) { tz = f[5]; } else { tz = deftz; } } else if (dt ~ /^(0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+) (0?[1-9]|[12][0-9]|30|31) .* (19|20)[0-9][0-9]$/) { mo = pud_month_number(f[1]); dy = f[2]; yr = f[nf]; if (nf >= 4) { tm = f[3]; } else { tm = "00:00:00"; } if (nf >= 5) { tz = f[4]; } else { tz = deftz; } } else if (dt ~ /^(19|20)[0-9][0-9][- ](0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+)[- ](0?[1-9]|[12][0-9]|30|31)[- ]/) { if (! (match(substr(dt,5,1), /^[- ]$/))) { return; } yr = substr(dt,1,4); dt = substr(dt,6); if (! (match(dt, /^[0-9]+[- ]/))) { return; } mo = pud_month_number(substr(dt,1,RLENGTH-1)); dt = substr(dt,RLENGTH+1); if (! (match(dt, /^[0-9]+[- ]/))) { return; } dy = substr(dt,1,RLENGTH-1); dt = substr(dt,RLENGTH+1); if (match(dt, /[-+][0-9][0-9][0-9][0-9]$/)) { tm = substr(dt,1,RSTART-1); tz = substr(dt,RSTART); } else { tm = dt; tz = deftz; } gsub(/^[ ]+/, "", tm); gsub(/[ ]+$/, "", tm); } else { return; } if (tm ~ /^(0?[0-9]|1[0-9]|2[0-4])[:](0?[0-9]|[1-5][0-9])([:](0?[0-9]|[1-5][0-9]|60))?$/) { nf = split(tm,f,":"); if ((nf < 2) || (nf > 3)) { return; } hr = f[1]; mi = f[2]; if (nf >= 3) { sc = f[3]; } else { sc = "00"; } } else if (tm ~/^([01][0-9]|2[0-4])([0-5][0-9])([0-5][0-9]|60)$/) { hr = substr(tm,1,2); mi = substr(tm,3,2); sc = substr(tm,5,2); } else { return; } if (tz !~ /^[-+](0[0-9]|1[0-2])([0-5][0-9])$/) { return; } if (tz !~ /^[-+]0000$/) { mi = mi - (substr(tz,1,1) substr(tz,4,2)); hr = hr - substr(tz,1,3); while (mi < 0) { hr = hr - 1; mi = mi + 60; } while (mi >= 60) { hr = hr + 1; mi = mi - 60; } while (hr < 0) { dy = dy - 1; hr = hr + 24; } while (hr >= 24) { dy = dy + 1; hr = hr - 24; } } # Format and return: df[1] = sprintf("%04d",yr); df[2] = sprintf("%02d",mo); df[3] = sprintf("%02d",dy); df[4] = sprintf("%02d",hr); df[5] = sprintf("%02d",mi); df[6] = sprintf("%02d",sc); } function pud_remove_day_of_week(dt) { # Removes an alphabetic day-of-week name, if present, and the following # comma, if present. The name is any string that begins # with a three-letter day-of-week abbreviation (lowercase or ca-initial) # followed by any number of lowercase letters or hyphens. # Currently only the English and Portuguese week-of-day abbreviations # are recognized. gsub(/^ *(Mon|Tue|Wed|Thu|Fri|Sat|Sun|Seg|Ter|Qua|Qui|Sex|S.b|Dom)[-a-z]*[,]? */, " ",dt); return dt; } function pud_month_number(mo) { # If {mo} is a valid month name, returns the corresponding two-digit month # number in {"01" .. "12"}. If {mo} is already a one- or two-digit number in {1 .. 12}, # merely pads it to two digits. In all other cases returns "00". # # It the name is alphabetic, the procedure considers only its # first three letters (which must be all lowercase, all upercase, # or initial-cap); the rest of the name may by any string of # upper or lower-case letters. Currently only the # English names ("Feb", "Dec", etc.) and the Portuguese names ("Fev", "Dez") # are recognized. gsub(/^(jan|Jan|JAN)[A-Za-z]*$/, "01",mo); gsub(/^(fev|Fev|FEV|feb|Feb|FEB)[A-Za-z]*$/, "02",mo); gsub(/^(mar|Mar|MAR)[A-Za-z]*$/, "03",mo); gsub(/^(abr|Abr|ABR|apr|Apr|APR)[A-Za-z]*$/, "04",mo); gsub(/^(may|May|MAY|mai|Mai|MAI)[A-Za-z]*$/, "05",mo); gsub(/^(jun|Jun|JUN)[A-Za-z]*$/, "06",mo); gsub(/^(jul|Jul|JUL)[A-Za-z]*$/, "07",mo); gsub(/^(ago|Ago|AGO|aug|Aug|AUG)[A-Za-z]*$/, "08",mo); gsub(/^(set|Set|SET|sep|Sep|SEP)[A-Za-z]*$/, "09",mo); gsub(/^(out|Out|OUT|oct|Oct|OCT)[A-Za-z]*$/, "10",mo); gsub(/^(nov|Nov|NOV)[A-Za-z]*$/, "11",mo); gsub(/^(dez|Dez|DEZ|dec|Dec|DEC)[A-Za-z]*$/, "12",mo); if (mo ~ /^[1-9]$/) { mo = ("0" mo); } if (mo !~ /^(0[1-9]|1[0-2])$/) { return "00"; } return mo; } function pud_format_date(df,dtsep,mdsep,tmsep) { # Formats a date {df[1..6]} as # "{YEAR}/{MONTH}/{DAY}-{HOUR}:{MIN}:{SEC}", # where "/", "-" and ":" are replaced by {dtsep,mdsep,tmsep}, # respectively. # # This procedure makes no assumption about the fields # {df[1..6]} return sprintf \ ( "%s%s%s%s%s%s%s%s%s%s%s", \ df[1],dtsep,df[2],dtsep,df[3], mdsep, \ df[4],tmsep,df[5],tmsep,df[6] ); } # DATE/TIMESTAMP CONVERSION function pud_timestamp_from_date(df, yr,mo,dy,hr,mi,sc,bug,mlen,epdy,ts) { # Converts a GMT date {df[1..6]} into a number of seconds elapsed since the # standard UNIX epoch (Jan 1, 1970 00:00:00). # # The year {yr = df[1]} must be a number in 1901 .. 2099. # # The month {mo = df[2]} must be a number in {1 .. 12}. # # The day {dy = df[3]} must be a number. It may be zero or negative, # and does not need be within the month's valid day range. Thus the # following dates are equivalent: # # "31 Feb 1979" = "03 Mar 1979" # "31 Feb 1980" = "02 Mar 1980", # "00 Mar 1980" = "29 Feb 1980" # "-1 Jan 1970" = "30 Dec 1969" # "35 Dec 1990" = "04 Jan 1991" # # The hour {hr = df[4]} must be a number in {0 .. 24}. # # The minute {mi = df[5]} must be a number in {0 .. 59}. # # The second {sc = df[6]} must be a number in {0 .. 59}. # # This procedure does not use UTC, but rather a simpler TAI-like # date standard, where all days have precisely 24*60*60 seconds. In # particular, it ignores the UTC leap seconds, and does not allow # "60" as a value for the seconds parameter {sc}. # # Thus, the number of seconds returned for an input date in the # range 1972-2012 will be up to 25 seconds less than the result one # would obtain if the same input were interpreted as an UTC date. # For dates after 2012 the difference is unpredictable, since the # UTC leap seconds are known only six months in advance. # # In any case, the resulting second count must be between -2177452800 # (1901-01-01 00:00:00) and 4102444799 (2099-12-31 23:59:59) inclusive. # (The current code does not account for the fact that years divisible # by 100 but not by 400 are *not* leap years.) # # Arguments that are not numeric are converted to zero silently, # but the procedure returns 9999999999 (ten nines) if any # argument or the computed result is outside the corresponding # range stated above. bug = 9999999999; # Grab the fields, ensure they are numeric: yr = df[1]+0; mo = df[2]+0; dy = df[3]+0; hr = df[4]+0; mi = df[5]+0; sc = df[6]+0; # Check the fields: if ((yr < 1901) || (yr > 2099)) { return bug; } if ((mo < 1) || (mo > 12)) { return bug; } if ((hr < 0) || (hr > 23)) { return bug; } if ((mi < 0) || (mi > 59)) { return bug; } if ((sc < 0) || (sc > 59)) { return bug; } # Computes the number {epdy} of days from the epoch to 00:00 January 1 of {yr}: # All years in 1901..2099 that are divisible by 4 (including 2000) are leap years. epdy = 365*(yr - 1970) + int((yr - 1901)/4) - 17; # Adds the number of days from 00:00 January 1 of {yr} to # 00:00 of month {mo} of that year: epdy = epdy + pud_month_offset[mo]; if (((yr % 4) == 0) && (mo+0 > 2)) { epdy = epdy + 1; } # Add days from beginning of month to date: epdy = epdy + (dy-1); if ((epdy < -25202) || (epdy > 47481)) { return bug; } # Convert days to seconds and add the time, # assuming days of exactly {24*60*60} secs: ts = ((epdy*24 + hr)*60 + mi)*60 + sc; return ts; } function pud_date_from_timestamp(ts,tz,df, yr,mo,dy,hr,mi,sc,q,mlen) { # Converts a timestamp {ts} (seconds since the UNIX standard epoch) # to separate year, month, day, hour, minute and second values, # which are returned in {df[1..6]}, in that order. # The argument {df} must be declared as array by the caller. # # If {tz} is not empty, it must have the format "{TZSIGN}{HOFF}{MOFF}", # where {SIGN} (mandatory) is "+" or "-", {HOFF} is two-digit hour offset, and {MOFF} is a # two-digit minute offset (both 00 to 99). The values # "{TZSIGN}{HOFF}" times 3600 and "{TZSIGN}{MOFF}" times 60 are added to the # timestamp before conversion. # # This procedure does not use UTC but rather a TAI-like system, # where all days have precisely 24*60*60 seconds. In particular, it # ignores the UTC leap seconds, so the result returned for an input # timestamp in 1972-2012 will be up to 25 seconds greater than the result # one would obtain if the same timestamp were split into an UTC date. # For dates after 2012 the error may be larger, depending on # the rotation of the Earth and the whim of international bodies. # Unlike UTC, this procedure never generates "60" as the seconds field. # # The timestamp (after timezone addition) # must be between -2177452800 (1901/01/01-00:00:00) # and 4102444799 (2099/31/12-23:59:59) inclusive. # # Returns {"9999" "99" "99" "99" "99" "99"} if the input {ts} # is not numeric or is outside this range. #Initialize {df} df[1] = "9999"; df[2] = "99"; df[3] = "99"; df[4] = "99"; df[5] = "99"; df[6] = "99"; if (ts !~ /^[-+]?[0-9]+$/) { return; } if (tz != "") { if (tz !~ /^[-+]0000$/) { return; } ts = ts + substr(tz,1,3)*3600 + (substr(tz,1,1) substr(tz,4,2))*60; } if ((ts < -2177452800) || (ts > 4102444799)) { return; } # Change to 1901/01/01-00:00:00 as the epoch: ts = ts + 2177452800; # Assumes days are always 24*60*60 seconds long: sc = ts % 60; ts = int((ts - sc)/60); mi = ts % 60; ts = int((ts - mi)/60); hr = ts % 24; ts = int((ts - hr)/24); # Now {ts} is whole days from the 1901 epoch to the day in question, exclusive. # Account for whole leap cycles (4-year periods ending with leap year) # from the 1901 epoch to current date, exclusive: q = int(ts/1461); ts = ts - q*1461; yr = 1901 + q*4; # Account for whole years elapsed within leap cycle: q = int(ts/365); if (q == 4) { q = 3; } ts = ts - q*365; yr = yr + q; # Now {ts} is days from Jan/01 of {yr} to current date, exclusive. mo = 1; while (1) { # Get number of days {mlen} of month {mo}: mlen = pud_month_offset[mo+1] - pud_month_offset[mo]; if (((yr % 4) == 0) && (mo == 2)) { mlen = mlen + 1; } if (ts < mlen) { break; } mo = mo + 1; ts = ts - mlen; } dy = ts + 1; # Format and return: df[1] = sprintf("%04d",yr); df[2] = sprintf("%02d",mo); df[3] = sprintf("%02d",dy); df[4] = sprintf("%02d",hr); df[5] = sprintf("%02d",mi); df[6] = sprintf("%02d",sc); }