#! /usr/bin/gawk -f
# Last edited on 2012-05-23 19:05:12 by stolfi

# To be included with -f by gawk programs.

BEGIN {
  # Days elapsed since Jan 1 and the first of each month in non-leap years:
  split("0 31 59 90 120 151 181 212 243 273 304 334 365", pud_month_offset, " ");
  printf "**BUGGY - FIX**\n";
}

# DATE PARSING AND FORMATTING

function pud_parse_date(dt,deftz,df,  nf,f,mtb,yr,mo,dy,tm,hr,mi,sc,tz,ts)
{
  # Parses a date {dt} in Unix format and saves its fields into {df[1..6]}.
  # The argument {df} must be declared as array by the caller.
  # Expects one of the following formats
  #  "{DAY} +{MONTH} +{YEAR} +{HOUR}:?{MIN}[:?{SEC}] +[{TZOFF}]"
  #  "{MONTH} +{DAY} +{HOUR}:?{MIN}[:?{SEC}] +[{TZOFF}] {YEAR}"
  #  "{YEAR}( +|-){MONTH}( +|-){DAY}( +|-){HOUR}:?{MIN}[:{SEC}][ *{TZOFF}]"
  # The argument {dt} may be optionally include the day of the week,
  # and/or with a timezone comment code in parens e.g. "(GMT)", "(PST)",
  # eithe as pefix or suffix.
  # Tabs, commas and slashes are treated as blanks.
  # The colons in the time part are optional.
  # 
  # LIMITS
  # 
  # The {YEAR} must be a four-digit number in {1900 .. 2999}.
  # 
  # The {MONTH} may be a one- or two-digit number in {1 .. 12}, or 
  # an alphabetic month name (as accepted by {pud_month_number}).
  #
  # The {DAY} must be a one- or two-digit number in {1 .. 31}. 
  # It need not be within the month's day range, so "30 Feb" and "31 Apr" 
  # are OK.
  # 
  # The {HOUR} must be a one- or two-digit number in {0 .. 23}.
  # 
  # The minute {MIN} must be a one- or two-digit number in {0 .. 59}.
  # 
  # The second {SEC}, if present, must be a one- or two-digit number
  # in {0 .. 60}. Note that "60" is valid only in some time standards
  # and only in a few specific instants (such as the leap seconds of
  # UTC); but the procedure does not check these constraints. If {SEC}
  # is missing, the procedure assumes {SEC = "00"}.
  # 
  # If the timezone offset {TZOFF} is missing and {deftz} is not
  # empty, assumes {TZOFF = deftz}. If {TZOFF} is missing and {deftz}
  # is empty, assumes {TZOFF = "+0000"}. In any case, {TZOFF} must
  # have the format "{TZSIGN}{HOFF}{MOFF}", where {SIGN} (mandatory)
  # is "+" or "-", {HOFF} is two-digit hour offset in {00 .. 12}, and {MOFF} is a
  # two-digit minute offset in {00 .. 59}. The signed values 
  # "{TZSIGN}{HOFF}" and "{TZSIGN}{MOFF}" are subtracted from {HOUR}
  # and {MIN}, respectively. The {DAY}, {HOUR} and {MIN} values are
  # then adjusted so that {HOUR} remains between 0 and 23 and {MIN} is
  # between 0 and 59. Note that {DAY} may become zero or negative, or
  # may exceed 31, as a result of this adjustment.
  # 
  # On success, the fields {df[1..6]} will be set to {YEAR MONTH DAY
  # HOUR MIN SEC}. The {MONTH}, {DAY}, {HOUR}, {MIN} and {SEC} fields
  # will be zero-padded to two digits; but because of the timezone
  # adjustment, the {DAY} may have a minus sign. 
  # 
  # In case of error, the procedure sets {df[1..6]} to 
  # {"9999" "99" "99" "99" "99" "99"}.
  
  #Initialize {df} 
  df[1] = "9999"; df[2] = "99"; df[3] = "99";
  df[4] = "99"; df[5] = "99"; df[6] = "99";

  # Default default timezone:
  if (deftz == "") { deftz = "+0000"; }

  gsub(/[(][A-Z][A-Z]+[)]/, " ",dt);
  dt = pud_remove_day_of_week(dt);
  gsub(/^[\011 ]+/, "", dt);
  gsub(/[\011 ]+$/, "", dt);
  gsub(/[\011 ]+/, " ", dt);
  gsub(/[,\/]/, " ",dt);

  nf = split(dt,f," ");
  if ((nf < 3) || (nf > 5)) { return; }
  if (dt ~ /^(0?[1-9]|[12][0-9]|30|31) (0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+) (19|20)[0-9][0-9]/)
    { dy = f[1]; mo = pud_month_number(f[2]); yr = f[3];
      if (nf >= 4) { tm = f[4]; } else { tm = "00:00:00"; }
      if (nf >= 5) { tz = f[5]; } else { tz = deftz; }
    }
  else if (dt ~ /^(0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+) (0?[1-9]|[12][0-9]|30|31) .* (19|20)[0-9][0-9]$/)
    { mo = pud_month_number(f[1]); dy = f[2]; yr = f[nf];
      if (nf >= 4) { tm = f[3]; } else { tm = "00:00:00"; }
      if (nf >= 5) { tz = f[4]; } else { tz = deftz; }
    }
  else if (dt ~ /^(19|20)[0-9][0-9][- ](0?[1-9]|1[0-2]|[a-zA-Z][a-zA-Z]+)[- ](0?[1-9]|[12][0-9]|30|31)[- ]/)
    { 
      if (! (match(substr(dt,5,1), /^[- ]$/))) { return; }
      yr = substr(dt,1,4); dt = substr(dt,6);
      if (! (match(dt, /^[0-9]+[- ]/))) { return; }
      mo = pud_month_number(substr(dt,1,RLENGTH-1)); dt = substr(dt,RLENGTH+1);
      if (! (match(dt, /^[0-9]+[- ]/))) { return; }
      dy = substr(dt,1,RLENGTH-1); dt = substr(dt,RLENGTH+1);
      if (match(dt, /[-+][0-9][0-9][0-9][0-9]$/))
        { tm = substr(dt,1,RSTART-1); 
          tz = substr(dt,RSTART);
        }
      else
        { tm = dt; tz = deftz; }
      gsub(/^[ ]+/, "", tm);
      gsub(/[ ]+$/, "", tm);
    }
  else
    { return; }
  if (tm ~ /^(0?[0-9]|1[0-9]|2[0-4])[:](0?[0-9]|[1-5][0-9])([:](0?[0-9]|[1-5][0-9]|60))?$/) 
    { nf = split(tm,f,":");
      if ((nf < 2) || (nf > 3)) { return; }
      hr = f[1]; mi = f[2];
      if (nf >= 3) { sc = f[3]; } else { sc = "00"; }
    }
  else if (tm ~/^([01][0-9]|2[0-4])([0-5][0-9])([0-5][0-9]|60)$/)
    { hr = substr(tm,1,2); 
      mi = substr(tm,3,2); 
      sc = substr(tm,5,2);
    }
  else
    { return; }
  if (tz !~ /^[-+](0[0-9]|1[0-2])([0-5][0-9])$/) { return; }
  if (tz !~ /^[-+]0000$/) 
    { mi = mi - (substr(tz,1,1) substr(tz,4,2));
      hr = hr - substr(tz,1,3);
      while (mi < 0) { hr = hr - 1; mi = mi + 60; }
      while (mi >= 60) { hr = hr + 1; mi = mi - 60; }
      while (hr < 0) { dy = dy - 1; hr = hr + 24; }
      while (hr >= 24) { dy = dy + 1; hr = hr - 24; }
    }
  
  # Format and return:
  df[1] = sprintf("%04d",yr); df[2] = sprintf("%02d",mo); df[3] = sprintf("%02d",dy);
  df[4] = sprintf("%02d",hr); df[5] = sprintf("%02d",mi); df[6] = sprintf("%02d",sc);
}
  
function pud_remove_day_of_week(dt)
{ 
  # Removes an alphabetic day-of-week name, if present, and the following 
  # comma, if present.  The name is any string that begins 
  # with a three-letter day-of-week abbreviation (lowercase or ca-initial) 
  # followed by any number of lowercase letters or hyphens.
  # Currently only the English and Portuguese week-of-day abbreviations
  # are recognized.

  gsub(/^ *(Mon|Tue|Wed|Thu|Fri|Sat|Sun|Seg|Ter|Qua|Qui|Sex|S.b|Dom)[-a-z]*[,]? */, " ",dt);
  return dt;
}
  
function pud_month_number(mo)
{
  # If {mo} is a valid month name, returns the corresponding two-digit month
  # number in {"01" .. "12"}. If {mo} is already a one- or two-digit number in {1 .. 12},
  # merely pads it to two digits. In all other cases returns "00".
  # 
  # It the name is alphabetic, the procedure considers only its 
  # first three letters (which must be all lowercase, all upercase,
  # or initial-cap); the rest of the name may by any string of 
  # upper or lower-case letters. Currently only the 
  # English names ("Feb", "Dec", etc.) and the Portuguese names ("Fev", "Dez")
  # are recognized.
  
  gsub(/^(jan|Jan|JAN)[A-Za-z]*$/, "01",mo);
  gsub(/^(fev|Fev|FEV|feb|Feb|FEB)[A-Za-z]*$/, "02",mo);
  gsub(/^(mar|Mar|MAR)[A-Za-z]*$/, "03",mo);
  gsub(/^(abr|Abr|ABR|apr|Apr|APR)[A-Za-z]*$/, "04",mo);
  gsub(/^(may|May|MAY|mai|Mai|MAI)[A-Za-z]*$/, "05",mo);
  gsub(/^(jun|Jun|JUN)[A-Za-z]*$/, "06",mo);
  gsub(/^(jul|Jul|JUL)[A-Za-z]*$/, "07",mo);
  gsub(/^(ago|Ago|AGO|aug|Aug|AUG)[A-Za-z]*$/, "08",mo);
  gsub(/^(set|Set|SET|sep|Sep|SEP)[A-Za-z]*$/, "09",mo);
  gsub(/^(out|Out|OUT|oct|Oct|OCT)[A-Za-z]*$/, "10",mo);
  gsub(/^(nov|Nov|NOV)[A-Za-z]*$/, "11",mo);
  gsub(/^(dez|Dez|DEZ|dec|Dec|DEC)[A-Za-z]*$/, "12",mo);
  if (mo ~ /^[1-9]$/) { mo = ("0" mo); }
  if (mo !~ /^(0[1-9]|1[0-2])$/) { return "00"; }
  return mo;
}

function pud_format_date(df,dtsep,mdsep,tmsep)
{
  # Formats a date {df[1..6]} as 
  # "{YEAR}/{MONTH}/{DAY}-{HOUR}:{MIN}:{SEC}",
  # where "/", "-" and ":" are replaced by {dtsep,mdsep,tmsep},
  # respectively.
  # 
  # This procedure makes no assumption about the fields 
  # {df[1..6]}
  
  return sprintf \
    ( "%s%s%s%s%s%s%s%s%s%s%s", \
      df[1],dtsep,df[2],dtsep,df[3], mdsep, \
      df[4],tmsep,df[5],tmsep,df[6] );
}

# DATE/TIMESTAMP CONVERSION

function pud_timestamp_from_date(df,   yr,mo,dy,hr,mi,sc,bug,mlen,epdy,ts)
{
  # Converts a GMT date {df[1..6]} into a number of seconds elapsed since the
  # standard UNIX epoch (Jan 1, 1970 00:00:00).
  # 
  # The year {yr = df[1]} must be a number in 1901 .. 2099.
  #
  # The month {mo = df[2]} must be a number in {1 .. 12}.
  # 
  # The day {dy = df[3]} must be a number. It may be zero or negative,
  # and does not need be within the month's valid day range. Thus the
  # following dates are equivalent:
  # 
  #   "31 Feb 1979" = "03 Mar 1979"
  #   "31 Feb 1980" = "02 Mar 1980",
  #   "00 Mar 1980" = "29 Feb 1980"
  #   "-1 Jan 1970" = "30 Dec 1969"
  #   "35 Dec 1990" = "04 Jan 1991"
  #
  # The hour {hr = df[4]} must be a number in {0 .. 24}.
  # 
  # The minute {mi = df[5]} must be a number in {0 .. 59}.
  # 
  # The second {sc = df[6]} must be a number in {0 .. 59}.
  # 
  # This procedure does not use UTC, but rather a simpler TAI-like
  # date standard, where all days have precisely 24*60*60 seconds. In
  # particular, it ignores the UTC leap seconds, and does not allow
  # "60" as a value for the seconds parameter {sc}. 
  # 
  # Thus, the number of seconds returned for an input date in the
  # range 1972-2012 will be up to 25 seconds less than the result one
  # would obtain if the same input were interpreted as an UTC date.
  # For dates after 2012 the difference is unpredictable, since the
  # UTC leap seconds are known only six months in advance.
  # 
  # In any case, the resulting second count must be between -2177452800
  # (1901-01-01 00:00:00) and 4102444799 (2099-12-31 23:59:59) inclusive.
  # (The current code does not account for the fact that years divisible
  # by 100 but not by 400 are *not* leap years.)
  # 
  # Arguments that are not numeric are converted to zero silently,
  # but the procedure returns 9999999999 (ten nines) if any
  # argument or the computed result is outside the corresponding
  # range stated above.
  
  bug = 9999999999;
  
  # Grab the fields, ensure they are numeric: 
  yr = df[1]+0; mo = df[2]+0; dy = df[3]+0;
  hr = df[4]+0; mi = df[5]+0; sc = df[6]+0;

  # Check the fields:
  if ((yr < 1901) || (yr > 2099)) { return bug; }
  if ((mo < 1) || (mo > 12)) { return bug; }
  if ((hr < 0) || (hr > 23)) { return bug; }
  if ((mi < 0) || (mi > 59)) { return bug; }
  if ((sc < 0) || (sc > 59)) { return bug; }
   
  # Computes the number {epdy} of days from the epoch to 00:00 January 1 of {yr}:
  # All years in 1901..2099 that are divisible by 4 (including 2000) are leap years.
  epdy = 365*(yr - 1970) + int((yr - 1901)/4) - 17;
  
  # Adds the number of days from 00:00 January 1 of {yr} to
  # 00:00 of month {mo} of that year:
  epdy = epdy + pud_month_offset[mo];
  if (((yr % 4) == 0) && (mo+0 > 2)) { epdy = epdy + 1; }
  
  # Add days from beginning of month to date:
  epdy = epdy + (dy-1);
  if ((epdy < -25202) || (epdy > 47481)) { return bug; }

  # Convert days to seconds and add the time, 
  # assuming days of exactly {24*60*60} secs:
  ts = ((epdy*24 + hr)*60 + mi)*60 + sc;
  return ts;  
}

function pud_date_from_timestamp(ts,tz,df,  yr,mo,dy,hr,mi,sc,q,mlen)
{
  # Converts a timestamp {ts} (seconds since the UNIX standard epoch) 
  # to separate year, month, day, hour, minute and second values,
  # which are returned in {df[1..6]}, in that order.
  # The argument {df} must be declared as array by the caller. 
  # 
  # If {tz} is not empty, it must have the format "{TZSIGN}{HOFF}{MOFF}",
  # where {SIGN} (mandatory) is "+" or "-", {HOFF} is two-digit hour offset, and {MOFF} is a
  # two-digit minute offset (both 00 to 99). The values
  # "{TZSIGN}{HOFF}" times 3600 and "{TZSIGN}{MOFF}" times 60 are added to the
  # timestamp before conversion. 
  # 
  # This procedure does not use UTC but rather a TAI-like system,
  # where all days have precisely 24*60*60 seconds. In particular, it 
  # ignores the UTC leap seconds, so the result returned for an input
  # timestamp in 1972-2012 will be up to 25 seconds greater than the result
  # one would obtain if the same timestamp were split into an UTC date.
  # For dates after 2012 the error may be larger, depending on  
  # the rotation of the Earth and the whim of international bodies.
  # Unlike UTC, this procedure never generates "60" as the seconds field.
  # 
  # The timestamp (after timezone addition) 
  # must be between -2177452800 (1901/01/01-00:00:00)
  # and 4102444799 (2099/31/12-23:59:59) inclusive.
  # 
  # Returns {"9999" "99" "99" "99" "99" "99"} if the input {ts} 
  # is not numeric or is outside this range.

  #Initialize {df} 
  df[1] = "9999"; df[2] = "99"; df[3] = "99";
  df[4] = "99"; df[5] = "99"; df[6] = "99";

  if (ts !~ /^[-+]?[0-9]+$/) { return; }
  
  if (tz != "")
    { if (tz !~ /^[-+]0000$/) { return; }
      ts = ts + substr(tz,1,3)*3600 + (substr(tz,1,1) substr(tz,4,2))*60;
    }
  
  if ((ts < -2177452800) || (ts > 4102444799)) { return; }
    
  # Change to 1901/01/01-00:00:00 as the epoch:
  ts = ts + 2177452800;
  
  # Assumes days are always 24*60*60 seconds long:
  sc = ts % 60; ts = int((ts - sc)/60);
  mi = ts % 60; ts = int((ts - mi)/60);
  hr = ts % 24; ts = int((ts - hr)/24);
  
  # Now {ts} is whole days from the 1901 epoch to the day in question, exclusive.
  # Account for whole leap cycles (4-year periods ending with leap year)
  # from the 1901 epoch to current date, exclusive:
  q = int(ts/1461); 
  ts = ts - q*1461; yr = 1901 + q*4;
  # Account for whole years elapsed within leap cycle:
  q = int(ts/365); if (q == 4) { q = 3; }
  ts = ts - q*365; yr = yr + q;
  # Now {ts} is days from Jan/01 of {yr} to current date, exclusive.
  mo = 1;
  while (1) 
    { # Get number of days {mlen} of month {mo}:
      mlen = pud_month_offset[mo+1] - pud_month_offset[mo];
      if (((yr % 4) == 0) && (mo == 2)) { mlen = mlen + 1; }
      if (ts < mlen) { break; }
      mo = mo + 1;
      ts = ts - mlen;
    }
  dy = ts + 1;
  
  # Format and return:
  df[1] = sprintf("%04d",yr); df[2] = sprintf("%02d",mo); df[3] = sprintf("%02d",dy);
  df[4] = sprintf("%02d",hr); df[5] = sprintf("%02d",mi); df[6] = sprintf("%02d",sc);
}