#! /bin/gawk -f
# Last edited on 2009-02-14 16:07:27 by stolfi

BEGIN {
  abort = -1;
  USAGE = ( \
    "format-table \\\n" \
    "  [ -v colsep={SEP} ] [ -v outsep={SEP} ] \\\n" \
    "  < INFILE \\\n" \
    "  > OUTFILE" \
  );
  
  # OBSOLETE -- see {txtable-format}.
  
  # Reads a text file {INFILE} and formats the tables contained therein.
  # A /table/ consists of zero or more consecutive /table lines/, all
  # with the same number of fields.
  # 
  # A table line can be a /info line/ or a /rule line/. An info
  # line is a line that contains one or more occurrences of the {SEP}
  # string or regular expression (which default to '|'). A rule
  # line is a line that contains only spaces (or tabs) or characters
  # in '[-=+]', with at least one '+'. The fields are the
  # substrings of the line delimited by the occurrences of {SEP}
  # or '+', respectively.
  # 
  # The raw length of the first field of the table defines the
  # minimum width of the first column.  Otherwise, the
  # leading and trailing blanks in each field are stripped off.
  # That done, each field in a rule line must be either empty or
  # a string of one or more '-'s.
  #
  # On output, each field in the table gets extended
  # so that all fields in the same table column have the same 
  # width:
  #   
  #   * In an info line, each field gets padded with exactly one blank on
  #     the right and one blanks on the left. Then futher blanks are
  #     added to the left or to the right, depending on whether the
  #     column is assumed to be numeric or alphabetic.
  #   
  #   * In a rule line, each field gets extended with blanks if
  #     it is blank or empty, otherwise it is replicated and trimmed
  #     to fill the whole column width.
  # 
  # A column is assumed to be numeric or alphabetic if the last
  # definite info field in it is numeric or alphabetic, respectively.
  # An info field is considered numeric iff it contains a digit not
  # followed by a letter. An info field is considered to be alphabetic
  # if it contains a letter not followed by a digit. Otherwise the
  # field is indefinite (neither numeric nor alphabetic). If the
  # column contains no definite field, it is assumed to be alphabetic.
  # 
  # On output, the fields of info lines are separated by the {outsep}
  # string (which defaults to {colsep} if {colsep} is a single character,
  # else to "|").  The fields of rule lines are separated with '+'s,
  # padded with '-'s to the same length as {outsep}.
  
  # Set {insep} to {colsep}, or to the default if not specified:
  insep = colsep;
  if (insep == "") { insep = "[|]"; }
  if (insep ~ /[+]/) { arg_error(("\"insep\" should not contain '+'")); }

  # Set {outsep} to the default if not specified:
  if (outsep == "") { outsep = (length(colsep) == 1 ? colsep : "|"); }
  if (outsep == "") { arg_error(("\"outsep\" must be nonempty")); }
  
  # Pad the output rule separator {outrulesep} to match {outsep}:
  outrulesep = "+";
  while(length(outrulesep) < length(outsep)) { ourtulesep = ("-" outrulesep "-"); }
  outrulesep = substr(outrulesep, 1, length(outsep));

  # Set the field separator for $0 to {insep}:
  FS = insep;
  
  # Regular expressions matching numeric and alphabetic fields:
  letters = "a-zA-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞàáâãäåæçèéêëìíîïðñòóôõößøùúûüýþÿ";
  numeric_re = ("[0-9][^" letters "]*$");
  alphabetic_re = ("[" letters "][^0-9]*$");

  clear_table(); 
}

(abort >= 0) { exit abort; }

(match($0, insep) != 0) { 
  # Info line -- split by {insep}:
  save_table_line(0, $0, insep);
  next;
}

/^[- \011]*[+][-+ \011]*$/ {
  # Rule line -- split by '+':
  save_table_line(1,  $0, "[+]");
  next;
}

// {
  # Non-table line:
  output_table();
  clear_table();
  print;
  next;
}

END {
  if (abort >= 0) { exit abort; }
  output_table();
}
  
function clear_table()
{
  # Sets the current table to empty.
  # Does not disturb the current line $0.
  
  ntlins = 0;     # Number of table lines in current table.
  minwd1 = 0;     # Minimum width of first column.
  ncols = 0;      # Number of colunms in table. 
  split("", tbl); # The fields of the current table are {tbl[0..ntlins-1,0..ncols-1]}.
  split("", typ); # The type of line {i} is {typ[i]} (0 = info, 1 = rule).
  split("", cwd); # The min column widths are {cwd[1..ncols]}.
  split("", ckd); # The kind of column {k} is {ckd[j]} ("N" = numeric, "A" = alpha)
}

function save_table_line(t,lin,sep,  nf,fld,j,fj,wdj)
{ 
  # Splits {lin} at separator {sep} into fields {fld[1..nf-1]}.
  # If the current table is not empty but the 
  # number of fields {nf} is different from {ncols}, 
  # prints the current table and clears it. 
  # 
  # In any case, if the table is empty, sets {ncols} to {nf} and
  # initializes the column widths.
  # 
  # Then strips blanks around each field {fld[1..nf]} and appends them
  # as a new line of the table {tbl}, with type {t}.

  nf = split($0, fld, sep);
  if (nf <= 1) { prog_error(("qua?")); }
  if (nf != ncols) 
    { output_table(); clear_table(); }
  
  if (ntlins == 0)
    { # Initialize {ncols} and {cwd[1..ncols]} :
      ncols = nf;
      cwd[1] = length(fld[1]);
      for (j = 2; j <= ncols; j++) { cwd[j] = 0; }
    }
  else
    { # Check {ncols}: 
      if (ncols != nf) { prog_error(("duh?")); }
    }
  for (j = 1; j <= ncols; j++) 
    { # Strip leading and trailing blanks from field {fld[j]}: 
      fj = fld[j];
      gsub(/^[ \011]+/, "", fj);
      gsub(/[ \011]+$/, "", fj);
      if (t == 0)
        { # Info field, add minimum padding: 
          fj = (" " fj " ");
          # Update the column kind: 
          if (is_numeric(fj)) { ckd[j] = "N"; }
          if (is_alphabetic(fj)) { ckd[j] = "A"; }
        }
      else
        { # Rule field, make sure that is has only '-'s:
          gsub(/./, "-", fj);
        }
      # Save field in table: 
      tbl[ntlins,j] = fj;
      # Update the column width:
      wdj = length(fj);
      if (wdj > cwd[j]) { cwd[j] = wdj; }
    }

  # Save type and increment line counts:
  typ[ntlins] = t;
  ntlins++;
}
  
function output_table(   clin,i,j,fj,wdj,ch)
{
  # Outputs the current table. Does not disturb the 
  # current line $0.
  
  if (ntlins == 0) { return; }
  
  # Print each line:
  for (i = 0; i < ntlins; i++)
    { for (j = 1; j <= ncols; j++)
        { if (j > 1)
            { # Print separator:  
              printf "%s", (typ[i] == 0 ? outsep : outrulesep); 
            }
          # Extend field to width {cwd[j]}:  
          fj = tbl[i,j];
          if (typ[i] == 0)
            { # Info line field -- extend according to column kind:  
              if ((! (j in ckd)) || (ckd[j] == "A"))
                { # Assume alphabetic, pad with blanks at right: 
                  fj = sprintf("%-*s", cwd[j], fj);
                }
              else if (ckd[j] == "N")
                { # Assume numeric, pad with blanks at left: 
                  fj = sprintf("%*s", cwd[j], fj);
                }
              else 
                { prog_error(("was?")); }
            }
          else 
            { # Rule line field:
              if (fj == "")
                { # Pad with blanks:  
                  fj = sprintf("%*s", cwd[j], fj);
                }
              else 
                { # Replicate {jf} to fill the column width:
                  ch = substr(fj,1,1);
                  while (length(fj) < cwd[j]) { fj = (fj fj); }
                  fj = substr(fj, 1, cwd[j]);
                }
            }
          printf "%s", fj;
        }
      printf "\n";
    }
}

function is_numeric(x)
{
  return (match(x, numeric_re) != 0); 
}

function is_alphabetic(x)
{
  return (match(x, alphabetic_re) != 0); 
}

function arg_error(msg)
{ printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "usage:\n  %s\n", USAGE > "/dev/stderr";
  abort = 1; exit abort;
}

function data_error(msg)
{ printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "%s:%d: «%s»\n", $0 > "/dev/stderr";
  abort = 1; exit abort;
}
  
function prog_error(msg)
{ printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}