#! /usr/bin/gawk -f
# Last edited on 2002-03-05 03:30:53 by stolfi

BEGIN{
  abort = -1;
  usage = ( \
    "cat INFILE \\\n" \
    "  | format-soc \\\n" \
    "      -v title=STRING \\\n" \
    "      -v showWords=BOOL \\\n" \
    "      -v showWeak=BOOL \\\n" \
    "  > OUTFILE " \
  );

  # Reads a file containing lines of the form 
  # 
  #   SEC USEQ FNUM UNIT LINE TRAN FPOS RPOS PFRST PLAST WORD HEAD TAG
  #   1   2    3    4    5    6    7    8    9     10    11   12   13
  # 
  # Assumes that the file is sorted by HEAD, then some fields,
  # then USEQ (FNUM and UNIT), NLIN, TRAN.  Prints all entries
  # with the same HEAD, in compact format, supressing repeated fields.
  # If `showWords' is TRUE, prints words too, else locations only.
  # If `showWeak' is FALSE, omits weak matches.
  
  if (title == "") { title = "Occurrences of selected words"; }
  if (showWords == "") { showWords = 1; }
  if (showWeak == "") { showWeak = 1; }

  output_html_header(title);
  out_line_indent = 2;
  out_line_width = 56; # Not counting indentation
}

(abort >= 0) {exit abort;} 

(NF == 13){
  sec = $1; useq = $2; 
  fnum = $3; unit = $4; nlin = $5; tran = $6;
  fpos = $7; rpos = $8; 
  pfrst = $9; plast = $10; 
  word = $11;
  head = $12; tag = $13;
  
  if ((tag == 0) && (! showWeak)) { next; }
  
  if (head != ohead)
    { if (ohead != "") { finish_head(); }
      start_head(head); 
      ohead = head; 
      osec = ""; otag = ""; oword = ""; ofnum = ""; ounit = ""; onlin = ""; otran = "";
    } 

  if (otran != "") { print_comma(); }
  
  if (tag != otag) 
    { if (otag != "") { close_font(); } open_font(tag_color(tag)); otag = tag; } 
  if (showWords) 
    { if (word != oword) { print_word(word); oword = word;  } }
  if (sec != osec) 
    { print_sec(sec); osec = sec; ofnum = ""; ounit = ""; onlin = ""; otran = ""; }
  if (fnum != ofnum) 
    { print_fnum(fnum); ofnum = fnum; ounit = ""; onlin = ""; otran = ""; }
  if (unit != ounit) 
    { print_unit(unit); ounit = unit; onlin = ""; otran = ""; }
  if (nlin != onlin)
    { print_nlin(nlin); onlin = nlin; otran = ""; }
  print_tran(tran); otran = tran;
  next;
}

END {
  if (abort >= 0) {exit abort;} 
  if (ohead != "") { finish_head(); }
  output_html_trailer();
}

function output_html_header(title)
{
  printf "<html>\n<head>\n<title>%s</title>\n</head>\n", title;
  printf "<body bgcolor=\"#000000\" text=\"#aaaaaa\">\n\n";
  printf "<h1>%s</h1>\n\n<pre><b>\n", title;
}

function start_head(head,   n,i)
{
  printf "<font color=\"#22aaff\">=== %s ", head;
  n = 60 - 5 - length(head);
  for (i = 1; i <= n; i++) { printf "="; }
  printf "</font>\n";
  newline();
}

function tag_color(tag)
{
  if (tag == 0)
    { return "00aa00"; }
  else if (tag == 1)
    { return "ff7700"; }
  else if (tag == 2)
    { return "ffbb00"; }
}

function open_font(clr)
{
  printf "<font color=\"#%s\">", clr;
}

function print_sec(sec)
{ 
  open_font(sec_color(sec));
  print_string((sec " "));
  close_font();
}

function sec_color(sec)
{
  if (sec == "her")
    { return "00cc00"; }
  else if (sec == "str")
    { return "ff00ff"; }
  else if (sec == "ast")
    { return "00ddff"; }
  else if (sec == "bio")
    { return "ffaa66"; }
  else if (sec == "unk")
    { return "aaaaaa"; }
  else if (sec == "zod")
    { return "ffff00"; }
  else
    { return "88ccff"; }
}

function print_fnum(fnum)
{ 
  print_string(fnum);
}

function print_unit(unit)
{
  if (ounit == "") 
    { print_string(("." unit)); }
  else
    { print_string(unit); }
}

function print_nlin(nlin)
{
  if (onlin == "") 
    { print_string(("." nlin)); }
  else
    { print_string(nlin); }
}

function print_tran(tran)
{
  if (otran == "") 
    { print_string((":" tran)); }
  else
    { print_string(tran); }
}

function print_word(word)
{
  print_string(("(" word ")")); 
}

function close_font()
{
  printf "</font>";
}

function print_comma()
{
  printf ","; cur_line_width++;
  if (cur_line_width + 1 > out_line_width)
    { newline(); }
  else
    { printf " "; cur_line_width++; }
}

function print_string(str,   n)
{
  n = length(str);
  if ((cur_line_width > 0) && (cur_line_width + n > out_line_width))
    { newline(); }
  printf "%s", str; cur_line_width += n;
}

function newline()
{
  printf "\n%*s", out_line_indent, ""; cur_line_width = 0;
}

function finish_head()
{
  if (otag != "") { close_font(); }
  printf "\n\n";
}

function output_html_trailer(title)
{
  printf "</b></pre>\n</body>\n</html>\n";
}

/./{ data_error("bad line type"); }

function data_error(msg)
{
  printf "*** line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}