#! /usr/bin/gawk -f
# Last edited on 2003-10-29 23:41:27 by stolfi

BEGIN {
  
  # This script reads a list of paper citations, obtained from the ISI
  # WebOfScience or from the NEC CiteSeer, and brings it closer to the
  # Bibtex format.
  # 
  # The input file must be a sequence of bibligraphy entries. Each
  # entry must have been broken up into at least three major chunks:
  # author names, title, and publication data, each on a line by
  # itself..
  # 
  # Each entry starts with a line "@TYPE{KEY," where TYPE is any
  # Bibtex entry type, or "isiitem", or "necitem"; and KEY is a
  # bibtex-like citation key, or "??". The final comma is optional.
  # Each entry ends with a line containing a single brace "}".
  # 
  # Within an entry there may be one or more field lines, of the form
  # " FIELD = {VALUE},", where FIELD is an alpha string (field name)
  # and VALUE is anything.  The final comma is optional.  
  # 
  # Each field FIELD may be any Bibtex field name, or one of several
  # special field names starting with "isi" for information still in
  # the ISI format. The main fields in this class are
  # 
  #   (isi|nec)author =   {Author list in the ISI/NEC format}
  #   (isi|nec)where =    {Journal,volume,dat,pages,etc.}
  #   (isi|nec)title =    {Raw title, possibly with wrong capitalization.}
  #
  # Between the entries there may be lines of the form "! FIELD = {VALUE}"
  # Each of these lines is converted into a comment, and all subsequent 
  # entries will be provided with a new field "FIELD = {VALUE}". A line
  # "! FIELD" (with empty value) cancels the previous "! FIELD" directive.
  # 
  # The output should be closer to the Bibtex format, except that
  # there are no commas between the entry fields.
  
  abort = -1;
  
  # Fields to be added to each entry:
  split("", addfield);  # Indexed by active field names
}

(abort >= 0) { exit abort; }

/^[ ]*([#%]|$)/ {
  lin = $0;
  sub(/[#]/, "%", lin);
  print lin;
  next;
}

/^[@][a-zA-Z]+[ ]*[{]/ {
  # Beginning-of-entry:
  lin = $0;
  print lin; 
  next;
}

/^[!][ ]*[a-z]+[ ]*[=][ ]*[{].*[}][, ]*$/ {
  # Adds field to all subsequent entries:
  lin = $0;
  if (! match(lin, /^[!][ ]*[a-z]+[ ]*[=]/)) { data_error(("prog error")); }
  fname = substr(lin,RSTART,RLENGTH-1);
  fval = substr(lin,RSTART+RLENGTH);
  gsub(/^[!][ ]*/, "", fname);
  gsub(/[ ]*[=][ ]*$/, "", fname);
  gsub(/^[ ]*[{][ ]*/, "", fval);
  gsub(/[ ]*[}][, ]*$/, "", fval);
  addfield[fname] = fval;
  data_warning(("start adding \"" fname " = {" fval "}\"..."));
  print ("% " $0); 
  next;
}

/^[!][ ]*[a-z]+[ ]*$/ {
  # Cancels addition of field to entries:
  fname = $0;
  gsub(/^[!][ ]*/, "", fname);
  gsub(/[ ]*$/, "", fname);
  if (fname in addfield)
    { data_warning(("stop adding \"" fname " = {" addfield[fname] "}\"."));
      delete addfield[fname];
    }
  else
    { data_warning(("field \"" fname "\" is not currently being added")); }
  print ("% " $0); 
  next;
}

/^[ ]*[a-z]+[ ]*[=][ ]*{.*}[, ]*$/ {
  # Generic field entry, remove final commas:
  gsub(/[ ,]+$/, "", $0);
}

/^[ ]*isiauthor[ ]*[=][ ]*{/ {
  # Author list in ISI format, convert to Bibtex:
  lin = $0;
  lin = convert_isi_authors(lin);
  if (lin ~ /[,]/) 
    { data_warning(("commas not removed from authors")); }
  print lin;
  next;
}

/^[ ]*necauthor[ ]*[=][ ]*{/ {
  # Author list in NEC format, try to convert to Bibtex:
  lin = $0;
  lin = convert_nec_authors(lin);
  if (lin ~ /[,]/) 
    { data_warning(("commas not removed from authors")); }
  print lin;
  next;
}

# WAS: /isiwhere[ ]*[=][ ]*{.* +[-A-Z0-9()]*[0-9][-0-9A-Z()]*[ ]*[:]/

/^[ ]*isiwhere[ ]*[=][ ]*{/ {
  # Journal with volume number, possibly issue number:
  lin = $0;
  lin = split_isi_where(lin);
  if (lin ~ /isiwhere[ ]*[=]/) 
    { data_warning(("split_isi_where failed")); }
  print lin;
  next;
}

/^[ ]*necwhere[ ]*[=][ ]*{/ {
  # Journal with volume number, possibly issue number:
  lin = $0;
  lin = split_nec_where(lin);
  if (lin ~ /necwhere[ ]*[=]/) 
    { data_warning(("split_nec_where failed\n   lin = \"" lin "\"")); }
  print lin;
  next;
}
 
/^[ ]*necvolpagedate *[=]/ {
  # Volume (possibly number), page range and date:
  lin = $0;
  # To be handled...
  print lin;
  next;
}

/^[ ]*isijournal[ ]*[=]/ {
  # Rewrite journal names
  lin = fix_isi_journal_name($0);
  print lin;
  next;
}
 
/^[ ]*isipagedate *[=]/ {
  # Page range and date:
  lin = $0;
  lin = split_isi_pagedate(lin);
  if (lin ~ /isipagedate[ ]*[=]/) 
    { data_warning(("split_isi_pagedate failed")); }
  print lin;
  next;
}

/^[ ]*type[ ]*[=][ ]*[a-z]+[ ]*$/ {
  # Entry type (without braces).
  print;
  next;
}

/^[ ]*month[ ]*[=][ ]*[jfmasond][a-z][a-z]([-][-][jfmasond][a-z][a-z]|)[ ]*$/ {
  # Month(s) without braces (predefined string).
  print;
  next;
}

/^[ ]*(isi|nec|)(author|title|journal|day|year|volume|number|pages)[ ]*[=][ ]*[{].*[}][ ]*$/ {
  # Another valid field:
  print;
  next;
}

/^[ ]*(isi|nec|)(publisher|note|comment|abstract|booktitle|series|school)[ ]*[=][ ]*[{].*[}][ ]*$/ {
  # Another valid field::
  print;
  next;
}

/^[ ]*(isi|nec|)(institution|howpublished|edition|editor|cites|citations|url)[ ]*[=][ ]*[{].*[}][ ]*$/ {
  # Another valid field:
  print;
  next;
}

/^[ ]*(isi|nec|)(where|pagedate|instdate|volpagedate|key|ctxurl|docurl|misc)[ ]*[=][ ]*[{].*[}][ ]*$/ {
  # A field still partially in ISI or NEC format:
  print;
  next;
}

/^[ ]*[}][ ]*$/ {
  # End of entry
  
  # Add new fields: 
  for (fname in addfield)
    { printf "  %s = {%s}\n", fname, addfield[fname]; }
    
  # Print end of entry:
  print; 
  next;
}
 
// { 
  data_error(("unrecognized line format"));
  print;
  next;
}

END {
  if (abort >= 0) { exit abort; }
}

function convert_isi_authors(lin,  aui,aub,res)
{
  # Convert author list from ISI to BibTex format
  
  # Remove field name and braces:
  gsub(/^[ ]*isiauthor[ ]*[=][ ]*{[ ]*/, "", lin);
  gsub(/[ ]*}[ ]*$/, "", lin);
  
  # Fix "et al."
  gsub(/[ ]*et +al[.]?[ ]*/, "??", lin);
  
  # Break into authors and convert each one:
  res = "";
  while (match(lin, /[ ]*[,][ ]*/))
    { aui = substr(lin,1,RSTART-1);
      lin = substr(lin,RSTART+RLENGTH);
      aub = convert_one_isi_author(aui);
      res = ( res aub " and " );
    }
  aub = convert_one_isi_author(lin);
  res = ( res aub );
  return ( "  author = {" res "}" );
}

function convert_one_isi_author(aui, lst,inis,tmp,pre,suf,aub)
{
  # Convert one author name from ISI to BibTex:
  
  # Remove extraneous spaces:
  gsub(/^[ ]*/, "", aui); 
  gsub(/[ ]*$/, "", aui); 
  gsub(/[ ][ ][ ]*/, " ", aui); 
  
  if (match(aui, /^[?]+$/))
    { return "??"; }
  else if (match(aui, /^[A-Za-z?]+$/))
    { data_warning(("single-word author name \"" aui "\""));
      return aui;
    }
  else if (match(aui, /[ ][A-Z]+$/))
    { 
      # Separate last name from initials:
      lst = substr(aui,1,RSTART-1);
      inis = substr(aui,RSTART);

      # Remove surrounding spaces:
      gsub(/^[ ]*/, "", lst); gsub(/[ ]*$/, "", lst); 
      gsub(/^[ ]*/, "", inis); gsub(/[ ]*$/, "", inis);
      
      # Try to fix last name capitalization:
      if (match(lst, /^[A-Z][-'A-Z]+$/))
        { 
          # Last name is all caps, try to fix capitalization:
          # Watch out for some special patterns:
          if (match(lst, /^[M][C][A-Z][-A-Za-z]+$/))
            { tmp = ("Mc" substr(lst,3,1) tolower(substr(lst,4))); }
          else if (match(lst, /^[O]['][A-Z][-A-Za-z]+$/))
            { tmp = ("O'" substr(lst,3,1) tolower(substr(lst,4))); }
          else if (match(lst, /^[D]['][A-Z][-A-Za-z]+$/))
            { tmp = ("D'" substr(lst,3,1) tolower(substr(lst,4))); }
          else
            { tmp = ""; }

          if (tmp != "")
            { data_warning(("last name \"" lst "\" changed to \"" tmp "\""));
              lst = tmp;
            } 
          else
            { lst = (substr(lst,1,1) tolower(substr(lst,2))); }
        }
      else if (match(lst, /^([Dd][aeiou']|[Vv][ao]n([ ]*[Dd]e[rn]|))[ ]*[A-Z][a-z]/))
        { 
          # Name with detachable prefix, normalize spacing:
          pre = substr(lst,1,RLENGTH-2);
          suf = substr(lst,RLENGTH-1);
          if (match(pre, /^[Vv][ao]n[Dd]e[rn]$/)) 
            { pre = (substr(pre,1,3) " " substr(pre,4,3)); }
          tmp = (pre " " suf);
          gsub(/[ ][ ][ ]*/, " ", tmp); 
          gsub(/['][ ][ ]*/, "'", tmp); 
          if (! match(suf, /^[A-Z][a-z]+$/))
            { data_warning(("strange last name \"" lst "\"")); }
          else if (tmp != lst)
            { data_warning(("last name \"" lst "\" changed to \"" tmp "\""));
              lst = tmp;
            } 
        } 
      else if (match(lst, /^([O])[ ]*[A-Z][a-z]/))
        { 
          # Irish last name without the "'":
          suf = substr(lst,2);
          tmp = ("O'" suf);
          if (! match(suf, /^[A-Z][a-z]+$/))
            { data_warning(("strange Irish name \"" lst "\"")); }
          else if (tmp != lst)
            { data_warning(("last name \"" lst "\" changed to \"" tmp "\""));
              lst = tmp;
            } 
        } 
      else if (match(lst, /^(La|Mc|O[']|)[A-Z][a-z]+([-][A-Z][a-z]+|)$/))
        { # OK. 
        }
      else
        { data_warning(("strange last name \"" lst "\"")); } 

      # Insert periods after each initial: 
      inis = gensub(/([A-Z])/, "\\1. ", "g", inis);
      
      # Put back initials before last name:
      aub = (inis " " lst);

      # Remove extraneous spaces:
      gsub(/^[ ]*/, "", aub); 
      gsub(/[ ]*$/, "", aub); 
      gsub(/[ ][ ][ ]*/, " ", aub); 

      return aub;
    }
  else
    { data_warning(("author name \"" aui "\" garbled"));
      return aui; 
    }
}

function convert_nec_authors(lin,  aui,aub,res,p1,p2,inverted)
{
  # Convert author list from NEC to BibTex format
  
  # printf " lin = [%s]\n", lin > "/dev/stderr";
  
  # Remove field name and braces:
  gsub(/^[ ]*necauthor[ ]*[=][ ]*{[ ]*/, "", lin);
  gsub(/[ ]*}[ ]*$/, "", lin);
  
  # Fix "et al."
  gsub(/[ ]*et +al[.]?[ ]*/, "??", lin);
  
  # Try to guess whether the format is "Stolfi, J." or "J. Stolfi"
  p1 = match(lin, / and +.*[,]/);
  p2 = match(lin, /^[^,]*[,][^,]*$/);
  inverted = (p1 || p2);
  
  # Replace " and " by semicolon:
  gsub(/ and /, ";", lin);
  # Remove comma before semicolon:
  gsub(/[,][ ]*[;]/, ";", lin);
  # Append a final semicolon to simplify parsing:
  lin = ( lin ";" );
  
  # Break into authors and convert each one:
  res = "";
  while (lin != "")
    { aui = "";
      if (inverted) 
        { # Presumably the format is "Guibas, L.J., Stolfi, J., ..."
          # Grab two next two things separated by a comma
          # and terminated by comma or semicolon.
          if (match(lin, /^[ ]*([A-Za-z][-'A-Za-zçãéá ]*[,][ ]*[A-Z][A-Z. ]*[.][,;])[ ]*/))
            { aui = substr(lin,RSTART,RLENGTH);
              lin = substr(lin,RSTART+RLENGTH);
              # Remove name delimiter at end of name: 
              gsub(/[ ]*[,;]+[ ]*$/, "", aui);
              # Bring initials to front of name:
              if (! match(aui, /[,]/)) { data_error(("duh?")); }
              aui = ( substr(aui,RSTART+1) " " substr(aui,1,RSTART-1) );
            }
          else 
            { aui = ""; }
        }
      if (aui == "")
        { # Presumably the format is "L.J. Guibas, J. Stolfi, , ..."
          if (match(lin, /^[ ]*([A-Za-z][-.'A-Za-zçãéá ]*[,;])[ ]*/))
            { aui = substr(lin,RSTART,RLENGTH);
              lin = substr(lin,RSTART+RLENGTH);
              # Remove name delimiter at end of name: 
              gsub(/[ ]*[,;]+[ ]*$/, "", aui);
            }
          else
            { aui = ""; }
        }
      if (aui == "")
        { # Cannot split name, use whole line:
          data_warning(("cannot parse name \"" lin "\""));
          aui = lin; 
          lin = "";
          # Remove name delimiter at end of name: 
          gsub(/[ ]*[,;]+[ ]*$/, "", aui);
        }
      
      # printf "   aui = [%s]", aui > "/dev/stderr";
      aub = convert_one_nec_author(aui);
      # printf " -> %s\n", aub > "/dev/stderr";

      # Acrescenta nome ao resultado:
      res = ( res ( res == "" ? "" : " and ") aub );
    }
  return ( "  author = {" res "}" );
}

function convert_one_nec_author(aui,  lst,ini,tmp,pre,suf,aub)
{
  # Try to convert one author name from NEC to BibTex format:
  
  # Spread out initials:
  gsub(/[.]/, ". ", aui);
  
  # Remove extraneous spaces:
  gsub(/^[ ]+/, "", aui); 
  gsub(/[ ]+$/, "", aui); 
  gsub(/[ ][ ]+/, " ", aui); 
  
  # Single word name, or "??":
  if (match(aui, /^[?]+$/))
    { return "??"; }
  else if (match(aui, /^[A-Za-z?]+$/))
    { data_warning(("single-word author name \"" aui "\""));
      return aui;
    }
    
  # Try to bring all initials to the front:
  aui = ( "@@ " aui );  # Safety marker to detect loop.
  
  while (match(aui, /[A-Z][.][ ]*$/))
    { 
      # Bring postfixed initial to front:
      lst = substr(aui,1,RSTART-1);
      ini = substr(aui,RSTART);

      # Remove surrounding spaces:
      gsub(/^[ ]+/, "", lst); gsub(/[ ]+$/, "", lst); 
      gsub(/^[ ]+/, "", ini); gsub(/[ ]+$/, "", ini);
      
      aui = (ini " " lst);
      
      # Prevent infinite loop:
      if (match(lst, /[@]$/)) { break; }
    }
  gsub(/[ ]*[@]+[ ]*/, " ", aui);
  gsub(/^[ ]+/, "", aui); gsub(/[ ]+$/, "", aui);
  
  return aui;
}
 
function split_isi_where(lin,  res,fld,sep)
{
  # Splits "isiwhere = {...}" field into "isijournal" and "isipagedate"
  
  # Strip initial "[ ]*isiwhere = {" and final "}": 
  gsub(/^[ ]*isiwhere[ ]*[=][ ]*{[ ]*/, "", lin);
  gsub(/[ ]*}[ ]*$/, "", lin);
  
  # Prepare for parsing:
  split("", fld); res = ""; sep = "";
  
  # Journal with volume and issue numbers:
  if (match(lin, \
      /^([^{}]*) +([A-Z]*[0-9]+[A-Z]*) +[(]([-0-9]+)[)][ ]*[:][ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep \
        "  isijournal = {" fld[1] "}\n" \
        "  volume = {" fld[2] "}\n" \
        "  number = {" fld[3] "}\n" \
        "  isipagedate = {" fld[4] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Journal with volume number only:
  if (match(lin, \
      /^([^{}]*) +([A-Z]*[0-9]+[A-Z]*)[ ]*[:][ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep \
        "  isijournal = {" fld[1] "}\n" \
        "  volume = {" fld[2] "}\n" \
        "  isipagedate = {" fld[3] "}" \
      );
      sep = "\n";
      return res;
    }
    
  # Give up: 
  if (lin != "") 
    { res = ( res sep "  isiwhere = {" lin "}" ); sep = "\n"; lin = ""; }
  
  return res;
}
 
function split_nec_where(lin,  res,fld,sep)
{
  # Splits "necwhere = " field into "necjournal" and "necvolpagedate"
  
  # Strip initial "[ ]*necwhere = {" and final "}": 
  gsub(/^[ ]*necwhere[ ]*[=][ ]*{[ ]*/, "", lin);
  gsub(/[ ]*}[ ]*$/, "", lin);
  
  # Prepare for parsing:
  split("", fld); res = ""; sep = "";
  
  # Yank out any URLs (before fiddling with puncts!):
  while (match(lin, \
      /^([^{}]*)(http[:]|)([\/]*www[.][^ ,{}]*)[ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep "  url = {{\\url{" fld[2] fld[3] "}}}" );
      sep = "\n";
      lin = ( fld[1] ". " fld[4] );
    }
    
  # Normalize punctuation:
  lin = remove_extra_punctuation(lin);
  
  # Yank out the year in parenthesis:
  if (match(lin, \
      /^([^{}]*)[(]([12][09][0-36-9][0-9])[)][ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep "  year = {" fld[2] "}" );
      sep = "\n";
      lin = ( fld[1] ", @YEAR@, " fld[3] );
      lin = remove_extra_punctuation(lin);
    }
  
  # Yank out the month (in full):
  if (match(lin, \
      /^([^{}]*)[ ,;:.]([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember)[ ]*[ ,;:.]([^{}]*)$/, \
      fld))
    { res = ( res sep "  month = {" fld[2] "}" );
      sep = "\n";
      lin = ( fld[1] ", @MTH@, " fld[3] );
      lin = remove_extra_punctuation(lin);
    }
  
  # Yank out the month (abbreviated):
  if (match(lin, \
      /^([^{}]*)[ ,;:.]([Jj]an|[Ff]eb|[Mm]ar|[Aa]pr|[Jj]un|[Jj]ul|[Aa]ug|[Ss]ep[t]*|[Nn]ov|[Dd]ec)[ ]*[ ,;:.]([^{}]*)$/, \
      fld))
    { res = ( res sep "  month = {" fld[2] "}" );
      sep = "\n";
      lin = ( fld[1] ", @MTH@, " fld[3] );
      lin = remove_extra_punctuation(lin);
    }
  
  # Yank out year at end of field:
  if (match(lin, \
      /^([^{}]*)[ ,;:.][ ]*([12][09][0-36-9][0-9])[ ,;.]*$/, \
      fld))
    { res = ( res sep "  year = {" fld[2] "}" );
      sep = "\n";
      lin = ( fld[1] ", @YEAR@, " );
      lin = remove_extra_punctuation(lin);
    }
  
  # Yank out volume number and issue number:
  if (match(lin, \
      /^([^{}]*)[ ]+([0-9]+)[ ]*[(]([0-9]+)[)][ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep \
        "  volume = {" fld[2] "}\n" \
        "  number = {" fld[3] "}" \
      );
      sep = "\n";
      lin = ( fld[1] ", @VOL@, " fld[4] );
      lin = remove_extra_punctuation(lin);
    }
  
  # Try to recognize technical reports (TR numbers look like page ranges):
  gsub(/ *[Rr]elat[']*[ óo]rio[ ]*[Tt][']*[ ée]c[h]*nico */, " Technical Report ", lin);
  gsub(/ *[Tt]ech(nical|[.]*)[ ]*[Rr]ep(ort|t|[.]*) */, " Technical Report ", lin);
  gsub(/ *[Rr]es(earch|[.]*)[ ]*[Rr]ep(ort|t|[.]*) */, " Research Report ", lin);
  gsub(/ *Report[ ]+([Nn]o|[Nn]|[Nn]um|[#])[.]**/, " Report ", lin);
  if (match(lin, \
      /^[ ]*(Technical[ ]*|Research[ ]*|)Report[ ]+([-A-Z]*[-0-9]+)[ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep \
        "  type = techreport\n" \
        "  number = {" fld[2] "}\n" \
        "  necinstdate = {" fld[3] "}" \
      );
      sep = "\n";
      lin = "";
    }
  
  # Yank out page range:
  if (match(lin, \
      /^([^{}]*)(pages|pp[.]*) *([0-9]+)[-]+([0-9]+)[ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep "  pages = {" fld[3] "--" fld[4] "}" );
      sep = "\n";
      lin = ( fld[1] ", @PGS@ , " fld[5] );
      lin = remove_extra_punctuation(lin);
    }
  else if (match(lin, \
      /^([^{}]*)[ ]+([0-9]+)[-][-]([0-9]+)[ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep "  pages = {" fld[2] "--" fld[3] "}" );
      sep = "\n";
      lin = ( fld[1] ", @PGS@ " fld[4] );
      lin = remove_extra_punctuation(lin);
    }
  else if (match(lin, \
      /^([^{}]*)[ ]+([0-9]+)[-]([0-9]+)[ ]*$/, \
      fld))
    { res = ( res sep "  pages = {" fld[2] "--" fld[3] "}" );
      sep = "\n";
      lin = ( fld[1] ", @PGS@ " );
      lin = remove_extra_punctuation(lin);
    }
  
  # No internal delimiters, assume it is a journal/proceedings
  if (match(lin, \
      /^[ .,;:]*([^.,;:(){}]*)[ .,;:]*$/, \
      fld))
    { res = ( res sep "  necjournal = {" fld[1] "}" );
      sep = "\n";
      lin = "";
    }
  
  # First phrase delimited by comma or semicolon is probably journal name:
  if (match(lin, \
      /^([^,;{}]*[a-zA-Z][.() ]*)[,;][ ]*([^{}]*)$/, \
      fld))
    { res = ( res sep \
        "  necjournal = {" fld[1] "}\n" \
        "  necvolpagedate = {" fld[2] "}" \
      );
      sep = "\n";
      lin = "";
    }
    
  # Remove markers fof removed fields:
  gsub(/[@][A-Z]+[@]/, " ", lin);
  lin = remove_extra_punctuation(lin);
    
  # Give up: 
  if (lin != "") 
    { res = ( res sep "  necwhere = {" lin "}" ); sep = "\n"; lin = ""; }

  return res;
}

function remove_extra_punctuation(lin)
{
  # Remove multiple punctuation, leave only the strongest:
  # However, leave a comma after a period:
  gsub(/[ ,;:]*[.][ ,;:.]*[.]/, ". ", lin); 
  gsub(/[ ,;:]*[.][ ,;:]*[:][ ,;]*/, ".: ", lin); 
  gsub(/[ ,;:]*[.][ ,;]*[;][ ,]*/, ".; ", lin); 
  gsub(/[ ,;:]*[.][ ,]*[,][ ]*/, "., ", lin);
  gsub(/[ ,;:]*[.][ ]*/, ". ", lin); 
  
  gsub(/[ ,;]*[:][ ,;:]*/, ": ", lin); 
  gsub(/[ ,]*[;][ ,;]*/, "; ", lin); 
  gsub(/[ ]*[,][ ,]*/, ", ", lin);
  
  # Remove puncts after bol, open brace, parenthesis:
  lin = gensub(/(^|[{(])[- ,;:.]+/, "\\1", "g", lin); 
  
  # Remove puncts (except ".") before close brace, parenthesis, eol:
  lin = gensub(/[- ,;:]+([)}]|$)/, "\\1", "g", lin); 
  
  return lin;
}

function fix_isi_journal_name(lin, tmp)
{
  if (match(lin,/{LECT NOTES ARTIF INT}/))
    { # Things in LNAI are usually proceedings
      lin = \
        ( "  booktitle = {??}\n" \
          "  series = {Lecture Notes in Artificial Intelligence}\n" \
          "  publisher = {Springer}" \
        ); 
    }
  else if (match(lin,/{LECT NOTES COMPUT SC}/))
    { # Things in LNCS are usually proceedings
      lin = \
        ( "  booktitle = {??}\n" \
          "  series = {Lecture Notes in Computer Science}\n" \
          "  publisher = {Springer}" \
        ); 
    }
  else
    { tmp = lin;
    
      # Map journal names

      sub(/{ACM COMPUT SURV}/, "{ACM Computing Surveys}",  tmp);
      sub(/{ACM SIGPLAN NOTICES}/, "{ACM SIGPLAN Notices}",  tmp);
      sub(/{ACM T GRAPHIC}/, "{ACM Transactions on Graphics}",  tmp);
      sub(/{ACM T MATH SOFTWARE}/, "{ACM Transactions on Mathematical Software}",  tmp);
      sub(/{ACTA INFORM}/, "{Acta Informatica}",  tmp);
      sub(/{ALGORITHMICA}/, "{Algorithmica}",  tmp);
      sub(/{ANN OPER RES}/, "{Annals of Operations Research}",  tmp);
      sub(/{ARS COMBINATORIA}/, "{Ars Combinatoria}",  tmp);
      sub(/{ARTIF INTELL}/, "{Artificial Intelligence}",  tmp);
      sub(/{ASTRON ASTROPHYS}/, "{Astronomy and Astrophysics}",  tmp);
      sub(/{BIT}/, "{BIT}",  tmp);
      sub(/{COMP GEOM-THEOR APPL}/, "{Cumputational Geometry - Theory and Applications}",  tmp);
      sub(/{COMPUT AIDED DESIGN}/, "{Computer Aided Design}",  tmp);
      sub(/{COMPUT AIDED GEOM D}/, "{Computer Aided Geometric Design}",  tmp);
      sub(/{COMPUT GRAPH FORUM}/, "{Computer Graphics Forum}",  tmp);
      sub(/{COMPUT GRAPH}/, "{Computers {\\&} Graphics}",  tmp);
      sub(/{COMPUT VIS IMAGE UND}/, "{Computer Vision and Image Understanding}",  tmp);
      sub(/{COMPUT VISION GRAPH}/, "{Computer Vision and Graphics}",  tmp);
      sub(/{DISCRETE COMPUT GEOM}/, "{Discrete and Computational Geometry}",  tmp);
      sub(/{IEEE COMPUT GRAPH}/, "{IEEE Computer Graphics}",  tmp);
      sub(/{IEEE T CIRCUITS-I}/, "{IEEE Trans. on Circuits - I}",  tmp);
      sub(/{IEEE T COMPUT AID D}/, "{IEEE Trans. on Computer Aided Design}",  tmp);
      sub(/{IEEE T COMPUT}/, "{IEEE Trans. on Computers}",  tmp);
      sub(/{IEEE T EDUC}/, "{IEEE Trans. on Education}",  tmp);
      sub(/{IEEE T NEURAL NETWOR}/, "{IEEE Trans. on Neural Networks}",  tmp);
      sub(/{IEEE T PATTERN ANAL}/, "{IEEE Trans. on Pattern Analysis and Machine Intelligence}",  tmp);
      sub(/{INFORM PROCESS LETT}/, "{Information Processing Letters}",  tmp);
      sub(/{INT J COMPUT GEOM AP}/, "{Int. J. of Computational Geometry and Applications}",  tmp);
      sub(/{INT J ROBOT RES}/, "{Int. J. of Robot Research}",  tmp);
      sub(/{J ACM}/, "{J. of the ACM}",  tmp);
      sub(/{J ALGORITHM}/, "{J. of Algorithms}",  tmp);
      sub(/{J APPROX THEORY}/, "{J. of Approximation Theory}",  tmp);
      sub(/{J COMPUT PHYS}/, "{J. of Computational Physics}",  tmp);
      sub(/{J GRAPH THEOR}/, "{J. of Graph Theory}",  tmp);
      sub(/{J SYMB COMPUT}/, "{J. of Symbolic Computation}",  tmp);
      sub(/{KYBERNETES}/, "{Kybernetes}",  tmp);
      sub(/{Neural Networks}/, "{Neural Networks}",  tmp);
      sub(/{NUCL INSTRUM METH A}/, "{Nuclear Instrumentation Methods - A}",  tmp);
      sub(/{OPER RES}/, "{Operations Research}",  tmp);
      sub(/{P IEEE}/, "{Proceedings of the IEEE}",  tmp);
      sub(/{PATTERN RECOGN LETT}/, "{Pattern Recognition Letters}",  tmp);
      sub(/{PATTERN RECOGN}/, "{Pattern Recognition}",  tmp);
      sub(/{SIAM J COMPUT}/, "{SIAM J. on Computing}",  tmp);
      sub(/{SIAM J NUMER ANAL}/, "{SIAM J. on Numerical Analysis}",  tmp);
      sub(/{THEOR COMPUT SCI}/, "{Theoretical Computer Science}",  tmp);
      sub(/{VISUAL COMPUT}/, "{Visual Computer}",  tmp);
      
      # To be checked:
      
      # ? sub(/{ACM T PROGR LANG SYS}/, "{ACM Transactions on Programming Languages and Systems}",  tmp);
      # ? sub(/{ACTA APPL MATH}/, "{}",  tmp);
      # ? sub(/{ADV ENG SOFTW}/, "{Advances in Engineering Software}",  tmp);
      # ? sub(/{ADV IMAG ELECT PHYS}/, "{Advances in Imaging and Electron Physics}",  tmp);
      # ? sub(/{ADV MATH}/, "{Advances in Mathematics}",  tmp);
      # ? sub(/{ANNU REV COMPUT SCI}/, "{Annual Review of Computer Science}",  tmp);
      # ? sub(/{APPL MATH LETT}/, "{Applied Mathematics Letters}",  tmp);
      # ? sub(/{CELL PROLIFERAT}/, "{Cell Proliferation}",  tmp);
      # ? sub(/{CEREB CORTEX}/, "{Cerebral Cortex}",  tmp);
      # ? sub(/{COMMUN APPL NUMER M}/, "{Communications in Applied Numerical Methods}",  tmp);
      # ? sub(/{COMPUT BIOMED RES}/, "{Computers and Biomedical Research}",  tmp);
      # ? sub(/{COMPUT GEOSCI-UK}/, "{Computational Geosciences - UK}",  tmp);
      # ? sub(/{COMPUT GEOSCI}/, "{Computational Geosciences}",  tmp);
      # ? sub(/{COMPUT GRAPH-UK}/, "{Computers and Graphics - UK}",  tmp);
      # ? sub(/{COMPUT GRAPH}/, "{Computer Graphics}",  tmp);
      # ? sub(/{COMPUT IND}/, "{}",  tmp);
      # ? sub(/{COMPUT MATH APPL}/, "{Computational Mathematics and Applications}",  tmp);
      # ? sub(/{COMPUT NETWORKS ISDN}/, "{Computer Networks ISDN}",  tmp);
      # ? sub(/{COMPUT STRUCT}/, "{Computer Structures}",  tmp);
      # ? sub(/{COMPUTING}/, "{Computing}",  tmp);
      # ? sub(/{CR ACAD SCI I-MATH}/, "{}",  tmp);
      # ? sub(/{CVGIP-GRAPH MODEL IM}/, "{CVGIP - Graph Models in Image Processing}",  tmp);
      # ? sub(/{DISCRETE APPL MATH}/, "{Discrete Applied Mathematics}",  tmp);
      # ? sub(/{DISCRETE COMPUT GEOM}/, "{Discrete and Computational Geometry}",  tmp);
      # ? sub(/{ECOL MODEL}/, "{Ecological Modeling}",  tmp);
      # ? sub(/{ELECTROPHORESIS}/, "{Electrophoresis}",  tmp);
      # ? sub(/{FRACTALS}/, "{Fractals}",  tmp);
      # ? sub(/{FUNCT ANAL APPL+}/, "{Functional Analysis and Applications}",  tmp);
      # ? sub(/{FUZZY SET SYST}/, "{Fuzzy Set Theory}",  tmp);
      # ? sub(/{GRAPH MODEL IM PROC}/, "{Graph Models in Image Processing}",  tmp);
      # ? sub(/{GRAPH MODELS}/, "{Graph Models}",  tmp);
      # ? sub(/{IEEE T CIRC SYST VID}/, "{IEEE Trans. on Circuits and Systems - Video}",  tmp);
      # ? sub(/{IEEE T DIELECT EL IN}/, "{IEEE Trans. on Dielectrics and Electrical Insulation}",  tmp);
      # ? sub(/{IEEE T GEOSCI REMOTE}/, "{IEEE Trans. on Geoscience and Remote Sensing}",  tmp);
      # ? sub(/{IEEE T PATTERN ANAL}/, "{IEEE Trans. on Pattern Analysis and Machine Intelligence}",  tmp);
      # ? sub(/{IEEE T ROBOTIC AUTOM}/, "{IEEE Trans. on Robotics and Automation}",  tmp);
      # ? sub(/{IEEE T VIS COMPUT GR}/, "{IEEE Trans. on Visualization and Computer Graphics}",  tmp);
      # ? sub(/{IEICE T FUND ELECTR}/, "{IEICE Transactions on Fundamental Electronics}",  tmp);
      # ? sub(/{IEICE T INF SYST}/, "{IEICE Transactions on Information Systems}",  tmp);
      # ? sub(/{IETE TECH REV}/, "{IETE Technical Review}",  tmp);
      # ? sub(/{INT J BIFURCAT CHAOS}/, "{Int. J. on Bifurcation and Chaos}",  tmp);
      # ? sub(/{INT J COMPUT VISION}/, "{Int. J. of Computer Vision}",  tmp);
      # ? sub(/{INT J GEOGR INF SCI}/, "{Int. J. on Geographical Information Science}",  tmp);
      # ? sub(/{INT J NUMER METH ENG}/, "{Int. J. of Numerical Methods in Engineering}",  tmp);
      # ? sub(/{INT J ROBUST NONLIN}/, "{Int. J. of Robust Nonlinear Systems}",  tmp);
      # ? sub(/{ISPRS J PHOTOGRAMM}/, "{ISPRS J. of Photogrammetry}",  tmp);
      # ? sub(/{J CHEM INF COMP SCI}/, "{J. of Chemical Information and Computation Science}",  tmp);
      # ? sub(/{J COMPUT SYST SCI}/, "{J. of Computer and Systems Science}",  tmp);
      # ? sub(/{J EXP THEOR ARTIF IN}/, "{J. of Experimental and Thoretical Artifical Intelligence}",  tmp);
      # ? sub(/{J FUNCT PROGRAM}/, "{J. of Functional Programming}",  tmp);
      # ? sub(/{J GUID CONTROL DYNAM}/, "{J. of Guidance and Control Dynamics}",  tmp);
      # ? sub(/{J MATER PROCESS TECH}/, "{J. of Material Processing Techniques}",  tmp);
      # ? sub(/{J NEUROSCI}/, "{J. of Neuroscience}",  tmp);
      # ? sub(/{J SURV ENG-ASCE}/, "{}",  tmp);
      # ? sub(/{J VIS COMMUN IMAGE R}/, "{J. of Visual Communication and Imaging Research}",  tmp);
      # ? sub(/{J VISUAL COMP ANIMAT}/, "{J. of Visualization and Computing Animation}",  tmp);
      # ? sub(/{J WUHAN UNIV TECHNOL}/, "{J. of the Wuhan Univ. of Technology}",  tmp);
      # ? sub(/{MACH VISION APPL}/, "{Machine Vision and Applications}",  tmp);
      # ? sub(/{MATER STRUCT}/, "{Material Structure}",  tmp);
      # ? sub(/{MATH GEOL}/, "{Mathematical Geology}",  tmp);
      # ? sub(/{NETWORKS}/, "{Networks}",  tmp);
      # ? sub(/{NEURAL NETWORKS}/, "{Neural Networks}",  tmp);
      # ? sub(/{P I MECH ENG B-J ENG}/, "{Proc. of the Inst. of Mechanical Engineers B - J. Engineering}",  tmp);
      # ? sub(/{PHILOS MAG B}/, "{Philosophical Magazine - B}",  tmp);
      # ? sub(/{PHILOS T ROY SOC B}/, "{Philosophical Trans. of the  Royal Society - B}",  tmp);
      # ? sub(/{PROG NAT SCI}/, "{Progress in Natural Sciences}",  tmp);
      # ? sub(/{Physica D}/, "{Physica - D}",  tmp);
      # ? sub(/{RAIRO-AUTOM PROD INF}/, "{RAIRO - Automation et Prod. Informatique}",  tmp);
      # ? sub(/{RAIRO-INF THEOR APPL}/, "{RAIRO - Informatique Theorique et Appliquée}",  tmp);
      # ? sub(/{RAPID PROTOTYPING J}/, "{Rapid Prototyping J.}",  tmp);
      # ? sub(/{REAL-TIME IMAGING}/, "{Real-Time Imaging}",  tmp);
      # ? sub(/{SIAM J SCI STAT COMP}/, "{SIAM J. on Scific Statistics and Computing}",  tmp);
      # ? sub(/{SIGNAL PROCESS-IMAGE}/, "{Signal Processing - Image}",  tmp);
      # ? sub(/{STAT COMPUT}/, "{Statistical Computing}",  tmp);
      # ? sub(/{VISION RES}/, "{Vision Research}",  tmp);

      if (tmp != lin)
        { sub(/ isijournal[ ]*[=][ ]*/, " journal = ", tmp);
          lin = tmp;
        }
    }
  return lin;
}

function split_isi_pagedate(lin, imo,res,fld,sep)
{
  # Splits "isipagedate" field into "month", "year", and "pages".
  
  # Strip initial "[ ]*isipagedate = {" and final "}": 
  gsub(/^[ ]*isipagedate[ ]*[=][ ]*{[ ]*/, "", lin);
  gsub(/[ ]*}[ ]*$/, "", lin);
  
  # Fix ISI notation for unknown final page:
  lin = gensub(/[-][&][ ]/, "-?? ", "g", lin);
  
  # Prepare for parsing:
  split("", fld); res = ""; sep = "";
  
  # Variant with pages, month, day, and year:
  if (match(lin, \
      /^([A-Z]?[0-9?]+)[-]([A-Z]?[0-9?]+) +([JFMASOND][A-Z][A-Z]) +([0-3]?[0-9]) +([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        "  day = {" fld[4] "}\n" \
        "  month = " convert_isi_month(fld[3]) "\n" \
        "  year = {" fld[5] "}\n" \
        "  pages = {" fld[1] "--" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with pages, month, year:
  if (match(lin, \
      /^([A-Z]?[0-9?]+)[-]([A-Z]?[0-9?]+) +([JFMASOND][A-Z][A-Z]) +([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        "  month = " convert_isi_month(fld[3]) "\n" \
        "  year = {" fld[4] "}\n" \
        "  pages = {" fld[1] "--" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with pages, month range, year:
  if (match(lin, \
      /^([A-Z]?[0-9?]+)[-]([A-Z]?[0-9?]+) +([JFMASOND][A-Z][A-Z][-][JFMASOND][A-Z][A-Z]) +([12][089][0-9][0-9])$/, \
      fld))
    { gsub(/[-]/, "--", fld[3]);
      res = ( res sep \
        "  month = " convert_isi_month(fld[3]) "\n" \
        "  year = {" fld[4] "}\n" \
        "  pages = {" fld[1] "--" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with pages and year:
  if (match(lin, \
      /^([A-Z]?[0-9?]+)[-]([A-Z]?[0-9?]+) +([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        "  year = {" fld[3] "}\n" \
        "  pages = {" fld[1] "--" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with month, day, and year:
  if (match(lin, \
      /^([JFMASOND][A-Z][A-Z]) +([0-3]?[0-9]) +([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        "  day = {" fld[2] "}\n" \
        "  month = " convert_isi_month(fld[1]) "\n" \
        "  year = {" fld[3] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with month and year:
  if (match(lin, \
      /^([JFMASOND][A-Z][A-Z]) +([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        "  month = " convert_isi_month(fld[1]) "\n" \
        "  year = {" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with month range and year:
  if (match(lin, \
      /^([JFMASOND][A-Z][A-Z][-][JFMASOND][A-Z][A-Z]) +([12][089][0-9][0-9])$/, \
      fld))
    { gsub(/[-]/, "--", fld[1]);
      res = ( res sep \
        "  month = {" convert_isi_month(fld[1]) "}\n" \
        "  year = {" fld[2] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Variant with year only:
  if (match(lin, \
      /^([12][089][0-9][0-9])$/, \
      fld))
    { res = ( res sep \
        " year = {" fld[1] "}" \
      );
      sep = "\n";
      return res;
    }
  
  # Give up:
  if (lin != "")
    { res = ( res sep "  isipagedate = {" lin "}" ); sep = "\n"; lin = ""; }

  return res;
}

function convert_isi_month(m)
{ 
  # Converts month from ISI format to Bibtex format:
  return tolower(m); 
}

function data_warning(msg)
{
  printf "%s:%d: ++ Warning: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "   $0 = \"%s\"\n", $0 > "/dev/stderr";
}

function data_error(msg)
{
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "   $0 = \"%s\"\n", $0 > "/dev/stderr";
  abort = -1;
  exit abort;
}