#! /usr/bin/gawk -f 
# Last edited on 1999-01-06 06:45:18 by stolfi

# Translates a pattern occurrence index file from one encoding to another.
# Usage: $0 MAPFILE < INFILE > OUTFILE
#
# The INFILE should have records of the form
#
#   <LOCATION> LOCPOS GLOBPOS PAT OBS
#
# meaning that pattern PAT occurred in line <LOCATION> 
# displaced LOCPOS character from the beginning of the line,
# and GLOBPOS characters from the beginning of the whole text.
# The OBS is an optional comment about this occurrence.
#
# The MAPFILE must contain entries in the format
#   OLDPAT NEWPAT NEWOBS
# meaning that any occurrence of pattern OLDPAT listed in INFILE
# should be replaced by an occurrence of NEWPAT.
# If the NEWOBS field is present, it replaces the OBS field from
# INFILE. 
#
# There may be multiple lines with the same OLDPAT; one output 
# line will be written for each.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

BEGIN {
  abort = 0
  usage = (ARGV[0] " MAPFILE < OCCFILE")
  if (ARGC != 2) { error(("usage: " usage)); }
  mfile = ARGV[1];
  split("", mpat)
  split("", mobs)
  split("", nalts)
  while((getline e < mfile) > 0) 
    { split(e, fld)
      opat = fld[1]
      npat = fld[2]
      if (opat in nalts) { nalts[opat]++ } else { nalts[opat] = 1 }
      k = nalts[opat]
      mpat[opat,k] = npat
      if (3 in fld) { mobs[opat,k] = fld[3] }
    }
  if (ERRNO != "0") { error((mfile ": " ERRNO)); }
  close (mfile);
  ARGC = 1
}

/./ {
  if (abort) exit;
  loc = $1
  lpos = $2
  gpos = $3
  opat = $4
  if (!(opat in nalts))
    { error(("line " NR ": unmatched pattern \"" opat "\"")); exit 1 }
  n = nalts[opat];
  for (k=1;k<=n;k++)
    { if ((opat,k) in mobs) { $5 = mobs[opat,k] }
      if (NF >= 5) 
        { obs = $5
          print loc, lpos, gpos, mpat[opat,k], obs
        }
      else
        { print loc, lpos, gpos, mpat[opat,k] }
    }
  next
}