#! /bin/csh -f
# Last edited on 2001-04-08 22:47:10 by stolfi

set usage = "$0 ID"

# Reads a test corpus "ID/test.exp" and writes a corpus-specific Prolog lexicon 
# "ID/test.plg", with all categories for all words contained in "ID/test.exp". 

# For the time being, this script assumes that all contractions have
# been expanded, and that non-intrinsic capital letters (e. g. in
# sentence-initial common words) have been reduced to lowercase.

if ( $#argv != 1 ) then
  echo "usage: ${usage}"; exit 1
endif

set ID = "$1"; shift

set path = ( /n/gnu/bin /bin /usr/bin/ $path )

set pdir = "/home/staff/stolfi/projects/port-syntax/"

set tmp = "/tmp/$$-mp"

# Remove any existing prolog lexicon:

/bin/rm -f ${ID}/test.dpg

# Extract the list of all words occurring in the corpus.

cat ${ID}/test.exp \
  | tr ' .,:;\!?()[]"/$&@' '\012' \
  | egrep '.' \
  | sort \
  | uniq \
  > ${ID}/test.wds
  
# Extracts the corresponding entries from the lexicon:

set graca = "scc-g"

cat ${ID}/test.wds \
  | sed -e 's/^/(/' -e 's/$/,/' \
  > ${tmp}.pat
  
cat ${pdir}/dics/${graca}.dic \
  | grep -F -f ${tmp}.pat \
  > ${ID}/test.dic
      
# Extracts the text words that are present in the specific lexicon:

if ( ! ( -r ${ID}/extra.dic ) ) touch ${ID}/extra.dic

cat ${ID}/test.dic ${ID}/extra.dic \
  | sed -e 's/[,].*$//' -e 's/^[a-z0-9]*[(]//' \
  | sort | uniq \
  > ${ID}/found.wds

# If any corpus words are missing from the corpus-specific lexicon, 
# prints them and stops:

comm -23 ${ID}/test.wds ${ID}/found.wds \
  > ${ID}/missing.wds
  
if ( ! ( -z ${ID}/missing.wds ) ) then
  echo "** missing words -- add them to ${ID}/extra.dic:"
  cat ${ID}/missing.wds | sed -e 's/\(.*\)/(\1,\1)/'
  exit 1
endif

# Converts the corpus-specific lexicon to prolog format:
  
cat ${ID}/test.dic ${ID}/extra.dic \
  | ${pdir}/tools/dic-to-prolog \
  > ${ID}/test.dpg

/bin/rm -f ${tmp}.*