#! /bin/csh -f # Last edited on 2001-04-08 22:47:10 by stolfi set usage = "$0 ID" # Reads a test corpus "ID/test.exp" and writes a corpus-specific Prolog lexicon # "ID/test.plg", with all categories for all words contained in "ID/test.exp". # For the time being, this script assumes that all contractions have # been expanded, and that non-intrinsic capital letters (e. g. in # sentence-initial common words) have been reduced to lowercase. if ( $#argv != 1 ) then echo "usage: ${usage}"; exit 1 endif set ID = "$1"; shift set path = ( /n/gnu/bin /bin /usr/bin/ $path ) set pdir = "/home/staff/stolfi/projects/port-syntax/" set tmp = "/tmp/$$-mp" # Remove any existing prolog lexicon: /bin/rm -f ${ID}/test.dpg # Extract the list of all words occurring in the corpus. cat ${ID}/test.exp \ | tr ' .,:;\!?()[]"/$&@' '\012' \ | egrep '.' \ | sort \ | uniq \ > ${ID}/test.wds # Extracts the corresponding entries from the lexicon: set graca = "scc-g" cat ${ID}/test.wds \ | sed -e 's/^/(/' -e 's/$/,/' \ > ${tmp}.pat cat ${pdir}/dics/${graca}.dic \ | grep -F -f ${tmp}.pat \ > ${ID}/test.dic # Extracts the text words that are present in the specific lexicon: if ( ! ( -r ${ID}/extra.dic ) ) touch ${ID}/extra.dic cat ${ID}/test.dic ${ID}/extra.dic \ | sed -e 's/[,].*$//' -e 's/^[a-z0-9]*[(]//' \ | sort | uniq \ > ${ID}/found.wds # If any corpus words are missing from the corpus-specific lexicon, # prints them and stops: comm -23 ${ID}/test.wds ${ID}/found.wds \ > ${ID}/missing.wds if ( ! ( -z ${ID}/missing.wds ) ) then echo "** missing words -- add them to ${ID}/extra.dic:" cat ${ID}/missing.wds | sed -e 's/\(.*\)/(\1,\1)/' exit 1 endif # Converts the corpus-specific lexicon to prolog format: cat ${ID}/test.dic ${ID}/extra.dic \ | ${pdir}/tools/dic-to-prolog \ > ${ID}/test.dpg /bin/rm -f ${tmp}.*