#! /bin/bash # Last edited on 2024-03-30 18:31:36 by stolfi # Splits a manually-checked wordlist into "good", "bad", and "dunno" parts. # # Usage: $0 # The must contain a list of words, each preceded by one of # the checkmarks "+ " (good), "- " (bad), or "? " (dunno). # # This script writes the files .checked-good, # .checked-bad, and .checked-dunno, containing # the indicated subset of , without the checkmarks. It also # prints out the number of lines of that don't have # checkmarks. # if [[ $# -ne 2 ]]; then echo "** usage: split-check-list " exit 1 fi infile="$1"; shift opref="$1"; shift tmpfile="/tmp/$$.sort" bugfile="${opref}.checked-bugs" sort $infile | uniq > $tmpfile egrep -e '^\-' $tmpfile | sed -e 's/^\- *//g' | sort | uniq > ${opref}.checked-bad egrep -e '^\?' $tmpfile | sed -e 's/^\? *//g' | sort | uniq > ${opref}.checked-dunno egrep -e '^\+' $tmpfile | sed -e 's/^\+ *//g' | sort | uniq > ${opref}.checked-good egrep -e '^[^-+?]' $tmpfile > $bugfile if [[ ( -f $bugfile ) && ( ! ( -s $bugfile )) ]]; then rm $bugfile else echo "!! ${infile} contains `cat ${bugfile} | wc -l ` unchecked lines." 1>&2 fi wc ${opref}.checked-bad 1>&2 wc ${opref}.checked-dunno 1>&2 wc ${opref}.checked-good 1>&2 cat ${opref}.checked-{bad,dunno,good} | sort | uniq > ${opref}.checked wc ${opref}.checked 1>&2 if [[ -e ${opref}.unseen ]]; then bool 1-2 ${opref}.checked ${opref}.unseen > ${opref}.invented wc ${opref}.invented 1>&2 bool 2-1 ${opref}.checked ${opref}.unseen > ${opref}.unchecked if [[ ( -f ${opref}.unchecked ) && ( ! ( -s ${opref}.unchecked ) ) ]]; then rm ${opref}.unchecked else echo 'warning:' ${opref}.unseen 'not contained in' ${opref}.checked wc ${opref}.unchecked echo ' ' fi fi echo ' ' 1>&2 # Comparing output files: for f1 in good bad; do for f2 in bad dunno; do if [[ "${f1}" != "${f2}" ]]; then echo "=== words in both ${opref}.checked-${f1} ${opref}.${f2}:" 1>&2 bool 1.2 ${opref}.checked-${f1} ${opref}.${f2} 1>&2 fi done done