#! /bin/bash
# Last edited on 2024-03-30 18:31:36 by stolfi

# Splits a manually-checked wordlist into "good", "bad", and "dunno" parts.
#
# Usage: $0 <infile> <outprefix>
# The <infile> must contain a list of words, each preceded by one of 
#   the checkmarks "+ " (good),  "- " (bad), or "? " (dunno).
#
# This script writes the files <outprefix>.checked-good,
#   <outprefix>.checked-bad, and <outprefix>.checked-dunno, containing
#   the indicated subset of <infile>, without the checkmarks.  It also
#   prints out the number of lines of <infile> that don't have
#   checkmarks.
#

if [[ $# -ne 2 ]]; then
  echo "** usage: split-check-list <infile> <outprefix>"
  exit 1
fi

infile="$1"; shift
opref="$1"; shift

tmpfile="/tmp/$$.sort"
bugfile="${opref}.checked-bugs"

sort $infile | uniq > $tmpfile
egrep -e '^\-' $tmpfile | sed -e 's/^\- *//g' | sort | uniq > ${opref}.checked-bad
egrep -e '^\?' $tmpfile | sed -e 's/^\? *//g' | sort | uniq > ${opref}.checked-dunno
egrep -e '^\+' $tmpfile | sed -e 's/^\+ *//g' | sort | uniq > ${opref}.checked-good
egrep -e '^[^-+?]' $tmpfile > $bugfile

if [[ ( -f  $bugfile ) && ( ! ( -s $bugfile )) ]]; then
  rm $bugfile
else
  echo "!! ${infile} contains `cat ${bugfile} | wc -l ` unchecked lines." 1>&2
fi

wc ${opref}.checked-bad 1>&2
wc ${opref}.checked-dunno 1>&2
wc ${opref}.checked-good 1>&2

cat ${opref}.checked-{bad,dunno,good} | sort | uniq > ${opref}.checked
wc ${opref}.checked 1>&2

if [[ -e ${opref}.unseen ]]; then
  bool 1-2 ${opref}.checked ${opref}.unseen > ${opref}.invented
  wc ${opref}.invented 1>&2

  bool 2-1 ${opref}.checked ${opref}.unseen > ${opref}.unchecked
  if [[ ( -f  ${opref}.unchecked ) && ( ! ( -s ${opref}.unchecked ) ) ]]; then
    rm ${opref}.unchecked
  else
    echo 'warning:' ${opref}.unseen 'not contained in' ${opref}.checked
    wc ${opref}.unchecked
    echo ' '
  fi
fi

echo ' ' 1>&2

# Comparing output files:

for f1 in good bad; do 
  for f2 in bad dunno; do 
    if [[ "${f1}" != "${f2}" ]]; then
      echo "=== words in both ${opref}.checked-${f1} ${opref}.${f2}:" 1>&2
      bool 1.2 ${opref}.checked-${f1} ${opref}.${f2} 1>&2
    fi
  done
done


