#! /bin/csh -f 
# Last edited on 2025-05-01 18:43:26 by stolfi

set usage = "$0 < WORDS.frq > TBPATS.frq"

# Tabulates core-mantles according to pattern and number of benches

set tmp = ".cm"

cat \
  | gawk \
      ' /./{ \
          w=$3; \
          gsub(/[cs]h/,"C",w); gsub(/se/,"C",w); \
          gsub(/eeeee/,"eEE",w); gsub(/eeee/,"EE",w); \
          gsub(/eee/,"eE",w); gsub(/ee/,"E",w); \
          gsub(/E/,"C",w); \
          gsub(/[ci][ktpf]h/,"K",w); gsub(/[ktpf]/,"k",w); \
          gsub(/[aoy]/,"",w); \
          gsub(/^[^CEKcehk]*[^CEKcehkaoy]/,"",w); gsub(/[^CEKcehk]*$/,"",w); \
          if (w == "") { w = ".";} \
          if(w \!~ /[^CEKekaoy.]/){print $1,w;} \
        } ' \
  | combine-counts | compute-freqs \
  | sort -b -k1nr -k3 \
  > ${tmp}.pats

cat ${tmp}.pats \
  | gawk \
      ' /./{ \
          w=$3; gsub(/[^CE]/,"",w); CE=length(w); \
          w=$3; gsub(/[^kK]/,"",w);  k=length(w); \
          w=$3; gsub(/[^K]/,"@",w); ckh=length(w); \
          printf "%02d %02d %02d %s\n", k,CE,ckh, $0; \
        } ' \
  | sort -b -k1 -k2 -k3 -k4nr -k6

# /bin/rm ${tmp}.pats
