#! /bin/bash -eu
# Last edited on 2026-01-15 14:14:50 by stolfi

# Reads a list of triples "{COUNT} {FREQ} {PATTERN}" where
# {PATTERN} is a word parsed into elements delimited by '{}',
# with one or more '!' inserted into it.

# Discards blank lines, #-comments, and entries with no '!'.]
# Also discards lines of '~' and lines with 'TOTAL'.

# Split entries with multiple '!':
cat \
  | sed \
      -e 's/^[ ][ ]*//g' \
      -e 's/[ ][ ]*$//g' \
      -e '/^[#]/d' \
      -e '/TOTAL/d' \
      -e '/^[~]+$/d' \
  | gawk \
      ' /[!]/{
          if (NF != 3) { data_error("bad format") }
          ct = $1; fr = $2; we = $3;
          
          # Do not count @ih @ith etc as TWO errors:
          we = gensub(/[!]{i}[!]{h}/, "!{i}{h}","g", we)
          we = gensub(/[!]{i}({[ktpfwz]})[!]{h}/, "!{i}\\1{h}","g", we)
          we = gensub(/[!]{i}({[ktpfwz]})[!]{h}[!]*{h}/, "!{i}\\1{h}{h}","g", we)
          
          # Do not count @i+X as multiple errors:
          we = gensub(/[!]{i}[!]*{i}[!]*{i}[!]*{i}[!]*/, "!{i}{i}{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*{i}[!]*{i}[!]*/, "!{i}{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*{i}[!]*/, "!{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*/, "!{i}","g", we)
          
          while (match(we, /[!][^!]*[!]/)) {
            we1 = substr(we, 1, RSTART-1)
            we2 = substr(we, RSTART+1, RLENGTH-2) 
            we3 = substr(we, RSTART+RLENGTH)
            we3n = we3; gsub(/[!]/, "", we3n)
            wea = (we1 "!" we2 we3n)
            printf "%12.6f %s\n", ct, wea
            we = ( we1 we2 "!" we3 )
          }
          printf "%12.6f %s\n", ct, we
        }
      ' \
  > .bad-split
  
# Replace words with '!' by patterns with 'ż':
cat .bad-split \
  | sed \
      -e 's:[ ][^ ]*[!]{c}{\([^ {}]*\)}{\([^ {}]*\)}.*$: *żc\1\2*:g' \
      -e 's:[ ][^ ]*[!]{c}{\([^ {}]*\)}$: *żc\1:g' \
      -e 's:[ ][^ ]*[!]{c}$: *żc:g' \
       \
      -e 's:[ ]{q}{o}[!]{e}.*$: żqoe*:g' \
      -e 's:[ ]{q}[!]{e}.*$: żqe*:g' \
       \
      -e 's:[ ]{o}[!]{e}.*$: żoe*:g' \
      -e 's:[ ][!]{e}.*$: że*:g' \
       \
      -e 's:[ ][^ ]*[!]{i}{\([tkpfwz]\)}{h}.*$: *żi\1h*:g' \
      -e 's:[ ][^ ]*[!]{i}{h}.*$: *żih*:g' \
       \
      -e 's:[ ][^ ]*[!]{i}{i}{i}{i}{\([^nmr]\)}.*$: *żiiii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{i}{i}{\([^nmr]\)}.*$: *żiii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{i}{\([^nmr]\)}.*$: *żii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{\([^nmr]\)}.*$: *żi\1*:g' \
       \
      -e 's:[ ][^ ]*\({[^ {}]*}\)\({[^ {}]*}\)[!]: *\1\2ż:g' \
      -e 's:[ ]\({[^ {}]*}\)[!]: \1ż:g' \
      -e 's:[ ][!]: ż:g' \
      -e 's:[ż]\(.....\).*$:ż\1*:g' \
  > .bad-pats-u
  
cat .bad-pats-u \
  | combine_counts.gawk \
  | sort -b -k1,1nr \
  | compute_freqs.gawk -v outputTotal='TOTAL' \