#! /bin/bash -eu
# Last edited on 2026-01-15 14:14:50 by stolfi

# Reads a list of triples "{COUNT} {FREQ} {PATTERN}" where
# {PATTERN} is a word parsed into elements delimited by '{}',
# with one or more '!' inserted into it.

# Discards blank lines, #-comments, and entries with no '!'.]
# Also discards lines of '~' and lines with 'TOTAL'.

# Split entries with multiple '!':
cat \
  | sed \
      -e 's/^[ ][ ]*//g' \
      -e 's/[ ][ ]*$//g' \
      -e '/^[#]/d' \
      -e '/TOTAL/d' \
      -e '/^[~]+$/d' \
  | gawk \
      ' /[!]/{
          if (NF != 3) { data_error("bad format") }
          ct = $1; fr = $2; we = $3;
          
          # Do not count @ih @ith etc as TWO errors:
          we = gensub(/[!]{i}[!]{h}/, "!{i}{h}","g", we)
          we = gensub(/[!]{i}({[ktpfwz]})[!]{h}/, "!{i}\\1{h}","g", we)
          we = gensub(/[!]{i}({[ktpfwz]})[!]{h}[!]*{h}/, "!{i}\\1{h}{h}","g", we)
          
          # Do not count @i+X as multiple errors:
          we = gensub(/[!]{i}[!]*{i}[!]*{i}[!]*{i}[!]*/, "!{i}{i}{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*{i}[!]*{i}[!]*/, "!{i}{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*{i}[!]*/, "!{i}{i}","g", we)
          we = gensub(/[!]{i}[!]*/, "!{i}","g", we)
          
          while (match(we, /[!][^!]*[!]/)) {
            we1 = substr(we, 1, RSTART-1)
            we2 = substr(we, RSTART+1, RLENGTH-2) 
            we3 = substr(we, RSTART+RLENGTH)
            we3n = we3; gsub(/[!]/, "", we3n)
            wea = (we1 "!" we2 we3n)
            printf "%12.6f %s\n", ct, wea
            we = ( we1 we2 "!" we3 )
          }
          printf "%12.6f %s\n", ct, we
        }
      ' \
  > .bad-split
  
# Replace words with '!' by patterns with '¿':
cat .bad-split \
  | sed \
      -e 's:[ ][^ ]*[!]{c}{\([^ {}]*\)}{\([^ {}]*\)}.*$: *¿c\1\2*:g' \
      -e 's:[ ][^ ]*[!]{c}{\([^ {}]*\)}$: *¿c\1:g' \
      -e 's:[ ][^ ]*[!]{c}$: *¿c:g' \
       \
      -e 's:[ ]{q}{o}[!]{e}.*$: ¿qoe*:g' \
      -e 's:[ ]{q}[!]{e}.*$: ¿qe*:g' \
       \
      -e 's:[ ]{o}[!]{e}.*$: ¿oe*:g' \
      -e 's:[ ][!]{e}.*$: ¿e*:g' \
       \
      -e 's:[ ][^ ]*[!]{i}{\([tkpfwz]\)}{h}.*$: *¿i\1h*:g' \
      -e 's:[ ][^ ]*[!]{i}{h}.*$: *¿ih*:g' \
       \
      -e 's:[ ][^ ]*[!]{i}{i}{i}{i}{\([^nmr]\)}.*$: *¿iiii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{i}{i}{\([^nmr]\)}.*$: *¿iii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{i}{\([^nmr]\)}.*$: *¿ii\1*:g' \
      -e 's:[ ][^ ]*[!]{i}{\([^nmr]\)}.*$: *¿i\1*:g' \
       \
      -e 's:[ ][^ ]*\({[^ {}]*}\)\({[^ {}]*}\)[!]: *\1\2¿:g' \
      -e 's:[ ]\({[^ {}]*}\)[!]: \1¿:g' \
      -e 's:[ ][!]: ¿:g' \
      -e 's:[¿]\(.....\).*$:¿\1*:g' \
  > .bad-pats-u
  
cat .bad-pats-u \
  | combine_counts.gawk \
  | sort -b -k1,1nr \
  | compute_freqs.gawk -v outputTotal='TOTAL' \
