# Last edited on 2025-09-17 03:42:53 by stolfi # 083 Generating pseudo-English by Thorsten and Timm This note uses the Thorsten and Timm "self-copying" gibberish generation algorithm with an English seed text, to "prove" that evey English text is a hoax. ln -s ../.. work ln -s work/make_zipf_plots.sh ln -s work/langbank ln -s work/compute_freqs.gawk ln -s work/format_words_filled.sh See the Makefile. IMPROVING THE MUTATE PROCEDURE To improve the Mutate procedure, we will collect digraphs and trigraphs from an English text, then use rules xz -> xyz (insert) xyz -> xz (delete) xyz ->xwz (replace) with suitable probabilities. cat langbank/engl/wow/main.wds \ | gawk \ ' /^[a][ ]/{ w = tolower($2) gsub(/[~]/, "-", w) w = ("~" w "~"); n = length(w); for (i = 0; i < n; i++) { if (i+1 < n) { print substr(w, i+1, 2); } if (i+2 < n) { print substr(w, i+1, 3); } } }' \ | sort | uniq -c \ | sort -k2,2 \ > in/bitrigrams.wct cat langbank/engl/wow/main.wds \ | gawk \ ' /^[a][ ]/{ n = length($2); print n; }' \ | sort | uniq -c \ | sort -k2,2n \ > in/lengths.wct tandt.py 200 200 10000 10000 1 > out/test.txt With the new Mutate procedure, the best match with the English FxR plot was obtained with Q = 0.320