#! /usr/bin/gawk -f # Last edited on 2010-04-13 21:43:00 by stolfi BEGIN \ { nspkr = 0; # Number of speakers. split("", spkr); # {spkr[0..nspkr-1]} are the speaker names. # Counts lines of each speaker. Also {nlin[""]} is lost lines. split("", nlin); # {nlin[sp]} is the number of output lines for speaker {sp}. split("", nick); # {nick[sp]} is the nickname of speaker {sp}. nick["Jorge Stolfi"] = "JS"; nick["Rafael F. V. Saracchini"] = "RFVS"; cur_spkr = ""; # Current speaker. buf = ""; # Last conversation line of {cur_spkr}. } /^[ ]*([\#]|$)/ \ { # Comment or blank lines, just keep. flush_buf(); print; # Reset speaker: cur_spkr = ""; next; } /^[MTWFS].*[ ]20[0-9][0-9][ ]*$/ \ { # Date header, turn into into comment: flush_buf(); printf "# %s\n", $0; # Reset speaker: cur_spkr = ""; next; } /^\[/ \ { # Looks like a chat entry line: lin = $0; # Get the time field: if (! match(lin, /^\[[A-Z0-9:. ]+\][ ]/)) { data_warning(("bad time format")); print; next; } hr = substr(lin, RSTART, RLENGTH-1); lin = substr(lin, RSTART+RLENGTH); if (match(lin, /^[ ]*[*][ ]/)) { # File transfer line: prefix = "* "; lin = substr(lin, RSTART+RLENGTH); # Provide a ": " for proper name parsing: lin = gensub(/[ ]*(arquivo[s]?|file[s]?)/, ": \\1", "n", lin); } else { prefix = ""; } # Get the speaker name: if (! match(lin, /^[A-Za-z0-9. ]+[:]([ ]|$)/)) { data_warning(("missing speaker name")); print; next; } sp = substr(lin, RSTART, RLENGTH); gsub(/[: ]+$/, "", sp); lin = substr(lin, RSTART+RLENGTH); # Flush any pending input: flush_buf(); if (sp != cur_spkr) { # Print new-speaker header: printf "\n"; printf "%s %s:\n", hr, (sp in nick ? nick[sp] : sp); printf "\n"; # Update current speaker if (! (sp in nlin)) { # Speaker is new. spkr[nspkr] = sp; nspkr++; nlin[sp] = 0; } cur_spkr = sp; } # Save rest of line in buffer: buf = (prefix lin); next; } /^[^\[]/ \ { # probably continuation line split by accident: if (cur_spkr == "") { data_warning(("spurious continuation line")); print; next; } if (buf == "") { buf = $0; } else { buf = (buf " " $0); } next; } END \ { flush_buf(); printf "\n"; printf "# Line counts:\n"; for (i = 0; i < nspkr; i++) { printf "# %6d %s\n", nlin[spkr[i]], spkr[i]; } if (nlin[""] + 0 > 0) { printf "# %6d Missing speaker name\n", nlin[""]; } } function flush_buf() { # Prints the {buf} line and resets it to blanks. if (buf != "") { printf " %s\n", buf; nlin[cur_spkr]++; } buf = ""; } function data_warning(msg) { # Prints a comment with a warning message {msg}: flush_buf(); printf "# !! %s\n", msg; }