#! /usr/bin/gawk -f # Last edited on 2005-02-02 22:53:26 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " [<] INFILE > OUTFILE" \ ); # Reads a file broken into sections by "@@ {TAG}" lines. # Concatenates all sections with the same tag. Deletes duplicate # lines within each section. Writes out the result. # # The sections are written out in the order of their first # occurence. The lines of each section are written out in the # original order. # # Trailing blanks are removed on input. An input line that contains # only characters from the set [ -=+|] (in particular, a blank line) # is considered "duplicate" iff it is equal to the previous line in # the same section. Otherwise a line is considered "duplicate" iff # it is equal to any previous line of that section. Non-blank # duplicate lines and section headers are printed to {stderr}. ns = 0; # Number of sections; they are numbered in {0..ns-1}. split("", snum); # {snum[tag]} is the index of the section with the given {tag}. split("", stag); # {stag[i]} is the tag of the section number {i}. split("", nlin); # {nlin[i]} is the number of lines in section {i}. split("", line); # {line[i,j]} is line {j} line of section {i}, both from 0. nh = 0; # Number of header lines read. nr = 0; # Number of non-header lines read. nt = 0; # Number of total text lines kept. is = -1; # Current section, or -1 if no section header yet. } /^[@][@]/ { nh++; if (NF != 2) { data_error("bad header line"); } tag = $2; if (tag in snum) { # old section: is = snum[tag]; } else { # New section: is = ns; ns++; nlin[is] = 0; snum[tag] = is; stag[is] = tag; } printf "%s:%d: @@ %s [=%d]\n", FILENAME, FNR, tag, is > "/dev/stderr"; next; } // { nr++; if (is < 0) { data_error("no section header"); } lin = $0; gsub(/[ ]+$/, "", lin); nl = nlin[is]; if (nl > 0) { # Check for duplication: fst = (lin ~ /^[ -=+|]*$/ ? nl-1 : 0); for (j = fst; j < nl; j++) { if (lin == line[is,j]) { # Discard this line: if (lin != "") { printf "%s:%d: %s\n", FILENAME, FNR, lin > "/dev/stderr"; } next; } } } # Append to section: line[is,nl] = lin; nlin[is]++; nt++; next; } END { printf "%d header lines read\n", nh > "/dev/stderr"; printf "%d text lines read\n", nr > "/dev/stderr"; printf "%d distinct sections\n", ns > "/dev/stderr"; printf "%d text lines kept\n", nt > "/dev/stderr"; for (is = 0; is < ns; is++) { printf "@@ %s\n", stag[is]; nl = nlin[is]; for (j = 0; j < nl; j++) { print line[is,j]; } } } function data_error(msg) { printf "%s:%d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }