#! /usr/bin/gawk -f # Last edited on 2003-10-12 12:29:48 by stolfi BEGIN{ abort = -1; usage = ( ARGV[0] " < INFILE > OUTFILE" ); # # Cleans up a list of entries copied from the "Nature" contents alert. # Output is a single line per entry, TITLE @ AUTHORS @ URL # where the AUTHORS have been shortened and the URL has "${NAT"" instead # of "http://www.nature.com". # begin_entry(); } (abort >= 0) { exit abort; } //{ gsub(/[ \011]+$/, "", $0); } /^$/{ finish_entry(); begin_entry(); next; } /^http:/ { # URL if (n == 0) { error("missing author"); } if (n == 2) { error("duplicate URL"); } n = 2; field[n] = (field[n] " " $0); next; } /^[-A-ZΑΙΣΘά'.,\& ]+$/ { # Author(s) if (n == 2) { error("garbled entry"); } n = 1; field[n] = (field[n] " " $0); next; } /./ { if (n == 2) { error("garbled entry"); } if (n == 1) { error("garbled entry"); } n = 0; field[n] = (field[n] " " $0); next; } END { if (abort >= 0) { exit abort; } finish_entry(); } function error(msg) { printf "*** line %d: %s\n", FNR, msg; abort = 1; exit abort; } funtion finish_entry( tit,aut,url) { tit = field[0]; aut = field[1]; url = field[2]; aut = gensub(/^([A-ZΑΙΣΘά][A-ZΑΙΣΘά' ]*[A-ZΑΙΣΘά])[ ]*[,\&].*$/, "\\1 ...", "g", aut); gsub(/http[:][\/][\/]www.nature.com/, "${NAT}", url); printf "%s @ %s @ %s\n", tit, aut, url; } function begin_entry() { split("", field); n = 0; }