#! /usr/bin/gawk -f # Last edited on 2013-03-02 02:04:47 by stolfilocal # Reads a list of entries extracted from the Wikimapia contribs panel. # Writes them in a table format. # Input has fields "{{TAG}={VALUE}}" separated by "|". # First field is "@@". BEGIN{ abort = -1; FS = "|"; # Tags that may be present in input: ntags_in = split("OBJ LANG TYPE LREV PREV LON1 LAT1 LON2 LAT2 ZOOM FLAG USER NAME", tags_in, " "); # Tags that should be written to table: ntags_ot = split("DIR OBJ LANG TYPE LREV PREV LON1 LAT1 LON2 LAT2 ZOOM FLAG USER NAME", tags_ot, " "); # Tags indicator vector: split("", is_tag); # {is_tag[tag]} is 1 iff {tag} is a valid input field tag. for (i = 1; i <= ntags_in; i++) { is_tag[tags_in[i]] = 1; } # Output the table header output_header(ntags_ot, tags_ot); output_dashes(ntags_ot); } (abort >= 0) { exit abort; } /^[@][@]/ { split("", tag_val); # {tag_val[tag]} is the value of field {tag}. if ($1 != "@@") { data_error(("invalid field 1 = «" $1 "»")); } for (i=2; i<=NF; i++) { # Get next field: fd = $(i); # Normalize spaces: gsub(/^[ ]+/,"",fd); gsub(/[ ]+$/,"",fd); gsub(/[ ]*[=][ ]*/,"=",fd); gsub(/[ ][ ]+/," ",fd); # Check syntax: if (! match(fd,/^[{][A-Z0-9]+[=].*[}]$/)) { data_error(("invalid field " i " = «" fd "»")); } # Remove braces: fd = substr(fd,2,length(fd)-2); # Split at "=" into {tag,val}: if (! match(fd,/[=]/)) { prog_error("duh?"); } tag = substr(fd,1,RSTART-1); val = substr(fd,RSTART+1); # Perform some cleanups and regularizations: if (tag == "OBJ") { val = clean_id(val); } else if (tag == "NAME") { val = clean_name(val); } # Check for repeats: if (tag in tag_val) { if (val != tag_val[tag]) { data_error(("inconsistent field " i ": «" tag "=" val "» prev val = «" tag_val[tag] "»")); } } else if (! (tag in is_tag)) { data_error(("invalid tag in field " i ": «" tag "=" val "»")); } else { tag_val[tag] = val; } } # Remove possibleduplicate fields: if (tag_val["PREV"] == tag_val["OBJ"]) { tag_val["PREV"] = ""; } if (tag_val["LREV"] == tag_val["OBJ"]) { tag_val["LREV"] = ""; } if (tag_val["LON2"] == tag_val["LON1"]) { tag_val["LON2"] = ""; } if (tag_val["LAT2"] == tag_val["LAT1"]) { tag_val["LAT2"] = ""; } if (tag_val["FLAG"] == "0") { tag_val["FLAG"] = ""; } if (tag_val["USER"] == "1654246") { tag_val["USER"] = ""; } if (tag_val["ZOOM"] == "") { tag_val["ZOOM"] = "14"; } # Split "OBJ" tag into "DIR" and "OBJ" proper: if (match(tag_val["OBJ"], /[\/]/)) { tag_val["DIR"] = substr(tag_val["OBJ"],1,RSTART-1); tag_val["OBJ"] = substr(tag_val["OBJ"],RSTART+1); } # Output entry: for (i = 1; i <= ntags_ot; i++) { tag = tags_ot[i]; printf "%s |", tag_val[tag]; } printf "\n"; next; } // { data_error(("invalid line format")); } END { output_dashes(ntags_ot); } function output_header(nt,t, j) { for (j = 1; j <= nt; j++) { printf " %s !", t[j]; } printf "\n"; } function output_dashes(nt, j) { for (j = 1; j <= nt; j++) { printf "---+"; } printf "\n"; } function clean_id(id ) { gsub(/^["(\/]*/, "", id); gsub(/([\/][a-z][a-z])?[")\/]*$/, "", id); if (id !~ /^(street[/]|ferry[/]|river[/]|)[0-9]+$/) { data_error(("invalid id [" id "]")); } return id; } function clean_name(na) { gsub(/^["{ ]*/, "", na); gsub(/ *([(][a-z][a-z][)] *)?["}]*$/, "", na); gsub(/[&]amp[;]/, "\\&", na); gsub(/[][ '()_.,;!?]/, "-", na); gsub(/[-][-]+/, "-", na); gsub(/^[-]+/, "", na); gsub(/[-]+$/, "", na); na = utf8_to_url(na); if (na == "") { na = "[UNNAMED]"; } if (na == "DELETED") { na = "[DELETED]"; } return na; } function utf8_to_url(x) { gsub(/[%]/ ,"%25",x); gsub(/[&]/ ,"%26",x); gsub(/[\053]/,"%2B",x); gsub(/[\056]/,"%2E",x); gsub(/[\074]/,"%3C",x); gsub(/[\076]/,"%3E",x); gsub(/[\177]/,"%7F",x); gsub(/[\200]/,"%80",x); gsub(/[\201]/,"%81",x); gsub(/[\202]/,"%82",x); gsub(/[\203]/,"%83",x); gsub(/[\204]/,"%84",x); gsub(/[\205]/,"%85",x); gsub(/[\206]/,"%86",x); gsub(/[\207]/,"%87",x); gsub(/[\210]/,"%88",x); gsub(/[\211]/,"%89",x); gsub(/[\212]/,"%8A",x); gsub(/[\213]/,"%8B",x); gsub(/[\214]/,"%8C",x); gsub(/[\215]/,"%8D",x); gsub(/[\216]/,"%8E",x); gsub(/[\217]/,"%8F",x); gsub(/[\240]/,"%A0",x); gsub(/[\241]/,"%A1",x); gsub(/[\242]/,"%A2",x); gsub(/[\243]/,"%A3",x); gsub(/[\244]/,"%A4",x); gsub(/[\245]/,"%A5",x); gsub(/[\246]/,"%A6",x); gsub(/[\247]/,"%A7",x); gsub(/[\250]/,"%A8",x); gsub(/[\251]/,"%A9",x); gsub(/[\252]/,"%AA",x); gsub(/[\253]/,"%AB",x); gsub(/[\254]/,"%AC",x); gsub(/[\255]/,"%AD",x); gsub(/[\256]/,"%AE",x); gsub(/[\257]/,"%AF",x); gsub(/[\260]/,"%B0",x); gsub(/[\261]/,"%B1",x); gsub(/[\262]/,"%B2",x); gsub(/[\263]/,"%B3",x); gsub(/[\264]/,"%B4",x); gsub(/[\265]/,"%B5",x); gsub(/[\266]/,"%B6",x); gsub(/[\267]/,"%B7",x); gsub(/[\270]/,"%B8",x); gsub(/[\271]/,"%B9",x); gsub(/[\272]/,"%BA",x); gsub(/[\273]/,"%BB",x); gsub(/[\274]/,"%BC",x); gsub(/[\275]/,"%BD",x); gsub(/[\276]/,"%BE",x); gsub(/[\277]/,"%BF",x); gsub(/[\300]/,"%C0",x); gsub(/[\301]/,"%C1",x); gsub(/[\302]/,"%C2",x); gsub(/[\303]/,"%C3",x); gsub(/[\304]/,"%C4",x); gsub(/[\305]/,"%C5",x); gsub(/[\306]/,"%C6",x); gsub(/[\307]/,"%C7",x); gsub(/[\310]/,"%C8",x); gsub(/[\311]/,"%C9",x); gsub(/[\312]/,"%CA",x); gsub(/[\313]/,"%CB",x); gsub(/[\314]/,"%CC",x); gsub(/[\315]/,"%CD",x); gsub(/[\316]/,"%CE",x); gsub(/[\317]/,"%CF",x); gsub(/[\320]/,"%D0",x); gsub(/[\321]/,"%D1",x); gsub(/[\322]/,"%D2",x); gsub(/[\323]/,"%D3",x); gsub(/[\324]/,"%D4",x); gsub(/[\325]/,"%D5",x); gsub(/[\326]/,"%D6",x); gsub(/[\327]/,"%D7",x); gsub(/[\330]/,"%D8",x); gsub(/[\331]/,"%D9",x); gsub(/[\332]/,"%DA",x); gsub(/[\333]/,"%DB",x); gsub(/[\334]/,"%DC",x); gsub(/[\335]/,"%DD",x); gsub(/[\336]/,"%DE",x); gsub(/[\337]/,"%DF",x); gsub(/[\340]/,"%E0",x); gsub(/[\341]/,"%E1",x); gsub(/[\342]/,"%E2",x); gsub(/[\343]/,"%E3",x); gsub(/[\344]/,"%E4",x); gsub(/[\345]/,"%E5",x); gsub(/[\346]/,"%E6",x); gsub(/[\347]/,"%E7",x); gsub(/[\350]/,"%E8",x); gsub(/[\351]/,"%E9",x); gsub(/[\352]/,"%EA",x); gsub(/[\353]/,"%EB",x); gsub(/[\354]/,"%EC",x); gsub(/[\355]/,"%ED",x); gsub(/[\356]/,"%EE",x); gsub(/[\357]/,"%EF",x); gsub(/[\360]/,"%F0",x); gsub(/[\361]/,"%F1",x); gsub(/[\362]/,"%F2",x); gsub(/[\363]/,"%F3",x); gsub(/[\364]/,"%F4",x); gsub(/[\365]/,"%F5",x); gsub(/[\366]/,"%F6",x); gsub(/[\367]/,"%F7",x); gsub(/[\370]/,"%F8",x); gsub(/[\371]/,"%F9",x); gsub(/[\372]/,"%FA",x); gsub(/[\373]/,"%FB",x); gsub(/[\374]/,"%FC",x); gsub(/[\375]/,"%FD",x); gsub(/[\376]/,"%FE",x); gsub(/[\377]/,"%FF",x); return x; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " [%s]\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "%s:%d: ** PROG ERROR: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " [%s]\n", $0 > "/dev/stderr"; abort = 1; exit abort; }