#! /usr/bin/gawk -f
# Last edited on 2014-12-03 00:10:42 by stolfilocal

BEGIN \
  { 
    # Converts an HTML page fetched from "http://www.walletexplorer.com/"
    # to a ".txt" file.  Splits each input data line into one or more lines,
    # one for each transaction input or output.  
    
    # Note that the transactions are in inverse chronological order
    # and addresses are combined into "wallets" by some unspecified 
    # clustering logic.
    
    abort = -1;
    
    # Needs the wallet name, for intra-wallet transactions:
    if (wallet == "") { arg_error(("must define {wallet}")); }
    
    # Make sure that conversions of numbers to strings and vice-versa preserve 8 decimals after point:
    OFMT = "%.8f";
    CONVFMT = "%.8f";
    
    nin = 0; # Number of data lines in input page.
    nout = 0; # Number of data lines in putput file.
    
    # Total received and sent in file:
    vbtc_tot_R = 0;
    vbtc_tot_S = 0;

    # Max absolute received and sent in file:
    vbtc_max_R = 0;
    vbtc_max_S = 0;
    
    # Number of receives, sends, and rearrangents:
    nout_R = 0;
    nout_S = 0;
    nout_I = 0;
    
  }
  
(abort >= 0) { exit abort; }

# Expand tabs, non-breaking spaces: 
// \
  { gsub(/[	]/, " ", $0);
    gsub(/[&]nbsp;/, " ", $0);
  }

# Remove headers and other non-data stuff:
/^ *$/ { next; }
/^ *<!DOCTYPE html> *$/ { next; }
/^ *<html lang="en"> *$/ { next; }
/^ *<meta charset="utf-8"[/]> *$/ { next; }
/^ *<title>Wallet .*<[/]title> *$/ { next; }
/^ *<link href="[/]styles.css" rel="stylesheet"> *$/ { next; }
/^ *<div id="topbar"> *$/ { next; }
/^ *<h1><a href="[/]">.*<[/]a> - smart block explorer<[/]h1> *$/ { next; }
/^ *<form action="[/]"> *$/ { next; }
/^ *<input type="text" name="q" value=""[/]> *$/ { next; }
/^ *<input type="submit" value="Search address[/]wallet"[/]> *$/ { next; }
/^ *<[/]form> *$/ { next; }
/^ *<[/]div> *$/ { next; }
/^ *<div id="main"> *$/ { next; }
/^ *<h2>Wallet.*<[/]h2><div.*<[/]th><[/]tr> *$/ { next; }
/^ *<[/]table><div>.*<[/]div> *$/ { next; }
/^ *<[/]div> *$/ { next; }
/^ *<div style=".*"><[/]div> *$/ { next; }
/^ *<p>[&]copy; <a href=.*>.*<[/]a> .*<[/]p> *$/ { next; }
/^ *<[/]html> *$/ { next; }

# Cleanup data lines:
/^<tr class="(received|sent)">.*<[/]tr> *$/ \
  { 
    lin = $0;
    
    # Replace HTML formatting by tags, leaving a list of "@{tag}={value}"
    gsub(/^ *<tr class="received">/, "@K=R ", lin);
    gsub(/^ *<tr class="sent">/, "@K=S ", lin);
    gsub(/<[/]tr> *$/, "", lin); # Discard final </tr>.

    gsub(/<span class="walletcolor" style="background-color: [#][0-9a-f]*"><[/]span>/, "", lin);

    gsub(/<td>-<[/]td>/, "@W=REWARD", lin); # Mined block reward.

    gsub(/<td class="date">/, " @D=", lin);
    gsub(/<td class="txid">/, " @T=", lin);
    gsub(/<td class="inout">/, " @IO=", lin);
    gsub(/<td class="amount diff">/, " @V=", lin);
    gsub(/<td class="amount">/, " @B=", lin);
    gsub(/<td>/, "", lin);

    gsub(/<table class="empty">/, "{ ", lin);
    lin = gensub(/<a href="[/]wallet[/]([^<>"]*)">/, "@W=\\1<a>", "g", lin);
    lin = gensub(/<a href="[/]txid[/]([^<>"]*)">/, "@T=\\1<a>", "g", lin);
    gsub(/<em>fee<[/]em>/, " @W=FEE ", lin);
    gsub(/<a>[^<>]*<[/]a>/, "", lin);
    gsub(/<[/]table>/, " } ", lin);

    gsub(/<em>/, "", lin);
    gsub(/<[/]em>/, "", lin);

    gsub(/<[/]td>/, "", lin);

    gsub(/<tr>/, " { ", lin);
    gsub(/<[/]tr>/, " } ", lin); # Internal </tr>.

    gsub(/@T=@T=/, "@T=", lin);
    
    # Remove "()" around fee amount:
    lin = gensub(/@V=[(]([-+]?[0-9.]*)[)]/, "@V=\\1", "g", lin);
    
    # Tag the time:
    lin = gensub(/^(.*20[01][0-9]-[01][0-9]-[0-3][0-9]) +([0-2][0-9][:][0-5][0-9][:][0-6][0-9].*)$/, "\\1 @H=\\2", "g", lin);
    
    # Move input/output list to end of record:
    lin = gensub(/^(.*) *(@IO=[{] *[{].*[}] *[}]) *(.*) *$/, "\\1 \\3 \\2", "g", lin);
    lin = gensub(/^(.*) *(@IO=[{] *[}]) *(.*) *$/, "\\1 \\3 \\2", "g", lin);
    
    output_as_txtable_rows(lin);
    
    nin++;
    next;
  }

// \
  { data_error(("bad format")); }
  
END \
  { printf "%d lines read, %d lines written\n", nin, nout > "/dev/stderr";
    printf "receives: count %6d total %+.8f max %+.8f\n", nout_R, vbtc_tot_R, vbtc_max_R > "/dev/stderr";
    printf "sends:    count %6d total %+.8f max %+.8f\n", nout_S, vbtc_tot_S, vbtc_max_S > "/dev/stderr";
    printf "shuffles: count %6d\n", nout_I > "/dev/stderr";
    printf "\n" > "/dev/stderr";
  }
  
function output_as_txtable_rows(lin,   fix,ios,fixN,fixF,kind,date,hour,fbal,txid,iosN,iosF,k,nio,wall,vbtc,kwall,kvbtc,vtot,pbal,vsgn)
  {
    # Takes a data line from the HTML file and outputs one or more lines,
    # one line for each input or for each output of the transaction.
    # Updates the global variables {nout,nout_R,nout_S,nout_I,vbtc_tot_R,vbtc_tot_S,vbtc_max_R,vbtc_max_S}.
    
    # Assumes that the input data line {lin} has been cleaned up and reformatted as
    # a sequence of fields "@{tag}={value}".  For most {tag}s
    # the {value} is a signed number of bitcoins or a simple string without quotes.
    # For the "IO" tag, the {value} is a list of pairs
    # "@V={amount} @W={wallet}" delimited by "{{...}{...}...{...}}".
    
    # Split {lin} into the fixed-format part and the output list:
    fix = lin; gsub(/@IO=.*$/, "", fix);
    ios = lin; gsub(/^.*@IO=/, "", ios);
    
    # Split the fixed part into fields:
    # printf "%s\n", fix > "/dev/stderr";
    
    gsub(/[ ]/, "", fix);  # Remove all spaces.
    gsub(/^[@]/, "", fix); # Remove leading "@".  
    fixN = split(fix, fixF, /[@=]/);
    if (fixN != 10) 
      { data_error(("wrong fixed field count = " fixN " \"" fix "\"")); }
    
    # Transaction kind:
    kind = get_tagged_field("K", fixF[1], fixF[2]);
    if (kind !~ /^[RS]$/) { data_error(("invalid {kind} \"" kind "\"")); }
    
    date = get_tagged_field("D", fixF[3], fixF[4]);
    if (date !~ /^20[01][0-9]-[01][0-9]-[0-3][0-9]$/) { data_error(("invalid {date} \"" date "\"")); }
    
    hour = get_tagged_field("H", fixF[5], fixF[6]);
    if (hour !~ /^[0-2][0-9][:][0-5][0-9][:][0-6][0-9]$/) { data_error(("invalid {hour} \"" hour "\"")); }
    
    fbal = get_tagged_field("B", fixF[7], fixF[8]);
    if (fbal !~ /^[-+]?[0-9]+([.][0-9]*)?$/) { data_error(("invalid {fbal} \"" fbal "\"")); }
    
    txid = get_tagged_field("T", fixF[9], fixF[10]);
    if (txid !~ /^[0-9a-f]+?$/) {data_error(("invalid {txid} \"" txid "\"")); }
    if (length(txid) != 64) {data_error(("invalid {txid} length = " length(txid))); }
    
    # Split the input/output list, save in {wall[0..nio-1],vbtc[0..nio-1]}, compute total {vtot}:
    # printf "%s\n", ios > "/dev/stderr";
    nio = 0; # Number of inputs or outputs.
    split("", wall); # Wallet of input or output, indexed {0..nio-1}.
    split("", vbtc); # Amount of input or output, indexed {0..nio-1}.
    gsub(/[{} ]/, "", ios); # Remove spaces and braces.
    gsub(/^[@]/, "", ios); # Remove leading "@"
    iosN = split(ios, iosF, /[@=]/);
    if (iosN == 0)
      { # Internal rearrangement transaction.
        kind = "I";
        vbtc[0] = 0;
        wall[0] = wallet; # !!! Should be argument.
        nio = 1;
      }
    else
      { if ((iosN % 4) != 0) { data_error(("wrong input/output field count = " iosN " \"" ios "\"")); } 
        vtot = 0.00000000;
        vsgn = (kind == "R" ? +1.0 : -1.0); # Expected sign of amounts transferred.
        for (k = 1; k < iosN; k += 4)
          { # Get the next input/output, write table row: 

            # For receive the wallet comes first, for send it comes second:
            kwall = (kind == "R" ? k : k+2);
            kvbtc = (kind == "R" ? k+2 : k);

            wall[nio] = get_tagged_field("W", iosF[kwall], iosF[kwall+1]);
            if (wall[nio] !~ /^[-A-Za-z0-9.]+$/) { data_error(("invalid wallet \"" wall[nio] "\"")); } 

            vbtc[nio] = get_tagged_field("V", iosF[kvbtc], iosF[kvbtc+1]);
            if (vbtc[nio] !~ /^[-+]?[0-9]+([.][0-9]*)?$/) { data_error(("invalid input/output amount \"" vbtc[nio] "\"")); } 
            if (vsgn*vbtc[nio] < 0) { data_error(("input/output amount " vbtc[nio] " has wrong sign for {kind} \"" kind "\"")); }
            if (vbtc[nio] == 0) { data_warning(("input/output amount " vbtc[nio] " is zero")); }

            vtot += vbtc[nio];

           nio++;
          }
        }
      
    # Write one table row for each input or output:  
    pbal = fbal;
    for (k = nio-1; k >= 0; k--)
      { 
        # printf "pbal = %s\n", pbal > "/dev/stderr";
        
        printf "%s %s", date, hour;
        printf " | %s | %5d", txid, k;
        printf " | %s | %s", (vbtc[k] == 0 ? "00.00000000" : sprintf("%+.8f", vbtc[k])), wall[k];
        printf " | %.8f\n", pbal;
        pbal -= vbtc[k];
        
        # Update global max and total amounts:
        if (kind == "R")
          { nout_R++;
            vbtc_tot_R += vbtc[k];
            if (vbtc[k] > vbtc_max_R) { vbtc_max_R = vbtc[k]; }
          } 
        else if (kind == "S")
          { nout_S++;
            vbtc_tot_S += vbtc[k];
            if (vbtc[k] < vbtc_max_S) { vbtc_max_S = vbtc[k]; }
          }
        else if (kind == "I")
          { nout_I++; }
        else
          { prog_error(("invalid kind \"" kind "\"")); }
        
         # Update global output line count:
         nout++;
      }
      
    check_balance(pbal, fbal - vtot);
  }
    
function check_balance(b1,b2,  x1, x2)
  {
    # Compares the listed balance {b1} to expected balance {b2}, to 8 decimals:
    x1 = sprintf("%.8f", b1);
    x2 = sprintf("%.8f", b2);
    if (x1 != x2) 
      { prog_error(("inconsistent balance -- " x1 " should be " x2)); }
  }
    
function get_tagged_field(tag, f1, f2)
  { # Checks whether {f1 == tag}, returns {f2} if true.
    if (f1 != tag) { data_error(("expected \"" tag "\", got \"" f1 "\"")); }
    return f2;
  }

function data_error(msg)
  { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
    printf "  %s\n", $0 > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  } 
          
function arg_error(msg)
  { printf "** %s\n", msg > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  } 
          
function prog_error(msg)
  { printf "** PROGRAM ERROR: %s\n", msg > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  }