# Last edited on 2015-04-17 22:46:15 by stolfilocal

Fetching the persumed BitPay wallet data

  TO DO: change  {fetch-bitpay-wallet-data.sh} to save in "raw/${wallet}/${fetchdate}"
  instead of "bitpay-pages/${fetchdate}".
  
  Executed:

    fetch_wallet_data.sh 2014-11-27 "BitPay.com" 2600 3200
    
    fetch_wallet_data.sh 2014-11-28 "BitPay.com"    1 2610
    fetch_wallet_data.sh 2014-11-28 "BitPay.com" 3190 4600
    
    fetch_wallet_data.sh 2014-11-29 "BitPay.com" 4580 6600
    
    fetch_wallet_data.sh 2014-11-30 "BitPay.com"    1  300
    fetch_wallet_data.sh 2014-11-30 "BitPay.com" 6580 7600
    
 Again, to make sure we got all transactions:
    
    fetch_wallet_data.sh 2014-12-01 "BitPay.com"    1 1519
    fetch_wallet_data.sh 2014-12-01 "BitPay.com" 2550 2650
    
    fetch_wallet_data.sh 2014-12-01 "BitPay.com" 3150 3250
    fetch_wallet_data.sh 2014-12-01 "BitPay.com" 4530 4630
    fetch_wallet_data.sh 2014-12-01 "BitPay.com" 6530 6630
  
    fetch_wallet_data.sh 2014-12-03 "BitPay.com"    1  100
    fetch_wallet_data.sh 2014-12-03 "BitPay.com" 7600 7623
    
Cleanup

    # for fetchdate in 2014-11-27 2014-11-28 2014-11-29 2014-11-30 2014-12-01 2014-12-03 ; do 

    wallet="BitPay.com"
    for fetchdate in 2014-12-03; do 
      hdir="raw/${wallet}/${fetchdate}"
      tdir="txt/${wallet}/${fetchdate}"
      mkdir -p "${tdir}"
      logfile="${tdir}/log.txt"
      rm -f ${logfile}
      for page in `cd ${hdir} && ls ????.html | sort | sed -e 's:[.].*$::g'` ; do 
        hfile="${hdir}/${page}.html"
        tfile="${tdir}/${page}.txt"
        printf "${page}: " 1>> ${logfile}
        cleanup_wallet_data.gawk \
            -v wallet="${wallet}" \
          ${hfile} \
          > ${tfile} \
          2>> ${logfile}
      done
      ls -l ${logfile}
    done
    
  Checking for errors:    

    egrep -e '[*!][*!]' txt/BitPay.com/*/log.txt

  Quick stats:

    grep -e 'receives:' txt/BitPay.com/*/log.txt \
      | sort -b -k7,7gr \
      > .recvs
    
  Concatenating the files of each batch in reverse order,
  to files "txt/${wallet}/${fetchdate}/.all.txt":
  
    wallet="BitPay.com"
    for fetchdate in 2014-11-27 2014-11-28 2014-11-29 2014-11-30 2014-12-01 2014-12-03; do 
      tdir="txt/${wallet}/${fetchdate}"
      afile="${tdir}/.all.txt"
      tac `ls ${tdir}/????.txt | sort -r ` > ${afile}
      ls -l ${afile}
    done
    
  Counting records per day:

    gawk '//{ print $1; }' | sort | uniq -c     

  Edited the files and split them into year-by-year files, then month-by-month files:
  
    wallet="BitPay.com"
    pushd "txt/${wallet}/monthly"
    for year in 2011 2012 2013 ; do 
      for tag in a b c d e f g h i j k l m ; do 
        yfile="${year}-${tag}.txt" 
        echo "${yfile} ... " 1>&2
        if [[ -s ${yfile} ]]; then 
          echo "ok" 1>&2
          cat ${yfile} \
            | gawk -v tag="${tag}" \
                ' BEGIN { omo = ""; ofile = ""; } 
                  /^[ ]*20[01][0-9]-[0-1][0-9]/ { 
                    mo = substr($1,1,7); 
                    if (mo != omo) 
                      { if (omo != "") { close(ofile); }
                        omo = mo; ofile = (mo "-" tag ".txt");
                      }
                    print >> ofile;
                    next;
                  }
                  //{ print "** duh"; print; }
                  END { if (omo != "") { close(ofile); } }
                '
         fi
       done
     done
                        
Database shift

  Between one fetch and the other, the database was updated and some
  wallets were merged. Whenever that happens, the wallet numbers in the
  older file are synonyms of the new numbers in the source site.
    
  Because of these wallet merges, the balances are inconsistent whenever
  the merged file switches from one fetch session to the next. Basically
  the balance after the splice gets incremented with the balances of the
  wallets that were merged with it. Should redo the fetch of the entire
  wallet in one swoop, in the hope of avoiding the database update.
  
  Looking for such inconsistencies:
  
    wallet="BitPay.com"
    pushd "txt/${wallet}/monthly"
    tags=( a c ) # Two tags to compare, reference and secondary.
    for yrmo in 2014-10 2014-11 ; do
      dfile=".${yrmo}-new.txt"  # Transactions in secondary file not in ref file.
      rm -f ${dfile} .tmp-?.txt
      for tag in "${tags[@]}" ; do
        cat ${yrmo}-${tag}.txt \
          | sort \
          > .tmp-${tag}.txt 
      done 
      bool 1-2 .tmp-${tags[1]}.txt .tmp-${tags[0]}.txt > ${dfile} 
      ls -l ${dfile} 
    done
    popd
  
  Some of the above inconsistencies were fixed by hand in the files
  "txt/${wallet}/monthly/${yrmo}-${tag}.txt.  However, the balances
  were not fixed.
  
  Because of the database shift above, in order to check the basic
  consistency between output files with the same year-month but
  different ${tag}s, we compare date, hour, transaction id, input/output
  index, and input/output amount (ignoring source/destination wallet and
  balance):
  
    wallet="BitPay.com"
    pushd "txt/${wallet}/monthly"
    tags=( a c ) # Two tags to compare, reference and secondary.
    for yrmo in 2014-11 ; do
      dfile=".${yrmo}-new.txt"  # Transactions in secondary file not in ref file.
      rm -f ${dfile} .tmp-?.txt
      for tag in "${tags[@]}" ; do
        cat ${yrmo}-${tag}.txt \
          | gawk '//{ print $1, $2, $4, $6, $8 ; }' \
          | sort \
          > .tmp-${tag}.txt 
      done 
      bool 1-2 .tmp-${tags[1]}.txt .tmp-${tags[0]}.txt > ${dfile} 
      ls -l ${dfile} 
    done
    popd

Date errors

  There were some entries out of order in the fetched pages:

    In "txt/BitPay.com/2014-12-03/7606.txt" :
      
    2012-03-09 03:48:25 | ba06611298c37367... | 0 | +0.50000000 | 001656dd6d6aa746 | 28.17797000
    2012-03-09 03:22:28 | c8408f663dbbec9d... | 0 | +1.00000000 | 06b0d10e9435d5c1 | 27.67797000
    2012-03-09 03:14:35 | a4a85bafd72849eb... | 0 | +0.50000000 | 08d25895a105b70f | 26.67797000
    2012-03-09 03:22:54 | 3c4f335e33b6b255... | 0 | +0.50000000 | 13ce7d8f7ec97b25 | 26.17797000
      
  Apparently the date and hour is not when the transaction was confirmed (block mining), but when 
  it was issued.

Plotting

  Plotting the wallet traffic:
  
    
    wallet="BitPay.com"
    compute_and_plot_wallet_traffic.sh ${wallet} 2011-07-02 2014-11-30
    
    compute_and_plot_wallet_traffic.sh ${wallet} 2013-01-01 2014-11-30
    

  Plotting the histograms by operation amount:
    
    wallet="BitPay.com"
    compute_and_plot_wallet_histograms.sh ${wallet} 2011-07-02 2014-11-30
    
    compute_and_plot_wallet_histograms.sh ${wallet} 2013-01-01 2013-10-31
    compute_and_plot_wallet_histograms.sh ${wallet} 2013-11-01 2013-11-29
    compute_and_plot_wallet_histograms.sh ${wallet} 2014-01-01 2014-11-30
    
Finding the largest deposits:

  wallet="BitPay.com"
  ( cd txt/${wallet}/monthly/ && cat 20??-??-?.txt ) \
    | sort -b -k8,8gr \
    | gawk '//{ printf "%s %s %s %16s %s\n", $1, $2, $4, $8, $10; }' \
    | head -1000 \
    | sort \
    > .biggest-1000.txt

Finding the deposits of certain:

  wallet="BitPay.com"
  sender="KnC"
  ( cd txt/${wallet}/monthly/ && cat 20??-??-?.txt ) \
    | gawk -v sender="${sender}" '($10 ~ sender){ printf "%s %s %s %16s %s\n", $1, $2, $4, $8, $10; }' \
    | sort \
    > .from-${sender}.txt

Looking at small entries:

  ( cd txt/${wallet}/monthly/ && cat 20??-??-?.txt ) \
    | gawk '//{ v = $8 + 0; if ((v > 0) && (v <= 1.0)) printf "%s %s %s %16s %s\n", $1, $2, $4, $8, $10; }' \
    | sort \
    > .under-1-BTC.txt