├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── bitcoin-cli-dump
    ├── blockHashes.txt
    ├── dumpBlockHashes.scala
    └── processBlocks.scala
├── graphx-notebook
    ├── .gitingore
    ├── Dockerfile
    ├── Makefile
    ├── README.md
    ├── blockchain.snb.ipynb
    └── entrypoint.sh
├── notebook
    ├── .gitignore
    ├── Dockerfile
    ├── Makefile
    ├── Makefile_bak
    ├── README.md
    ├── blockchain.ipynb
    ├── buildImg.sh
    ├── custom
    │   ├── custom.css
    │   └── custom.js
    └── js
    │   └── sigma-graph.js
├── parquet-converter
    ├── .gitignore
    ├── README.md
    ├── build.sbt
    ├── data
    │   └── example1
    │   │   ├── input
    │   │       └── blk00003.dat
    │   │   └── output
    │   │       ├── edges
    │   │           ├── ._SUCCESS.crc
    │   │           ├── .part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet.crc
    │   │           ├── _SUCCESS
    │   │           └── part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet
    │   │       └── nodes
    │   │           ├── ._SUCCESS.crc
    │   │           ├── .part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet.crc
    │   │           ├── _SUCCESS
    │   │           └── part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet
    ├── project
    │   ├── assembly.sbt
    │   ├── build.properties
    │   └── plugins.sbt
    ├── run-ds.sh
    ├── run-new.sh
    ├── run.sh
    └── src
    │   ├── main
    │       └── scala
    │       │   └── io
    │       │       └── radanalytics
    │       │           └── bitcoin
    │       │               ├── ConverterUtil.scala
    │       │               ├── Model.scala
    │       │               ├── ParquetConverter.scala
    │       │               ├── ParquetConverterDS.scala
    │       │               └── ParquetConverterNewArch.scala
    │   └── test
    │       ├── resources
    │           └── serialized
    │       └── scala
    │           └── io
    │               └── radanalytics
    │                   └── bitcoin
    │                       ├── HashAlgSpec.scala
    │                       ├── SparkGraphxBitcoinSpec.scala
    │                       └── TransactionsSpec.scala
└── workers-with-data
    ├── Dockerfile
    ├── Dockerfile_3_graphs
    ├── Makefile
    ├── README.md
    └── buildImg.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .idea
3 | metastore_db/
4 | derby.log
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | 
 3 | scala:
 4 |   - 2.11.11
 5 | 
 6 | services:
 7 |   - docker
 8 | 
 9 | script:
10 |   - cd parquet-converter
11 |   - sbt ++2.11.11 clean assembly test
12 |   #- cd ../notebook
13 |   #- GRAPH_DATA=../parquet-converter/data/example1/output/ make build
14 |   #- cd ../workers-with-data
15 |   #- GRAPH_DATA=../parquet-converter/data/example1/output make build
16 |   #- cd ../graphx-notebook
17 |   #- GRAPH_DATA=../parquet-converter/data/example1/output make build
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017  Red Hat, Inc. and/or its affiliates
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bitcoin Insights
 2 | [![Build status](https://travis-ci.org/Jiri-Kremser/bitcoin-insights.svg?branch=master)](https://travis-ci.org/Jiri-Kremser/bitcoin-insights)
 3 | [![Docker build](https://img.shields.io/docker/automated/jkremser/bitcoin-notebook.svg)](https://hub.docker.com/r/jkremser/bitcoin-notebook)
 4 | [![Layers info](https://images.microbadger.com/badges/image/jkremser/bitcoin-notebook.svg)](https://microbadger.com/images/jkremser/bitcoin-notebook)
 5 | 
 6 | ## Quick start
 7 | Follow the tutorial on https://radanalytics.io/examples/blockchain
 8 | 
 9 | or for ultra-quick-start with some prepared data just run:
10 | ```bash
11 | docker run --rm -ti -p 9000:9000 jkremser/bitcoin-spark-notebook:tutorial-1.0.0
12 | ```
13 | 
14 | ## Converter to Parquet
15 | 
16 | Bitcoin stores all the transactions in the binary format described [here](https://webbtc.com/api/schema), however Spark is much better with columnar data formats such as Parquet, so we provide a simple converter from the Blockchain binary format into parquet files. The converter was inspired by the examples in the [hadoopcryptoledger](https://github.com/ZuInnoTe/hadoopcryptoledger/wiki/Using-Hive-to-analyze-Bitcoin-Blockchain-data) project and uses some of its classes for parsing the original format.
17 | 
18 | ### Building
19 | 
20 | ```bash
21 | sbt clean assembly
22 | ```
23 | will create a jar file in `./target/scala-2.11/bitcoin-insights.jar`
24 | 
25 | ### Running
26 | 
27 | Converter assumes the binary block data in the input directory and writes parquet files to the output directory. Paths to input and output directories are passed to the converter. Depending on the size of the task, or in other words, how many blocks you want to convert, tweak the memory parameters of the following example command:
28 | ```bash
29 | ls ~/bitcoin/input/
30 | # blk00003.dat
31 | 
32 | spark-submit \
33 |   --driver-memory 2G \
34 |   --executor-memory 2G \
35 |   --class io.radanalytics.bitcoin.ParquetConverter \
36 |   --master local[8] \
37 |   ./target/scala-2.11/bitcoin-insights.jar ~/bitcoin/input ~/bitcoin/output
38 | 
39 | # ... <output from the conversion> ...
40 | 
41 | ls ~/bitcoin/output
42 | # addresses  blocks  edges  transactions
43 | ```
44 | 
45 | There are some prepared some bash scripts that can be used to automate the conversion tasks and make them idempotent: [run.sh](parquet-converter/run.sh), [run-new.sh](parquet-converter/run-new.sh),
46 | 
47 | ## Notebook
48 | ```bash
49 | cd notebook
50 | make build
51 | make run
52 | ```
53 | 


--------------------------------------------------------------------------------
/bitcoin-cli-dump/dumpBlockHashes.scala:
--------------------------------------------------------------------------------
1 | import sys.process._
2 | import java.io._
3 | import scala.language.postfixOps
4 | 
5 | val writer = new PrintWriter(new File("blockHashes.txt"))
6 | val totalBlocks = ("bitcoin-cli getblockcount" !!).trim.toInt
7 | val hashes = (0 to totalBlocks).par.map(hash => s"bitcoin-cli getblockhash $hash" !!)
8 | hashes.foreach(writer.write)
9 | writer.close


--------------------------------------------------------------------------------
/bitcoin-cli-dump/processBlocks.scala:
--------------------------------------------------------------------------------
 1 | import sys.process._
 2 | import java.io._
 3 | import scala.language.postfixOps
 4 | 
 5 | val blocks = scala.io.Source.fromFile("blockHashes.txt").mkString.split("\n").toList.take(1)
 6 | println(blocks)
 7 | 
 8 | blocks.foreach(block => {
 9 |     val output = Seq("/bin/sh", "-c", s"bitcoin-cli getblock $block | jq -c '.tx'").!!.trim
10 |     val transactions = output.substring(1, output.length - 1).split(",").toList
11 |     transactions.foreach(tx => {
12 |         val txId = tx.substring(1, tx.length - 1)
13 |         // todo: enhance this
14 |         println(s"bitcoin-cli getrawtransaction $txId 1".!!)
15 |     })
16 | })


--------------------------------------------------------------------------------
/graphx-notebook/.gitingore:
--------------------------------------------------------------------------------
1 | blockchain.snb.ipynb_bak
2 | edges
3 | nodes
4 | 


--------------------------------------------------------------------------------
/graphx-notebook/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM andypetrella/spark-notebook:0.9.0-SNAPSHOT-scala-2.11.8-spark-2.2.0-hadoop-2.7.2
 2 | #FROM jkremser/spark-notebook:0.9.0-SNAPSHOT-scala-2.11.8-spark-2.1.1-hadoop-2.7.3
 3 | 
 4 | ADD blockchain.snb.ipynb /opt/docker/notebooks/
 5 | ADD addresses /tmp/addresses.parquet
 6 | ADD blocks /tmp/blocks.parquet
 7 | ADD transactions /tmp/transactions.parquet
 8 | ADD edges /tmp/edges.parquet
 9 | ADD entrypoint.sh /entrypoint.sh
10 | 
11 | 
12 | LABEL io.k8s.description="spark-notebook Notebook with blockchain analysis." \
13 |       io.k8s.display-name="spark-notebook Notebook with blockchain analysis." \
14 |       io.openshift.expose-services="9000:http"
15 | 
16 | # /usr/sbin is needed because akka creates this /usr/sbin/.coursier/cache/v1/.structure.lock
17 | RUN chmod ug+rw /etc/passwd \
18 |     && chmod o+w /usr/sbin/ \
19 |     && chown daemon:daemon /opt/docker/notebooks/blockchain.snb.ipynb \
20 |     && find /opt/docker/notebooks/ ! -path /opt/docker/notebooks/ -type d -exec rm -rf {} + \
21 |     && apt-get update \
22 |     && apt-get install -y python \
23 |     && apt-get clean all
24 | 
25 | USER daemon
26 | 
27 | EXPOSE 9000 8000
28 | ENTRYPOINT ["/entrypoint.sh", "/opt/docker/bin/spark-notebook", "-Dpidfile.path=/tmp/pid"]
29 | 


--------------------------------------------------------------------------------
/graphx-notebook/Makefile:
--------------------------------------------------------------------------------
 1 | LOCAL_IMAGE=$(USER)/bitcoin-spark-notebook
 2 | GRAPH_DATA ?= $(HOME)/bitcoin/output/
 3 | 
 4 | 
 5 | .PHONY: build clean run test prettify open
 6 | 
 7 | build: clean prettify
 8 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA) ---\n"
 9 | 	cp -r $(GRAPH_DATA)/* .
10 | 	-sed -i'_bak' "s/\([[:space:]]*\"name\": \"Blockchain\".*\)/\1 \"data-info\": \""$${GRAPH_DATA#*-}"\",/g" blockchain.snb.ipynb
11 | 	# $(MAKE) -f $(THIS_FILE) prettify
12 | 	docker build -t $(LOCAL_IMAGE) .
13 | 	-rm -rf edges addresses blocks transactions
14 | 	-mv blockchain.snb.ipynb_bak blockchain.snb.ipynb
15 | 
16 | clean:
17 | 	-docker rmi -f $(LOCAL_IMAGE)
18 | 
19 | run:
20 | 	# remove the container to avoid the name colision
21 | 	-docker rm -f spark-notebook || true
22 | 	# remove the other notebooks (in 5 seconds)
23 | 	#-(sleep 5 && docker exec spark-notebook find /opt/docker/notebooks/ ! -path /opt/docker/notebooks/ -type d -exec rm -rf {} +)&
24 | 	# and run it
25 | 	docker run --name spark-notebook --rm -ti -p 9000:9000 $(LOCAL_IMAGE)
26 | 
27 | open:
28 | 	-(sleep 5.5 && firefox http://localhost:9000 &> /dev/null)&
29 | 
30 | prettify:
31 | 	cp ./blockchain.snb.ipynb ./blockchain.snb.ipynb_bak
32 | 	cat ./blockchain.snb.ipynb_bak | python -m json.tool > ./blockchain.snb.ipynb
33 | 	rm ./blockchain.snb.ipynb_bak
34 | 


--------------------------------------------------------------------------------
/graphx-notebook/README.md:
--------------------------------------------------------------------------------
 1 | [![Docker build](https://img.shields.io/docker/automated/jkremser/bitcoin-spark-notebook.svg)](https://hub.docker.com/r/jkremser/bitcoin-spark-notebook)
 2 | [![Layers info](https://images.microbadger.com/badges/image/jkremser/bitcoin-spark-notebook.svg)](https://microbadger.com/images/jkremser/bitcoin-spark-notebook)
 3 | ## Info
 4 | 
 5 | This is the docker image with a spark-notebook using the GraphX.
 6 | 
 7 | ### Building
 8 | 
 9 | ```bash
10 | make build
11 | ```
12 | This will build a docker image with the notebook and the example data. It assumes
13 | the parquet data generated by the converter.
14 | Some example data can be found in `../parquet-converter/data/example1/output/`, but
15 | I suggest creating own from the `~/.bitcoin/blocks/blk00xyz.dat` files using the
16 | `parquet-converter`.
17 | 
18 | ### Running
19 | 
20 | ```bash
21 | make open run
22 | ```
23 | 
24 | Then the notebook is listening on http://localhost:9000.
25 | 


--------------------------------------------------------------------------------
/graphx-notebook/blockchain.snb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {
  6 |                 "id": "1BD94847B63C4394AF1B3AAFABA54F4C"
  7 |             },
  8 |             "source": "# Analyse Blockchain with GraphX"
  9 |         },
 10 |         {
 11 |             "cell_type": "markdown",
 12 |             "metadata": {
 13 |                 "id": "1C6EE6059BED491E873A4D067054036E"
 14 |             },
 15 |             "source": "_Trying identify interesting addresses in the blockchain transaction graph_"
 16 |         },
 17 |         {
 18 |             "cell_type": "markdown",
 19 |             "metadata": {
 20 |                 "id": "9D487F6FAEDE4C6B8976FAABA268FCAD"
 21 |             },
 22 |             "source": "## Basic setup\n\nHere we will create spark session that is necessary for further dataframe processing.\n"
 23 |         },
 24 |         {
 25 |             "cell_type": "code",
 26 |             "metadata": {
 27 |                 "collapsed": false,
 28 |                 "id": "2C7CCB3C43D2425BAD7017C0F0780AF7",
 29 |                 "input_collapsed": false,
 30 |                 "trusted": true
 31 |             },
 32 |             "outputs": [],
 33 |             "source": [
 34 |                 "val spark = SparkSession.builder\n",
 35 |                 "                    .master(\"local[4]\")\n",
 36 |                 "                    .getOrCreate()"
 37 |             ]
 38 |         },
 39 |         {
 40 |             "cell_type": "markdown",
 41 |             "metadata": {
 42 |                 "id": "6BAD062F226548978710E3B28A975836"
 43 |             },
 44 |             "source": "## Check the data on disk\nGraph data is stored on the dist as two Parquet files. One with vertices and the second one with the edges."
 45 |         },
 46 |         {
 47 |             "cell_type": "code",
 48 |             "metadata": {
 49 |                 "collapsed": false,
 50 |                 "id": "DABBD04745134831822739BFF782EF9F",
 51 |                 "input_collapsed": false,
 52 |                 "trusted": true
 53 |             },
 54 |             "outputs": [],
 55 |             "source": [
 56 |                 ":sh du -h /tmp/nodes.parquet"
 57 |             ]
 58 |         },
 59 |         {
 60 |             "cell_type": "code",
 61 |             "metadata": {
 62 |                 "collapsed": false,
 63 |                 "id": "0605F16CF90642DE8A71D7A676CE5CA5",
 64 |                 "input_collapsed": false,
 65 |                 "trusted": true
 66 |             },
 67 |             "outputs": [],
 68 |             "source": [
 69 |                 ":sh du -h /tmp/edges.parquet"
 70 |             ]
 71 |         },
 72 |         {
 73 |             "cell_type": "markdown",
 74 |             "metadata": {
 75 |                 "id": "2DA4D8DC9A01453ABF0832C1A461AC71"
 76 |             },
 77 |             "source": "## Load the data"
 78 |         },
 79 |         {
 80 |             "cell_type": "code",
 81 |             "metadata": {
 82 |                 "collapsed": false,
 83 |                 "id": "58F5BF50D84345C193F523A533DEC6E9",
 84 |                 "input_collapsed": false,
 85 |                 "trusted": true
 86 |             },
 87 |             "outputs": [],
 88 |             "source": [
 89 |                 "val rawNodes = spark.read.load(\"/tmp/nodes.parquet\")\n",
 90 |                 "rawNodes.show(5, false)"
 91 |             ]
 92 |         },
 93 |         {
 94 |             "cell_type": "markdown",
 95 |             "metadata": {
 96 |                 "id": "5C9D921E0A8744A18015E46326C4EB34"
 97 |             },
 98 |             "source": "#### Number of vertices"
 99 |         },
100 |         {
101 |             "cell_type": "code",
102 |             "metadata": {
103 |                 "collapsed": false,
104 |                 "id": "14CFF48EA8F94EC48BA1E896BA7B0DD3",
105 |                 "input_collapsed": false,
106 |                 "trusted": true
107 |             },
108 |             "outputs": [],
109 |             "source": [
110 |                 "rawNodes.count"
111 |             ]
112 |         },
113 |         {
114 |             "cell_type": "markdown",
115 |             "metadata": {
116 |                 "id": "4C715D1FA97341C98A3A7A3E87747855"
117 |             },
118 |             "source": "### Clean the data"
119 |         },
120 |         {
121 |             "cell_type": "code",
122 |             "metadata": {
123 |                 "collapsed": false,
124 |                 "id": "71BC35B9AF0244C38005D16168B622E3",
125 |                 "input_collapsed": false,
126 |                 "trusted": true
127 |             },
128 |             "outputs": [],
129 |             "source": [
130 |                 "import org.apache.spark.sql.functions.regexp_replace\n",
131 |                 "\n",
132 |                 "val nodes = rawNodes.na.drop()\n",
133 |                 "                    .withColumnRenamed(\"_1\", \"id\")\n",
134 |                 "                    .withColumnRenamed(\"_2\", \"address\")\n",
135 |                 "                    .withColumn(\"address\", regexp_replace($\"address\", \"bitcoinaddress_\", \"\"))\n",
136 |                 "nodes.show(5, false)"
137 |             ]
138 |         },
139 |         {
140 |             "cell_type": "code",
141 |             "metadata": {
142 |                 "collapsed": false,
143 |                 "id": "E6EE8C40BD2645378DA7868C9B91A1DA",
144 |                 "input_collapsed": false,
145 |                 "trusted": true
146 |             },
147 |             "outputs": [],
148 |             "source": [
149 |                 "val edges = spark.read.load(\"/tmp/edges.parquet\")\n",
150 |                 "                      .drop($\"attr\")\n",
151 |                 "edges.show(5)"
152 |             ]
153 |         },
154 |         {
155 |             "cell_type": "markdown",
156 |             "metadata": {
157 |                 "id": "5C9D921E0A8744A18015E46326C4EB34"
158 |             },
159 |             "source": "#### Number of edges"
160 |         },
161 |         {
162 |             "cell_type": "code",
163 |             "metadata": {
164 |                 "collapsed": false,
165 |                 "id": "DB8F7E1290F94FD8A101CC98F8007FE1",
166 |                 "input_collapsed": false,
167 |                 "trusted": true
168 |             },
169 |             "outputs": [],
170 |             "source": [
171 |                 "edges.count()"
172 |             ]
173 |         },
174 |         {
175 |             "cell_type": "markdown",
176 |             "metadata": {
177 |                 "id": "D07A81FBF86A4D7FAA6C14469730D4C4"
178 |             },
179 |             "source": "# Creating the Graph\nGraphX library expects RDDs, so we need to do the conversion from the dataframes here"
180 |         },
181 |         {
182 |             "cell_type": "code",
183 |             "metadata": {
184 |                 "collapsed": false,
185 |                 "id": "0060C72E46054322B4C6F8AB8AF21F9E",
186 |                 "input_collapsed": false,
187 |                 "trusted": true
188 |             },
189 |             "outputs": [],
190 |             "source": [
191 |                 "// todo: ugly\n",
192 |                 "import org.apache.spark.graphx._\n",
193 |                 "val nodesRdd: RDD[(VertexId, String)] = nodes.rdd.map(row => (row(0).asInstanceOf[Long], row(1).asInstanceOf[String]))\n",
194 |                 "val edgesRdd: RDD[Edge[Option[String]]] = edges.rdd.map(row => Edge(row(0).asInstanceOf[Long], row(1).asInstanceOf[Long]))\n"
195 |             ]
196 |         },
197 |         {
198 |             "cell_type": "code",
199 |             "metadata": {
200 |                 "collapsed": false,
201 |                 "id": "064059438BE447369AAA0E9268B9DC5F",
202 |                 "input_collapsed": false,
203 |                 "trusted": true
204 |             },
205 |             "outputs": [],
206 |             "source": [
207 |                 "val graph = Graph(nodesRdd, edgesRdd)"
208 |             ]
209 |         },
210 |         {
211 |             "cell_type": "markdown",
212 |             "metadata": {
213 |                 "id": "56D05A8F5C364252B90D311FB6545A70"
214 |             },
215 |             "source": "## Calculate the Page Rank\n\nThis may take couple of minutes depending on the size of the data. The implementation of the algorithm is described [here](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.graphx.lib.PageRank$)."
216 |         },
217 |         {
218 |             "cell_type": "code",
219 |             "metadata": {
220 |                 "collapsed": false,
221 |                 "id": "F447A11E0445478C86437D6DE93DEA92",
222 |                 "input_collapsed": false,
223 |                 "trusted": true
224 |             },
225 |             "outputs": [],
226 |             "source": [
227 |                 "val ranks = graph.pageRank(0.001)\n",
228 |                 "                 .vertices\n",
229 |                 "                 .toDF(\"id\", \"rank\")\n",
230 |                 "\n",
231 |                 "ranks.show"
232 |             ]
233 |         },
234 |         {
235 |             "cell_type": "markdown",
236 |             "metadata": {
237 |                 "id": "FC227B64660044C28631DE9832169B6D"
238 |             },
239 |             "source": "Now we can sort the vertices by their calculated page ranks."
240 |         },
241 |         {
242 |             "cell_type": "code",
243 |             "metadata": {
244 |                 "collapsed": false,
245 |                 "id": "306648DE2A9742788B669AB3E366CF41",
246 |                 "input_collapsed": false,
247 |                 "trusted": true
248 |             },
249 |             "outputs": [],
250 |             "source": [
251 |                 "val sortedRanks = ranks.join(nodes, \"id\")\n",
252 |                 "                       .sort(desc(\"rank\"))\n",
253 |                 "\n",
254 |                 "sortedRanks.show(5, false)"
255 |             ]
256 |         },
257 |         {
258 |             "cell_type": "code",
259 |             "metadata": {
260 |                 "collapsed": false,
261 |                 "id": "A98EF9E40EE34BB88EF64CD145BB2612",
262 |                 "input_collapsed": false,
263 |                 "presentation": {
264 |                     "pivot_chart_state": "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}",
265 |                     "tabs_state": "{\n  \"tab_id\": \"#tab1059669048-0\"\n}"
266 |                 },
267 |                 "trusted": true
268 |             },
269 |             "outputs": [
270 |                 {
271 |                     "name": "stdout",
272 |                     "output_type": "stream",
273 |                     "text": "top10: Array[String] = Array(C825A1ECF2A6830C4401620C3A16F1995057C2AB, DE21D51F82F065DF011CFB3CDCE09C6F71FC716B, D63066643AFA128CE4BEBB2523242ADF5F07A0A9, AA3750AA18B8A0F3F0590731E1FAB934856680CF, 4FA170CFDE2372AC91D479F989DC4DB5AA8D47E0, 9A4E5250E56CA29765635022FB11624116B226BE, 200413B74F3B34198333778C79AF1728AC9A912A, 7773B5B0576CCC2FC79E94098B7D879CCE8BB377, 7C154ED1DC59609E3D26ABB2DF2EA3D587CD8C41, 9B71CA50A249F283DCE5848A6259EFDD2E47FA4B)\nres299: Array[String] = Array(C825A1ECF2A6830C4401620C3A16F1995057C2AB, DE21D51F82F065DF011CFB3CDCE09C6F71FC716B, D63066643AFA128CE4BEBB2523242ADF5F07A0A9, AA3750AA18B8A0F3F0590731E1FAB934856680CF, 4FA170CFDE2372AC91D479F989DC4DB5AA8D47E0, 9A4E5250E56CA29765635022FB11624116B226BE, 200413B74F3B34198333778C79AF1728AC9A912A, 7773B5B0576CCC2FC79E9409..."
274 |                 },
275 |                 {
276 |                     "data": {
277 |                         "text/html": "<div>\n      <script data-this=\"{&quot;dataId&quot;:&quot;anonc29a4e5a0c865de2bd5ba82c3b232f36&quot;,&quot;dataInit&quot;:[],&quot;genId&quot;:&quot;1059669048&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/req(['../javascripts/notebook/playground','../javascripts/notebook/magic/tabs'], \n      function(playground, _magictabs) {\n        // data ==> data-this (in observable.js's scopedEval) ==> this in JS => { dataId, dataInit, ... }\n        // this ==> scope (in observable.js's scopedEval) ==> this.parentElement ==> div.container below (toHtml)\n\n        playground.call(data,\n                        this\n                        ,\n                        {\n    \"f\": _magictabs,\n    \"o\": {}\n  }\n  \n                        \n                        \n                      );\n      }\n    );/*]]>*/</script>\n    <div>\n      <div>\n        <ul class=\"nav nav-tabs\" id=\"ul1059669048\"><li>\n              <a href=\"#tab1059669048-0\"><i class=\"fa fa-table\"/></a>\n            </li><li>\n              <a href=\"#tab1059669048-1\"><i class=\"fa fa-cubes\"/></a>\n            </li></ul>\n\n        <div class=\"tab-content\" id=\"tab1059669048\"><div class=\"tab-pane\" id=\"tab1059669048-0\">\n            <div>\n      <script data-this=\"{&quot;dataId&quot;:&quot;anon261a2d0edde727e1fa52ec43891678ad&quot;,&quot;dataInit&quot;:[{&quot;string value&quot;:&quot;C825A1ECF2A6830C4401620C3A16F1995057C2AB&quot;},{&quot;string value&quot;:&quot;DE21D51F82F065DF011CFB3CDCE09C6F71FC716B&quot;},{&quot;string value&quot;:&quot;D63066643AFA128CE4BEBB2523242ADF5F07A0A9&quot;},{&quot;string value&quot;:&quot;AA3750AA18B8A0F3F0590731E1FAB934856680CF&quot;},{&quot;string value&quot;:&quot;4FA170CFDE2372AC91D479F989DC4DB5AA8D47E0&quot;},{&quot;string value&quot;:&quot;9A4E5250E56CA29765635022FB11624116B226BE&quot;},{&quot;string value&quot;:&quot;200413B74F3B34198333778C79AF1728AC9A912A&quot;},{&quot;string value&quot;:&quot;7773B5B0576CCC2FC79E94098B7D879CCE8BB377&quot;},{&quot;string value&quot;:&quot;7C154ED1DC59609E3D26ABB2DF2EA3D587CD8C41&quot;},{&quot;string value&quot;:&quot;9B71CA50A249F283DCE5848A6259EFDD2E47FA4B&quot;}],&quot;genId&quot;:&quot;58890138&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/req(['../javascripts/notebook/playground','../javascripts/notebook/magic/tableChart'], \n      function(playground, _magictableChart) {\n        // data ==> data-this (in observable.js's scopedEval) ==> this in JS => { dataId, dataInit, ... }\n        // this ==> scope (in observable.js's scopedEval) ==> this.parentElement ==> div.container below (toHtml)\n\n        playground.call(data,\n                        this\n                        ,\n                        {\n    \"f\": _magictableChart,\n    \"o\": {\"headers\":[\"string value\"],\"width\":600,\"height\":400}\n  }\n  \n                        \n                        \n                      );\n      }\n    );/*]]>*/</script>\n    <div>\n      <span class=\"chart-total-item-count\"><p data-bind=\"text: value\"><script data-this=\"{&quot;valueId&quot;:&quot;anon304f913878fd3a0d709b5446e272e0bf&quot;,&quot;initialValue&quot;:&quot;10&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/\nreq(\n['observable', 'knockout'],\nfunction (O, ko) {\n  ko.applyBindings({\n      value: O.makeObservable(valueId, initialValue)\n    },\n    this\n  );\n});\n        /*]]>*/</script></p> entries total</span>\n      <span class=\"chart-sampling-warning\"><p data-bind=\"text: value\"><script data-this=\"{&quot;valueId&quot;:&quot;anon88894f4f8b295d6addafed331a6fdde1&quot;,&quot;initialValue&quot;:&quot;&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/\nreq(\n['observable', 'knockout'],\nfunction (O, ko) {\n  ko.applyBindings({\n      value: O.makeObservable(valueId, initialValue)\n    },\n    this\n  );\n});\n        /*]]>*/</script></p></span>\n      <div>\n      </div>\n    </div></div>\n            </div><div class=\"tab-pane\" id=\"tab1059669048-1\">\n            <div>\n      <script data-this=\"{&quot;dataId&quot;:&quot;anonb83aedb59e2af8ca3002d40d45577eb6&quot;,&quot;dataInit&quot;:[{&quot;string value&quot;:&quot;C825A1ECF2A6830C4401620C3A16F1995057C2AB&quot;},{&quot;string value&quot;:&quot;DE21D51F82F065DF011CFB3CDCE09C6F71FC716B&quot;},{&quot;string value&quot;:&quot;D63066643AFA128CE4BEBB2523242ADF5F07A0A9&quot;},{&quot;string value&quot;:&quot;AA3750AA18B8A0F3F0590731E1FAB934856680CF&quot;},{&quot;string value&quot;:&quot;4FA170CFDE2372AC91D479F989DC4DB5AA8D47E0&quot;},{&quot;string value&quot;:&quot;9A4E5250E56CA29765635022FB11624116B226BE&quot;},{&quot;string value&quot;:&quot;200413B74F3B34198333778C79AF1728AC9A912A&quot;},{&quot;string value&quot;:&quot;7773B5B0576CCC2FC79E94098B7D879CCE8BB377&quot;},{&quot;string value&quot;:&quot;7C154ED1DC59609E3D26ABB2DF2EA3D587CD8C41&quot;},{&quot;string value&quot;:&quot;9B71CA50A249F283DCE5848A6259EFDD2E47FA4B&quot;}],&quot;genId&quot;:&quot;2143859974&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/req(['../javascripts/notebook/playground','../javascripts/notebook/magic/pivotChart'], \n      function(playground, _magicpivotChart) {\n        // data ==> data-this (in observable.js's scopedEval) ==> this in JS => { dataId, dataInit, ... }\n        // this ==> scope (in observable.js's scopedEval) ==> this.parentElement ==> div.container below (toHtml)\n\n        playground.call(data,\n                        this\n                        ,\n                        {\n    \"f\": _magicpivotChart,\n    \"o\": {\"width\":600,\"height\":400,\"derivedAttributes\":{},\"extraOptions\":{}}\n  }\n  \n                        \n                        \n                      );\n      }\n    );/*]]>*/</script>\n    <div>\n      <span class=\"chart-total-item-count\"><p data-bind=\"text: value\"><script data-this=\"{&quot;valueId&quot;:&quot;anon5e72cfa6bfa57282dfa6feff307c8b65&quot;,&quot;initialValue&quot;:&quot;10&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/\nreq(\n['observable', 'knockout'],\nfunction (O, ko) {\n  ko.applyBindings({\n      value: O.makeObservable(valueId, initialValue)\n    },\n    this\n  );\n});\n        /*]]>*/</script></p> entries total</span>\n      <span class=\"chart-sampling-warning\"><p data-bind=\"text: value\"><script data-this=\"{&quot;valueId&quot;:&quot;anonc75f17a622f6df2702fb3a8dea1e6a4a&quot;,&quot;initialValue&quot;:&quot;&quot;}\" type=\"text/x-scoped-javascript\">/*<![CDATA[*/\nreq(\n['observable', 'knockout'],\nfunction (O, ko) {\n  ko.applyBindings({\n      value: O.makeObservable(valueId, initialValue)\n    },\n    this\n  );\n});\n        /*]]>*/</script></p></span>\n      <div>\n      </div>\n    </div></div>\n            </div></div>\n      </div>\n    </div></div>"
278 |                     },
279 |                     "execution_count": 182,
280 |                     "metadata": {},
281 |                     "output_type": "execute_result",
282 |                     "time": "Took: 4.559s, at 2017-09-30 14:45"
283 |                 }
284 |             ],
285 |             "source": [
286 |                 "val top10 = sortedRanks.take(10).map(_(2).toString)\n",
287 |                 "\n",
288 |                 "top10"
289 |             ]
290 |         },
291 |         {
292 |             "cell_type": "markdown",
293 |             "metadata": {
294 |                 "id": "B4A4BDBC889444F39BEF8D9A2DB1A7F8"
295 |             },
296 |             "source": "### Helper functions\n\nBitcoin address is essentially a hash or fingerprint of the public key. In the blockchain for the addresses Bitcoin uses internally `hash160` with zero redundancy. However, humans tend to make mistakes and in order to mittigate the risk of sending money to wrong address by making a typo in the address, there is also address that uses a checksum. It's possible to convert between the two forms of the address.\n\nWe will be using `blockchain.info` API for fetching some useful information about the top ten addresses in our Page Rank calculation. To do that we need to define couple of helper functions."
297 |         },
298 |         {
299 |             "cell_type": "code",
300 |             "metadata": {
301 |                 "collapsed": false,
302 |                 "id": "F158F501F8544DBD8453FF6C1738157D",
303 |                 "input_collapsed": false,
304 |                 "presentation": {
305 |                     "pivot_chart_state": "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}",
306 |                     "tabs_state": "{\n  \"tab_id\": \"#tab744741656-0\"\n}"
307 |                 },
308 |                 "trusted": true
309 |             },
310 |             "outputs": [],
311 |             "source": [
312 |                 "import scala.io.Source.fromURL\n",
313 |                 "\n",
314 |                 "def makeFunc(path: String)(param: String) = \n",
315 |                 "  fromURL(s\"https://blockchain.info/q/$path/$param\").mkString\n",
316 |                 "\n",
317 |                 "def hashToAddress = makeFunc(\"hashtoaddress\") _\n",
318 |                 "def balance = makeFunc(\"addressbalance\") _\n",
319 |                 "def totalReceived = makeFunc(\"getreceivedbyaddress\") _\n",
320 |                 "def totalSent = makeFunc(\"getsentbyaddress\") _\n",
321 |                 "def firstSeen = makeFunc(\"addressfirstseen\") _\n",
322 |                 "val rawJson = (addr: String) => fromURL(s\"https://blockchain.info/rawaddr/$addr?limit=0\").mkString\n",
323 |                 "\n",
324 |                 "val parseJson = (jsonStr: String) => {\n",
325 |                 "  val result = scala.util.parsing.json.JSON.parseFull(jsonStr)\n",
326 |                 "  result match {\n",
327 |                 "    case Some(hash: Map[String, Any]) => List(\"address\", \"total_received\", \"total_sent\", \"final_balance\", \"n_tx\")\n",
328 |                 "                                              .map(x => hash(x))\n",
329 |                 "    case _ => Nil\n",
330 |                 "  }\n",
331 |                 "}\n",
332 |                 "\n",
333 |                 "val getInfo = rawJson.andThen(parseJson)\n",
334 |                 "val satoshi2BTC = (input: Double) => input / 1.0E8\n",
335 |                 "\n",
336 |                 "// https://blockchain.info/ticker\n",
337 |                 "val btcInUsd = 4279.92\n",
338 |                 "val BTC2USD = (input: Double) => input * btcInUsd\n",
339 |                 "val toUSD = satoshi2BTC.andThen(BTC2USD)\n",
340 |                 "val formatter = java.text.NumberFormat.getCurrencyInstance\n",
341 |                 "val toReadable = satoshi2BTC.andThen(BTC2USD).andThen(formatter.format(_))"
342 |             ]
343 |         },
344 |         {
345 |             "cell_type": "markdown",
346 |             "metadata": {
347 |                 "id": "2593C099C1694CE8B6C98F072E0B0F63"
348 |             },
349 |             "source": "Now, let's apply the `getInfo` function to our top 10 addresses."
350 |         },
351 |         {
352 |             "cell_type": "code",
353 |             "metadata": {
354 |                 "collapsed": false,
355 |                 "id": "2967D4FE1DA14F0996DE30AAE7871CB4",
356 |                 "input_collapsed": false,
357 |                 "presentation": {
358 |                     "pivot_chart_state": "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}",
359 |                     "tabs_state": "{\n  \"tab_id\": \"#tab360202904-0\"\n}"
360 |                 },
361 |                 "trusted": true
362 |             },
363 |             "outputs": [],
364 |             "source": [
365 |                 "val top10detailed = top10.map(getInfo)\n",
366 |                 "top10detailed"
367 |             ]
368 |         },
369 |         {
370 |             "cell_type": "markdown",
371 |             "metadata": {
372 |                 "id": "6251CD095E48489E88ABE9F331CBDF67"
373 |             },
374 |             "source": "And present the results in an HTML table."
375 |         },
376 |         {
377 |             "cell_type": "code",
378 |             "metadata": {
379 |                 "collapsed": false,
380 |                 "id": "12C1B66003C147128E2BC40B169826E0",
381 |                 "input_collapsed": false,
382 |                 "presentation": {
383 |                     "pivot_chart_state": "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}",
384 |                     "tabs_state": "{\n  \"tab_id\": \"#tab434507380-0\"\n}"
385 |                 },
386 |                 "trusted": true
387 |             },
388 |             "outputs": [],
389 |             "source": [
390 |                 "<table>\n",
391 |                 "  <tr><td><b>Address</b></td><td><b>Received Ttl</b></td>\n",
392 |                 "  <td><b>Sent Ttl</b></td><td><b>Balance</b></td><td><b>Transactions</b></td></tr>\n",
393 |                 "{\n",
394 |                 "top10detailed.map(record => {\n",
395 |                 "  val address = record(0)\n",
396 |                 "  val totalRcv = toReadable(record(1).toString.toDouble)\n",
397 |                 "  val totalSnt = toReadable(record(2).toString.toDouble)\n",
398 |                 "  val balance = toReadable(record(3).toString.toDouble)\n",
399 |                 "  val txNumber = record(4)\n",
400 |                 "  <tr><td><a href={\"https://blockchain.info/address/\" + address}>{address}</a></td>\n",
401 |                 "  <td>{totalRcv}</td>\n",
402 |                 "  <td>{totalSnt}</td>\n",
403 |                 "  <td>{balance}</td>\n",
404 |                 "  <td>{txNumber}</td>\n",
405 |                 "  </tr>\n",
406 |                 "})\n",
407 |                 "}\n",
408 |                 "</table>"
409 |             ]
410 |         },
411 |         {
412 |             "cell_type": "markdown",
413 |             "metadata": {
414 |                 "id": "D57F8A582AE04F9D91EA083A30FD4838"
415 |             },
416 |             "source": "We can also display the detailed information about any given Bitcoin address."
417 |         },
418 |         {
419 |             "cell_type": "code",
420 |             "metadata": {
421 |                 "collapsed": false,
422 |                 "id": "9DB5325B8EA5496088E5C0D3D37A16CD",
423 |                 "input_collapsed": false,
424 |                 "trusted": true
425 |             },
426 |             "outputs": [
427 |                 {
428 |                     "name": "stdout",
429 |                     "output_type": "stream",
430 |                     "text": "displayAddress: (address: String)scala.xml.Elem\nres318: scala.xml.Elem = <iframe width=\"1024\" frameborder=\"0\" height=\"630\" src=\"http://bitcoinwhoswho.com/address/1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm\"></iframe>\n"
431 |                 },
432 |                 {
433 |                     "data": {
434 |                         "text/html": "<iframe width=\"1024\" frameborder=\"0\" height=\"630\" src=\"http://bitcoinwhoswho.com/address/1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm\"></iframe>"
435 |                     },
436 |                     "execution_count": 193,
437 |                     "metadata": {},
438 |                     "output_type": "execute_result",
439 |                     "time": "Took: 2.202s, at 2017-09-30 14:58"
440 |                 }
441 |             ],
442 |             "source": [
443 |                 "def displayAddress(address: String) = <iframe \n",
444 |                 "  width=\"1024\" frameborder=\"0\" height=\"630\" \n",
445 |                 "  src={\"http://bitcoinwhoswho.com/address/\" + address}></iframe>\n",
446 |                 "\n",
447 |                 "displayAddress(\"1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm\")"
448 |             ]
449 |         }
450 |     ],
451 |     "metadata": {
452 |         "auto_save_timestamp": "1970-01-01T01:00:00.000Z",
453 |         "customArgs": null,
454 |         "customDeps": null,
455 |         "customImports": null,
456 |         "customLocalRepo": null,
457 |         "customRepos": null,
458 |         "customSparkConf": null,
459 |         "customVars": null,
460 |         "id": "580d9f21-537b-4f36-ab14-755f340c0632",
461 |         "language_info": {
462 |             "codemirror_mode": "text/x-scala",
463 |             "file_extension": "scala",
464 |             "name": "scala"
465 |         },
466 |         "name": "Blockchain",
467 |         "sparkNotebook": null,
468 |         "trusted": true,
469 |         "user_save_timestamp": "1970-01-01T01:00:00.000Z"
470 |     },
471 |     "nbformat": 4
472 | }
473 | 


--------------------------------------------------------------------------------
/graphx-notebook/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | myuid=$(id -u)
 4 | mygid=$(id -g)
 5 | uidentry=$(getent passwd $myuid)
 6 | 
 7 | if [ -z "$uidentry" ] ; then
 8 |     # assumes /etc/passwd has root-group (gid 0) ownership
 9 |     echo "$myuid:x:$myuid:$mygid:anonymous uid:/tmp:/bin/false" >> /etc/passwd
10 | fi
11 | 
12 | /bin/bash "$@"
13 | 


--------------------------------------------------------------------------------
/notebook/.gitignore:
--------------------------------------------------------------------------------
1 | nodes/
2 | edges/
3 | nodes1/
4 | edges1/
5 | nodes2/
6 | edges2/
7 | nodes3/
8 | edges3/
9 | 


--------------------------------------------------------------------------------
/notebook/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jkremser/base-notebook
 2 | 
 3 | USER root
 4 | RUN mkdir /data
 5 | 
 6 | ENV NB_USER=nbuser \
 7 |     NB_UID=1011 \
 8 |     PYSPARK_DRIVER_PYTHON="jupyter" \
 9 |     PYSPARK_DRIVER_PYTHON_OPTS="notebook" \
10 |     SPARK_MASTER=""\
11 |     PYSPARK_SUBMIT_ARGS="--packages graphframes:graphframes:0.5.0-spark2.1-s_2.11 --driver-memory 4G"
12 | 
13 | 
14 | EXPOSE 8888
15 | 
16 | USER $NB_UID
17 | RUN mkdir /tmp/data/
18 | 
19 | ADD addresses /tmp/addresses.parquet
20 | ADD blocks /tmp/blocks.parquet
21 | ADD transactions /tmp/transactions.parquet
22 | ADD edges /tmp/edges.parquet
23 | # ADD nodes3 /tmp/nodes3.parquet
24 | # ADD edges3 /tmp/edges3.parquet
25 | # ADD graphframes.ipynb /notebooks/graphframes.ipynb
26 | ADD blockchain.ipynb /notebooks/blockchain.ipynb
27 | ADD custom /home/$NB_USER/.jupyter/custom
28 | ADD js /notebooks/js
29 | 
30 | USER root
31 | 
32 | RUN curl -L -O --progress-bar http://dl.bintray.com/spark-packages/maven/graphframes/graphframes/0.5.0-spark2.1-s_2.11/graphframes-0.5.0-spark2.1-s_2.11.jar \
33 |     && curl -L -O --progress-bar http://central.maven.org/maven2/com/typesafe/scala-logging/scala-logging-slf4j_2.11/2.1.2/scala-logging-slf4j_2.11-2.1.2.jar \
34 |     && curl -L -O --progress-bar http://central.maven.org/maven2/com/typesafe/scala-logging/scala-logging-api_2.11/2.1.2/scala-logging-api_2.11-2.1.2.jar \
35 |     && echo "56e120e4839005098ec0dfa5b69677c124788048 `ls graphframes-*`" | sha1sum -c - \
36 |     && echo "7628485be79eacac9e7f19bf049f581394d45ea3 `ls scala-logging-slf4j_*`" | sha1sum -c - \
37 |     && echo "fed5521041c26b8c1daa2d5e5dc5f01b9bde4fe6 `ls scala-logging-api_*`" | sha1sum -c - \
38 |     && su $NB_USER -c "echo \"c.NotebookApp.token = ''\" >> /home/$NB_USER/.jupyter/jupyter_notebook_config.py" \
39 |     #&& conda install -y --quiet -c conda-forge jupyter_contrib_nbextensions \
40 |     && pip install https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tarball/master \
41 |     && pip install networkx \
42 |     && jupyter contrib nbextension install --user \
43 |     && jupyter nbextension enable toc2/main \
44 |     && jupyter nbextension enable python-markdown/main \
45 |     && jupyter nbextension enable init_cell/main \
46 |     && jupyter trust /notebooks/blockchain.ipynb \
47 |     # && jupyter trust /notebooks/graphframes.ipynb \
48 |     && chown 500:500 *.jar \
49 |     && chmod 666 *.jar \
50 |     && mv *.jar /opt/spark/jars/ \
51 |     && sed -i'' -e 's/^\(exec jupyter notebook\)$/#\1/' /start.sh \
52 |     && echo 'ADDITIONAL_ARGS=`[[ ! -z "$SPARK_MASTER" ]] && echo "--master $SPARK_MASTER" || echo ""`' >> /start.sh \
53 |     && echo "pyspark \$ADDITIONAL_ARGS \$PYSPARK_SUBMIT_ARGS" >> /start.sh \
54 |     && chown -R $NB_USER:root /home/$NB_USER /data \
55 |     && find /home/$NB_USER -type d -exec chmod g+rwx,o+rx {} \; \
56 |     && find /home/$NB_USER -type f -exec chmod g+rw {} \; \
57 |     && find /data -type d -exec chmod g+rwx,o+rx {} \; \
58 |     && find /data -type f -exec chmod g+rw {} \; \
59 |     && chmod -f g+rw /notebooks/*
60 |     # && conda remove --quiet --yes --force qt pyqt \
61 |     # && conda remove --quiet --yes --force --feature mkl ; conda clean -tipsy
62 | 
63 | USER $NB_UID
64 | ENV HOME /home/$NB_USER
65 | 
66 | LABEL io.k8s.description="PySpark Jupyter Notebook with blockchain analysis." \
67 |       io.k8s.display-name="PySpark Jupyter Notebook with blockchain analysis." \
68 |       io.openshift.expose-services="8888:http"
69 | 
70 | CMD ["/entrypoint", "/start.sh"]
71 | 


--------------------------------------------------------------------------------
/notebook/Makefile:
--------------------------------------------------------------------------------
 1 | LOCAL_IMAGE=$(USER)/bitcoin-notebook
 2 | GRAPH_DATA ?= $(HOME)/bitcoin/output/
 3 | 
 4 | 
 5 | .PHONY: build clean run test
 6 | 
 7 | build: clean
 8 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA) ---\n"
 9 | 	cp -r $(GRAPH_DATA)/edges edges
10 | 	cp -r $(GRAPH_DATA)/addresses addresses
11 | 	cp -r $(GRAPH_DATA)/blocks blocks
12 | 	cp -r $(GRAPH_DATA)/transactions transactions
13 | #	cp -r $(GRAPH_DATA)/edges edges3
14 | #	cp -r $(GRAPH_DATA)/nodes nodes3
15 | 	docker build -t $(LOCAL_IMAGE) .
16 | 	-rm -rf edges* addresses* blocks* transactions*
17 | 
18 | clean:
19 | 	-docker rmi -f $(LOCAL_IMAGE)
20 | 
21 | run:
22 | 	docker run --rm -ti -p 8888:8888 $(LOCAL_IMAGE)
23 | 
24 | open:
25 | 	-(sleep 2 && xdg-open http://localhost:8888 &> /dev/null)&
26 | 
27 | 


--------------------------------------------------------------------------------
/notebook/Makefile_bak:
--------------------------------------------------------------------------------
 1 | LOCAL_IMAGE=$(USER)/bitcoin-notebook
 2 | GRAPH_DATA1 ?= $(HOME)/bitcoin/output/
 3 | GRAPH_DATA2 ?= $(HOME)/bitcoin/output/
 4 | #GRAPH_DATA3 ?= $(HOME)/bitcoin/output/
 5 | 
 6 | 
 7 | .PHONY: build clean run test
 8 | 
 9 | build: clean
10 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA1) ---\n"
11 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA2) ---\n"
12 | #	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA3) ---\n"
13 | 	cp -r $(GRAPH_DATA1)/edges edges1
14 | 	cp -r $(GRAPH_DATA1)/nodes nodes1
15 | 	cp -r $(GRAPH_DATA2)/edges edges2
16 | 	cp -r $(GRAPH_DATA2)/nodes nodes2
17 | #	cp -r $(GRAPH_DATA3)/edges edges3
18 | #	cp -r $(GRAPH_DATA3)/nodes nodes3
19 | 	docker build -t $(LOCAL_IMAGE) .
20 | 	-rm -rf edges* nodes*
21 | 
22 | clean:
23 | 	-docker rmi -f $(LOCAL_IMAGE)
24 | 
25 | run:
26 | 	docker run --rm -ti -p 8888:8888 $(LOCAL_IMAGE)
27 | 
28 | open:
29 | 	-(sleep 2 && xdg-open http://localhost:8888 &> /dev/null)&
30 | 
31 | 


--------------------------------------------------------------------------------
/notebook/README.md:
--------------------------------------------------------------------------------
 1 | [![Docker build](https://img.shields.io/docker/automated/jkremser/bitcoin-notebook.svg)](https://hub.docker.com/r/jkremser/bitcoin-notebook)
 2 | [![Layers info](https://images.microbadger.com/badges/image/jkremser/bitcoin-notebook.svg)](https://microbadger.com/images/jkremser/bitcoin-notebook)
 3 | ## Info
 4 | 
 5 | This is the docker image with a Jupyter notebook using the GraphFrames.
 6 | It contains two notebooks: `blockchain.ipynb` and `graphframes.ipynb`.
 7 | 
 8 | ### Building
 9 | 
10 | ```bash
11 | make build
12 | ```
13 | This will build a docker image with the notebook and the example data. It assumes
14 | the parquet data generated by the converter. Check the `buildImg.sh` for example.
15 | Some example data can be found in `../parquet-converter/data/example1/output/`, but
16 | I suggest creating own from the `~/.bitcoin/blocks/blk00xyz.dat` files using the
17 | `parquet-converter`.
18 | 
19 | ### Running
20 | 
21 | ```bash
22 | make open run
23 | ```
24 | 
25 | Then the notebook is listening on http://localhost:8888.
26 | 


--------------------------------------------------------------------------------
/notebook/buildImg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | GRAPH_DATA1=/home/jkremser/bitcoin/output-0051-0054 \
3 | GRAPH_DATA2=/home/jkremser/bitcoin/output-0401-0404 \
4 | make build
5 | 


--------------------------------------------------------------------------------
/notebook/custom/custom.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/notebook/custom/custom.css


--------------------------------------------------------------------------------
/notebook/custom/custom.js:
--------------------------------------------------------------------------------
 1 | var neighbors = function(graph, nodeId) {
 2 |   var k,
 3 |       neighbors = {},
 4 |       index = graph.allNeighborsIndex.get(nodeId).keyList() || {};
 5 | 
 6 |   for (k in index)
 7 |     neighbors[k] = graph.nodesIndex.get(index[k]);
 8 | 
 9 |   return neighbors;
10 | };
11 | 
12 | // ugly
13 | window.neighbors = neighbors;
14 | 
15 | require(['base/js/namespace']) {
16 |     // setup 'ctrl-l' as shortcut for clearing current output
17 |     Jupyter.keyboard_manager.command_shortcuts
18 |            .add_shortcut('ctrl-shift-l', 'jupyter-notebook:clear-cell-output');
19 | }
20 | 


--------------------------------------------------------------------------------
/notebook/js/sigma-graph.js:
--------------------------------------------------------------------------------
 1 | var g = $graph_data ;
 2 | 
 3 | s = new sigma({graph: g, container: '$container', settings: { defaultNodeColor: '#ec5148',
 4 |                                                               defaultEdgeColor: '#999',
 5 |                                                               edgeColor: 'default',
 6 |                                                               borderSize: 2,
 7 |                                                               defaultNodeHoverColor: '#555',
 8 |                                                               drawEdgeLabels: true
 9 |                                                             } });
10 | 
11 | s.graph.nodes().forEach(function(n) {
12 |   n.originalColor = n.color;
13 | });
14 | s.graph.edges().forEach(function(e) {
15 |   e.originalColor = e.color;
16 | });
17 | 
18 | s.bind('clickNode', function(e) {
19 |   var nodeId = e.data.node.id,
20 |       toKeep = neighbors(s.graph, nodeId);
21 |   toKeep[nodeId] = e.data.node;
22 | 
23 |   s.graph.nodes().forEach(function(n) {
24 |     if (toKeep[n.id])
25 |       n.color = n.originalColor;
26 |     else
27 |       n.color = '#eee';
28 |   });
29 | 
30 |   s.graph.edges().forEach(function(e) {
31 |     if (toKeep[e.source] && toKeep[e.target])
32 |       e.color = e.originalColor;
33 |     else
34 |       e.color = '#eee';
35 |   });
36 | 
37 |   s.refresh();
38 | });
39 | 
40 | s.bind('clickStage', function(e) {
41 |   s.graph.nodes().forEach(function(n) {
42 |     n.color = n.originalColor;
43 |   });
44 | 
45 |   s.graph.edges().forEach(function(e) {
46 |     e.color = e.originalColor;
47 |   });
48 | 
49 |   s.refresh();
50 | });
51 | 


--------------------------------------------------------------------------------
/parquet-converter/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | metastore_db/
3 | hs_err_*
4 | 


--------------------------------------------------------------------------------
/parquet-converter/README.md:
--------------------------------------------------------------------------------
 1 | ### Building
 2 | 
 3 | Make sure your sbt version is at least `0.13.5`
 4 | 
 5 | ```bash
 6 | cd
 7 | sbt clean assembly
 8 | ```
 9 | will create a jar file in `./target/scala-2.11/bitcoin-insights.jar`
10 | 
11 | ### Running
12 | 
13 | Converter assumes the binary block data in the input directory and writes parquet files to the output directory. Paths to input and output directories are passed to the converter. Depending on the size of the task, or in other words, how many blocks you want to convert, tweak the memory parameters of the following example command:
14 | ```bash
15 | ls ~/bitcoin/input/
16 | # blk00003.dat
17 | 
18 | spark-submit \
19 |   --driver-memory 2G \
20 |   --executor-memory 2G \
21 |   --class io.radanalytics.bitcoin.ParquetConverter \
22 |   --master local[8] \
23 |   ./target/scala-2.11/bitcoin-insights.jar ~/bitcoin/input ~/bitcoin/output
24 | 
25 | # ... <output from the conversion> ...
26 | 
27 | ls ~/bitcoin/output
28 | # addresses  blocks  edges  transactions
29 | ```
30 | 


--------------------------------------------------------------------------------
/parquet-converter/build.sbt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Red Hat, Inc. and/or its affiliates
 3 |  * and other contributors as indicated by the @author tags.
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | import sbt._
19 | import Keys._
20 | 
21 | 
22 | lazy val root = (project in file("."))
23 | .settings(
24 |     name := "bitcoin-insights",
25 |     organization := "io.radanalytics",
26 |     version := "0.0.1-SNAPSHOT",
27 |     scalaVersion := "2.11.11"
28 | )
29 |  
30 | 
31 | resolvers += Resolver.mavenLocal
32 | 
33 | fork  := true
34 | 
35 | assemblyJarName in assembly := "bitcoin-insights.jar"
36 | 
37 | libraryDependencies += "com.github.zuinnote" % "hadoopcryptoledger-fileformat" % "1.0.4" % "compile"
38 | 
39 | libraryDependencies += "org.apache.spark" %% "spark-core" % "2.2.0" % "provided" withSources() withJavadoc()
40 | 
41 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "2.2.0" % "provided" withSources() withJavadoc()
42 | 
43 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.2.0" % "provided" withSources() withJavadoc()
44 | 
45 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.0" % "provided"
46 | 
47 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.1" % "test"
48 | 


--------------------------------------------------------------------------------
/parquet-converter/data/example1/input/blk00003.dat:
--------------------------------------------------------------------------------
1 | this is a placeholder, because the file itself is large (~130 MB), this file can be found in ~/.bitcoin/blocks/
2 | 


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/edges/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/edges/.part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/edges/.part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/edges/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/edges/_SUCCESS


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/edges/part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/edges/part-00000-f7851458-cd56-4835-88bf-f3fddf79e24e-c000.snappy.parquet


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/nodes/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/nodes/.part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/nodes/.part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/nodes/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/nodes/_SUCCESS


--------------------------------------------------------------------------------
/parquet-converter/data/example1/output/nodes/part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/data/example1/output/nodes/part-00000-9c206410-a79b-4313-a364-d1ba4d168480-c000.snappy.parquet


--------------------------------------------------------------------------------
/parquet-converter/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | 


--------------------------------------------------------------------------------
/parquet-converter/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.16
2 | 


--------------------------------------------------------------------------------
/parquet-converter/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0")
2 | 
3 | addSbtPlugin("de.johoop" % "jacoco4sbt" % "2.1.6")
4 | 


--------------------------------------------------------------------------------
/parquet-converter/run-ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | INPUT_DIR="$HOME/bitcoin/input"
 5 | OUTPUT_DIR="$HOME/bitcoin/output"
 6 | 
 7 | mkdir -p $INPUT_DIR
 8 | mkdir -p $HOME/tmp
 9 | rm -rf $INPUT_DIR/*
10 | 
11 | # following command assumes the default bitcoin client to be installed and present in the home directory
12 | # copy a file that contains serialized blocks
13 | cp $HOME/.bitcoin/blocks/blk00003.dat $INPUT_DIR
14 | 
15 | #cp $HOME/.bitcoin/blocks/blk000{0..5}{0..9}.dat $INPUT_DIR
16 | 
17 | # first 200
18 | #cp $HOME/.bitcoin/blocks/blk00{0,1}{0..9}{0..9}.dat $INPUT_DIR
19 | #cp $HOME/.bitcoin/blocks/blk00200.dat $INPUT_DIR
20 | 
21 | rm -Rf $OUTPUT_DIR
22 | 
23 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
24 | pushd $DIR
25 | sbt clean assembly
26 | 
27 | spark-submit \
28 |   --driver-memory 10G \
29 |   --executor-memory 1.25G \
30 |   --class io.radanalytics.bitcoin.ParquetConverterDS \
31 |   --master local[4] \
32 |   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \
33 |   --conf "spark.local.dir=$HOME/tmp" \
34 |   ./target/scala-2.11/bitcoin-insights.jar $INPUT_DIR $OUTPUT_DIR $@
35 | 
36 |   popd
37 | 


--------------------------------------------------------------------------------
/parquet-converter/run-new.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | INPUT_DIR="$HOME/bitcoin/input"
 5 | OUTPUT_DIR="$HOME/bitcoin/output"
 6 | 
 7 | mkdir -p $INPUT_DIR
 8 | mkdir -p $HOME/tmp
 9 | rm -rf $INPUT_DIR/*
10 | 
11 | # following command assumes the default bitcoin client to be installed and present in the home directory
12 | # copy a file that contains serialized blocks
13 | cp $HOME/.bitcoin/blocks/blk00003.dat $INPUT_DIR
14 | 
15 | # first 200
16 | #cp $HOME/.bitcoin/blocks/blk00{0,1}{0..9}{0..9}.dat $INPUT_DIR
17 | #cp $HOME/.bitcoin/blocks/blk00200.dat $INPUT_DIR
18 | 
19 | rm -Rf $OUTPUT_DIR
20 | 
21 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
22 | pushd $DIR
23 | sbt clean assembly
24 | 
25 | spark-submit \
26 |   --driver-memory 9G \
27 |   --executor-memory 1.25G \
28 |   --class io.radanalytics.bitcoin.ParquetConverterNewArch \
29 |   --master local[4] \
30 |   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \
31 |   --conf "spark.local.dir=$HOME/tmp" \
32 |   ./target/scala-2.11/bitcoin-insights.jar $INPUT_DIR $OUTPUT_DIR $@
33 | 
34 |   popd
35 | 


--------------------------------------------------------------------------------
/parquet-converter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -xe
 3 | 
 4 | INPUT_DIR="$HOME/bitcoin/input"
 5 | OUTPUT_DIR="$HOME/bitcoin/output"
 6 | 
 7 | mkdir -p $INPUT_DIR
 8 | mkdir -p $HOME/tmp
 9 | rm -rf $INPUT_DIR/*
10 | 
11 | # following command assumes the default bitcoin client to be installed and present in the home directory
12 | # copy a file that contains serialized blocks
13 | cp $HOME/.bitcoin/blocks/blk00003.dat $INPUT_DIR
14 | 
15 | # first 200
16 | #cp $HOME/.bitcoin/blocks/blk00{0,1}{0..9}{0..9}.dat $INPUT_DIR
17 | #cp $HOME/.bitcoin/blocks/blk00200.dat $INPUT_DIR
18 | 
19 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
20 | pushd $DIR
21 | rm -Rf $OUTPUT_DIR
22 | 
23 | sbt clean assembly
24 | 
25 | spark-submit \
26 |   --driver-memory 9G \
27 |   --executor-memory 1.25G \
28 |   --class io.radanalytics.bitcoin.ParquetConverter \
29 |   --master local[4] \
30 |   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \
31 |   --conf "spark.local.dir=$HOME/tmp" \
32 |   ./target/scala-2.11/bitcoin-insights.jar $INPUT_DIR $OUTPUT_DIR $@
33 | 
34 |   popd
35 | 


--------------------------------------------------------------------------------
/parquet-converter/src/main/scala/io/radanalytics/bitcoin/ConverterUtil.scala:
--------------------------------------------------------------------------------
 1 | package io.radanalytics.bitcoin
 2 | 
 3 | import java.security.{MessageDigest, NoSuchAlgorithmException}
 4 | import java.util
 5 | 
 6 | import org.bouncycastle.crypto.digests.{RIPEMD160Digest, SHA256Digest}
 7 | import org.zuinnote.hadoop.bitcoin.format.common.BitcoinUtil
 8 | 
 9 | class ConverterUtil extends Serializable
10 | 
11 | object ConverterUtil {
12 | 
13 |   def sha256(input: Array[Byte]): Array[Byte] = {
14 |     val d = new SHA256Digest
15 |     d.update(input, 0, input.length)
16 |     val result = new Array[Byte](d.getDigestSize)
17 |     d.doFinal(result, 0)
18 |     result
19 |   }
20 | 
21 |   def ripemd160(input: Array[Byte]): Array[Byte] = {
22 |     try {
23 |       val sha256 = MessageDigest.getInstance("SHA-256").digest(input)
24 |       val digest = new RIPEMD160Digest
25 |       digest.update(sha256, 0, sha256.length)
26 |       val out = new Array[Byte](20)
27 |       digest.doFinal(out, 0)
28 |       return out
29 |     } catch {
30 |       case e: NoSuchAlgorithmException =>
31 |         throw new RuntimeException(e) // Cannot happen.
32 |     }
33 |   }
34 | 
35 | 
36 |   // ugly Java code that was copy pasted and altered
37 |   def getPaymentDestination(scriptPubKey: Array[Byte]): String = {
38 |     if (scriptPubKey == null) return null
39 |     // test if anyone can spend output
40 |     if (scriptPubKey.length == 0) return "anyone" // need to check also ScriptSig for OP_TRUE
41 |     // test if standard transaction to Bitcoin address
42 |     val payToHash: String = checkPayToHash(scriptPubKey)
43 |     if (payToHash != null) return payToHash
44 |     // test if obsolete transaction to public key
45 |     val payToPubKey: String = checkPayToPubKey(scriptPubKey)
46 |     if (payToPubKey != null) return payToPubKey
47 |     // test if puzzle
48 |     if ((scriptPubKey.length > 0) && ((scriptPubKey(0) & 0xFF) == 0xAA) && ((scriptPubKey(scriptPubKey.length - 1) & 0xFF) == 0x87)) {
49 |       val puzzle: Array[Byte] = util.Arrays.copyOfRange(scriptPubKey, 1, scriptPubKey.length - 2)
50 |       return "puzzle_" + BitcoinUtil.convertByteArrayToHexString(puzzle)
51 |     }
52 |     // test if unspendable
53 |     if ((scriptPubKey.length > 0) && ((scriptPubKey(0) & 0xFF) == 0x6a)) return "unspendable"
54 |     null
55 |   }
56 | 
57 |   private def checkPayToHash(scriptPubKey: Array[Byte]): String = { // test start
58 |     val validLength = scriptPubKey.length == 25
59 |     if (!validLength) return null
60 |     val validStart = ((scriptPubKey(0) & 0xFF) == 0x76) && ((scriptPubKey(1) & 0xFF) == 0xA9) && ((scriptPubKey(2) & 0xFF) == 0x14)
61 |     val validEnd = ((scriptPubKey(23) & 0xFF) == 0x88) && ((scriptPubKey(24) & 0xFF) == 0xAC)
62 |     if (validStart && validEnd) {
63 |       val bitcoinAddress = util.Arrays.copyOfRange(scriptPubKey, 3, 23)
64 |       return BitcoinUtil.convertByteArrayToHexString(bitcoinAddress)
65 |     }
66 |     null
67 |   }
68 | 
69 |   private def checkPayToPubKey(scriptPubKey: Array[Byte]): String = {
70 |     if ((scriptPubKey.length > 0) && ((scriptPubKey(scriptPubKey.length - 1) & 0xFF) == 0xAC)) {
71 |       return BitcoinUtil.convertByteArrayToHexString(ripemd160(util.Arrays.copyOfRange(scriptPubKey, 1, scriptPubKey.length - 1)))
72 |     }
73 |     null
74 |   }
75 | 
76 | 
77 |   def getNiceAddress(ugly: Array[Byte]): String = {
78 |     val niceAddress = Option(ConverterUtil.getPaymentDestination(ugly))
79 | //    val niceAddressStripped = niceAddress.map(_.stripPrefix("bitcoinaddress_"))
80 |     niceAddress.getOrElse("unknown")
81 |   }
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/parquet-converter/src/main/scala/io/radanalytics/bitcoin/Model.scala:
--------------------------------------------------------------------------------
 1 | package io.radanalytics.bitcoin
 2 | 
 3 | import org.zuinnote.hadoop.bitcoin.format.common.BitcoinTransaction
 4 | 
 5 | object CustomTypes {
 6 |   type BlockOrderId = Long
 7 |   type TransactionOrderId = Long
 8 |   type VertexOrderId = Long
 9 | 
10 |   type IO_REF = (String, Long)
11 |   type TxData = BitcoinTransaction
12 | }
13 | import CustomTypes._
14 | 
15 | 
16 | case class TxRef(val hash: String, val ord: Long) extends Serializable {
17 | 
18 | }
19 | 
20 | case class Input(var value: Long, var address: String, val txOutputRef: IO_REF, val tx: String) extends Serializable {
21 |   override def toString: String = {
22 |     s" - val=$value ; adr=$address\n   txOutputRef=$txOutputRef; tx=$tx"
23 |   }
24 | }
25 | 
26 | case class Input2(var value: Long, var address: String, val txOutputRef: TxRef, val tx: String) extends Serializable {
27 |   override def toString: String = {
28 |     s" - val=$value ; adr=$address\n   txOutputRef=$txOutputRef; tx=$tx"
29 |   }
30 | }
31 | 
32 | case class Output2(val value: Long, val address: String, val txRef: TxRef) extends Serializable {
33 |   override def toString: String = {
34 |     s" - val=$value ; adr=$address\n   txRef=$txRef"
35 |   }
36 | }
37 | 
38 | case class Output(val value: Long, val address: String, val txRef: IO_REF) extends Serializable {
39 |   override def toString: String = {
40 |     s" - val=$value ; adr=$address\n   txRef=$txRef"
41 |   }
42 | }
43 | 
44 | case class Transaction(val hash: String, val time: Int, val block: String, val inputs: Array[Input], val outputs: Array[Output]) extends Serializable {
45 |   override def toString: String = {
46 |     s"txId:    $hash\ntime:    ${new java.util.Date(time * 1000L).toString}\nblock:   $block\n\ninputs:\n${inputs.foldLeft("")(_+_)}\noutputs:\n${outputs.foldLeft("")(_+_)}\n----------\n"
47 |   }
48 | }
49 | 
50 | case class Transaction2(val hash: String, val time: Int, val block: String) extends Serializable {
51 |   override def toString: String = {
52 |     s"txId:    $hash\ntime:    ${new java.util.Date(time * 1000L).toString}\nblock:   $block\n"
53 |   }
54 | }
55 | 
56 | case class Block(val hash: String, val prevHash: String, val time: Int) extends Serializable {
57 |   override def toString: String = {
58 |     s"block id:    $hash\ntime:    ${new java.util.Date(time * 1000L).toString}\n prev: $prevHash\n"
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/parquet-converter/src/main/scala/io/radanalytics/bitcoin/ParquetConverter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Red Hat, Inc. and/or its affiliates
  3 |  * and other contributors as indicated by the @author tags.
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | /**
 19 |   * This simple parquet converter was inspired by example projects in https://github.com/ZuInnoTe/hadoopcryptoledger
 20 |   *
 21 |   * For the structure of the blocks consult https://webbtc.com/api/schema
 22 |   */
 23 | package io.radanalytics.bitcoin
 24 | 
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.SparkConf
 27 | import org.apache.hadoop.conf._
 28 | import org.apache.spark.graphx._
 29 | import org.apache.spark.graphx.Edge
 30 | import org.apache.spark.rdd.RDD
 31 | import org.apache.hadoop.io._
 32 | import org.apache.spark.sql.SparkSession
 33 | import org.zuinnote.hadoop.bitcoin.format.common._
 34 | import org.zuinnote.hadoop.bitcoin.format.mapreduce._
 35 | 
 36 | object ParquetConverter {
 37 |   var debug = 0
 38 | 
 39 |   type TXHashBytes = Array[Byte]
 40 |   type TXHashByteArray = ByteArray
 41 |   type TXIoIndex = Long
 42 |   type TX_ID = (TXHashByteArray, TXIoIndex)
 43 |   type Input = TX_ID
 44 |   type Output = TX_ID
 45 |   type BtcAddress = String
 46 |   type TxData = (BtcAddress, TXHashBytes, TXIoIndex, TXHashBytes, TXIoIndex)
 47 |   type VertexId = Long
 48 | 
 49 |   def main(args: Array[String]): Unit = {
 50 |     val conf = new SparkConf().setAppName("Bitcoin insights - ParquetConverter)")
 51 |     val sc = new SparkContext(conf)
 52 |     val hadoopConf = new Configuration()
 53 |     if (args.size == 3) debug = args(2).toInt
 54 |     convert(sc, hadoopConf, args(0), args(1))
 55 |     sc.stop()
 56 |   }
 57 | 
 58 |   def convert(sc: SparkContext, hadoopConf: Configuration, inputFile: String, outputDir: String) = {
 59 |     if (debug >= 1) println("\n\n\n\n\n                          ********************invoking convert()****************              \n\n\n\n\n\n\n\n")
 60 |     val bitcoinBlocksRDD = sc.newAPIHadoopFile(inputFile, classOf[BitcoinBlockFileInputFormat], classOf[BytesWritable], classOf[BitcoinBlock], hadoopConf)
 61 | 
 62 |     // extract a tuple per transaction containing Bitcoin destination address, the input transaction hash,
 63 |     // the input transaction output index, and the current transaction hash, the current transaction output index, a (generated) long identifier
 64 |     val bitcoinTransactionTuples: RDD[TxData] = bitcoinBlocksRDD.flatMap(hadoopKeyValueTuple => extractTransactionData(hadoopKeyValueTuple._2))
 65 | //    bitcoinTransactionTuples.cache()
 66 | 
 67 | //    // RDD[(BytesWritable, BitcoinBlock)]
 68 | //    val foo = bitcoinBlocksRDD.collect()
 69 | //    println("\n\n\n\n\n" + foo.size + "\n\n\n\n")
 70 | //    foo.foreach(hadoopKeyValueTuple => extractTransactionData(hadoopKeyValueTuple._2))
 71 | //    println(foo)
 72 | 
 73 |     // create the vertex (Bitcoin destination address, vertexId), keep in mind that the flat table contains the same bitcoin address several times
 74 |     val bitcoinAddressIndexed: RDD[(BtcAddress, VertexId)] = bitcoinTransactionTuples.map(bitcoinTransactions => bitcoinTransactions._1).distinct().zipWithIndex()
 75 | //    bitcoinAddressIndexed.cache()
 76 | 
 77 |     // create the edges (bitcoinAddress,(byteArrayTransaction, TransactionIndex)
 78 |     val inputTransactionTuple: RDD[(BtcAddress, Input)] = bitcoinTransactionTuples.map(bitcoinTransactions =>
 79 |       (bitcoinTransactions._1, (new ByteArray(bitcoinTransactions._2), bitcoinTransactions._3)))
 80 | 
 81 |     // (bitcoinAddress,((byteArrayTransaction, TransactionIndex),vertexId))
 82 |     val inputTransactionTupleWithIndex: RDD[(BtcAddress, (Input, VertexId))] = inputTransactionTuple.join(bitcoinAddressIndexed)
 83 | 
 84 |     // (byteArrayTransaction, TransactionIndex), (vertexId, bitcoinAddress)
 85 |     val inputTransactionTupleByHashIdx: RDD[(Input, VertexId)] = inputTransactionTupleWithIndex.map(iTTuple => (iTTuple._2._1, iTTuple._2._2))
 86 | 
 87 |     val currentTransactionTuple: RDD[(BtcAddress, Output)] = bitcoinTransactionTuples.map(bitcoinTransactions =>
 88 |       (bitcoinTransactions._1, (new ByteArray(bitcoinTransactions._4), bitcoinTransactions._5)))
 89 |     val currentTransactionTupleWithIndex: RDD[(BtcAddress, (Output, VertexId))] = currentTransactionTuple.join(bitcoinAddressIndexed)
 90 | 
 91 |     // (byteArrayTransaction, TransactionIndex), (vertexId, bitcoinAddress)
 92 |     val currentTransactionTupleByHashIdx: RDD[(Output, VertexId)] = currentTransactionTupleWithIndex.map { cTTuple => (cTTuple._2._1, cTTuple._2._2) }
 93 | 
 94 |     // the join creates ((ByteArray, Idx), (srcIdx,srcAddress), (destIdx,destAddress)
 95 |     val joinedTransactions: RDD[(TX_ID, (VertexId, VertexId))] = inputTransactionTupleByHashIdx.join(currentTransactionTupleByHashIdx)
 96 | 
 97 |     // create vertices => vertexId,bitcoinAddress
 98 |     val bitcoinTransactionVertices: RDD[(VertexId, BtcAddress)] = bitcoinAddressIndexed.map { case (k, v) => (v, k) }
 99 | 
100 |     // create edges
101 |     val bitcoinTransactionEdges: RDD[Edge[Int]] = joinedTransactions.map(joinTuple => Edge(joinTuple._2._1, joinTuple._2._2))
102 | 
103 |     if (debug >= 1) println("\n\n\n\n\n                          ********************saving parquet files****************              \n\n\n\n\n\n\n\n")
104 | 
105 |     // create two parquet files, one with nodes and second with edges
106 |     val spark = SparkSession
107 |       .builder()
108 |       .getOrCreate()
109 |     import spark.implicits._
110 |     bitcoinTransactionVertices.toDF().write.save(s"$outputDir/nodes")
111 |     bitcoinTransactionEdges.toDF().write.save(s"$outputDir/edges")
112 |   }
113 | 
114 |   // extract relevant data
115 |   def extractTransactionData(bitcoinBlock: BitcoinBlock): Array[TxData] = {
116 |     if (debug >= 1) println("\n\n\n\n\n                          ********************invoking extractTransactionData()****************              \n\n\n\n\n\n\n\n")
117 | 
118 |     // first we need to determine the size of the result set by calculating the total number of inputs
119 |     // multiplied by the outputs of each transaction in the block
120 |     val transactionCount = bitcoinBlock.getTransactions().size()
121 |     var resultSize = 0
122 |     for (i <- 0 to transactionCount - 1) {
123 |       resultSize += bitcoinBlock.getTransactions().get(i).getListOfInputs().size() * bitcoinBlock.getTransactions().get(i).getListOfOutputs().size()
124 |     }
125 | 
126 |     // then we can create a tuple for each transaction input: Destination Address (which can be found in the output!), Input Transaction Hash, Current Transaction Hash, Current Transaction Output
127 |     // as you can see there is no 1:1 or 1:n mapping from input to output in the Bitcoin blockchain, but n:m (all inputs are assigned to all outputs), cf. https://en.bitcoin.it/wiki/From_address
128 |     val result: Array[TxData] = new Array[TxData](resultSize)
129 |     var resultCounter: Int = 0
130 |     for (i <- 0 to transactionCount - 1) { // for each transaction
131 |       val currentTransaction = bitcoinBlock.getTransactions().get(i)
132 |       val currentTransactionHash = BitcoinUtil.getTransactionHash(currentTransaction)
133 |       for (j <- 0 to currentTransaction.getListOfInputs().size() - 1) { // for each input
134 |         val currentTransactionInput = currentTransaction.getListOfInputs().get(j)
135 |         val currentTransactionInputHash = currentTransactionInput.getPrevTransactionHash()
136 |         val currentTransactionInputOutputIndex = currentTransactionInput.getPreviousTxOutIndex()
137 |         for (k <- 0 to currentTransaction.getListOfOutputs().size() - 1) {
138 |           val currentTransactionOutput = currentTransaction.getListOfOutputs().get(k)
139 |           val currentTransactionOutputIndex = k.toLong
140 | 
141 |           val btcAddress = BitcoinScriptPatternParser.getPaymentDestination(currentTransactionOutput.getTxOutScript()).stripPrefix("bitcoinaddress_")
142 | 
143 |           // example of multi-input multi-output tx: https://blockchain.info/tx/7c666411f52a2515f0593fc7ccd6e50a6b24150eb73df4b37fc8c1e174f5da15
144 |           // example of 'normal' tx: https://blockchain.info/tx/fb308839d7410a9b5ee9f4a7a36ab38908219d14141b607965640edf727445d1
145 |           // i.e. 1 sender, 1 receiver and sending back the rest
146 |           if (debug >= 2) {
147 |             println("\n\n\n\n\n                          ********************TX****************              \n\n\n\n\n\n\n\n")
148 |             println("currentTransactionInputHash = " + currentTransactionInputHash)
149 |             println("currentTransactionHash = " + currentTransactionHash)
150 |             println("paymentDestination = " + btcAddress)
151 |             println("currentTransactionOutputIndex = " + currentTransactionOutputIndex)
152 |             println("value = " + currentTransactionOutput.getValue())
153 |           }
154 | 
155 |           result(resultCounter) = (btcAddress, currentTransactionInputHash, currentTransactionInputOutputIndex, currentTransactionHash, currentTransactionOutputIndex)
156 |           resultCounter += 1
157 |         }
158 |       }
159 | 
160 |     }
161 |     result
162 |   }
163 | 
164 | 
165 | }
166 | 
167 | 
168 | /**
169 |   * Helper class to make byte arrays comparable
170 |   *
171 |   */
172 | class ByteArray(val bArray: Array[Byte]) extends Serializable {
173 |   override val hashCode = bArray.deep.hashCode
174 | 
175 |   override def equals(obj: Any) = obj.isInstanceOf[ByteArray] && obj.asInstanceOf[ByteArray].bArray.deep == this.bArray.deep
176 | }
177 | 
178 | 


--------------------------------------------------------------------------------
/parquet-converter/src/main/scala/io/radanalytics/bitcoin/ParquetConverterDS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Red Hat, Inc. and/or its affiliates
  3 |  * and other contributors as indicated by the @author tags.
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | /**
 19 |   * This simple parquet converter was inspired by example projects in https://github.com/ZuInnoTe/hadoopcryptoledger
 20 |   *
 21 |   * For the structure of the blocks consult https://webbtc.com/api/schema
 22 |   */
 23 | package io.radanalytics.bitcoin
 24 | 
 25 | import org.apache.hadoop.conf._
 26 | import org.apache.hadoop.io._
 27 | import org.apache.spark.{SparkConf, SparkContext}
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 30 | import org.zuinnote.hadoop.bitcoin.format.common._
 31 | import org.zuinnote.hadoop.bitcoin.format.mapreduce.BitcoinBlockFileInputFormat
 32 | 
 33 | import scala.collection.JavaConverters
 34 | import scala.collection.mutable.MutableList
 35 | 
 36 | object ParquetConverterDS {
 37 |   var debug = 0
 38 | 
 39 |   import CustomTypes._
 40 | 
 41 |   def main(args: Array[String]): Unit = {
 42 |     val conf = new SparkConf().setAppName("Bitcoin insights - ParquetConverter)")
 43 |     val sc = new SparkContext(conf)
 44 |     val hadoopConf = new Configuration()
 45 |     if (args.size == 3) debug = args(2).toInt
 46 |     convert(sc, hadoopConf, args(0), args(1))
 47 |     sc.stop
 48 |   }
 49 | 
 50 |   def convert(sc: SparkContext, hadoopConf: Configuration, inputFile: String, outputDir: String) = {
 51 |     // TODO: !!! currently all the entities have ids from the same range, so there are collisions
 52 | 
 53 |     val spark = SparkSession
 54 |       .builder()
 55 |       .getOrCreate()
 56 |     import spark.implicits._
 57 | 
 58 | 
 59 |     if (debug >= 1) println("\n\n\n\n\n                          ********************invoking convert()****************              \n\n\n\n\n\n\n\n")
 60 |     val bitcoinBlocksRDD = sc.newAPIHadoopFile(inputFile, classOf[BitcoinBlockFileInputFormat], classOf[BytesWritable], classOf[BitcoinBlock], hadoopConf)
 61 | 
 62 |     val blockchainDataRDD: RDD[(Block, Array[Transaction2], Array[Input2], Array[Output2])] = bitcoinBlocksRDD.map(hadoopKeyValueTuple => extractData(hadoopKeyValueTuple._2))
 63 |     val allInputs: RDD[Input2] = blockchainDataRDD.flatMap(t => t._3)
 64 |     val allInputsDS: Dataset[Input2] = spark.createDataset(allInputs)
 65 |     val allOutputs: RDD[Output2] = blockchainDataRDD.flatMap(t => t._4)
 66 |     val allOutputsDS: Dataset[Output2] = spark.createDataset(allOutputs)
 67 | 
 68 |     if (debug >= 2) {
 69 |       println("\n\n\n\n\n allOutputs: RDD[Output] = \n\n")
 70 |       allOutputs.sample(true, 0.001).take(5).foreach(println)
 71 |     }
 72 | 
 73 |     // note: distinct causes shuffle
 74 |     val addressesCached: RDD[String] = allOutputs.map(_.address).distinct.cache
 75 |     val addresses: RDD[(String, VertexOrderId)] = addressesCached.zipWithIndex()
 76 |     val addressesDS: Dataset[(String, VertexOrderId)] = spark.createDataset(addresses).cache
 77 |     if (debug >= 2) {
 78 |       val strange = addresses.filter(a => a._1.contains("_"))
 79 |       println("\n\n\n\n\n\nNON-STANDARD #: " + strange.count())
 80 |       println("\n\n\n\n\n\nNON-STANDARD (bitcoinpubkey_)#: " + strange.filter(s => s._1.contains("bitcoinpubkey")).count())
 81 |       println("\n\n\n\n\n\nNON-STANDARD  :" + strange.sample(true, 0.1).take(5).foreach(println))
 82 |       //      other possible prefixes: "puzzle_" "anyone" "unspendable"
 83 |     }
 84 | 
 85 |     if (debug >= 1) {
 86 |       println("\n\n\n\n\n addresses: RDD[String] = \n\n")
 87 |       addresses.sample(true, 0.001).take(5).foreach(println)
 88 |     }
 89 | 
 90 |     val allBlocksCached: RDD[Block] = blockchainDataRDD.map(t => t._1).cache
 91 |     val allBlocks: RDD[(Block, BlockOrderId)] = allBlocksCached.zipWithIndex
 92 |     val allBlocksDS: Dataset[(Block, BlockOrderId)] = spark.createDataset(allBlocks)
 93 | 
 94 |     if (debug >= 2) {
 95 |       println("\n\n\n\n\n allBlocksDS = \n\n")
 96 |       allBlocksDS.sample(true, 0.001).take(5).foreach(println)
 97 |       allBlocksDS.show
 98 |     }
 99 | 
100 |     val allTransactionsCached: RDD[Transaction2] = blockchainDataRDD.flatMap(t => t._2).cache
101 |     val allTransactions: RDD[(Transaction2, TransactionOrderId)] = allTransactionsCached.zipWithIndex()
102 |     val allTransactionsDS: Dataset[(Transaction2, TransactionOrderId)] = spark.createDataset(allTransactions).cache
103 | 
104 | 
105 |     val addressesDF: DataFrame = addressesDS.map(a => (a._2, a._1)).toDF("id", "address")
106 |     addressesDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/addresses")
107 | 
108 |     val allBlocksDF = allBlocksDS.map(b => (b._2, b._1.hash, b._1.time)).toDF("id", "hash", "time")
109 |     allBlocksDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/blocks")
110 | 
111 |     val allTransactionsDF = allTransactionsDS.map(t => (t._2, t._1.hash)).toDF("id", "hash")
112 |     allTransactionsDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/transactions")
113 | 
114 |     // |address|txRef|ord|value| : long, long, long, long
115 |     val allOutputsAux: DataFrame = allOutputsDS.join(addressesDS, allOutputsDS("address") === addressesDS("_1"))
116 |       .withColumnRenamed("_2", "aux").drop("_1")
117 |       .join(allTransactionsDS, $"txRef.hash" === $"_1.hash")
118 |       .select($"aux".as("address"), $"_2".as("txRef"), $"txRef.ord", $"value")
119 |       .cache
120 | 
121 |     // |address|txOutputRef|ord| tx| : long, long, long, long
122 |     val allInputsAux: DataFrame = allInputsDS.join(allTransactionsDS, $"txOutputRef.hash" === $"_1.hash")
123 |       .select($"_2".as("txOutputRef"), $"txOutputRef.ord", $"tx")
124 |       .join(allTransactionsDS, $"tx" === $"_1.hash")
125 |       .drop("_1", "tx")
126 |       .withColumnRenamed("_2", "tx")
127 | 
128 |     if (debug >= 2) {
129 |       println("\n\n\n\n\n allInputsDS = \n\n")
130 |       allInputsDS.sample(true, 0.001).take(5).foreach(println)
131 |       allInputsDS.show
132 |       println("step0:\n\n")
133 |       println("allInputsDS:\n\n")
134 |       allInputsDS.show
135 |       println("step1:\n\n")
136 |       allInputsDS.join(allTransactionsDS, $"txOutputRef.hash" === $"_1.hash")
137 |         .show
138 |       println("step1.5:\n\n")
139 |       allInputsDS.join(allTransactionsDS, $"txOutputRef.hash" === $"_1.hash")
140 |         .select($"_2".as("txOutputRef"), $"txOutputRef.ord", $"tx")
141 |         .show
142 |       println("step2:\n\n")
143 |       allInputsDS.join(allTransactionsDS, $"txOutputRef.hash" === $"_1.hash")
144 |         .select($"_2".as("txOutputRef"), $"txOutputRef.ord", $"tx")
145 |         .join(allTransactionsDS, $"tx" === $"_1.hash")
146 |         .show
147 |     }
148 | 
149 |     if (debug >= 2) {
150 |       println("\n\n\n\n\n allInputsAux = \n\n")
151 |       allInputsAux.sample(true, 0.001).take(5).foreach(println)
152 |       allInputsAux.show
153 |     }
154 | 
155 | 
156 |     import org.apache.spark.sql.functions._
157 |     // block -> TX
158 |     val blockTxEdges = allTransactionsDS.join(allBlocksDS, allTransactionsDS("_1.block") === allBlocksDS("_1.hash"))
159 |       .drop("_1")
160 |       .withColumn("foo" , lit(0L))
161 |     printCount("blockTxEdges", blockTxEdges)
162 | 
163 |     // block -> previous block
164 |     val blockBlockEdges = allBlocksDS.alias("ds1").join(allBlocksDS.alias("ds2"), $"ds1._1.hash" === $"ds2._1.prevHash")
165 |       .drop("_1").withColumn("foo" , lit(0L))
166 |     printCount("blockBlockEdges", blockBlockEdges)
167 | 
168 |     // TX -> address (outputs)
169 |     val txAddressEdges = allOutputsAux.select($"txRef", $"address", $"value")
170 |     printCount("txAddressEdges", txAddressEdges)
171 | 
172 |     // clear caches that are not needed anymore
173 | //    allBlocksCached.unpersist(true)
174 | //    addressesCached.unpersist(true)
175 | //    addressesDS.unpersist(true)
176 | 
177 |     // address => TX (inputs)
178 |     val addressTxEdges = allOutputsAux.alias("ds1")
179 |       .join(allInputsAux.alias("ds2"), $"txOutputRef" === $"txRef" && $"ds1.ord" === $"ds2.ord")
180 |       .select($"address", $"tx", $"value")
181 |     printCount("addressTxEdges", addressTxEdges)
182 | 
183 |     val allEdgesDF = blockTxEdges
184 |         .union(addressTxEdges)
185 |         .union(txAddressEdges)
186 |         .union(blockBlockEdges)
187 |         .toDF("src", "dst", "value")
188 | 
189 |     allEdgesDF.repartition(2).write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/edges")
190 |   }
191 | 
192 |   def extractData(bitcoinBlock: BitcoinBlock): (Block, Array[Transaction2], Array[Input2], Array[Output2]) = {
193 |     val blockTime: Int = bitcoinBlock.getTime
194 |     val blockHashHex: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.getBlockHash(bitcoinBlock))
195 |     val prevHash: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(bitcoinBlock.getHashPrevBlock))
196 |     val rawTransactions: Array[TxData] = bitcoinBlock.getTransactions().toArray(Array[BitcoinTransaction]())
197 |     val inputs: MutableList[Input2] = MutableList[Input2]()
198 |     val outputs: MutableList[Output2] = MutableList[Output2]()
199 | 
200 |     val transactions: Array[Transaction2] = rawTransactions.map((tx: BitcoinTransaction) => {
201 |       val txHash: Array[Byte] = BitcoinUtil.reverseByteArray(BitcoinUtil.getTransactionHash(tx))
202 |       val txHashHex: String =  BitcoinUtil.convertByteArrayToHexString(txHash)
203 | 
204 |       inputs ++= JavaConverters.asScalaBufferConverter(tx.getListOfInputs).asScala.map(input => {
205 |         convertBitcoinTransactionInput2Input(input, txHashHex)
206 |       })
207 | 
208 |       outputs ++= JavaConverters.asScalaBufferConverter(tx.getListOfOutputs).asScala.zipWithIndex.map(output => {
209 |         convertBitcoinTransactionOutput2Output(output._1, txHashHex, output._2)
210 |       })
211 | 
212 |       new Transaction2(hash = txHashHex, time = blockTime, block = blockHashHex)
213 |     })
214 |     val block = new Block(hash = blockHashHex, prevHash = prevHash, time = blockTime)
215 |     (block, transactions, inputs.toArray, outputs.toArray)
216 |   }
217 | 
218 |   def convertBitcoinTransactionOutput2Output(output: BitcoinTransactionOutput, txHash: String, index: Long): Output2 = {
219 |     val address = ConverterUtil.getNiceAddress(output.getTxOutScript)
220 |     new Output2(value = output.getValue, address = address, txRef = TxRef(txHash, index))
221 |   }
222 | 
223 |   def convertBitcoinTransactionInput2Input(input: BitcoinTransactionInput, txHash: String): Input2 = {
224 |     val address = ConverterUtil.getNiceAddress(input.getTxInScript)
225 |     val prevTxOutIndex: Long = input.getPreviousTxOutIndex
226 |     val prevTransactionHash: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(input.getPrevTransactionHash))
227 |     new Input2(value = 0L, address = address, TxRef(prevTransactionHash, prevTxOutIndex), txHash)
228 |   }
229 | 
230 |   private def printCount(name: String, df: DataFrame) {
231 |     if (debug >= 2) {
232 |       println(s"\n\n$name count: ${df.count}")
233 |     }
234 |   }
235 | }
236 | 


--------------------------------------------------------------------------------
/parquet-converter/src/main/scala/io/radanalytics/bitcoin/ParquetConverterNewArch.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Red Hat, Inc. and/or its affiliates
  3 |  * and other contributors as indicated by the @author tags.
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | /**
 19 |   * This simple parquet converter was inspired by example projects in https://github.com/ZuInnoTe/hadoopcryptoledger
 20 |   *
 21 |   * For the structure of the blocks consult https://webbtc.com/api/schema
 22 |   */
 23 | package io.radanalytics.bitcoin
 24 | 
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.SparkConf
 27 | import org.apache.hadoop.conf._
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.hadoop.io._
 30 | import org.apache.spark.sql.{DataFrame, SparkSession}
 31 | import org.zuinnote.hadoop.bitcoin.format.common._
 32 | import org.zuinnote.hadoop.bitcoin.format.mapreduce.BitcoinBlockFileInputFormat
 33 | 
 34 | import scala.collection.JavaConverters
 35 | import scala.collection.mutable.MutableList
 36 | 
 37 | object ParquetConverterNewArch {
 38 |   var debug = 0
 39 | 
 40 |   import CustomTypes._
 41 | 
 42 |   def main(args: Array[String]): Unit = {
 43 |     val conf = new SparkConf().setAppName("Bitcoin insights - ParquetConverter)")
 44 |     val sc = new SparkContext(conf)
 45 |     val hadoopConf = new Configuration()
 46 |     if (args.size == 3) debug = args(2).toInt
 47 |     convert(sc, hadoopConf, args(0), args(1))
 48 |     sc.stop()
 49 |   }
 50 | 
 51 |   def convert(sc: SparkContext, hadoopConf: Configuration, inputFile: String, outputDir: String) = {
 52 |     if (debug >= 1) println("\n\n\n\n\n                          ********************invoking convert()****************              \n\n\n\n\n\n\n\n")
 53 |     val bitcoinBlocksRDD = sc.newAPIHadoopFile(inputFile, classOf[BitcoinBlockFileInputFormat], classOf[BytesWritable], classOf[BitcoinBlock], hadoopConf)
 54 | 
 55 |     val blockchainData: RDD[(Block, Array[Transaction2], Array[Input], Array[Output])] = bitcoinBlocksRDD.map(hadoopKeyValueTuple => extractData(hadoopKeyValueTuple._2))
 56 |     val allInputs: RDD[Input] = blockchainData.flatMap(t => t._3)
 57 |     val allOutputs: RDD[Output] = blockchainData.flatMap(t => t._4)
 58 | 
 59 |     if (debug >= 2) {
 60 |       println("\n\n\n\n\n allOutputs: RDD[Output] = \n\n")
 61 |       allOutputs.sample(true, 0.001).take(5).foreach(println)
 62 |     }
 63 | 
 64 |     val indexedInputs: RDD[((IO_REF), Input)] = allInputs.map(in => (in.txOutputRef, in))
 65 |     val indexedOutputs: RDD[((IO_REF), Output)] = allOutputs.map(out => (out.txRef, out))
 66 | 
 67 |     // match inputs to outputs => output that doesn't have corresponding input is considered as unspent and the sum of
 68 |     // its values per address represents the balance
 69 |     val joined: RDD[(Output, Option[Input])] = indexedOutputs.leftOuterJoin(indexedInputs).values
 70 | 
 71 |     if (debug >= 1) {
 72 |       println("\n\n\n\n\n joinedFixed: RDD[(Output, Input)] = \n\n")
 73 |       joined.sample(true, 0.001).take(5).foreach(println)
 74 |     }
 75 | 
 76 |     // note: distinct shuffles
 77 |     val addresses: RDD[String] = allOutputs.map(_.address).distinct()
 78 |     if (debug >= 2) {
 79 |       val strange = addresses.filter(a => a.contains("_"))
 80 |       println("\n\n\n\n\n\nNON-STANDARD #: " + strange.count())
 81 |       println("\n\n\n\n\n\nNON-STANDARD (bitcoinpubkey_)#: " + strange.filter(s => s.contains("bitcoinpubkey")).count())
 82 |       println("\n\n\n\n\n\nNON-STANDARD  :" + strange.sample(true, 0.1).take(5).foreach(println))
 83 | //      other possible prefixes: "puzzle_" "anyone" "unspendable"
 84 |     }
 85 | 
 86 |     if (debug >= 1) {
 87 |       println("\n\n\n\n\n addresses: RDD[String] = \n\n")
 88 |       addresses.sample(true, 0.001).take(5).foreach(println)
 89 |     }
 90 | 
 91 |     val allTransactions: RDD[Transaction2] = blockchainData.flatMap(t => t._2)
 92 |     val allBlocks: RDD[Block] = blockchainData.map(t => t._1)
 93 | 
 94 |     // create two parquet files, one with nodes and second with edges
 95 |     val spark = SparkSession
 96 |       .builder()
 97 |       .getOrCreate()
 98 |     import spark.implicits._
 99 | 
100 |     val addressesDF: DataFrame = addresses.toDF("address")
101 |     addressesDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/addresses")
102 | 
103 |     val allBlocksDF = allBlocks.map(b => (b.hash, b.time)).toDF("hash", "time")
104 |     allBlocksDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/blocks")
105 | 
106 |     val allTransactionsDF = allTransactions.map(t => (t.hash)).toDF("hash")
107 |     allTransactionsDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/transactions")
108 | 
109 |     if (debug >= 2) {
110 |       println("\n\n\n\n\n allOutputs.map(o => (o.txRef._1, o.address, o.value)) = \n\n")
111 |       allOutputs.map(o => (o.txRef._1, o.address, o.value)).sample(true, 0.001).take(10).foreach(println)
112 |     }
113 | 
114 |     val allEdgesDF = allTransactions.map(t => (t.block, t.hash, 0L)) // block -> TX
115 |         .union(joined.flatMap(io => if (io._2.isDefined) Array((io._1.address, io._2.get.tx, io._1.value)) else Array[(String, String, Long)]())) // address => TX
116 |         .union(allOutputs.map(o => (o.txRef._1, o.address, o.value))) // TX -> address
117 |         .union(allBlocks.map(b => (b.hash, b.prevHash, 0L))) // block -> previous block
118 |         .toDF("src", "dst", "value")
119 | 
120 |     allEdgesDF.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(s"$outputDir/edges")
121 |   }
122 | 
123 |   def extractData(bitcoinBlock: BitcoinBlock): (Block, Array[Transaction2], Array[Input], Array[Output]) = {
124 |     val blockTime: Int = bitcoinBlock.getTime
125 |     val blockHashHex: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.getBlockHash(bitcoinBlock))
126 |     val prevHash: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(bitcoinBlock.getHashPrevBlock))
127 |     val rawTransactions: Array[TxData] = bitcoinBlock.getTransactions().toArray(Array[BitcoinTransaction]())
128 |     val inputs: MutableList[Input] = MutableList[Input]()
129 |     val outputs: MutableList[Output] = MutableList[Output]()
130 | 
131 |     val transactions: Array[Transaction2] = rawTransactions.map((tx: BitcoinTransaction) => {
132 |       val txHash: Array[Byte] = BitcoinUtil.reverseByteArray(BitcoinUtil.getTransactionHash(tx))
133 |       val txHashHex: String =  BitcoinUtil.convertByteArrayToHexString(txHash)
134 | 
135 |       inputs ++= JavaConverters.asScalaBufferConverter(tx.getListOfInputs).asScala.map(input => {
136 |         convertBitcoinTransactionInput2Input(input, txHashHex)
137 |       })
138 | 
139 |       outputs ++= JavaConverters.asScalaBufferConverter(tx.getListOfOutputs).asScala.zipWithIndex.map(output => {
140 |         convertBitcoinTransactionOutput2Output(output._1, txHashHex, output._2)
141 |       })
142 | 
143 |       new Transaction2(hash = txHashHex, time = blockTime, block = blockHashHex)
144 |     })
145 |     val block = new Block(hash = blockHashHex, prevHash = prevHash, time = blockTime)
146 |     (block, transactions, inputs.toArray, outputs.toArray)
147 |   }
148 | 
149 | 
150 | 
151 |   def convertBitcoinTransactionOutput2Output(output: BitcoinTransactionOutput, txHash: String, index: Long): Output = {
152 |     val address = ConverterUtil.getNiceAddress(output.getTxOutScript)
153 |     new Output(value = output.getValue, address = address, txRef = (txHash, index))
154 |   }
155 | 
156 |   def convertBitcoinTransactionInput2Input(input: BitcoinTransactionInput, txHash: String): Input = {
157 |     val address = ConverterUtil.getNiceAddress(input.getTxInScript)
158 |     val prevTxOutIndex: Long = input.getPreviousTxOutIndex
159 |     val prevTransactionHash: String = BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(input.getPrevTransactionHash))
160 |     new Input(value = 0L, address = address, (prevTransactionHash, prevTxOutIndex), txHash)
161 |   }
162 | }
163 | 


--------------------------------------------------------------------------------
/parquet-converter/src/test/resources/serialized:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkremser/bitcoin-insights/97783cf8d465dc149959fadeb3b667a90e6d5d7e/parquet-converter/src/test/resources/serialized


--------------------------------------------------------------------------------
/parquet-converter/src/test/scala/io/radanalytics/bitcoin/HashAlgSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   *
 3 |   * This test "unit tests" the BitcoinTransactionInput object
 4 |   *
 5 |   */
 6 | package io.radanalytics.bitcoin
 7 | 
 8 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, GivenWhenThen, Matchers}
 9 | import org.zuinnote.hadoop.bitcoin.format.common.{BitcoinTransactionInput, BitcoinUtil}
10 | 
11 | 
12 | class HashAlgSpec extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers {
13 | 
14 | 
15 |   "ConverterUtil.sha256 applied on 0430DB17C4.." should "return D6A739B77.." in {
16 |     val inputHex = "0430DB17C4AE5A12418F911284F35DFDB814DA33434202B9735BC42CF171F468C01E386B827FC683C3B750CAD7522692A942C30529E14D80C523A89BF880DBCFD5"
17 |     val desiredOutputHex = "D6A739B77ED9E701D1FCFBD2E4589BFF6B38C651125E3DBEE7CC9FCB1A6CB3C5"
18 |     assert(BitcoinUtil.convertByteArrayToHexString(ConverterUtil.sha256(BitcoinUtil.convertHexStringToByteArray(inputHex))) == desiredOutputHex)
19 |   }
20 | 
21 |   "ConverterUtil.ripemd160 applied on 0430DB17C4.." should "return 4861455.." in {
22 |     val inputHex = "0430DB17C4AE5A12418F911284F35DFDB814DA33434202B9735BC42CF171F468C01E386B827FC683C3B750CAD7522692A942C30529E14D80C523A89BF880DBCFD5"
23 |     val desiredOutputHex = "48614559893DF9816B8795B9F41C8C903EA7F332"
24 |     assert(BitcoinUtil.convertByteArrayToHexString(ConverterUtil.ripemd160(BitcoinUtil.convertHexStringToByteArray(inputHex))) == desiredOutputHex)
25 |   }
26 | }


--------------------------------------------------------------------------------
/parquet-converter/src/test/scala/io/radanalytics/bitcoin/SparkGraphxBitcoinSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 | *
 3 | * This test "unit tests" the application with a local Spark master
 4 | *
 5 | */
 6 | package io.radanalytics.bitcoin
 7 | 
 8 | 
 9 | import org.scalatest.{FlatSpec, BeforeAndAfterAll, GivenWhenThen, Matchers}
10 | 
11 | 
12 | class SparkGraphxBitcoinSpec extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers  {
13 | 
14 | 
15 | override def beforeAll(): Unit = {
16 |     super.beforeAll()
17 | 
18 |  }
19 | 
20 |   
21 |   override def afterAll(): Unit = {
22 | 
23 |     super.afterAll()
24 | }
25 | 
26 | 
27 | 
28 | "Simple Test" should "be always successful" in {
29 | 	Given("Nothing")
30 | 	Then("success")
31 | 	// fetch results
32 | 	assert(42==42)
33 | }
34 | 
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/parquet-converter/src/test/scala/io/radanalytics/bitcoin/TransactionsSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   *
 3 |   * This test "unit tests" the BitcoinTransactionInput object
 4 |   *
 5 |   */
 6 | package io.radanalytics.bitcoin
 7 | 
 8 | 
 9 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, GivenWhenThen, Matchers}
10 | import org.zuinnote.hadoop.bitcoin.format.common.{BitcoinTransactionInput, BitcoinUtil}
11 | 
12 | 
13 | class TransactionsSpec extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers {
14 | 
15 |   var input: BitcoinTransactionInput = null;
16 | 
17 | 
18 |   override def beforeAll(): Unit = {
19 |     super.beforeAll()
20 |     import java.io.FileInputStream
21 |     import java.io.ObjectInputStream
22 |     val source = getClass.getResource("/serialized").getPath
23 |     val ois = new ObjectInputStream(new FileInputStream(source))
24 |     input = ois.readObject.asInstanceOf[BitcoinTransactionInput]
25 |   }
26 | 
27 | 
28 |   override def afterAll(): Unit = {
29 |     super.afterAll()
30 |     input = null;
31 |   }
32 | 
33 | 
34 |   "Prev hash of giver input in hex" should "be 9609319FA.." in {
35 |     assert(BitcoinUtil.convertByteArrayToHexString(input.getPrevTransactionHash) == "9609319FA8102E0506E86E179A142AC4D1B187CD56F508E9F58FC18B0E4DEB11")
36 |   }
37 | 
38 |   "Prev hash of giver input in hex with reversed endianness" should "be 11EB4D0E8BC.." in {
39 |     assert(BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(input.getPrevTransactionHash)) == "11EB4D0E8BC18FF5E908F556CD87B1D1C42A149A176EE806052E10A89F310996")
40 |   }
41 | 
42 |   "Input hashed in hex" should "be 49304602.." in {
43 |     assert(BitcoinUtil.convertByteArrayToHexString(input.getTxInScript) == "493046022100829C046C9D820F6290EB3281E510D1D102AB5BBFD0B0F51DBEF37BA845C237F50221009A43D1F81F2A37AAD357EF0A1243AD66AD9C7F4F223554380AF1921CC41B051601")
44 |   }
45 | 
46 |   "Input hashed in hex with reversed endianness" should "be 0116051BC4.." in {
47 |     assert(BitcoinUtil.convertByteArrayToHexString(BitcoinUtil.reverseByteArray(input.getTxInScript)) == "0116051BC41C92F10A385435224F7F9CAD66AD43120AEF57D3AA372A1FF8D1439A002102F537C245A87BF3BE1DF5B0D0BF5BAB02D1D110E58132EB90620F829D6C049C82002102463049")
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/workers-with-data/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM radanalyticsio/openshift-spark:2.2-latest
2 | 
3 | ADD nodes /tmp/nodes.parquet
4 | ADD edges /tmp/edges.parquet
5 | 
6 | CMD ["/opt/spark/bin/launch.sh"]
7 | 


--------------------------------------------------------------------------------
/workers-with-data/Dockerfile_3_graphs:
--------------------------------------------------------------------------------
 1 | FROM radanalyticsio/openshift-spark:2.2-latest
 2 | 
 3 | ADD nodes1 /tmp/nodes1.parquet
 4 | ADD edges1 /tmp/edges1.parquet
 5 | 
 6 | ADD nodes2 /tmp/nodes2.parquet
 7 | ADD edges2 /tmp/edges2.parquet
 8 | 
 9 | ADD nodes3 /tmp/nodes3.parquet
10 | ADD edges3 /tmp/edges3.parquet
11 | 
12 | CMD ["/opt/spark/bin/launch.sh"]
13 | 


--------------------------------------------------------------------------------
/workers-with-data/Makefile:
--------------------------------------------------------------------------------
 1 | LOCAL_IMAGE=$(USER)/openshift-spark-with-data
 2 | GRAPH_DATA ?= $(HOME)/bitcoin/output/
 3 | GRAPH_DATA1 ?= $(HOME)/bitcoin/output/
 4 | GRAPH_DATA2 ?= $(HOME)/bitcoin/output/
 5 | GRAPH_DATA3 ?= $(HOME)/bitcoin/output/
 6 | 
 7 | 
 8 | .PHONY: build clean run test
 9 | 
10 | build: clean
11 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA) ---\n"
12 | 	cp -r $(GRAPH_DATA)/{edges,nodes} .
13 | 	#echo $(GRAPH_DATA) >> ./nodes/data_from
14 | 	docker build -t $(LOCAL_IMAGE) .
15 | 	-rm -rf edges nodes
16 | 
17 | build-3-graphs: clean
18 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA1) ---\n"
19 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA2) ---\n"
20 | 	-echo -e "\n--- Copying graph data from: $(GRAPH_DATA3) ---\n"
21 | 	cp -r $(GRAPH_DATA1)/edges edges1
22 | 	cp -r $(GRAPH_DATA1)/nodes nodes1
23 | 	cp -r $(GRAPH_DATA2)/edges edges2
24 | 	cp -r $(GRAPH_DATA2)/nodes nodes2
25 | 	cp -r $(GRAPH_DATA3)/edges edges3
26 | 	cp -r $(GRAPH_DATA3)/nodes nodes3
27 | 	docker build -f ./Dockerfile_3_graphs -t $(LOCAL_IMAGE) .
28 | 	-rm -rf edges* nodes*
29 | 
30 | clean:
31 | 	-docker rmi -f $(LOCAL_IMAGE)
32 | 
33 | run:
34 | 	-docker rm -f spark-with-data || true
35 | 	docker run --name spark-with-data --rm -ti -p 9000:9000 $(LOCAL_IMAGE)
36 | 


--------------------------------------------------------------------------------
/workers-with-data/README.md:
--------------------------------------------------------------------------------
 1 | [![Docker build](https://img.shields.io/docker/automated/jkremser/openshift-spark-with-data.svg)](https://hub.docker.com/r/jkremser/openshift-spark-with-data)
 2 | [![Layers info](https://images.microbadger.com/badges/image/jkremser/openshift-spark-with-data.svg)](https://microbadger.com/images/jkremser/openshift-spark-with-data)
 3 | ## Info
 4 | 
 5 | This is the docker image based on `radanalyticsio/openshift-spark:2.2-latest` and
 6 | all it does is adding those parquet files to the spark workers. This is a workaround
 7 | for the data distribution, because if the notebook is connected to a spark running
 8 | in the cluster, the io operations will fail if the data is not present on the workers.
 9 | 
10 | Proper solution would be using persistent volumes and perhaps some distributed file
11 | system, like gluster fs or ceph, but this will make the demos long and boring.
12 | 
13 | ### Building
14 | 
15 | ```bash
16 | make build
17 | ```
18 | This will build a docker image with the notebook and the example data. It assumes
19 | the parquet data generated by the converter.
20 | Some example data can be found in `../parquet-converter/data/example1/output/`, but
21 | I suggest creating own from the `~/.bitcoin/blocks/blk00xyz.dat` files using the
22 | `parquet-converter`.
23 | 
24 | ### Running
25 | 
26 | This image should not be run on localhost. It's purpose is to be deployed via
27 | the Oshinko tool and used as the image for spark master and worker. But it's possible
28 | to run and debug it by invoking `make run`.
29 | 


--------------------------------------------------------------------------------
/workers-with-data/buildImg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | GRAPH_DATA1=/home/jkremser/bitcoin/output-0051-0054 \
3 | GRAPH_DATA2=/home/jkremser/bitcoin/output-0401-0404 \
4 | GRAPH_DATA3=/home/jkremser/bitcoin/output-0801-0804 make build-3-graphs
5 | 


--------------------------------------------------------------------------------