├── Analysis
    ├── ContractMap.py
    ├── ParsedBlocks.py
    ├── TxnGraph.py
    ├── analysis_util.py
    └── tags.py
├── Forecasting
    ├── R
    │   ├── arima.R
    │   └── vol.R
    ├── model.py
    ├── pipeline.py
    ├── r_io_util.py
    └── sim.py
├── Preprocessing
    └── Crawler
    │   ├── Crawler.py
    │   ├── __init__.py
    │   └── crawler_util.py
├── README.md
├── Scripts
    ├── draw_graphs.py
    ├── extract.py
    └── preprocess.py
├── stream.py
└── test
    ├── forecast.py
    └── verify_blocks.py


/Analysis/ContractMap.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """Build a hash map of all contract addresses on the Ethereum network."""
  3 | 
  4 | from collections import defaultdict
  5 | import requests
  6 | import json
  7 | import pickle
  8 | import os
  9 | import time
 10 | import pymongo
 11 | DIR = "."
 12 | 
 13 | class ContractMap(object):
 14 |     """
 15 |     A hash map of all contract addresses in the Ethereum network.
 16 | 
 17 |     Public functions:
 18 | 
 19 |     - find(): searches all blocks after self.last_block and adds them
 20 |             to the table. Updates self.last_block
 21 |     - save(): saves the object to a pickle file ".contracts.p"
 22 |     - load(): loads the object from pickle file ".contracts.p"
 23 | 
 24 |     Attributes:
 25 | 
 26 |     - addresses: defaultdict with default value of 0 for non-contracts and
 27 |         values of 1 for contract addresses.
 28 | 
 29 |     Usage:
 30 | 
 31 |     # If a mongo_client is passed, the ContractMap will scan geth via RPC
 32 |     # for new contract addresses starting at "last_block".
 33 |     cmap = ContractMap(mongo_client, last_block=90000, filepath="contracts.p")
 34 |     cmap.save()
 35 | 
 36 |     # If None is passed for a mongo_client, the ContractMap will automatically
 37 |     # load the map of addresses from the pickle file specified in "filepath",
 38 |     # ./contracts.p by default.
 39 |     cmap = ContractMap()
 40 | 
 41 |     """
 42 | 
 43 |     def __init__(self,
 44 |                 mongo_client=None,
 45 |                 last_block=0,
 46 |                 load=False,
 47 |                 filepath="{}/.contracts.p".format(DIR)):
 48 |         """Initialize with a mongo client and an optional last block."""
 49 |         self.client = mongo_client
 50 |         self.last_block = last_block
 51 |         self.url = "http://localhost:8545"
 52 |         self.headers = {"content-type": "application/json"}
 53 |         self.filepath = filepath
 54 | 
 55 |         self.addresses = defaultdict(int)
 56 | 
 57 |         if load:
 58 |             self.load()
 59 | 
 60 |         if self.client:
 61 |             self.find()
 62 |             self.save()
 63 | 
 64 |     def _checkGeth(self):
 65 |         """Make sure geth is running in RPC on port 8545."""
 66 |         try:
 67 |             self._rpcRequest("eth_getBlockByNumber", [hex(1), True], "id")
 68 |             return
 69 |         except Exception as err:
 70 |             assert not err, "Geth cannot be reached: {}".format(err)
 71 | 
 72 |     def _rpcRequest(self, method, params, key):
 73 |         """Make an RPC request to geth on port 8545."""
 74 |         payload = {
 75 |             "method": method,
 76 |             "params": params,
 77 |             "jsonrpc": "2.0",
 78 |             "id": 0
 79 |         }
 80 | 
 81 |         res = requests.post(self.url,
 82 |             data=json.dumps(payload),
 83 |             headers=self.headers).json()
 84 | 
 85 |         # Geth will sometimes crash if overloaded with requests
 86 |         time.sleep(0.005)
 87 | 
 88 |         return res[key]
 89 | 
 90 |     def find(self):
 91 |         """
 92 |         Build a hash table of contract addresses.
 93 | 
 94 |         Iterate through all blocks and search for new contract addresses.
 95 |         Append them to self.addresses if found.
 96 |         """
 97 |         blocks = self.client.find(
 98 |             {"number": {"$gt": self.last_block}},
 99 |             sort=[("number", pymongo.ASCENDING)]
100 |         )
101 |         counter = 0
102 |         for block in blocks:
103 |             if block["transactions"]:
104 |                 # Loop through all of the transactions in the current block
105 |                 # Add all the nodes to a global set (self.nodes)
106 |                 for txn in block["transactions"]:
107 |                     if txn["to"] and not self.addresses[txn["to"]]:
108 |                         # Get the code at the "to" address.
109 |                         code = self._rpcRequest(
110 |                             "eth_getCode",
111 |                             [txn["to"], "latest"],
112 |                             "result")
113 |                         # Add addressees if there is non-empty data
114 |                         if code != "0x":
115 |                             self.addresses[txn["to"]] = 1
116 | 
117 |             self.last_block = block["number"]
118 |             counter += 1
119 |             # Save the list every 10000 blocks in case geth crashes
120 |             # midway through the procedure
121 |             if not counter % 10000:
122 |                 print("Done with block {}...".format(self.last_block))
123 |                 self.save()
124 | 
125 |     def save(self):
126 |         """Pickle the object and save it to a file."""
127 |         state = (self.last_block, self.addresses)
128 |         pickle.dump(state, open(self.filepath, "wb"))
129 | 
130 |     def load(self):
131 |         """Load the contract map from a  file."""
132 |         no_file = "Error loading ContractMap: No file exists in that path."
133 |         assert os.path.isfile(self.filepath), no_file
134 |         state = pickle.load(open(self.filepath, "rb"))
135 |         self.addresses = state[1]
136 |         self.last_block = state[0]
137 | 


--------------------------------------------------------------------------------
/Analysis/ParsedBlocks.py:
--------------------------------------------------------------------------------
  1 | """Interace to parse aggregate data from snapshots of the Ethereum network."""
  2 | 
  3 | import tags
  4 | from ContractMap import ContractMap
  5 | import os
  6 | import csv
  7 | import requests
  8 | 
  9 | 
 10 | class ParsedBlocks(object):
 11 |     """
 12 |     Build a set of aggregate data from a snapshot (using a TxnGraph).
 13 | 
 14 |     Description:
 15 |     ------------
 16 |     Parse the network graphs at each timestamp.
 17 |     Time period is every X blocks.
 18 |     For each time period, look at aggregate stats.
 19 | 
 20 |     Iterate over all edges in the graph snapshot and calculate:
 21 |         - Total number of transactions in the network
 22 |         - Sum of all transaction amounts
 23 |         - Sum of all outflow from exchanges; suggests people entering long term
 24 |         - Sum of all inflow to exchanges (suggests people exiting)
 25 |         - Number of transactions to contracts (with data)
 26 |         - Number of transactions to crowdsale wallets (no data)
 27 |         - Number of transactions to peers, but with data, i.e. sending altcoins
 28 |         - Number of p2p transactions
 29 |         - Number of new addresses
 30 |         - Distribution of wealth (mean, std) across addresses that are NOT:
 31 | 
 32 |     Tagged addresses consitute:
 33 |     A: Exchanges
 34 |     B: Mining pools
 35 |     C: Crowdsale wallets/contract addresses
 36 | 
 37 |     Also tagged are all contract addresses to which data has been sent.
 38 | 
 39 |     Lastly, we also want to get the price of ETH (in USD) at the
 40 |     timestamp listed in the LAST block of the block range.
 41 | 
 42 |     Parameters:
 43 |     -----------
 44 |     txn_graph: TxnGraph instance (with a prebuilt graph)
 45 |     run: boolean, optional. Calculate the data when instantiated.
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, txn_graph, run=True, csv_file="blockchain.csv"):
 50 |         """Initialize the graph, address hash maps, and data fields."""
 51 |         self.txn_graph = txn_graph
 52 |         self.csv_file = csv_file
 53 | 
 54 |         # Global data:
 55 |         # -------------
 56 |         # Tagged addresses (exchanges, mining pools, contracts)
 57 |         # 1: Exchanges, 2: Crowdsale contracts, 3: mining pools, 0: Other
 58 |         self.tags = tags.tags
 59 |         # 1: Contracts, 0: Other
 60 |         self.contracts = ContractMap(load=True).addresses
 61 | 
 62 |         # Snapshot specific data:
 63 |         # ------------------------
 64 |         # Bookkeeping
 65 |         self.start_block = txn_graph.start_block
 66 |         self.end_block = txn_graph.end_block
 67 |         self.start_timestamp = txn_graph.start_timestamp
 68 |         self.end_timestamp = txn_graph.end_timestamp
 69 | 
 70 |         # Relevent metrics:
 71 |         # Note that the total supply is 5*block_n + the supply
 72 |         # at genesis. This neglects uncle rewards, which are
 73 |         # about 0.06% of the total supply.
 74 |         # -----------------
 75 |         self.data = {
 76 |             "timestamp_start": self.start_timestamp,
 77 |             "timestamp_end": self.end_timestamp,
 78 |             "block_start": self.start_block,
 79 |             "block_end": self.end_block,
 80 |             "transaction_sum": 0,
 81 |             "transaction_count": 0,
 82 |             "exchange_out_sum": 0,
 83 |             "exchange_out_count": 0,
 84 |             "exchange_in_sum": 0,
 85 |             "exchange_in_count": 0,
 86 |             "contract_txn_sum": 0,
 87 |             "contract_txn_count": 0,
 88 |             "crowdsale_txn_sum": 0,
 89 |             "crowdsale_txn_count": 0,
 90 |             "p2p_txn_sum": 0,
 91 |             "p2p_txn_count": 0,
 92 |             "peer_txns_w_data": 0,
 93 |             "num_addr": 0,
 94 |             "total_supply": 7200990.5 + 5.0*self.end_block,
 95 |             "priceUSD": self._getPrice(self.start_timestamp, self.end_timestamp)
 96 |             }
 97 | 
 98 |         self.peer_wealth = list()
 99 |         self.headers = None
100 | 
101 |         if run:
102 |             self._setHeaders()
103 |             self.parse()
104 |             self.saveData()
105 | 
106 |     # PRIVATE METHODS
107 | 
108 |     def _setHeaders(self):
109 |         """Get the headers that will be used in the CSV data file."""
110 |         self.headers = sorted(self.data.keys())
111 | 
112 |     def _getData(self):
113 |         """Return a list of the data in the order of the headers."""
114 |         return [str(self.data[h]) for h in self.headers]
115 | 
116 |     def _startCSV(self):
117 |         """Create a CSV file if none exists."""
118 |         with open(self.csv_file, "w") as f:
119 |             w = csv.DictWriter(f, fieldnames=self.headers)
120 |             w.writeheader()
121 | 
122 |     def _getPrice(self, start, end, period=300):
123 |         """
124 |         Get data from Poloniex API given a period.
125 |         Start and end are both UNIX timestamps (integers).
126 |         This will return the price at the close of the last period between
127 |         these blocks.
128 |         """
129 |         base = "https://poloniex.com/public?command=returnChartData"
130 |         pair = "USDT_ETH"
131 |         start = start
132 |         end = end
133 |         period = period
134 |         req_str = "{}&currencyPair={}&start={}&end={}&period={}".format(
135 |             base, pair, start, end, period
136 |         )
137 |         data = requests.get(req_str).json()
138 |         return data[len(data)-1]['close']
139 | 
140 |     def _isPeer(self, addr):
141 |         """
142 |         Determine if a vertex corresponds to a peer address.
143 | 
144 |         This means it is not a contract, crowdsale, exchange, or mining pool.
145 |         """
146 |         if not self.contracts[addr] and not self.tags[addr]:
147 |             return True
148 |         return False
149 | 
150 |     # PUBLIC METHODS
151 | 
152 |     def parse(self):
153 |         """Iterate through the graph to calculate metrics of interest."""
154 |         if not self.headers:
155 |             self._setHeaders()
156 | 
157 |         vWeights = self.txn_graph.graph.vertex_properties["weight"]
158 |         eWeights = self.txn_graph.graph.edge_properties["weight"]
159 | 
160 |         # A dictionary mapping vertex --> balance
161 |         balances = list()
162 | 
163 |         # Iterate over vertices (i.e. addresses)
164 |         for v in self.txn_graph.graph.vertices():
165 |             if self._isPeer(v):
166 |                 balances.append(vWeights[v])
167 | 
168 |         # Iterates over a bunch of Edge instances (i.e. transactions)
169 |         address_prop = self.txn_graph.graph.vertex_properties["address"]
170 | 
171 |         # All of the addresses encountered
172 |         address_dump = list()
173 | 
174 |         for e in self.txn_graph.graph.edges():
175 |             to_addr = address_prop[e.target()]
176 |             from_addr = address_prop[e.source()]
177 |             address_dump.append(to_addr)
178 |             address_dump.append(from_addr)
179 | 
180 |             amount = eWeights[e]
181 |             # The edgeWeight of this edge is the amount of the transaction
182 |             self.data["transaction_count"] += 1
183 |             self.data["transaction_sum"] += amount
184 | 
185 |             # If the target/source of the txn is an exchange:
186 |             if self.tags[from_addr] == 1:
187 |                 self.data["exchange_out_sum"] += amount
188 |                 self.data["exchange_out_count"] += 1
189 |             elif self.tags[to_addr] == 1:
190 |                 self.data["exchange_in_sum"] += amount
191 |                 self.data["exchange_in_count"] += 1
192 | 
193 |             # If the target is a crowdsale wallet:
194 |             if self.tags[to_addr] == 2:
195 |                 self.data["crowdsale_txn_sum"] += amount
196 |                 self.data["crowdsale_txn_count"] += 1
197 | 
198 |             # If the target is a contract:
199 |             if self.contracts[to_addr]:
200 |                 self.data["contract_txn_sum"] += amount
201 |                 self.data["contract_txn_count"] += 1
202 | 
203 |             # If source and target are both peer nodes
204 |             if self._isPeer(to_addr) and self._isPeer(from_addr):
205 |                 self.data["p2p_txn_sum"] += amount
206 |                 self.data["p2p_txn_count"] += 1
207 | 
208 |         # Record all unique addresses up to this point
209 |         addr_set = set(address_dump)
210 |         self.data["num_addr"] = len(addr_set)
211 | 
212 |     def saveData(self):
213 |         """Save the data to a line in the CSV file."""
214 |         if not os.path.isfile(self.csv_file):
215 |             self._startCSV()
216 |         with open(self.csv_file, "a") as f:
217 |             w = csv.DictWriter(f, fieldnames=self.headers)
218 |             w.writerow(self.data)
219 | 


--------------------------------------------------------------------------------
/Analysis/TxnGraph.py:
--------------------------------------------------------------------------------
  1 | """Create a snapshot of the Ethereum network."""
  2 | 
  3 | import six.moves.cPickle as pickle
  4 | from graph_tool.all import *
  5 | import pymongo
  6 | import os
  7 | import subprocess
  8 | import signal
  9 | import copy
 10 | from tags import tags
 11 | import analysis_util
 12 | env = analysis_util.set_env()
 13 | DIR = env["mongo"] + "/data"
 14 | DATADIR = env["txn_data"]
 15 | 
 16 | class TxnGraph(object):
 17 |     """
 18 |     Create a snapshot of the Ethereum network.
 19 | 
 20 |     Description:
 21 |     ------------
 22 |     Create a snapshot, which contains a graph, out of transactions stored in a
 23 |     mongo collection. Each snapshot must start at some time t0 (start_block)
 24 |     and end at time tf (end_block). It will include all nodes that sent or
 25 |     received a transaction between t0 and tf.
 26 | 
 27 | 
 28 |     Parameters:
 29 |     -----------
 30 |     start_block <int>              # The lower bound of the block range to
 31 |                                    # be analysed.
 32 |     end_block <int>                # The upper range of the block range to
 33 |                                    # be analysed.
 34 |     previous <dict>                # Previous graph and its end_block
 35 |     snap <bool> (default=True)     # Build the graph upon instantiation.
 36 |     save <bool> (default=True)     # Save the graph automatically
 37 |     load <bool> (default=False)    # Skip building the graph and load a
 38 | 
 39 | 
 40 |     Usage:
 41 |     ------
 42 |     Initialize with a previous graph:
 43 | 
 44 |         g = TxnGraph(previous={graph: <Graph>, end_block: <int>})
 45 | 
 46 |     Draw the image (saved by default to DATADIR/snapshots/a_b.png,
 47 |     where a=start_block, b=end_block):
 48 | 
 49 |         g.draw()
 50 | 
 51 |     Save the state of the object (including the graph):
 52 | 
 53 |         g.save()
 54 | 
 55 |     Load a graph with start_block=a, end_block=b from DATADIR if it exists:
 56 | 
 57 |         g.load(a, b)
 58 | 
 59 |     """
 60 | 
 61 |     # PRIVATE
 62 | 
 63 |     def __init__(self,
 64 |                 *args,
 65 |                 snap=True,
 66 |                 save=True,
 67 |                 load=False,
 68 |                 previous=None,
 69 |                 **kwargs):
 70 | 
 71 |         self.f_pickle = None
 72 |         self.f_snapshot = None
 73 |         self.start_block = max(args[0] if len(args) > 0 else 1, 1)
 74 |         self.end_block = args[1] if len(args) > 1 else 2
 75 | 
 76 |         self.start_timestamp = None
 77 |         self.end_timestamp = None
 78 | 
 79 |         # A lookup table mapping ethereum address --> graph node
 80 |         self.nodes = dict()
 81 |         self.edges = list()
 82 |         # A graph_tool Graph object
 83 |         self.graph = None
 84 |         # Store the graph separately in a file
 85 |         self.f_graph = None
 86 |         # PropertyMap of edges weighted by eth value of transaction
 87 |         self.edgeWeights = None
 88 |         # PropertyMap of vertices weighted by eth value they hold
 89 |         # at the time of the end_block.
 90 |         self.vertexWeights = None
 91 |         # All addresses (each node has an address)
 92 |         self.addresses = None
 93 |         # Record big exchange addresses
 94 |         self.exchanges = list()
 95 |         # Record all contracts
 96 |         self.contracts = list()
 97 |         # Run
 98 |         self._init(snap, save, load, previous)
 99 | 
100 |     def _init(self, snap, save, load, previous):
101 |         self.graph = Graph()
102 | 
103 |         # Accept a previous graph as an argument
104 |         if previous:
105 |             a_str = "prev is of form {'graph': <Graph>, 'end_block': <int>}"
106 |             assert "graph" in previous, a_str
107 |             self.graph = previous["graph"]
108 |             assert "end_block" in previous, a_str
109 |             self.start_block = previous["end_block"]
110 | 
111 |         # Set filepaths
112 |         self._setFilePaths()
113 | 
114 |         # Load a previous graph
115 |         if load:
116 |             self.load(self.start_block, self.end_block)
117 | 
118 |         else:
119 |             # Take a snapshot
120 |             if snap:
121 |                 self.snap()
122 | 
123 |             # Save this graph automatically
124 |             if save:
125 |                 self.save()
126 | 
127 |     def _setFilePaths(self, start=None, end=None):
128 |         """Set the file paths based on the start/end block numbers."""
129 |         if not start:
130 |             start = self.start_block
131 |         if not end:
132 |             start = self.end_block
133 | 
134 |         self.f_pickle = "{}/pickles/{}_{}.p".format(DATADIR, start, end)
135 |         self.f_graph = "{}/graphs/{}_{}.gt".format(DATADIR, start, end)
136 |         self.f_snapshot = "{}/snapshots/{}_{}.png".format(DATADIR, start, end)
137 | 
138 |     def _getMongoClient(self):
139 |         """Connect to a mongo client (assuming one is running)."""
140 |         try:
141 |             # Try a connection to mongo and force a findOne request.
142 |             # See if it makes it through.
143 |             client = pymongo.MongoClient(serverSelectionTimeoutMS=1000)
144 |             transactions = client["blockchain"]["transactions"]
145 |             test = client.find_one({"number": {"$gt": 1}})
146 |             popen = None
147 |         except Exception as err:
148 |             # If not, open up a mongod subprocess
149 |             cmd = "(mongod --dbpath {} > {}/mongo.log 2>&1) &".format(
150 |                 os.environ["BLOCKCHAIN_MONGO_DATA_DIR"],
151 |                 os.environ["BLOCKCHAIN_ANALYSIS_LOGS"])
152 | 
153 |             popen = subprocess.Popen(cmd, shell=True)
154 |             client = pymongo.MongoClient(serverSelectionTimeoutMS=1000)
155 |             transactions = client["blockchain"]["transactions"]
156 | 
157 |         # Update timestamps
158 |         transactions = self._updateTimestamps(transactions)
159 | 
160 |         return transactions, popen
161 | 
162 |     def _updateTimestamps(self, client):
163 |         """Lookup timestamps associated with start/end blocks and set them."""
164 |         start = client.find_one({"number": self.start_block})
165 |         end = client.find_one({"number": self.end_block})
166 |         self.start_timestamp = start["timestamp"]
167 |         self.end_timestamp = end["timestamp"]
168 |         return client
169 | 
170 |     def _addEdgeWeight(self, newEdge, value):
171 |         """
172 |         Add to the weight of a given edge (i.e. the amount of ether that has
173 |         flown through it). Create a new one if needed.
174 |         """
175 |         if self.edgeWeights[newEdge] is not None:
176 |             self.edgeWeights[newEdge] += value
177 |         else:
178 |             self.edgeWeights[newEdge] = 0
179 | 
180 |     def _addVertexWeight(self, from_v, to_v, value):
181 |         """
182 |         Add to the weight of a given vertex (i.e. the amount of ether)
183 |         it holds. Create a new weight if needed.
184 |         """
185 |         if self.vertexWeights[to_v] is not None:
186 |             self.vertexWeights[to_v] += value
187 |         else:
188 |             self.vertexWeights[to_v] = 0
189 |         if self.vertexWeights[from_v] is not None:
190 |             # We shouldn't need to worry about overspending
191 |             # as the ethereum protocol should not let you spend
192 |             # more ether than you have!
193 |             self.vertexWeights[from_v] -= value
194 |         else:
195 |             self.vertexWeights[from_v] = 0
196 | 
197 |     def _addBlocks(self, client, start, end):
198 |         """Add new blocks to current graph attribute."""
199 |         # Get a cursor containing all of the blocks
200 |         # between the start/end blocks
201 |         blocks = client.find(
202 |             {"number": {"$gt": start, "$lt": end}},
203 |             sort=[("number", pymongo.ASCENDING)]
204 |         )
205 |         for block in blocks:
206 |             if block["transactions"]:
207 |                 # Loop through all of the transactions in the current block
208 |                 # Add all the nodes to a global set (self.nodes)
209 |                 for txn in block["transactions"]:
210 | 
211 |                     # Graph vetices will be referenced temporarily, but the
212 |                     #   unique addresses will persist in self.nodes
213 |                     to_v = None
214 |                     from_v = None
215 | 
216 |                     # Exclude self referencing transactions
217 |                     if txn["to"] == txn["from"]:
218 |                         continue
219 | 
220 |                     # Set the "to" vertex
221 |                     if txn["to"] not in self.nodes:
222 |                         to_v = self.graph.add_vertex()
223 |                         self.nodes[txn["to"]] = to_v
224 |                         self.addresses[to_v] = txn["to"]
225 | 
226 |                         # If there is data, this is going to a contract
227 |                         if "data" in txn:
228 |                             if txn["data"] != "0x":
229 |                                 self.contracts.append(txn["to"])
230 |                     else:
231 |                         to_v = self.nodes[txn["to"]]
232 | 
233 |                     # Set the "from" vertex
234 |                     if txn["from"] not in self.nodes:
235 |                         from_v = self.graph.add_vertex()
236 |                         self.nodes[txn["from"]] = from_v
237 |                         self.addresses[from_v] = txn["from"]
238 |                     else:
239 |                         from_v = self.nodes[txn["from"]]
240 | 
241 |                     # Add a directed edge
242 |                     newEdge = self.graph.add_edge(from_v, to_v)
243 |                     self.edges.append(newEdge)
244 | 
245 |                     # Update the weights
246 |                     self._addEdgeWeight(newEdge, txn["value"])
247 |                     self._addVertexWeight(from_v, to_v, txn["value"])
248 |         self._addPropertyMaps()
249 | 
250 |     def _addPropertyMaps(self):
251 |         """Add PropertyMap attributes to Graph instance."""
252 |         self.graph.vertex_properties["weight"] = self.vertexWeights
253 |         self.graph.vertex_properties["address"] = self.addresses
254 |         self.graph.edge_properties["weight"] = self.edgeWeights
255 | 
256 |     # PUBLIC
257 |     # ------
258 |     def snap(self):
259 |         """
260 |         Take a snapshot of the graph of transactions.
261 | 
262 |         Description:
263 |         ------------
264 |         This essentially builds a graph with addresses (vertices) and
265 |         transactions (edges). It also adds a PropertyMap of <double>s to the
266 |         graph corresponding to transaction amounts (i.e. weights). The default
267 |         behavior of this is to initialize a new graph with data between
268 |         start_block and end_block, however it can be used with the 'extend'
269 |         method.
270 | 
271 |         Parameters:
272 |         -----------
273 |         start <int>, default self.start_block: the absolute block to start with
274 |         end <int>, default self.end_block: the absolute block to end with
275 |         """
276 | 
277 |         # Set up the mongo client
278 |         client, popen = self._getMongoClient()
279 | 
280 |         # Add PropertyMaps
281 |         self.edgeWeights = self.graph.new_edge_property("double")
282 |         self.vertexWeights = self.graph.new_vertex_property("double")
283 |         self.addresses = self.graph.new_vertex_property("string")
284 | 
285 |         # Add blocks to the graph
286 |         self._addBlocks(client, self.start_block, self.end_block)
287 | 
288 |         # Kill the mongo client if it was spawned in this process
289 |         if popen:
290 |             # TODO get this to work
291 |             popen.kill()
292 | 
293 |     def save(self):
294 |         """Pickle TxnGraph. Save the graph_tool Graph object separately."""
295 |         if not os.path.exists(DATADIR+"/pickles"):
296 |             os.makedirs(DATADIR+"/pickles")
297 |         if not os.path.exists(DATADIR+"/graphs"):
298 |             os.makedirs(DATADIR+"/graphs")
299 |         if not os.path.exists(DATADIR+"/snapshots"):
300 |             os.makedirs(DATADIR+"/snapshots")
301 | 
302 |         # We cannot save any of the graph_tool objects so we need to stash
303 |         # them in a temporary object
304 |         tmp = {
305 |             "nodes": self.nodes,
306 |             "edges": self.edges,
307 |             "edgeWeights": self.edgeWeights,
308 |             "vertexWeights": self.vertexWeights,
309 |             "addresses": self.addresses,
310 |             "graph": self.graph
311 |         }
312 |         # Empty the graph_tool objects
313 |         self.nodes = dict()
314 |         self.edges = list()
315 |         self.edgeWeights = None
316 |         self.vertexWeights = None
317 |         self.addresses = None
318 | 
319 |         # Save the graph to a file (but not if it is empty)
320 |         if len(self.nodes) > 0:
321 |             self.graph.save(self.f_graph, fmt="gt")
322 | 
323 |         self.graph = None
324 | 
325 |         # Save the rest of this object to a pickle
326 |         with open(self.f_pickle, "wb") as output:
327 |             pickle.dump(self.__dict__, output)
328 |             output.close()
329 | 
330 |         # Reload from tmp
331 |         self.nodes = tmp["nodes"]
332 |         self.edges = tmp["edges"]
333 |         self.edgeWeights = tmp["edgeWeights"]
334 |         self.vertexWeights = tmp["vertexWeights"]
335 |         self.addresses = tmp["addresses"]
336 |         self.graph = tmp["graph"]
337 | 
338 |     def load(self, start_block, end_block):
339 |         """
340 |         Load a TxnGraph.
341 | 
342 |         Description:
343 |         ------------
344 |         Load a pickle of a different TxnGraph object as well as a saved Graph
345 |         object as TxnGraph.graph. This can be called upon instantiation with
346 |         load=True OR can be called any time by passing new start/end block
347 |         params.
348 | 
349 |         Parameters:
350 |         -----------
351 |         start_block <int>
352 |         end_block <int>
353 |         """
354 |         self._setFilePaths(start_block, end_block)
355 | 
356 |         # Load the graph from file
357 |         tmp_graph = load_graph(self.f_graph)
358 | 
359 |         # Load the object from a pickle
360 |         with open(self.f_pickle, "rb") as input:
361 |             tmp = pickle.load(input)
362 |             self.__dict__.update(tmp)
363 |             self.graph = tmp_graph
364 |             input.close()
365 | 
366 |     def draw(self, **kwargs):
367 |         """
368 |         Draw the graph.
369 | 
370 |         Description:
371 |         ------------
372 |         Draw the graph and save to a .png file indexed by the start and
373 |         end block of the TxnGraph
374 | 
375 |         Parameters:
376 |         -----------
377 |         w <int> (optional, default=5000): width
378 |         h <int> (optional, default=5000): height
379 |         """
380 |         w = kwargs["w"] if "w" in kwargs else 1920*2
381 |         h = kwargs["h"] if "h" in kwargs else 1080*2
382 | 
383 |         # We want the vertices to be sized proportional to the number of
384 |         # transactions they are part of
385 |         # deg = self.graph.degree_property_map("total")
386 |         deg = copy.deepcopy(self.graph.vertex_properties['weight'])
387 | 
388 |         # Don't draw an empty graph
389 |         if not self.graph.num_vertices():
390 |             print("Nothing to draw!")
391 |             return
392 | 
393 |         # Testing to allow negative numbers
394 |         deg.a = abs(deg.a)**0.5
395 | 
396 |         # For some reason this works
397 |         # (TODO figure out how to scale this consistently)
398 |         # deg.a = deg.a**0.5
399 | 
400 |         # We want the largest node to be roughly 10%
401 |         # of the width of the image (somewhat arbitrary)
402 |         scale = (0.03*w)/max(deg.a)
403 |         deg.a = deg.a*scale
404 | 
405 |         # For some reason this doesn't work
406 |         # deg.a = deg.a*scale # For some reason this blows up the output
407 | 
408 |         # Set K=scale because we want the average edge length
409 |         # to be the size of the largest node
410 |         pos = random_layout(self.graph, shape=(w, h), dim=2)
411 | 
412 |         # Draw the graph
413 |         graph_draw(self.graph,
414 |             pos=pos,
415 |             vertex_size=deg,
416 |             vertex_fill_color=deg,
417 |             pen_width=0,
418 |             bg_color=[1,1,1,1],
419 |             output=self.f_snapshot,
420 |             output_size=(w,h),
421 |             fit_view=True
422 |         )
423 | 
424 |     def extend(self, n, save=True):
425 |         """
426 |         Add n blocks to the current TxnGraph instance.
427 | 
428 |         Description:
429 |         ------------
430 |         Rather than creating a bunch of TxnGraph instances from scratch,
431 |         this method can be used to add n blocks to the existing TxnGraph
432 |         instance. It can be called multiple times to iterate over the block
433 |         chain with resolution of n blocks. The extended TxnGraph will be
434 |         saved by default.
435 | 
436 |         Parameters:
437 |         -----------
438 |         n <int>: number of blocks to add (from the last_block)
439 |         save <bool>, default True: save the new state automatically
440 |         """
441 |         old_end = self.end_block
442 |         new_end = self.end_block + n
443 | 
444 |         client, popen = self._getMongoClient()
445 |         self._addBlocks(client, old_end, new_end)
446 |         self.end_block = new_end
447 |         self._setFilePaths()
448 | 
449 |         if save:
450 |             self.save()
451 | 


--------------------------------------------------------------------------------
/Analysis/analysis_util.py:
--------------------------------------------------------------------------------
 1 | """Util functions for Analysis process."""
 2 | import os
 3 | 
 4 | 
 5 | def set_env():
 6 |     """Set the analysis environment directory."""
 7 |     env = {
 8 |             "mongo": ".",      # Where the mongo data is stored
 9 |             "txn_data": "./data"    # Where the TxnGraphs are stored
10 |         }
11 |     if 'BLOCKCHAIN_MONGO_DATA_DIR' in os.environ:
12 |         env["mongo"] = os.environ['BLOCKCHAIN_MONGO_DATA_DIR']
13 | 
14 |     if 'BLOCKCHAIN_DATA_DIR' in os.environ:
15 |         env["tnx_data"] = os.environ['BLOCKCHAIN_DATA_DIR']
16 | 
17 |     return env
18 | 


--------------------------------------------------------------------------------
/Analysis/tags.py:
--------------------------------------------------------------------------------
 1 | """Various 'special' addresses that should be tagged."""
 2 | 
 3 | from collections import defaultdict
 4 | 
 5 | # Exchange wallets = 1
 6 | # Crowdsale wallets = 2
 7 | # Mining pools = 3
 8 | tags = defaultdict(int, {
 9 |     "0x32be343b94f860124dc4fee278fdcbd38c102d88": 1,  # Polo hot wallet
10 |     "0xb794f5ea0ba39494ce839613fffba74279579268": 1,  # Polo cold wallet
11 |     "0x2910543af39aba0cd09dbb2d50200b3e800a63d2": 1,  # Kraken
12 |     "0x120a270bbc009644e35f0bb6ab13f95b8199c4ad": 1,  # Shapeshift
13 |     "0xcafb10ee663f465f9d10588ac44ed20ed608c11e": 1,  # Bitfinix
14 |     "0x40b9b889a21ff1534d018d71dc406122ebcf3f5a": 1,  # Gatecoin
15 |     "0x42da8a05cb7ed9a43572b5ba1b8f82a0a6e263dc": 1,  # Yunbi 1
16 |     "0xd94c9ff168dc6aebf9b6cc86deff54f3fb0afc33": 1,  # Yunbi 2
17 |     "0xbb9bc244d798123fde783fcc1c72d3bb8c189413": 2,  # DAO
18 |     "0x807640a13483f8ac783c557fcdf27be11ea4ac7a": 2,  # DAOextrabalance
19 |     "0xf0160428a8552ac9bb7e050d90eeade4ddd52843": 2,  # Digix
20 |     "0x2a65aca4d5fc5b5c859090a6c34d164135398226": 3,  # Dwarfpool
21 |     "0x151255dd9e38e44db38ea06ec66d0d113d6cbe37": 3,  # Dwarfpool2
22 |     "0x63a9975ba31b0b9626b34300f7f627147df1f526": 3,  # eth.supernova.cc
23 |     "0xf8b483dba2c3b7176a3da549ad41a48bb3121069": 3,  # coinotron
24 |     "0xea674fdde714fd979de3edf0f56aa9716b898ec8": 3,  # ethermine
25 |     "0x4bb96091ee9d802ed039c4d1a5f6216f90f81b01": 3,  # ethpool
26 |     "0x1dcb8d1f0fcc8cbc8c2d76528e877f915e299fbe": 3,  # supernova
27 |     "0xa027231f42c80ca4125b5cb962a21cd4f812e88f": 3,  # eth.ppa.ua
28 |     "0x0c729be7c39543c3d549282a40395299d987cec2": 3,  # ?
29 |     "0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5": 3,  # Nanopool
30 |     "0x68795c4aa09d6f4ed3e5deddf8c2ad3049a601da": 3,  # coinmine.pl
31 |     "0x61c808d82a3ac53231750dadc13c777b59310bd9": 3,  # f2pool
32 |     "0xe6a7a1d47ff21b6321162aea7c6cb457d5476bca": 3,  # ethpool
33 |     "0x9d551f41fed6fc27b719777c224dfecce170004d": 3,  # ethereumpool
34 |     "0xd1e56c2e765180aa0371928fd4d1e41fbcda34d4": 3,  # weipool
35 |     "0xf3b9d2c81f2b24b0fa0acaaa865b7d9ced5fc2fb": 3,  # bitclubpool
36 |     "0xb2930b35844a230f00e51431acae96fe543a0347": 3  # mininggpoolhub
37 | })
38 | 


--------------------------------------------------------------------------------
/Forecasting/R/arima.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | library("forecast", lib.loc="/home/alex/miniconda2/lib/R/library")
 3 | 
 4 | # Command line args
 5 | options(echo=TRUE)
 6 | args <- commandArgs(trailingOnly=TRUE)
 7 | p = strtoi(args[1], base=0L)
 8 | d = strtoi(args[2], base=0L)
 9 | q = strtoi(args[3], base=0L)
10 | 
11 | 
12 | endog = read.csv("R/endog.csv")
13 | exog = read.csv("R/exog.csv")
14 | 
15 | fit <- Arima(endog[,1], order=c(p,d,q))
16 | pred = predict(fit, 1)
17 | 
18 | write.csv(pred, file="R/tmp.csv")
19 | 


--------------------------------------------------------------------------------
/Forecasting/R/vol.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | library("timeSeries", lib.loc="/home/alex/miniconda2/lib/R/library")
 3 | library("fBasics", lib.loc="/home/alex/miniconda2/lib/R/library")
 4 | library("fGarch", lib.loc="/home/alex/miniconda2/lib/R/library")
 5 | 
 6 | # Command line args
 7 | endog = read.csv("R/endog.csv")
 8 | exog = read.csv("R/exog.csv")
 9 | 
10 | vol <- garchFit(data=endog[,1])
11 | pred_volatility = predict(vol, n.ahead=1)
12 | 
13 | write.csv(pred_volatility[0], file="R/tmpvol.csv")
14 | 
15 | 


--------------------------------------------------------------------------------
/Forecasting/model.py:
--------------------------------------------------------------------------------
  1 | """Forecast given timeseries data."""
  2 | from pipeline import *
  3 | from sklearn.cross_validation import train_test_split
  4 | from statsmodels.tsa import arima_model
  5 | 
  6 | 
  7 | class Forecast(object):
  8 |     """
  9 |     A forecasting model. Given timeseries data, make predictions.
 10 | 
 11 |     Input:
 12 |     ------
 13 |     filename: path for the CSV file containing data.
 14 | 
 15 |     Methods:
 16 |     --------
 17 |     split(test_size): split data for cross validation with a given test_size.
 18 |     predict(n): predict whether to buy, sell, or hold at block n.
 19 |     """
 20 | 
 21 |     def __init__(self, filename, USD=100, ETH=1000):
 22 |         """Initialize the model with a filename and asset quantities."""
 23 |         self.USD = USD
 24 |         self.ETH = ETH
 25 |         self.p = 0
 26 |         self.d = 0
 27 |         self.q = 0
 28 |         self.model = None
 29 |         self._getData(parse_df(filename))
 30 | 
 31 |     def _getData(self, filename):
 32 |         """Run through the pipeline and load the data."""
 33 |         endog, exog, self.end_blocks = pipeline(filename)
 34 |         self.endog = endog
 35 |         self.exog = exog
 36 | 
 37 |     def _pointPredict(self):
 38 |         """Predict the next value of the ARIMA model."""
 39 |         _last = self.model.nobs
 40 |         _next = _last + 1
 41 |         _lastExog = self.exog[-1:]
 42 |         pred = self.model.predict(_last, _next, exog=_lastExog)
 43 |         return pred
 44 | 
 45 |     def fitARIMAsm(self, p, d, q, _endog, _exog):
 46 |         """Fit an ARIMA model give a set of parameters. Returns model."""
 47 |         endog = np.array(_endog)
 48 |         exog = np.array(_exog)
 49 |         model = arima_model.ARIMA(
 50 |             endog,
 51 |             order=(p, d, q),
 52 |             exog=exog).fit(
 53 |                 transparams=False
 54 |             )
 55 |         return model
 56 | 
 57 |     def optimizeARIMAsm(self, Ap, Ad, Aq, endog, exog):
 58 |         """
 59 |         Find an optimal ARIMA model given lists of p, d, and q.
 60 | 
 61 |         Split the data to test/train sets and then find the best model.
 62 | 
 63 |         Optimization criterion is AIC.
 64 |         """
 65 |         best_model = None
 66 |         best_aic = None
 67 |         for p in Ap:
 68 |             for d in Ad:
 69 |                 for q in Aq:
 70 |                     # Replace the model if AIC is lower
 71 |                     try:
 72 |                         _model = self._fitARIMA(p, d, q, endog, exog)
 73 |                         if not best_aic:
 74 |                             print("Updaing model ({}, {}, {})".format(p, d, q))
 75 |                             best_model = _model
 76 |                             best_aic = _model.aic
 77 |                             self.p = p
 78 |                             self.d = d
 79 |                             self.q = q
 80 |                         elif _model.aic < best_aic:
 81 |                             print("Updaing model ({}, {}, {})".format(p, d, q))
 82 |                             best_model = _model
 83 |                             best_aic = _model.aic
 84 |                             self.p = p
 85 |                             self.d = d
 86 |                             self.q = q
 87 |                     except:
 88 |                         pass
 89 | 
 90 |         # Reset the global model
 91 |         self.model = best_model
 92 | 
 93 |     def predictARIMAsm(self, start, end, exog=None, dynamic=False):
 94 |         """
 95 |         Make a series of n predictions given an ARIMA model.
 96 | 
 97 |         By default, the predictions will be made on top of self.endog_train
 98 | 
 99 |         Note that extra lagged exogenous time slices may need to be passed
100 |         depending on the p level. (Pass end-start + p exogenous slices)
101 |         """
102 |         if exog == None:
103 |             exog = self.exog[start:end]
104 | 
105 |         prediction = self.model.predict(
106 |             start, end, exog=self.exog[start:end], dynamic=dynamic
107 |         )
108 |         return prediction
109 | 
110 |     # Predictions in R
111 |     #####################
112 | 
113 |     def predictARIMA_R(self, p, d, q, endog=None, exog=None):
114 |         """
115 |         Pointwise prediction using forecast package in R.
116 |         """
117 |         # Define endog and exog vars
118 |         if endog is None:
119 |             endog = self.endog
120 |         if exog is None:
121 |             exog = self.exog
122 | 
123 |         # Use at most 100 points to predict the future
124 |         if len(endog) > 300:
125 |             endog = endog[-300:]
126 |         if len(exog) > 300:
127 |             exog = exog[-300:]
128 | 
129 |         # Pipe data through R and use its Arima model
130 |         R_push_csv(endog, exog)
131 |         R_predict(p, d, q)
132 |         pred = R_pull_csv()
133 |         R_cleanup()
134 | 
135 |         return pred
136 | 


--------------------------------------------------------------------------------
/Forecasting/pipeline.py:
--------------------------------------------------------------------------------
 1 | """A pipeline taking in data from a CSV and formatting it for forecasting."""
 2 | import copy
 3 | import numpy as np
 4 | import pandas as pd
 5 | from r_io_util import *
 6 | 
 7 | 
 8 | def pipeline(df):
 9 |     """
10 |     Process a dataframe for forecasting.
11 | 
12 |     Input:
13 |     ------
14 |     df: Pandas dataframe containing both exogenous and endogenous variables.
15 | 
16 |     Output:
17 |     -------
18 |     endog (numpy array), exog (numpy array), ts (numpy array): arrays
19 |         representing timeseries data of:
20 |         price, exogenous features, end, respectively.
21 | 
22 |     Note that block_end replaces the traditional timestamp.
23 |     """
24 |     diff_cols = [
25 |         "contract_txn_count",
26 |         "contract_txn_sum",
27 |         "crowdsale_txn_count",
28 |         "crowdsale_txn_sum",
29 |         "exchange_in_count",
30 |         "exchange_in_sum",
31 |         "exchange_out_count",
32 |         "exchange_out_sum",
33 |         "num_addr",
34 |         "p2p_txn_count",
35 |         "p2p_txn_sum",
36 |         # "peer_txns_w_data",
37 |         "transaction_count",
38 |         "transaction_sum",
39 |         "priceUSD"
40 |     ]
41 |     # Get the time domain (i.e. block_end)
42 |     block_end = np.array(df["block_end"])
43 | 
44 |     # Do single lag differencing
45 |     lag = 1
46 |     df = difference(df, diff_cols, lag=lag)
47 | 
48 |     # Split into endog and exog
49 |     endog, exog = endog_exog(df, diff_cols, lag=lag)
50 | 
51 |     return np.array(endog), np.array(exog), block_end
52 | 
53 | 
54 | def endog_exog(df, cols, lag=1):
55 |     """
56 |     Convert dataframe into endog and exog numpy arrays.
57 | 
58 |     Since everything is differenced, remove the first item in each array.
59 |     """
60 |     diff_cols = ["d_{}_{}".format(lag, col) for col in cols]
61 |     exog = df[diff_cols][1:]
62 |     endog = df["d_{}_priceUSD".format(lag)][1:]
63 | 
64 |     return endog, exog
65 | 
66 | 
67 | def difference(df, cols, lag=1):
68 |     """
69 |     Perform differencing on some columns in a dataframe.
70 | 
71 |     Input:
72 |     ------
73 |     df: pandas dataframe containing the timeseries data.
74 |     cols: list of strings indicating which columns to difference
75 |     """
76 |     df2 = copy.deepcopy(df)
77 | 
78 |     # Difference based on the lag provided.
79 |     for i in range(1, len(df2["block_end"])):
80 |         for L in cols:
81 |             curr = df2.loc[i, L]
82 |             prev = df2.loc[i-lag, L]
83 |             df2.loc[i, "d_{}_{}".format(lag, L)] = curr - prev
84 | 
85 |     return df2
86 | 
87 | 
88 | def parse_df(filename):
89 |     """Given a filename, load the data into a dataframe."""
90 |     df = pd.read_csv(filename)
91 |     return df
92 | 


--------------------------------------------------------------------------------
/Forecasting/r_io_util.py:
--------------------------------------------------------------------------------
 1 | """A util file for I/O related to R."""
 2 | import subprocess
 3 | import pandas as pd
 4 | import numpy as np
 5 | import os
 6 | 
 7 | def R_push_csv(endog, exog):
 8 |     """Save current endog and exog dfs to CSV files in the R directory."""
 9 |     np.savetxt("R/endog.csv", endog, delimiter=",")
10 |     np.savetxt("R/exog.csv", exog, delimiter=",")
11 |     return
12 | 
13 | 
14 | def R_pull_csv():
15 |     """Read from the CSV produced by R and return the prediction."""
16 |     return pd.read_csv("R/tmp.csv")['pred'][0]
17 | 
18 | 
19 | def R_predict(p, d, q):
20 |     """Run Rscript to produce a pointwise prediction given CSV files."""
21 |     subprocess.call(["Rscript", "R/arima.R", str(p), str(d), str(q)])
22 | 
23 | 
24 | def R_cleanup():
25 |     """Delete all CSV files in the R directory."""
26 |     dir = os.path.dirname(os.path.realpath(__file__))
27 |     for file in os.listdir("R"):
28 |         if file.endswith(".csv"):
29 |             try:
30 |                 os.remove(os.path.join(dir, file))
31 |             except:
32 |                 pass
33 | 


--------------------------------------------------------------------------------
/Forecasting/sim.py:
--------------------------------------------------------------------------------
 1 | """Simulate a trading bot by predicting a series of values using train/test sets."""
 2 | from model import Forecast
 3 | import numpy as np
 4 | import copy
 5 | from multiprocessing import Pool
 6 | 
 7 | def simulate(p=1, d=0, q=0):
 8 |     """
 9 |     This bot will perform the following steps.
10 | 
11 |     1. Load data, pipeline, and split it into training and test sets.
12 |     2. Train an optimized ARIMA model on the training data.
13 |     3. Make a series of point forecasts and store the predictions in a list.
14 |         Prediction requires exogenous variables, so append the next data point
15 |         to both the endogenous and exogenous variables in the Forecast object
16 |         before making the next prediction.
17 |     """
18 |     #print("Loading data...")
19 |     f = Forecast('blockchain.csv')
20 | 
21 |     # Define an index on which to split (like 80% of the way)
22 |     ixSplit = int(0.8 * f.endog.shape[0])
23 | 
24 |     # Define training and test sets
25 |     train_endog = f.endog[:ixSplit]
26 |     train_exog = f.exog[:ixSplit]
27 |     test_endog = f.endog[ixSplit:]
28 |     test_exog = f.exog[ixSplit:]
29 | 
30 |     # Update the instance
31 |     f.endog = train_endog
32 |     f.exog = train_exog
33 | 
34 |     # Copy test exogenous variables to compare with the predictions
35 |     endog_expected = copy.deepcopy(test_endog)
36 | 
37 |     # Make a series of predictions
38 |     #print("Making predictions...")
39 |     preds = list()
40 |     for i in range(len(test_exog)):
41 |         # Make the prediction
42 |         pred = f.predictARIMA_R(p, d, q, endog=f.endog, exog=f.exog)
43 |         preds.append(pred)
44 |         # Append the model's data with the first data in the test arrays
45 |         # Note that np.delete is analagous to pop, but -1 indicates the first
46 |         # item in the array.
47 |         f.exog = np.append(f.exog, [test_exog[0]], axis=0)
48 |         test_exog = np.delete(test_exog, 0, axis=0)
49 |         f.endog = np.append(f.endog, [test_endog[0]], axis=0)
50 |         test_endog = np.delete(test_endog, 0)
51 | 
52 |     return preds, endog_expected
53 | 
54 | 
55 | def decisionRule():
56 |     """Decide whether to buy, sell, or hold."""
57 |     pass
58 | 
59 | 
60 | def score_simulation(preds, endog_expected):
61 |     """Score a simulation based on mean squared error."""
62 |     MSE = 0
63 |     for i in range(len(preds)):
64 |         MSE += (preds[i] - endog_expected[i])**2
65 |     return MSE
66 | 
67 | 
68 | def test_f(gen):
69 |     p = gen[0][0]
70 |     d = gen[0][1]
71 |     q = gen[0][2]
72 |     try:
73 |         preds, exog_expected = simulate(p, d, q)
74 |         score = score_simulation(preds, exog_expected)
75 |     except:
76 |         score = 0
77 |     return (score, p, d, q)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     POOL = Pool(maxtasksperchild=500)
82 |     p_range = range(5)
83 |     d_range = range(5)
84 |     q_range = [0] * 5
85 |     gen = list()
86 |     for _p in p_range:
87 |         for _d in d_range:
88 |             gen.append((_p, _d, 0))
89 |     _gen = zip(gen)
90 |     x = POOL.map(test_f, _gen)
91 |     print("Done")
92 |     print(x)
93 |     # [(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (29.292981789631671, 1, 0, 0), (0, 1, 1, 0), (0, 1, 2, 0), (0, 1, 3, 0), (0, 1, 4, 0), (0, 2, 0, 0), (0, 2, 1, 0), (0, 2, 2, 0), (0, 2, 3, 0), (0, 2, 4, 0), (0, 3, 0, 0), (0, 3, 1, 0), (54.253053572867898, 3, 2, 0), (0, 3, 3, 0), (0, 3, 4, 0), (0, 4, 0, 0), (0, 4, 1, 0), (0, 4, 2, 0), (0, 4, 3, 0), (250.45917084881501, 4, 4, 0)]
94 | 


--------------------------------------------------------------------------------
/Preprocessing/Crawler/Crawler.py:
--------------------------------------------------------------------------------
  1 | """A client to interact with node and to save data to mongo."""
  2 | 
  3 | from pymongo import MongoClient
  4 | import crawler_util
  5 | import requests
  6 | import json
  7 | import sys
  8 | import os
  9 | import logging
 10 | import time
 11 | import tqdm
 12 | sys.path.append(os.path.realpath(os.path.dirname(__file__)))
 13 | 
 14 | DIR = os.environ['BLOCKCHAIN_MONGO_DATA_DIR']
 15 | LOGFIL = "crawler.log"
 16 | if "BLOCKCHAIN_ANALYSIS_LOGS" in os.environ:
 17 |     LOGFIL = "{}/{}".format(os.environ['BLOCKCHAIN_ANALYSIS_LOGS'], LOGFIL)
 18 | crawler_util.refresh_logger(LOGFIL)
 19 | logging.basicConfig(filename=LOGFIL, level=logging.DEBUG)
 20 | logging.getLogger("urllib3").setLevel(logging.WARNING)
 21 | 
 22 | 
 23 | class Crawler(object):
 24 |     """
 25 |     A client to migrate blockchain from geth to mongo.
 26 | 
 27 |     Description:
 28 |     ------------
 29 |     Before starting, make sure geth is running in RPC (port 8545 by default).
 30 |     Initializing a Crawler object will automatically scan the blockchain from
 31 |     the last block saved in mongo to the most recent block in geth.
 32 | 
 33 |     Parameters:
 34 |     -----------
 35 |     rpc_port: <int> default 8545 	# The port on which geth RPC can be called
 36 |     host: <string> default "http://localhost" # The geth host
 37 |     start: <bool> default True		# Create the graph upon instantiation
 38 | 
 39 |     Usage:
 40 |     ------
 41 |     Default behavior:
 42 |         crawler = Crawler()
 43 | 
 44 |     Interactive mode:
 45 |         crawler = Crawler(start=False)
 46 | 
 47 |     Get the data from a particular block:
 48 |         block = crawler.getBlock(block_number)
 49 | 
 50 |     Save the block to mongo. This will fail if the block already exists:
 51 |         crawler.saveBlock(block)
 52 | 
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         start=True,
 58 |         rpc_port=8545,
 59 |         host="http://localhost",
 60 |         delay=0.0001
 61 |     ):
 62 |         """Initialize the Crawler."""
 63 |         logging.debug("Starting Crawler")
 64 |         self.url = "{}:{}".format(host, rpc_port)
 65 |         self.headers = {"content-type": "application/json"}
 66 | 
 67 |         # Initializes to default host/port = localhost/27017
 68 |         self.mongo_client = crawler_util.initMongo(MongoClient())
 69 |         # The max block number that is in mongo
 70 |         self.max_block_mongo = None
 71 |         # The max block number in the public blockchain
 72 |         self.max_block_geth = None
 73 |         # Record errors for inserting block data into mongo
 74 |         self.insertion_errors = list()
 75 |         # Make a stack of block numbers that are in mongo
 76 |         self.block_queue = crawler_util.makeBlockQueue(self.mongo_client)
 77 |         # The delay between requests to geth
 78 |         self.delay = delay
 79 | 
 80 |         if start:
 81 |             self.max_block_mongo = self.highestBlockMongo()
 82 |             self.max_block_geth = self.highestBlockEth()
 83 |             self.run()
 84 | 
 85 |     def _rpcRequest(self, method, params, key):
 86 |         """Make an RPC request to geth on port 8545."""
 87 |         payload = {
 88 |             "method": method,
 89 |             "params": params,
 90 |             "jsonrpc": "2.0",
 91 |             "id": 0
 92 |         }
 93 |         time.sleep(self.delay)
 94 |         res = requests.post(
 95 |               self.url,
 96 |               data=json.dumps(payload),
 97 |               headers=self.headers).json()
 98 |         return res[key]
 99 | 
100 |     def getBlock(self, n):
101 |         """Get a specific block from the blockchain and filter the data."""
102 |         data = self._rpcRequest("eth_getBlockByNumber", [hex(n), True], "result")
103 |         block = crawler_util.decodeBlock(data)
104 |         return block
105 | 
106 |     def highestBlockEth(self):
107 |         """Find the highest numbered block in geth."""
108 |         num_hex = self._rpcRequest("eth_blockNumber", [], "result")
109 |         return int(num_hex, 16)
110 | 
111 |     def saveBlock(self, block):
112 |         """Insert a given parsed block into mongo."""
113 |         e = crawler_util.insertMongo(self.mongo_client, block)
114 |         if e:
115 |             self.insertion_errors.append(e)
116 | 
117 |     def highestBlockMongo(self):
118 |         """Find the highest numbered block in the mongo database."""
119 |         highest_block = crawler_util.highestBlock(self.mongo_client)
120 |         logging.info("Highest block found in mongodb:{}".format(highest_block))
121 |         return highest_block
122 | 
123 |     def add_block(self, n):
124 |         """Add a block to mongo."""
125 |         b = self.getBlock(n)
126 |         if b:
127 |             self.saveBlock(b)
128 |             time.sleep(0.001)
129 |         else:
130 |             self.saveBlock({"number": n, "transactions": []})
131 | 
132 |     def run(self):
133 |         """
134 |         Run the process.
135 | 
136 |         Iterate through the blockchain on geth and fill up mongodb
137 |         with block data.
138 |         """
139 |         logging.debug("Processing geth blockchain:")
140 |         logging.info("Highest block found as: {}".format(self.max_block_geth))
141 |         logging.info("Number of blocks to process: {}".format(
142 |             len(self.block_queue)))
143 | 
144 |         # Make sure the database isn't missing any blocks up to this point
145 |         logging.debug("Verifying that mongo isn't missing any blocks...")
146 |         self.max_block_mongo = 1
147 |         if len(self.block_queue) > 0:
148 |             print("Looking for missing blocks...")
149 |             self.max_block_mongo = self.block_queue.pop()
150 |             for n in tqdm.tqdm(range(1, self.max_block_mongo)):
151 |                 if len(self.block_queue) == 0:
152 |                     # If we have reached the max index of the queue,
153 |                     # break the loop
154 |                     break
155 |                 else:
156 |                     # -If a block with number = current index is not in
157 |                     # the queue, add it to mongo.
158 |                     # -If the lowest block number in the queue (_n) is
159 |                     # not the current running index (n), then _n > n
160 |                     # and we must add block n to mongo. After doing so,
161 |                     # we will add _n back to the queue.
162 |                     _n = self.block_queue.popleft()
163 |                     if n != _n:
164 |                         self.add_block(n)
165 |                         self.block_queue.appendleft(_n)
166 |                         logging.info("Added block {}".format(n))
167 | 
168 |         # Get all new blocks
169 |         print("Processing remainder of the blockchain...")
170 |         for n in tqdm.tqdm(range(self.max_block_mongo, self.max_block_geth)):
171 |             self.add_block(n)
172 | 
173 |         print("Done!\n")
174 | 


--------------------------------------------------------------------------------
/Preprocessing/Crawler/__init__.py:
--------------------------------------------------------------------------------
1 | from Crawler import Crawler
2 | from util import *
3 | 


--------------------------------------------------------------------------------
/Preprocessing/Crawler/crawler_util.py:
--------------------------------------------------------------------------------
  1 | """Util functions for interacting with geth and mongo."""
  2 | import pymongo
  3 | from collections import deque
  4 | import os
  5 | import pdb
  6 | 
  7 | DB_NAME = "blockchain"
  8 | COLLECTION = "transactions"
  9 | 
 10 | # mongodb
 11 | # -------
 12 | def initMongo(client):
 13 |     """
 14 |     Given a mongo client instance, create db/collection if either doesn't exist
 15 | 
 16 |     Parameters:
 17 |     -----------
 18 |     client <mongodb Client>
 19 | 
 20 |     Returns:
 21 |     --------
 22 |     <mongodb Client>
 23 |     """
 24 |     db = client[DB_NAME]
 25 |     try:
 26 |         db.create_collection(COLLECTION)
 27 |     except:
 28 |         pass
 29 |     try:
 30 |         # Index the block number so duplicate records cannot be made
 31 |         db[COLLECTION].create_index(
 32 | 			[("number", pymongo.DESCENDING)],
 33 | 			unique=True
 34 | 		)
 35 |     except:
 36 |         pass
 37 | 
 38 |     return db[COLLECTION]
 39 | 
 40 | 
 41 | def insertMongo(client, d):
 42 |     """
 43 |     Insert a document into mongo client with collection selected.
 44 | 
 45 |     Params:
 46 |     -------
 47 |     client <mongodb Client>
 48 |     d <dict>
 49 | 
 50 |     Returns:
 51 |     --------
 52 |     error <None or str>
 53 |     """
 54 |     try:
 55 |         client.insert_one(d)
 56 |         return None
 57 |     except Exception as err:
 58 |         pass
 59 | 
 60 | 
 61 | def highestBlock(client):
 62 |     """
 63 |     Get the highest numbered block in the collection.
 64 | 
 65 |     Params:
 66 |     -------
 67 |     client <mongodb Client>
 68 | 
 69 |     Returns:
 70 |     --------
 71 |     <int>
 72 |     """
 73 |     n = client.find_one(sort=[("number", pymongo.DESCENDING)])
 74 |     if not n:
 75 |         # If the database is empty, the highest block # is 0
 76 |         return 0
 77 |     assert "number" in n, "Highest block is incorrectly formatted"
 78 |     return n["number"]
 79 | 
 80 | 
 81 | def makeBlockQueue(client):
 82 |     """
 83 |     Form a queue of blocks that are recorded in mongo.
 84 | 
 85 |     Params:
 86 |     -------
 87 |     client <mongodb Client>
 88 | 
 89 |     Returns:
 90 |     --------
 91 |     <deque>
 92 |     """
 93 |     queue = deque()
 94 |     all_n = client.find({}, {"number":1, "_id":0},
 95 |     		sort=[("number", pymongo.ASCENDING)])
 96 |     for i in all_n:
 97 |         queue.append(i["number"])
 98 |     return queue
 99 | 
100 | # Geth
101 | # ----
102 | def decodeBlock(block):
103 |     """
104 |     Decode various pieces of information (from hex) for a block and return the parsed data.
105 | 
106 |     Note that the block is of the form:
107 |  	{
108 |        "id": 0,
109 |     	"jsonrpc": "2.0",
110 |     	"result": {
111 |     	  "number": "0xf4241",
112 |     	  "hash": "0xcb5cab7266694daa0d28cbf40496c08dd30bf732c41e0455e7ad389c10d79f4f",
113 |     	  "parentHash": "0x8e38b4dbf6b11fcc3b9dee84fb7986e29ca0a02cecd8977c161ff7333329681e",
114 |     	  "nonce": "0x9112b8c2b377fbe8",
115 |     	  "sha3Uncles": "0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347",
116 |     	  "logsBloom": "0x0",
117 |     	  "transactionsRoot": "0xc61c50a0a2800ddc5e9984af4e6668de96aee1584179b3141f458ffa7d4ecec6",
118 |     	  "stateRoot": "0x7dd4aabb93795feba9866821c0c7d6a992eda7fbdd412ea0f715059f9654ef23",
119 |     	  "receiptRoot": "0xb873ddefdb56d448343d13b188241a4919b2de10cccea2ea573acf8dbc839bef",
120 |     	  "miner": "0x2a65aca4d5fc5b5c859090a6c34d164135398226",
121 |     	  "difficulty": "0xb6b4bbd735f",
122 |     	  "totalDifficulty": "0x63056041aaea71c9",
123 |     	  "size": "0x292",
124 |     	  "extraData": "0xd783010303844765746887676f312e352e31856c696e7578",
125 |     	  "gasLimit": "0x2fefd8",
126 |     	  "gasUsed": "0x5208",
127 |     	  "timestamp": "0x56bfb41a",
128 |     	  "transactions": [
129 |     	    {
130 |     	      "hash": "0xefb6c796269c0d1f15fdedb5496fa196eb7fb55b601c0fa527609405519fd581",
131 |     	      "nonce": "0x2a121",
132 |     	      "blockHash": "0xcb5cab7266694daa0d28cbf40496c08dd30bf732c41e0455e7ad389c10d79f4f",
133 |     	      "blockNumber": "0xf4241",
134 |     	      "transactionIndex": "0x0",
135 |     	      "from": "0x2a65aca4d5fc5b5c859090a6c34d164135398226",
136 |     	      "to": "0x819f4b08e6d3baa33ba63f660baed65d2a6eb64c",
137 |     	      "value": "0xe8e43bc79c88000",
138 |     	      "gas": "0x15f90",
139 |     	      "gasPrice": "0xba43b7400",
140 |     	      "input": "0x"
141 |     	    }
142 |     	  ],
143 |     	  "uncles": []
144 |     	}
145 |   	}
146 |     """
147 |     try:
148 |         b = block
149 |         if "result" in block:
150 |             b = block["result"]
151 |         # Filter the block
152 |         new_block = {
153 |             "number": int(b["number"], 16),
154 |             "timestamp": int(b["timestamp"], 16),		# Timestamp is in unix time
155 |             "transactions": []
156 |         }
157 |         # Filter and decode each transaction and add it back
158 |         # 	Value, gas, and gasPrice are all converted to ether
159 |         for t in b["transactions"]:
160 |             new_t = {
161 |                 "from": t["from"],
162 |                 "to": t["to"],
163 |                 "value": float(int(t["value"], 16))/1000000000000000000.,
164 |                 "data": t["input"]
165 |             }
166 |             new_block["transactions"].append(new_t)
167 |         return new_block
168 | 
169 |     except:
170 |         return None
171 | 
172 | 
173 | def refresh_logger(filename):
174 |     """Remove old logs and create new ones."""
175 |     if os.path.isfile(filename):
176 |         try:
177 |             os.remove(filename)
178 |         except Exception:
179 |             pass
180 |     open(filename, 'a').close()
181 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ethereum Blockchain Parser
  2 | 
  3 | This is a project to parse the Ethereum blockchain from a local geth node. Blockchains are perfect data sets because they contain every transaction ever made on the network. This is valuable data if you want to analyze the network, but Ethereum stores its blockchain in [RLP](https://github.com/ethereum/wiki/wiki/RLP) encoded binary blobs within a series of LevelDB files and these are surprisingly difficult to access, even given the available tools. This project takes the approach of querying a local node via [JSON-RPC](https://github.com/ethereum/wiki/wiki/JSON-RPC), which returns unencoded transactional data, and then moves that data to a mongo database.
  4 | 
  5 | ![Blocks 1 to 120000](.content/1_120000.jpg)
  6 | 
  7 | 
  8 | ## Usage
  9 | 
 10 | ### Streaming data
 11 | 
 12 | To stream blockchain data for real-time analysis, make sure you have both geth and mongo running and start the process with:
 13 | 
 14 |         python3 stream.py
 15 | 
 16 | Note that this will automatically backfill your mongo database with blocks that it is missing.
 17 | 
 18 | ### Backfilling your Mongo database
 19 | 
 20 | To get data from the blockchain as it exists now and then stop parsing, simply run the following scripts, which are located in the `Scripts` directory. Note that at the time of writing, the Ethereum blockchain has about 1.5 million blocks so this will likely take several hours.
 21 | 
 22 | 1. Funnel the data from geth to MongoDB:
 23 | 
 24 | 
 25 |         python3 preprocess.py
 26 | 
 27 | 2. Create a series of snapshots of the blockchain through time and for each snapshot, calculate key metrics. Dump the data into a CSV file:
 28 | 
 29 | 
 30 |         python3 extract.py
 31 | 
 32 | 
 33 |         
 34 | ## Prerequisites:
 35 | 
 36 | Before using this tool to analyze your copy of the blockchain, you need the following things:
 37 | 
 38 | ### Geth
 39 | [Geth](https://github.com/ethereum/go-ethereum/wiki/Geth) is the Go implementation of a full Ethereum node. We will need to run it with the `--rpc` flag in order to request data (**WARNING** if you run this on a geth client containing an account that has ether in it, make sure you put a firewall 8545 or whatever port you run geth RPC on).
 40 | 
 41 | A geth instance downloads the blockchain and processes it, saving the blocks as LevelDB files in the specified data directory (`~/.ethereum/chaindata` by default). The geth instance can be queried via RPC with the `eth_getBlockByNumber([block, true])` endpoint (see [here](https://github.com/ethereum/wiki/wiki/JSON-RPC#eth_getblockbynumber)) to get the `X-th` block (with `true` indicating we want the transactional data included), which returns data of the form:
 42 | 
 43 |     {
 44 |       number: 1000000,
 45 |       timestamp: 1465003569,
 46 |       ...
 47 |       transactions: [
 48 |         {
 49 |           blockHash: "0x2052ce710a08094b81b5047ea9df5119773ce4b263a23d86659fa7293251055e",
 50 |           blockNumber: 1284937,
 51 |           from: "0x1f57f826caf594f7a837d9fc092456870a289365",
 52 |           gas: 22050,
 53 |           gasPrice: 20000000000,
 54 |           hash: "0x654ac26084ee6e40767e8735f38274ef5f594454a4d34cfdd70c93aa95be0c64",
 55 |           input: "0x",
 56 |           nonce: 6610,
 57 |           to: "0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98",
 58 |           transactionIndex: 27,
 59 |           value: 201544820000000000
 60 |         }
 61 |       ]
 62 |     }
 63 | 
 64 | Since I am only interested in `number`, `timestamps`, and `transactions` for this application, I have omitted the rest of the data, but there is lots of additional information in the block (explore [here](https://etherchain.org/blocks)), including a few Merkle trees to maintain hashes of state, transactions, and receipts (read [here](https://blog.ethereum.org/2015/11/15/merkling-in-ethereum/).
 65 | 
 66 | Using the `from` and `to` addresses in the `transactions` array, I can map the flow of ether through the network as time processes. Note that the value, gas, and gasPrice are in Wei, where 1 Ether = 10<sup>18</sup> Wei. The numbers are converted into Ether automatically with this tool.
 67 | 
 68 | ### MongoDB
 69 | 
 70 | We will use mongo to essentially copy each block served by Geth, preserving its structure. The data outside the scope of this analysis will be omitted. Note that this project also requires pymongo.
 71 | 
 72 | ### graph-tool
 73 | 
 74 | [graph-tool](https://graph-tool.skewed.de/) is a python library written in C to construct graphs quickly and has a flexible feature set for mapping properties to its edges and vertices. Depending on your system, this may be tricky to install, so be sure and follow their instructions carefully. I recommend you find some way to install it with a package manager because building from source is a pain.
 75 | 
 76 | ### python3
 77 | 
 78 | This was written for python 3.4 with the packages: contractmap, tqdm and requests. Some things will probably break if you try to do this analysis in python 2.
 79 | 
 80 | 
 81 | ## Workflow
 82 | 
 83 | The following outlines the procedure used to turn the data from bytes on the blockchain to data in a CSV file.
 84 | 
 85 | ### 1. Process the blockchain
 86 | 
 87 | Preprocessing is done with the `Crawler` class, which can be found in the `Preprocessing/Crawler` directory. Before instantiating a `Crawler` object, you need to have geth and mongo processes running. Starting a `Crawler()` instance will go through the processes of requesting and processing the blockchain from geth and copying it over to a Mongo collection named `transactions`. Once copied over, you can close the `Crawler()` instance.
 88 | 
 89 | ### 2. Take a snapshot of the blockchain
 90 | 
 91 | A snapshot of the network (i.e. all of the transactions occurring between two timestamps, or numbered blocks in the block chain) can be taken with a `TxnGraph()` instance. This class can be found in the `Analysis` directory. Create an instance with:
 92 | 
 93 |     snapshot = TxnGraph(a, b)
 94 | 
 95 | where a is the starting block (int) and b is ending block (int). This will load a directed graph of all ethereum addresses that made transactions between the two specified blocks. It will also weight vertices by the total amount of Ether at the time that the ending block was mined and edges by the amount of ether send in the transaction.
 96 | 
 97 | To move on to the next snapshot (i.e. forward in time):
 98 | 
 99 |     snapshot.extend(c)
100 | 
101 | where `c` is the number of blocks to proceed.
102 | 
103 | At each snapshot, the instance will automatically pickle the snapshot and save the state to a local file (disable on instantiation with `save=False`).
104 | 
105 | #### Drawing an image:
106 | 
107 | Once `TxnGraph` is created, it will create a graph out of all of the data in the blocks between a and b. An image can be drawn by calling `TxnGraph.draw()` and specific dimensions can be passed using `TxnGraph.draw(w=A, h=B)` where A and B are ints corresponding to numbers of pixels. By default, this is saved to the `Analysis/data/snapshots` directory.
108 | 
109 | #### Saving/Loading State (using pickle)
110 | 
111 | The `TxnGraph` instance state can be (and automatically is) pickled with `TxnGraph.save()` where the filename is parameterized by the start/end blocks and is saved. By default, this saves to the `Analysis/data/pickles` directory. If another instance was pickled with a different set of start/end blocks, it can be reloaded with `TxnGraph.load(a,b)`.
112 | 
113 | ### 3: (Optional) Add a lookup table for smart contract transactions
114 | 
115 | An important consideration when doing an analysis of the Ethereum network is of smart contract addresses. Much ether flows to and from contracts, which you may want to distinguish from simple peer-to-peer transactions. This can be done by loading a `ContractMap` instance. It is recommended you pass the most recent block in the blockchain for `last_block`, as this will find all contracts that were transacted with up to that point in history:
116 | 
117 |     # If a mongo_client is passed, the ContractMap will scan geth via RPC
118 |     # for new contract addresses starting at "last_block".
119 |     cmap = ContractMap(mongo_client, last_block=90000, filepath="./contracts.p")
120 |     cmap.save()
121 | 
122 |     # If None is passed for a mongo_client, the ContractMap will automatically
123 |     # load the map of addresses from the pickle file specified in "filepath",
124 |     # ./contracts.p by default.
125 |     cmap = ContractMap()
126 | 
127 | This will create a hash table of all contract addresses using a `defaultdict` and will save it to a pickle file.
128 | 
129 | ### 4: Aggregate data and analyze
130 | 
131 | Once a snapshot has been created, initialize an instance of `ParsedBlocks` with a `TxnGraph` instance. This will automatically aggregate the data and save to a local CSV file, which can then be analyzed.
132 | 


--------------------------------------------------------------------------------
/Scripts/draw_graphs.py:
--------------------------------------------------------------------------------
 1 | from ChainAnalysis.TxnGraph import TxnGraph
 2 | import os
 3 | import multiprocessing
 4 | 
 5 | # Build and snap a graph based on a tuple of form (start_block, end_block)
 6 | def build(blocks, old_graph):
 7 |     path_exists = os.path.exists("data/snapshots/")
 8 |     assert path_exists, "No path exists to store the snapshots."
 9 | 
10 |     print("Start=%s, End=%s; Building graph."%(blocks[0], blocks[1]))
11 | 
12 |     if old_graph:
13 |         previous = {"graph": old_graph, "end_block": blocks[0]}
14 |     else:
15 |         previous = None
16 |     tmp = TxnGraph(blocks[0], blocks[1], previous=previous)
17 |     tmp.draw()
18 |     return tmp.graph, blocks[1]
19 | 
20 | 
21 | 
22 | 
23 | if __name__=="__main__":
24 |     # Take a bunch of snapshots based on the resolution.
25 |     # Between each snapshot, pass the previous graph object and the previous
26 |     # end_block number as the start_block in the new snapshot.
27 |     resolution = 100000
28 | 
29 |     block_max = 1000000
30 |     tmp_graph = None
31 |     tmp_last_block = 0
32 |     for i in range(block_max//resolution):
33 |         tmp_graph, tmp_last_block = build(
34 |             (tmp_last_block, resolution*i), tmp_graph
35 |         )
36 | 


--------------------------------------------------------------------------------
/Scripts/extract.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parse a bunch of snapshots of the blockchain and dump contents into a CSV file.
 3 | """
 4 | import sys
 5 | sys.path.append("./../Analysis")
 6 | import os
 7 | os.environ['ETH_BLOCKCHAIN_ANALYSIS_DIR'] = './../Analysis/'
 8 | from ParsedBlocks import ParsedBlocks
 9 | from TxnGraph import TxnGraph
10 | import tqdm
11 | 
12 | def syncCSV(filename):
13 |     """Resume populating the CSV file."""
14 |     block = 0
15 |     with open(filename, "r") as f:
16 |         for line in f:
17 |             data = line.split(",")
18 |             try:
19 |                 if int(data[0]) > block:
20 |                     block = int(data[0])
21 |             except:
22 |                 pass
23 |     return block
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     max_block = 1600000
28 |     resolution = 1000
29 |     CSVFILE = "blockchain.csv"
30 |     STEP = 1000
31 |     prev_max_block = 0
32 | 
33 |     if os.path.exists(CSVFILE):
34 |         prev_max_block = syncCSV(CSVFILE)
35 | 
36 |     # Always start at block 1 because the data is cumulative.
37 |     # Resume at previous block + 1000
38 |     t = TxnGraph(1, prev_max_block + STEP)
39 |     for i in tqdm.tqdm(range(max_block//resolution)):
40 | 
41 |         if t.end_block > prev_max_block:
42 |             blocks = ParsedBlocks(t)
43 |             t.extend(STEP)
44 |         else:
45 |             t.end_block += STEP
46 | 


--------------------------------------------------------------------------------
/Scripts/preprocess.py:
--------------------------------------------------------------------------------
 1 | """Pull data from geth and parse it into mongo."""
 2 | 
 3 | import subprocess
 4 | import sys
 5 | sys.path.append("./../Preprocessing")
 6 | sys.path.append("./../Analysis")
 7 | import os
 8 | os.environ['ETH_BLOCKCHAIN_ANALYSIS_DIR'] = './../Preprocessing/'
 9 | from Crawler import Crawler
10 | from ContractMap import ContractMap
11 | import subprocess
12 | import time
13 | LOGDIR = "./../Preprocessing/logs"
14 | 
15 | 
16 | subprocess.call([
17 |     "(geth --rpc --rpcport 8545 > {}/geth.log 2>&1) &".format(LOGDIR),
18 |     "(mongod --dbpath mongo/data --port 27017 > {}/mongo.log 2>&1) &".format(LOGDIR)
19 | ], shell=True)
20 | 
21 | print("Booting processes.")
22 | # Catch up with the crawler
23 | c = Crawler()
24 | 
25 | print("Updating contract hash map.")
26 | # Update the contract addresses that have been interacted with
27 | ContractMap(c.mongo_client, last_block=c.max_block_mongo)
28 | 
29 | print("Update complete.")
30 | subprocess.call([
31 |     "(geth --rpc --rpcport 8545 > {}/geth.log 2>&1) &".format(LOGDIR),
32 |     "(mongod --dbpath mongo/data --port 27017 > {}/mongo.log 2>&1) &".format(LOGDIR)
33 | ], shell=True)
34 | 


--------------------------------------------------------------------------------
/stream.py:
--------------------------------------------------------------------------------
 1 | """Stream updates to the blockchain from geth to mongo."""
 2 | import sys
 3 | import os
 4 | sys.path.append("Preprocessing/Crawler")
 5 | from Crawler import Crawler
 6 | sys.path.append("Analysis")
 7 | from TxnGraph import TxnGraph
 8 | from ParsedBlocks import ParsedBlocks
 9 | sys.path.append("Scripts")
10 | from extract import syncCSV
11 | import tqdm
12 | 
13 | 
14 | def syncMongo(c):
15 |     """Sync mongo with geth blocks."""
16 |     gethBlock = c.highestBlockEth()
17 |     mongoBlock = c.highestBlockMongo()
18 |     counter = 0
19 |     if gethBlock > mongoBlock:
20 |         print("Syncing Mongo...")
21 |         for i in range(gethBlock-mongoBlock):
22 |             c.add_block(mongoBlock+i)
23 |         counter += 1
24 |         if counter >= 100:
25 |             print("Successfully parsed {} blocks.".format(counter))
26 |             print("Currently at block {} of {}".format(mongoBlock, gethBlock))
27 |             counter = 0
28 | 
29 | if __name__ == "__main__":
30 |     # Print success every N iterations
31 |     n = 100
32 | 
33 |     # Initialize a crawler that will catch the mongodb up
34 |     c = Crawler()
35 |     syncMongo(c)
36 | 
37 |     # Initialize a TxnGraph and save it every N blocks
38 |     N = 1000
39 |     t = None
40 | 
41 |     # Global vars
42 |     CSVFILE = "Scripts/blockchain.csv"
43 |     STEP = 1000
44 | 
45 |     # Sync with the CSV file
46 |     if os.path.exists(CSVFILE):
47 |         prev_max_block = syncCSV(CSVFILE)
48 | 
49 |     # Catch the CSV data up
50 |     _highestBlockMongo = c.highestBlockMongo()
51 | 
52 |     if prev_max_block + STEP <= _highestBlockMongo:
53 |         t = TxnGraph(1, prev_max_block+STEP)
54 |         for i in tqdm.tqdm(range(_highestBlockMongo//STEP)):
55 |             if t.end_block > prev_max_block:
56 |                 blocks = ParsedBlocks(t)
57 |                 t.extend(STEP)
58 |             else:
59 |                 t.end_block += STEP
60 | 
61 |     while True:
62 |         # Sync
63 |         syncMongo(c)
64 | 
65 |         # Initialize TxnGraph if it doesn't exist yet
66 |         if not t:
67 |             t = TxnGraph(1, c.highestBlockMongo())
68 | 
69 |         # Do the next iteration of the TxnGraph if applciable
70 |         if t.end_block + STEP <= c.highestBlockMongo():
71 |             t.extend(STEP)
72 | 
73 |         # Print an update at a certain resolution
74 |         if not t.end_block % 10000:
75 |             print("Streaming at block {}".format(t.end_block))
76 | 


--------------------------------------------------------------------------------
/test/forecast.py:
--------------------------------------------------------------------------------
 1 | """Test workflow of forecasting model."""
 2 | import sys
 3 | sys.path.append("../Forecasting")
 4 | import model
 5 | 
 6 | 
 7 | def test_forecast():
 8 |     """Optimize an ARIMA model and predict a few data points."""
 9 |     START = 5
10 |     END = 10
11 |     print("Forecasting...")
12 |     f = model.Forecast('../Forecasting/blockchain.csv')
13 |     f.optimizeARIMA(
14 |         range(5), range(5), range(5), f.endog, f.exog
15 |     )
16 |     pred = f.predictARIMA(START, END)
17 |     assert len(pred) == (END - START)
18 | 


--------------------------------------------------------------------------------
/test/verify_blocks.py:
--------------------------------------------------------------------------------
 1 | """Test that the transactions in local blocks are correct."""
 2 | import requests
 3 | import random
 4 | import json
 5 | import sys
 6 | sys.path.append("../Preprocessing")
 7 | from Crawler import Crawler
 8 | import pprint
 9 | 
10 | def test_blocks():
11 |     """
12 |     Check transactions in each of a random sample of blocks.
13 | 
14 |     Send a request to https://etherchain.org/api/block/:block/tx to get a list
15 |     of all transactions that occurred in that block. Cross-reference with the
16 |     transactions in the local block (in mongo).
17 |     """
18 |     c = Crawler.Crawler(start=False)
19 |     client = c.mongo_client
20 | 
21 |     sample = random.sample(range(1, 1700000), 100)
22 |     N = len(sample)
23 | 
24 |     # Track the number of times the number of transactions is different.
25 |     wrong_blocks = list()
26 |     num_error = "Incorrect number of transactions in {}% of {} blocks."
27 | 
28 |     blocks = client.find({"number": {"$in": sample}})
29 |     for block in blocks:
30 |         n = block["number"]
31 |         uri = "https://etherchain.org/api/block/{}/tx".format(n)
32 |         ethchain = json.loads(requests.get(uri).text)
33 | 
34 |         # Check the number of transactions in the block
35 |         if len(ethchain["data"]) != len(block["transactions"]):
36 |             wrong_blocks.append(n)
37 | 
38 |     wrong_nums = len(wrong_blocks)
39 |     pprint.pprint(wrong_blocks)
40 |     assert wrong_nums == 0, num_error.format(100.*wrong_nums/N, N)
41 | 


--------------------------------------------------------------------------------