├── Analysis ├── ContractMap.py ├── ParsedBlocks.py ├── TxnGraph.py ├── analysis_util.py └── tags.py ├── Forecasting ├── R │ ├── arima.R │ └── vol.R ├── model.py ├── pipeline.py ├── r_io_util.py └── sim.py ├── Preprocessing └── Crawler │ ├── Crawler.py │ ├── __init__.py │ └── crawler_util.py ├── README.md ├── Scripts ├── draw_graphs.py ├── extract.py └── preprocess.py ├── stream.py └── test ├── forecast.py └── verify_blocks.py /Analysis/ContractMap.py: -------------------------------------------------------------------------------- 1 | 2 | """Build a hash map of all contract addresses on the Ethereum network.""" 3 | 4 | from collections import defaultdict 5 | import requests 6 | import json 7 | import pickle 8 | import os 9 | import time 10 | import pymongo 11 | DIR = "." 12 | 13 | class ContractMap(object): 14 | """ 15 | A hash map of all contract addresses in the Ethereum network. 16 | 17 | Public functions: 18 | 19 | - find(): searches all blocks after self.last_block and adds them 20 | to the table. Updates self.last_block 21 | - save(): saves the object to a pickle file ".contracts.p" 22 | - load(): loads the object from pickle file ".contracts.p" 23 | 24 | Attributes: 25 | 26 | - addresses: defaultdict with default value of 0 for non-contracts and 27 | values of 1 for contract addresses. 28 | 29 | Usage: 30 | 31 | # If a mongo_client is passed, the ContractMap will scan geth via RPC 32 | # for new contract addresses starting at "last_block". 33 | cmap = ContractMap(mongo_client, last_block=90000, filepath="contracts.p") 34 | cmap.save() 35 | 36 | # If None is passed for a mongo_client, the ContractMap will automatically 37 | # load the map of addresses from the pickle file specified in "filepath", 38 | # ./contracts.p by default. 39 | cmap = ContractMap() 40 | 41 | """ 42 | 43 | def __init__(self, 44 | mongo_client=None, 45 | last_block=0, 46 | load=False, 47 | filepath="{}/.contracts.p".format(DIR)): 48 | """Initialize with a mongo client and an optional last block.""" 49 | self.client = mongo_client 50 | self.last_block = last_block 51 | self.url = "http://localhost:8545" 52 | self.headers = {"content-type": "application/json"} 53 | self.filepath = filepath 54 | 55 | self.addresses = defaultdict(int) 56 | 57 | if load: 58 | self.load() 59 | 60 | if self.client: 61 | self.find() 62 | self.save() 63 | 64 | def _checkGeth(self): 65 | """Make sure geth is running in RPC on port 8545.""" 66 | try: 67 | self._rpcRequest("eth_getBlockByNumber", [hex(1), True], "id") 68 | return 69 | except Exception as err: 70 | assert not err, "Geth cannot be reached: {}".format(err) 71 | 72 | def _rpcRequest(self, method, params, key): 73 | """Make an RPC request to geth on port 8545.""" 74 | payload = { 75 | "method": method, 76 | "params": params, 77 | "jsonrpc": "2.0", 78 | "id": 0 79 | } 80 | 81 | res = requests.post(self.url, 82 | data=json.dumps(payload), 83 | headers=self.headers).json() 84 | 85 | # Geth will sometimes crash if overloaded with requests 86 | time.sleep(0.005) 87 | 88 | return res[key] 89 | 90 | def find(self): 91 | """ 92 | Build a hash table of contract addresses. 93 | 94 | Iterate through all blocks and search for new contract addresses. 95 | Append them to self.addresses if found. 96 | """ 97 | blocks = self.client.find( 98 | {"number": {"$gt": self.last_block}}, 99 | sort=[("number", pymongo.ASCENDING)] 100 | ) 101 | counter = 0 102 | for block in blocks: 103 | if block["transactions"]: 104 | # Loop through all of the transactions in the current block 105 | # Add all the nodes to a global set (self.nodes) 106 | for txn in block["transactions"]: 107 | if txn["to"] and not self.addresses[txn["to"]]: 108 | # Get the code at the "to" address. 109 | code = self._rpcRequest( 110 | "eth_getCode", 111 | [txn["to"], "latest"], 112 | "result") 113 | # Add addressees if there is non-empty data 114 | if code != "0x": 115 | self.addresses[txn["to"]] = 1 116 | 117 | self.last_block = block["number"] 118 | counter += 1 119 | # Save the list every 10000 blocks in case geth crashes 120 | # midway through the procedure 121 | if not counter % 10000: 122 | print("Done with block {}...".format(self.last_block)) 123 | self.save() 124 | 125 | def save(self): 126 | """Pickle the object and save it to a file.""" 127 | state = (self.last_block, self.addresses) 128 | pickle.dump(state, open(self.filepath, "wb")) 129 | 130 | def load(self): 131 | """Load the contract map from a file.""" 132 | no_file = "Error loading ContractMap: No file exists in that path." 133 | assert os.path.isfile(self.filepath), no_file 134 | state = pickle.load(open(self.filepath, "rb")) 135 | self.addresses = state[1] 136 | self.last_block = state[0] 137 | -------------------------------------------------------------------------------- /Analysis/ParsedBlocks.py: -------------------------------------------------------------------------------- 1 | """Interace to parse aggregate data from snapshots of the Ethereum network.""" 2 | 3 | import tags 4 | from ContractMap import ContractMap 5 | import os 6 | import csv 7 | import requests 8 | 9 | 10 | class ParsedBlocks(object): 11 | """ 12 | Build a set of aggregate data from a snapshot (using a TxnGraph). 13 | 14 | Description: 15 | ------------ 16 | Parse the network graphs at each timestamp. 17 | Time period is every X blocks. 18 | For each time period, look at aggregate stats. 19 | 20 | Iterate over all edges in the graph snapshot and calculate: 21 | - Total number of transactions in the network 22 | - Sum of all transaction amounts 23 | - Sum of all outflow from exchanges; suggests people entering long term 24 | - Sum of all inflow to exchanges (suggests people exiting) 25 | - Number of transactions to contracts (with data) 26 | - Number of transactions to crowdsale wallets (no data) 27 | - Number of transactions to peers, but with data, i.e. sending altcoins 28 | - Number of p2p transactions 29 | - Number of new addresses 30 | - Distribution of wealth (mean, std) across addresses that are NOT: 31 | 32 | Tagged addresses consitute: 33 | A: Exchanges 34 | B: Mining pools 35 | C: Crowdsale wallets/contract addresses 36 | 37 | Also tagged are all contract addresses to which data has been sent. 38 | 39 | Lastly, we also want to get the price of ETH (in USD) at the 40 | timestamp listed in the LAST block of the block range. 41 | 42 | Parameters: 43 | ----------- 44 | txn_graph: TxnGraph instance (with a prebuilt graph) 45 | run: boolean, optional. Calculate the data when instantiated. 46 | 47 | """ 48 | 49 | def __init__(self, txn_graph, run=True, csv_file="blockchain.csv"): 50 | """Initialize the graph, address hash maps, and data fields.""" 51 | self.txn_graph = txn_graph 52 | self.csv_file = csv_file 53 | 54 | # Global data: 55 | # ------------- 56 | # Tagged addresses (exchanges, mining pools, contracts) 57 | # 1: Exchanges, 2: Crowdsale contracts, 3: mining pools, 0: Other 58 | self.tags = tags.tags 59 | # 1: Contracts, 0: Other 60 | self.contracts = ContractMap(load=True).addresses 61 | 62 | # Snapshot specific data: 63 | # ------------------------ 64 | # Bookkeeping 65 | self.start_block = txn_graph.start_block 66 | self.end_block = txn_graph.end_block 67 | self.start_timestamp = txn_graph.start_timestamp 68 | self.end_timestamp = txn_graph.end_timestamp 69 | 70 | # Relevent metrics: 71 | # Note that the total supply is 5*block_n + the supply 72 | # at genesis. This neglects uncle rewards, which are 73 | # about 0.06% of the total supply. 74 | # ----------------- 75 | self.data = { 76 | "timestamp_start": self.start_timestamp, 77 | "timestamp_end": self.end_timestamp, 78 | "block_start": self.start_block, 79 | "block_end": self.end_block, 80 | "transaction_sum": 0, 81 | "transaction_count": 0, 82 | "exchange_out_sum": 0, 83 | "exchange_out_count": 0, 84 | "exchange_in_sum": 0, 85 | "exchange_in_count": 0, 86 | "contract_txn_sum": 0, 87 | "contract_txn_count": 0, 88 | "crowdsale_txn_sum": 0, 89 | "crowdsale_txn_count": 0, 90 | "p2p_txn_sum": 0, 91 | "p2p_txn_count": 0, 92 | "peer_txns_w_data": 0, 93 | "num_addr": 0, 94 | "total_supply": 7200990.5 + 5.0*self.end_block, 95 | "priceUSD": self._getPrice(self.start_timestamp, self.end_timestamp) 96 | } 97 | 98 | self.peer_wealth = list() 99 | self.headers = None 100 | 101 | if run: 102 | self._setHeaders() 103 | self.parse() 104 | self.saveData() 105 | 106 | # PRIVATE METHODS 107 | 108 | def _setHeaders(self): 109 | """Get the headers that will be used in the CSV data file.""" 110 | self.headers = sorted(self.data.keys()) 111 | 112 | def _getData(self): 113 | """Return a list of the data in the order of the headers.""" 114 | return [str(self.data[h]) for h in self.headers] 115 | 116 | def _startCSV(self): 117 | """Create a CSV file if none exists.""" 118 | with open(self.csv_file, "w") as f: 119 | w = csv.DictWriter(f, fieldnames=self.headers) 120 | w.writeheader() 121 | 122 | def _getPrice(self, start, end, period=300): 123 | """ 124 | Get data from Poloniex API given a period. 125 | Start and end are both UNIX timestamps (integers). 126 | This will return the price at the close of the last period between 127 | these blocks. 128 | """ 129 | base = "https://poloniex.com/public?command=returnChartData" 130 | pair = "USDT_ETH" 131 | start = start 132 | end = end 133 | period = period 134 | req_str = "{}¤cyPair={}&start={}&end={}&period={}".format( 135 | base, pair, start, end, period 136 | ) 137 | data = requests.get(req_str).json() 138 | return data[len(data)-1]['close'] 139 | 140 | def _isPeer(self, addr): 141 | """ 142 | Determine if a vertex corresponds to a peer address. 143 | 144 | This means it is not a contract, crowdsale, exchange, or mining pool. 145 | """ 146 | if not self.contracts[addr] and not self.tags[addr]: 147 | return True 148 | return False 149 | 150 | # PUBLIC METHODS 151 | 152 | def parse(self): 153 | """Iterate through the graph to calculate metrics of interest.""" 154 | if not self.headers: 155 | self._setHeaders() 156 | 157 | vWeights = self.txn_graph.graph.vertex_properties["weight"] 158 | eWeights = self.txn_graph.graph.edge_properties["weight"] 159 | 160 | # A dictionary mapping vertex --> balance 161 | balances = list() 162 | 163 | # Iterate over vertices (i.e. addresses) 164 | for v in self.txn_graph.graph.vertices(): 165 | if self._isPeer(v): 166 | balances.append(vWeights[v]) 167 | 168 | # Iterates over a bunch of Edge instances (i.e. transactions) 169 | address_prop = self.txn_graph.graph.vertex_properties["address"] 170 | 171 | # All of the addresses encountered 172 | address_dump = list() 173 | 174 | for e in self.txn_graph.graph.edges(): 175 | to_addr = address_prop[e.target()] 176 | from_addr = address_prop[e.source()] 177 | address_dump.append(to_addr) 178 | address_dump.append(from_addr) 179 | 180 | amount = eWeights[e] 181 | # The edgeWeight of this edge is the amount of the transaction 182 | self.data["transaction_count"] += 1 183 | self.data["transaction_sum"] += amount 184 | 185 | # If the target/source of the txn is an exchange: 186 | if self.tags[from_addr] == 1: 187 | self.data["exchange_out_sum"] += amount 188 | self.data["exchange_out_count"] += 1 189 | elif self.tags[to_addr] == 1: 190 | self.data["exchange_in_sum"] += amount 191 | self.data["exchange_in_count"] += 1 192 | 193 | # If the target is a crowdsale wallet: 194 | if self.tags[to_addr] == 2: 195 | self.data["crowdsale_txn_sum"] += amount 196 | self.data["crowdsale_txn_count"] += 1 197 | 198 | # If the target is a contract: 199 | if self.contracts[to_addr]: 200 | self.data["contract_txn_sum"] += amount 201 | self.data["contract_txn_count"] += 1 202 | 203 | # If source and target are both peer nodes 204 | if self._isPeer(to_addr) and self._isPeer(from_addr): 205 | self.data["p2p_txn_sum"] += amount 206 | self.data["p2p_txn_count"] += 1 207 | 208 | # Record all unique addresses up to this point 209 | addr_set = set(address_dump) 210 | self.data["num_addr"] = len(addr_set) 211 | 212 | def saveData(self): 213 | """Save the data to a line in the CSV file.""" 214 | if not os.path.isfile(self.csv_file): 215 | self._startCSV() 216 | with open(self.csv_file, "a") as f: 217 | w = csv.DictWriter(f, fieldnames=self.headers) 218 | w.writerow(self.data) 219 | -------------------------------------------------------------------------------- /Analysis/TxnGraph.py: -------------------------------------------------------------------------------- 1 | """Create a snapshot of the Ethereum network.""" 2 | 3 | import six.moves.cPickle as pickle 4 | from graph_tool.all import * 5 | import pymongo 6 | import os 7 | import subprocess 8 | import signal 9 | import copy 10 | from tags import tags 11 | import analysis_util 12 | env = analysis_util.set_env() 13 | DIR = env["mongo"] + "/data" 14 | DATADIR = env["txn_data"] 15 | 16 | class TxnGraph(object): 17 | """ 18 | Create a snapshot of the Ethereum network. 19 | 20 | Description: 21 | ------------ 22 | Create a snapshot, which contains a graph, out of transactions stored in a 23 | mongo collection. Each snapshot must start at some time t0 (start_block) 24 | and end at time tf (end_block). It will include all nodes that sent or 25 | received a transaction between t0 and tf. 26 | 27 | 28 | Parameters: 29 | ----------- 30 | start_block # The lower bound of the block range to 31 | # be analysed. 32 | end_block # The upper range of the block range to 33 | # be analysed. 34 | previous # Previous graph and its end_block 35 | snap (default=True) # Build the graph upon instantiation. 36 | save (default=True) # Save the graph automatically 37 | load (default=False) # Skip building the graph and load a 38 | 39 | 40 | Usage: 41 | ------ 42 | Initialize with a previous graph: 43 | 44 | g = TxnGraph(previous={graph: , end_block: }) 45 | 46 | Draw the image (saved by default to DATADIR/snapshots/a_b.png, 47 | where a=start_block, b=end_block): 48 | 49 | g.draw() 50 | 51 | Save the state of the object (including the graph): 52 | 53 | g.save() 54 | 55 | Load a graph with start_block=a, end_block=b from DATADIR if it exists: 56 | 57 | g.load(a, b) 58 | 59 | """ 60 | 61 | # PRIVATE 62 | 63 | def __init__(self, 64 | *args, 65 | snap=True, 66 | save=True, 67 | load=False, 68 | previous=None, 69 | **kwargs): 70 | 71 | self.f_pickle = None 72 | self.f_snapshot = None 73 | self.start_block = max(args[0] if len(args) > 0 else 1, 1) 74 | self.end_block = args[1] if len(args) > 1 else 2 75 | 76 | self.start_timestamp = None 77 | self.end_timestamp = None 78 | 79 | # A lookup table mapping ethereum address --> graph node 80 | self.nodes = dict() 81 | self.edges = list() 82 | # A graph_tool Graph object 83 | self.graph = None 84 | # Store the graph separately in a file 85 | self.f_graph = None 86 | # PropertyMap of edges weighted by eth value of transaction 87 | self.edgeWeights = None 88 | # PropertyMap of vertices weighted by eth value they hold 89 | # at the time of the end_block. 90 | self.vertexWeights = None 91 | # All addresses (each node has an address) 92 | self.addresses = None 93 | # Record big exchange addresses 94 | self.exchanges = list() 95 | # Record all contracts 96 | self.contracts = list() 97 | # Run 98 | self._init(snap, save, load, previous) 99 | 100 | def _init(self, snap, save, load, previous): 101 | self.graph = Graph() 102 | 103 | # Accept a previous graph as an argument 104 | if previous: 105 | a_str = "prev is of form {'graph': , 'end_block': }" 106 | assert "graph" in previous, a_str 107 | self.graph = previous["graph"] 108 | assert "end_block" in previous, a_str 109 | self.start_block = previous["end_block"] 110 | 111 | # Set filepaths 112 | self._setFilePaths() 113 | 114 | # Load a previous graph 115 | if load: 116 | self.load(self.start_block, self.end_block) 117 | 118 | else: 119 | # Take a snapshot 120 | if snap: 121 | self.snap() 122 | 123 | # Save this graph automatically 124 | if save: 125 | self.save() 126 | 127 | def _setFilePaths(self, start=None, end=None): 128 | """Set the file paths based on the start/end block numbers.""" 129 | if not start: 130 | start = self.start_block 131 | if not end: 132 | start = self.end_block 133 | 134 | self.f_pickle = "{}/pickles/{}_{}.p".format(DATADIR, start, end) 135 | self.f_graph = "{}/graphs/{}_{}.gt".format(DATADIR, start, end) 136 | self.f_snapshot = "{}/snapshots/{}_{}.png".format(DATADIR, start, end) 137 | 138 | def _getMongoClient(self): 139 | """Connect to a mongo client (assuming one is running).""" 140 | try: 141 | # Try a connection to mongo and force a findOne request. 142 | # See if it makes it through. 143 | client = pymongo.MongoClient(serverSelectionTimeoutMS=1000) 144 | transactions = client["blockchain"]["transactions"] 145 | test = client.find_one({"number": {"$gt": 1}}) 146 | popen = None 147 | except Exception as err: 148 | # If not, open up a mongod subprocess 149 | cmd = "(mongod --dbpath {} > {}/mongo.log 2>&1) &".format( 150 | os.environ["BLOCKCHAIN_MONGO_DATA_DIR"], 151 | os.environ["BLOCKCHAIN_ANALYSIS_LOGS"]) 152 | 153 | popen = subprocess.Popen(cmd, shell=True) 154 | client = pymongo.MongoClient(serverSelectionTimeoutMS=1000) 155 | transactions = client["blockchain"]["transactions"] 156 | 157 | # Update timestamps 158 | transactions = self._updateTimestamps(transactions) 159 | 160 | return transactions, popen 161 | 162 | def _updateTimestamps(self, client): 163 | """Lookup timestamps associated with start/end blocks and set them.""" 164 | start = client.find_one({"number": self.start_block}) 165 | end = client.find_one({"number": self.end_block}) 166 | self.start_timestamp = start["timestamp"] 167 | self.end_timestamp = end["timestamp"] 168 | return client 169 | 170 | def _addEdgeWeight(self, newEdge, value): 171 | """ 172 | Add to the weight of a given edge (i.e. the amount of ether that has 173 | flown through it). Create a new one if needed. 174 | """ 175 | if self.edgeWeights[newEdge] is not None: 176 | self.edgeWeights[newEdge] += value 177 | else: 178 | self.edgeWeights[newEdge] = 0 179 | 180 | def _addVertexWeight(self, from_v, to_v, value): 181 | """ 182 | Add to the weight of a given vertex (i.e. the amount of ether) 183 | it holds. Create a new weight if needed. 184 | """ 185 | if self.vertexWeights[to_v] is not None: 186 | self.vertexWeights[to_v] += value 187 | else: 188 | self.vertexWeights[to_v] = 0 189 | if self.vertexWeights[from_v] is not None: 190 | # We shouldn't need to worry about overspending 191 | # as the ethereum protocol should not let you spend 192 | # more ether than you have! 193 | self.vertexWeights[from_v] -= value 194 | else: 195 | self.vertexWeights[from_v] = 0 196 | 197 | def _addBlocks(self, client, start, end): 198 | """Add new blocks to current graph attribute.""" 199 | # Get a cursor containing all of the blocks 200 | # between the start/end blocks 201 | blocks = client.find( 202 | {"number": {"$gt": start, "$lt": end}}, 203 | sort=[("number", pymongo.ASCENDING)] 204 | ) 205 | for block in blocks: 206 | if block["transactions"]: 207 | # Loop through all of the transactions in the current block 208 | # Add all the nodes to a global set (self.nodes) 209 | for txn in block["transactions"]: 210 | 211 | # Graph vetices will be referenced temporarily, but the 212 | # unique addresses will persist in self.nodes 213 | to_v = None 214 | from_v = None 215 | 216 | # Exclude self referencing transactions 217 | if txn["to"] == txn["from"]: 218 | continue 219 | 220 | # Set the "to" vertex 221 | if txn["to"] not in self.nodes: 222 | to_v = self.graph.add_vertex() 223 | self.nodes[txn["to"]] = to_v 224 | self.addresses[to_v] = txn["to"] 225 | 226 | # If there is data, this is going to a contract 227 | if "data" in txn: 228 | if txn["data"] != "0x": 229 | self.contracts.append(txn["to"]) 230 | else: 231 | to_v = self.nodes[txn["to"]] 232 | 233 | # Set the "from" vertex 234 | if txn["from"] not in self.nodes: 235 | from_v = self.graph.add_vertex() 236 | self.nodes[txn["from"]] = from_v 237 | self.addresses[from_v] = txn["from"] 238 | else: 239 | from_v = self.nodes[txn["from"]] 240 | 241 | # Add a directed edge 242 | newEdge = self.graph.add_edge(from_v, to_v) 243 | self.edges.append(newEdge) 244 | 245 | # Update the weights 246 | self._addEdgeWeight(newEdge, txn["value"]) 247 | self._addVertexWeight(from_v, to_v, txn["value"]) 248 | self._addPropertyMaps() 249 | 250 | def _addPropertyMaps(self): 251 | """Add PropertyMap attributes to Graph instance.""" 252 | self.graph.vertex_properties["weight"] = self.vertexWeights 253 | self.graph.vertex_properties["address"] = self.addresses 254 | self.graph.edge_properties["weight"] = self.edgeWeights 255 | 256 | # PUBLIC 257 | # ------ 258 | def snap(self): 259 | """ 260 | Take a snapshot of the graph of transactions. 261 | 262 | Description: 263 | ------------ 264 | This essentially builds a graph with addresses (vertices) and 265 | transactions (edges). It also adds a PropertyMap of s to the 266 | graph corresponding to transaction amounts (i.e. weights). The default 267 | behavior of this is to initialize a new graph with data between 268 | start_block and end_block, however it can be used with the 'extend' 269 | method. 270 | 271 | Parameters: 272 | ----------- 273 | start , default self.start_block: the absolute block to start with 274 | end , default self.end_block: the absolute block to end with 275 | """ 276 | 277 | # Set up the mongo client 278 | client, popen = self._getMongoClient() 279 | 280 | # Add PropertyMaps 281 | self.edgeWeights = self.graph.new_edge_property("double") 282 | self.vertexWeights = self.graph.new_vertex_property("double") 283 | self.addresses = self.graph.new_vertex_property("string") 284 | 285 | # Add blocks to the graph 286 | self._addBlocks(client, self.start_block, self.end_block) 287 | 288 | # Kill the mongo client if it was spawned in this process 289 | if popen: 290 | # TODO get this to work 291 | popen.kill() 292 | 293 | def save(self): 294 | """Pickle TxnGraph. Save the graph_tool Graph object separately.""" 295 | if not os.path.exists(DATADIR+"/pickles"): 296 | os.makedirs(DATADIR+"/pickles") 297 | if not os.path.exists(DATADIR+"/graphs"): 298 | os.makedirs(DATADIR+"/graphs") 299 | if not os.path.exists(DATADIR+"/snapshots"): 300 | os.makedirs(DATADIR+"/snapshots") 301 | 302 | # We cannot save any of the graph_tool objects so we need to stash 303 | # them in a temporary object 304 | tmp = { 305 | "nodes": self.nodes, 306 | "edges": self.edges, 307 | "edgeWeights": self.edgeWeights, 308 | "vertexWeights": self.vertexWeights, 309 | "addresses": self.addresses, 310 | "graph": self.graph 311 | } 312 | # Empty the graph_tool objects 313 | self.nodes = dict() 314 | self.edges = list() 315 | self.edgeWeights = None 316 | self.vertexWeights = None 317 | self.addresses = None 318 | 319 | # Save the graph to a file (but not if it is empty) 320 | if len(self.nodes) > 0: 321 | self.graph.save(self.f_graph, fmt="gt") 322 | 323 | self.graph = None 324 | 325 | # Save the rest of this object to a pickle 326 | with open(self.f_pickle, "wb") as output: 327 | pickle.dump(self.__dict__, output) 328 | output.close() 329 | 330 | # Reload from tmp 331 | self.nodes = tmp["nodes"] 332 | self.edges = tmp["edges"] 333 | self.edgeWeights = tmp["edgeWeights"] 334 | self.vertexWeights = tmp["vertexWeights"] 335 | self.addresses = tmp["addresses"] 336 | self.graph = tmp["graph"] 337 | 338 | def load(self, start_block, end_block): 339 | """ 340 | Load a TxnGraph. 341 | 342 | Description: 343 | ------------ 344 | Load a pickle of a different TxnGraph object as well as a saved Graph 345 | object as TxnGraph.graph. This can be called upon instantiation with 346 | load=True OR can be called any time by passing new start/end block 347 | params. 348 | 349 | Parameters: 350 | ----------- 351 | start_block 352 | end_block 353 | """ 354 | self._setFilePaths(start_block, end_block) 355 | 356 | # Load the graph from file 357 | tmp_graph = load_graph(self.f_graph) 358 | 359 | # Load the object from a pickle 360 | with open(self.f_pickle, "rb") as input: 361 | tmp = pickle.load(input) 362 | self.__dict__.update(tmp) 363 | self.graph = tmp_graph 364 | input.close() 365 | 366 | def draw(self, **kwargs): 367 | """ 368 | Draw the graph. 369 | 370 | Description: 371 | ------------ 372 | Draw the graph and save to a .png file indexed by the start and 373 | end block of the TxnGraph 374 | 375 | Parameters: 376 | ----------- 377 | w (optional, default=5000): width 378 | h (optional, default=5000): height 379 | """ 380 | w = kwargs["w"] if "w" in kwargs else 1920*2 381 | h = kwargs["h"] if "h" in kwargs else 1080*2 382 | 383 | # We want the vertices to be sized proportional to the number of 384 | # transactions they are part of 385 | # deg = self.graph.degree_property_map("total") 386 | deg = copy.deepcopy(self.graph.vertex_properties['weight']) 387 | 388 | # Don't draw an empty graph 389 | if not self.graph.num_vertices(): 390 | print("Nothing to draw!") 391 | return 392 | 393 | # Testing to allow negative numbers 394 | deg.a = abs(deg.a)**0.5 395 | 396 | # For some reason this works 397 | # (TODO figure out how to scale this consistently) 398 | # deg.a = deg.a**0.5 399 | 400 | # We want the largest node to be roughly 10% 401 | # of the width of the image (somewhat arbitrary) 402 | scale = (0.03*w)/max(deg.a) 403 | deg.a = deg.a*scale 404 | 405 | # For some reason this doesn't work 406 | # deg.a = deg.a*scale # For some reason this blows up the output 407 | 408 | # Set K=scale because we want the average edge length 409 | # to be the size of the largest node 410 | pos = random_layout(self.graph, shape=(w, h), dim=2) 411 | 412 | # Draw the graph 413 | graph_draw(self.graph, 414 | pos=pos, 415 | vertex_size=deg, 416 | vertex_fill_color=deg, 417 | pen_width=0, 418 | bg_color=[1,1,1,1], 419 | output=self.f_snapshot, 420 | output_size=(w,h), 421 | fit_view=True 422 | ) 423 | 424 | def extend(self, n, save=True): 425 | """ 426 | Add n blocks to the current TxnGraph instance. 427 | 428 | Description: 429 | ------------ 430 | Rather than creating a bunch of TxnGraph instances from scratch, 431 | this method can be used to add n blocks to the existing TxnGraph 432 | instance. It can be called multiple times to iterate over the block 433 | chain with resolution of n blocks. The extended TxnGraph will be 434 | saved by default. 435 | 436 | Parameters: 437 | ----------- 438 | n : number of blocks to add (from the last_block) 439 | save , default True: save the new state automatically 440 | """ 441 | old_end = self.end_block 442 | new_end = self.end_block + n 443 | 444 | client, popen = self._getMongoClient() 445 | self._addBlocks(client, old_end, new_end) 446 | self.end_block = new_end 447 | self._setFilePaths() 448 | 449 | if save: 450 | self.save() 451 | -------------------------------------------------------------------------------- /Analysis/analysis_util.py: -------------------------------------------------------------------------------- 1 | """Util functions for Analysis process.""" 2 | import os 3 | 4 | 5 | def set_env(): 6 | """Set the analysis environment directory.""" 7 | env = { 8 | "mongo": ".", # Where the mongo data is stored 9 | "txn_data": "./data" # Where the TxnGraphs are stored 10 | } 11 | if 'BLOCKCHAIN_MONGO_DATA_DIR' in os.environ: 12 | env["mongo"] = os.environ['BLOCKCHAIN_MONGO_DATA_DIR'] 13 | 14 | if 'BLOCKCHAIN_DATA_DIR' in os.environ: 15 | env["tnx_data"] = os.environ['BLOCKCHAIN_DATA_DIR'] 16 | 17 | return env 18 | -------------------------------------------------------------------------------- /Analysis/tags.py: -------------------------------------------------------------------------------- 1 | """Various 'special' addresses that should be tagged.""" 2 | 3 | from collections import defaultdict 4 | 5 | # Exchange wallets = 1 6 | # Crowdsale wallets = 2 7 | # Mining pools = 3 8 | tags = defaultdict(int, { 9 | "0x32be343b94f860124dc4fee278fdcbd38c102d88": 1, # Polo hot wallet 10 | "0xb794f5ea0ba39494ce839613fffba74279579268": 1, # Polo cold wallet 11 | "0x2910543af39aba0cd09dbb2d50200b3e800a63d2": 1, # Kraken 12 | "0x120a270bbc009644e35f0bb6ab13f95b8199c4ad": 1, # Shapeshift 13 | "0xcafb10ee663f465f9d10588ac44ed20ed608c11e": 1, # Bitfinix 14 | "0x40b9b889a21ff1534d018d71dc406122ebcf3f5a": 1, # Gatecoin 15 | "0x42da8a05cb7ed9a43572b5ba1b8f82a0a6e263dc": 1, # Yunbi 1 16 | "0xd94c9ff168dc6aebf9b6cc86deff54f3fb0afc33": 1, # Yunbi 2 17 | "0xbb9bc244d798123fde783fcc1c72d3bb8c189413": 2, # DAO 18 | "0x807640a13483f8ac783c557fcdf27be11ea4ac7a": 2, # DAOextrabalance 19 | "0xf0160428a8552ac9bb7e050d90eeade4ddd52843": 2, # Digix 20 | "0x2a65aca4d5fc5b5c859090a6c34d164135398226": 3, # Dwarfpool 21 | "0x151255dd9e38e44db38ea06ec66d0d113d6cbe37": 3, # Dwarfpool2 22 | "0x63a9975ba31b0b9626b34300f7f627147df1f526": 3, # eth.supernova.cc 23 | "0xf8b483dba2c3b7176a3da549ad41a48bb3121069": 3, # coinotron 24 | "0xea674fdde714fd979de3edf0f56aa9716b898ec8": 3, # ethermine 25 | "0x4bb96091ee9d802ed039c4d1a5f6216f90f81b01": 3, # ethpool 26 | "0x1dcb8d1f0fcc8cbc8c2d76528e877f915e299fbe": 3, # supernova 27 | "0xa027231f42c80ca4125b5cb962a21cd4f812e88f": 3, # eth.ppa.ua 28 | "0x0c729be7c39543c3d549282a40395299d987cec2": 3, # ? 29 | "0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5": 3, # Nanopool 30 | "0x68795c4aa09d6f4ed3e5deddf8c2ad3049a601da": 3, # coinmine.pl 31 | "0x61c808d82a3ac53231750dadc13c777b59310bd9": 3, # f2pool 32 | "0xe6a7a1d47ff21b6321162aea7c6cb457d5476bca": 3, # ethpool 33 | "0x9d551f41fed6fc27b719777c224dfecce170004d": 3, # ethereumpool 34 | "0xd1e56c2e765180aa0371928fd4d1e41fbcda34d4": 3, # weipool 35 | "0xf3b9d2c81f2b24b0fa0acaaa865b7d9ced5fc2fb": 3, # bitclubpool 36 | "0xb2930b35844a230f00e51431acae96fe543a0347": 3 # mininggpoolhub 37 | }) 38 | -------------------------------------------------------------------------------- /Forecasting/R/arima.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | library("forecast", lib.loc="/home/alex/miniconda2/lib/R/library") 3 | 4 | # Command line args 5 | options(echo=TRUE) 6 | args <- commandArgs(trailingOnly=TRUE) 7 | p = strtoi(args[1], base=0L) 8 | d = strtoi(args[2], base=0L) 9 | q = strtoi(args[3], base=0L) 10 | 11 | 12 | endog = read.csv("R/endog.csv") 13 | exog = read.csv("R/exog.csv") 14 | 15 | fit <- Arima(endog[,1], order=c(p,d,q)) 16 | pred = predict(fit, 1) 17 | 18 | write.csv(pred, file="R/tmp.csv") 19 | -------------------------------------------------------------------------------- /Forecasting/R/vol.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | library("timeSeries", lib.loc="/home/alex/miniconda2/lib/R/library") 3 | library("fBasics", lib.loc="/home/alex/miniconda2/lib/R/library") 4 | library("fGarch", lib.loc="/home/alex/miniconda2/lib/R/library") 5 | 6 | # Command line args 7 | endog = read.csv("R/endog.csv") 8 | exog = read.csv("R/exog.csv") 9 | 10 | vol <- garchFit(data=endog[,1]) 11 | pred_volatility = predict(vol, n.ahead=1) 12 | 13 | write.csv(pred_volatility[0], file="R/tmpvol.csv") 14 | 15 | -------------------------------------------------------------------------------- /Forecasting/model.py: -------------------------------------------------------------------------------- 1 | """Forecast given timeseries data.""" 2 | from pipeline import * 3 | from sklearn.cross_validation import train_test_split 4 | from statsmodels.tsa import arima_model 5 | 6 | 7 | class Forecast(object): 8 | """ 9 | A forecasting model. Given timeseries data, make predictions. 10 | 11 | Input: 12 | ------ 13 | filename: path for the CSV file containing data. 14 | 15 | Methods: 16 | -------- 17 | split(test_size): split data for cross validation with a given test_size. 18 | predict(n): predict whether to buy, sell, or hold at block n. 19 | """ 20 | 21 | def __init__(self, filename, USD=100, ETH=1000): 22 | """Initialize the model with a filename and asset quantities.""" 23 | self.USD = USD 24 | self.ETH = ETH 25 | self.p = 0 26 | self.d = 0 27 | self.q = 0 28 | self.model = None 29 | self._getData(parse_df(filename)) 30 | 31 | def _getData(self, filename): 32 | """Run through the pipeline and load the data.""" 33 | endog, exog, self.end_blocks = pipeline(filename) 34 | self.endog = endog 35 | self.exog = exog 36 | 37 | def _pointPredict(self): 38 | """Predict the next value of the ARIMA model.""" 39 | _last = self.model.nobs 40 | _next = _last + 1 41 | _lastExog = self.exog[-1:] 42 | pred = self.model.predict(_last, _next, exog=_lastExog) 43 | return pred 44 | 45 | def fitARIMAsm(self, p, d, q, _endog, _exog): 46 | """Fit an ARIMA model give a set of parameters. Returns model.""" 47 | endog = np.array(_endog) 48 | exog = np.array(_exog) 49 | model = arima_model.ARIMA( 50 | endog, 51 | order=(p, d, q), 52 | exog=exog).fit( 53 | transparams=False 54 | ) 55 | return model 56 | 57 | def optimizeARIMAsm(self, Ap, Ad, Aq, endog, exog): 58 | """ 59 | Find an optimal ARIMA model given lists of p, d, and q. 60 | 61 | Split the data to test/train sets and then find the best model. 62 | 63 | Optimization criterion is AIC. 64 | """ 65 | best_model = None 66 | best_aic = None 67 | for p in Ap: 68 | for d in Ad: 69 | for q in Aq: 70 | # Replace the model if AIC is lower 71 | try: 72 | _model = self._fitARIMA(p, d, q, endog, exog) 73 | if not best_aic: 74 | print("Updaing model ({}, {}, {})".format(p, d, q)) 75 | best_model = _model 76 | best_aic = _model.aic 77 | self.p = p 78 | self.d = d 79 | self.q = q 80 | elif _model.aic < best_aic: 81 | print("Updaing model ({}, {}, {})".format(p, d, q)) 82 | best_model = _model 83 | best_aic = _model.aic 84 | self.p = p 85 | self.d = d 86 | self.q = q 87 | except: 88 | pass 89 | 90 | # Reset the global model 91 | self.model = best_model 92 | 93 | def predictARIMAsm(self, start, end, exog=None, dynamic=False): 94 | """ 95 | Make a series of n predictions given an ARIMA model. 96 | 97 | By default, the predictions will be made on top of self.endog_train 98 | 99 | Note that extra lagged exogenous time slices may need to be passed 100 | depending on the p level. (Pass end-start + p exogenous slices) 101 | """ 102 | if exog == None: 103 | exog = self.exog[start:end] 104 | 105 | prediction = self.model.predict( 106 | start, end, exog=self.exog[start:end], dynamic=dynamic 107 | ) 108 | return prediction 109 | 110 | # Predictions in R 111 | ##################### 112 | 113 | def predictARIMA_R(self, p, d, q, endog=None, exog=None): 114 | """ 115 | Pointwise prediction using forecast package in R. 116 | """ 117 | # Define endog and exog vars 118 | if endog is None: 119 | endog = self.endog 120 | if exog is None: 121 | exog = self.exog 122 | 123 | # Use at most 100 points to predict the future 124 | if len(endog) > 300: 125 | endog = endog[-300:] 126 | if len(exog) > 300: 127 | exog = exog[-300:] 128 | 129 | # Pipe data through R and use its Arima model 130 | R_push_csv(endog, exog) 131 | R_predict(p, d, q) 132 | pred = R_pull_csv() 133 | R_cleanup() 134 | 135 | return pred 136 | -------------------------------------------------------------------------------- /Forecasting/pipeline.py: -------------------------------------------------------------------------------- 1 | """A pipeline taking in data from a CSV and formatting it for forecasting.""" 2 | import copy 3 | import numpy as np 4 | import pandas as pd 5 | from r_io_util import * 6 | 7 | 8 | def pipeline(df): 9 | """ 10 | Process a dataframe for forecasting. 11 | 12 | Input: 13 | ------ 14 | df: Pandas dataframe containing both exogenous and endogenous variables. 15 | 16 | Output: 17 | ------- 18 | endog (numpy array), exog (numpy array), ts (numpy array): arrays 19 | representing timeseries data of: 20 | price, exogenous features, end, respectively. 21 | 22 | Note that block_end replaces the traditional timestamp. 23 | """ 24 | diff_cols = [ 25 | "contract_txn_count", 26 | "contract_txn_sum", 27 | "crowdsale_txn_count", 28 | "crowdsale_txn_sum", 29 | "exchange_in_count", 30 | "exchange_in_sum", 31 | "exchange_out_count", 32 | "exchange_out_sum", 33 | "num_addr", 34 | "p2p_txn_count", 35 | "p2p_txn_sum", 36 | # "peer_txns_w_data", 37 | "transaction_count", 38 | "transaction_sum", 39 | "priceUSD" 40 | ] 41 | # Get the time domain (i.e. block_end) 42 | block_end = np.array(df["block_end"]) 43 | 44 | # Do single lag differencing 45 | lag = 1 46 | df = difference(df, diff_cols, lag=lag) 47 | 48 | # Split into endog and exog 49 | endog, exog = endog_exog(df, diff_cols, lag=lag) 50 | 51 | return np.array(endog), np.array(exog), block_end 52 | 53 | 54 | def endog_exog(df, cols, lag=1): 55 | """ 56 | Convert dataframe into endog and exog numpy arrays. 57 | 58 | Since everything is differenced, remove the first item in each array. 59 | """ 60 | diff_cols = ["d_{}_{}".format(lag, col) for col in cols] 61 | exog = df[diff_cols][1:] 62 | endog = df["d_{}_priceUSD".format(lag)][1:] 63 | 64 | return endog, exog 65 | 66 | 67 | def difference(df, cols, lag=1): 68 | """ 69 | Perform differencing on some columns in a dataframe. 70 | 71 | Input: 72 | ------ 73 | df: pandas dataframe containing the timeseries data. 74 | cols: list of strings indicating which columns to difference 75 | """ 76 | df2 = copy.deepcopy(df) 77 | 78 | # Difference based on the lag provided. 79 | for i in range(1, len(df2["block_end"])): 80 | for L in cols: 81 | curr = df2.loc[i, L] 82 | prev = df2.loc[i-lag, L] 83 | df2.loc[i, "d_{}_{}".format(lag, L)] = curr - prev 84 | 85 | return df2 86 | 87 | 88 | def parse_df(filename): 89 | """Given a filename, load the data into a dataframe.""" 90 | df = pd.read_csv(filename) 91 | return df 92 | -------------------------------------------------------------------------------- /Forecasting/r_io_util.py: -------------------------------------------------------------------------------- 1 | """A util file for I/O related to R.""" 2 | import subprocess 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | 7 | def R_push_csv(endog, exog): 8 | """Save current endog and exog dfs to CSV files in the R directory.""" 9 | np.savetxt("R/endog.csv", endog, delimiter=",") 10 | np.savetxt("R/exog.csv", exog, delimiter=",") 11 | return 12 | 13 | 14 | def R_pull_csv(): 15 | """Read from the CSV produced by R and return the prediction.""" 16 | return pd.read_csv("R/tmp.csv")['pred'][0] 17 | 18 | 19 | def R_predict(p, d, q): 20 | """Run Rscript to produce a pointwise prediction given CSV files.""" 21 | subprocess.call(["Rscript", "R/arima.R", str(p), str(d), str(q)]) 22 | 23 | 24 | def R_cleanup(): 25 | """Delete all CSV files in the R directory.""" 26 | dir = os.path.dirname(os.path.realpath(__file__)) 27 | for file in os.listdir("R"): 28 | if file.endswith(".csv"): 29 | try: 30 | os.remove(os.path.join(dir, file)) 31 | except: 32 | pass 33 | -------------------------------------------------------------------------------- /Forecasting/sim.py: -------------------------------------------------------------------------------- 1 | """Simulate a trading bot by predicting a series of values using train/test sets.""" 2 | from model import Forecast 3 | import numpy as np 4 | import copy 5 | from multiprocessing import Pool 6 | 7 | def simulate(p=1, d=0, q=0): 8 | """ 9 | This bot will perform the following steps. 10 | 11 | 1. Load data, pipeline, and split it into training and test sets. 12 | 2. Train an optimized ARIMA model on the training data. 13 | 3. Make a series of point forecasts and store the predictions in a list. 14 | Prediction requires exogenous variables, so append the next data point 15 | to both the endogenous and exogenous variables in the Forecast object 16 | before making the next prediction. 17 | """ 18 | #print("Loading data...") 19 | f = Forecast('blockchain.csv') 20 | 21 | # Define an index on which to split (like 80% of the way) 22 | ixSplit = int(0.8 * f.endog.shape[0]) 23 | 24 | # Define training and test sets 25 | train_endog = f.endog[:ixSplit] 26 | train_exog = f.exog[:ixSplit] 27 | test_endog = f.endog[ixSplit:] 28 | test_exog = f.exog[ixSplit:] 29 | 30 | # Update the instance 31 | f.endog = train_endog 32 | f.exog = train_exog 33 | 34 | # Copy test exogenous variables to compare with the predictions 35 | endog_expected = copy.deepcopy(test_endog) 36 | 37 | # Make a series of predictions 38 | #print("Making predictions...") 39 | preds = list() 40 | for i in range(len(test_exog)): 41 | # Make the prediction 42 | pred = f.predictARIMA_R(p, d, q, endog=f.endog, exog=f.exog) 43 | preds.append(pred) 44 | # Append the model's data with the first data in the test arrays 45 | # Note that np.delete is analagous to pop, but -1 indicates the first 46 | # item in the array. 47 | f.exog = np.append(f.exog, [test_exog[0]], axis=0) 48 | test_exog = np.delete(test_exog, 0, axis=0) 49 | f.endog = np.append(f.endog, [test_endog[0]], axis=0) 50 | test_endog = np.delete(test_endog, 0) 51 | 52 | return preds, endog_expected 53 | 54 | 55 | def decisionRule(): 56 | """Decide whether to buy, sell, or hold.""" 57 | pass 58 | 59 | 60 | def score_simulation(preds, endog_expected): 61 | """Score a simulation based on mean squared error.""" 62 | MSE = 0 63 | for i in range(len(preds)): 64 | MSE += (preds[i] - endog_expected[i])**2 65 | return MSE 66 | 67 | 68 | def test_f(gen): 69 | p = gen[0][0] 70 | d = gen[0][1] 71 | q = gen[0][2] 72 | try: 73 | preds, exog_expected = simulate(p, d, q) 74 | score = score_simulation(preds, exog_expected) 75 | except: 76 | score = 0 77 | return (score, p, d, q) 78 | 79 | 80 | if __name__ == "__main__": 81 | POOL = Pool(maxtasksperchild=500) 82 | p_range = range(5) 83 | d_range = range(5) 84 | q_range = [0] * 5 85 | gen = list() 86 | for _p in p_range: 87 | for _d in d_range: 88 | gen.append((_p, _d, 0)) 89 | _gen = zip(gen) 90 | x = POOL.map(test_f, _gen) 91 | print("Done") 92 | print(x) 93 | # [(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (29.292981789631671, 1, 0, 0), (0, 1, 1, 0), (0, 1, 2, 0), (0, 1, 3, 0), (0, 1, 4, 0), (0, 2, 0, 0), (0, 2, 1, 0), (0, 2, 2, 0), (0, 2, 3, 0), (0, 2, 4, 0), (0, 3, 0, 0), (0, 3, 1, 0), (54.253053572867898, 3, 2, 0), (0, 3, 3, 0), (0, 3, 4, 0), (0, 4, 0, 0), (0, 4, 1, 0), (0, 4, 2, 0), (0, 4, 3, 0), (250.45917084881501, 4, 4, 0)] 94 | -------------------------------------------------------------------------------- /Preprocessing/Crawler/Crawler.py: -------------------------------------------------------------------------------- 1 | """A client to interact with node and to save data to mongo.""" 2 | 3 | from pymongo import MongoClient 4 | import crawler_util 5 | import requests 6 | import json 7 | import sys 8 | import os 9 | import logging 10 | import time 11 | import tqdm 12 | sys.path.append(os.path.realpath(os.path.dirname(__file__))) 13 | 14 | DIR = os.environ['BLOCKCHAIN_MONGO_DATA_DIR'] 15 | LOGFIL = "crawler.log" 16 | if "BLOCKCHAIN_ANALYSIS_LOGS" in os.environ: 17 | LOGFIL = "{}/{}".format(os.environ['BLOCKCHAIN_ANALYSIS_LOGS'], LOGFIL) 18 | crawler_util.refresh_logger(LOGFIL) 19 | logging.basicConfig(filename=LOGFIL, level=logging.DEBUG) 20 | logging.getLogger("urllib3").setLevel(logging.WARNING) 21 | 22 | 23 | class Crawler(object): 24 | """ 25 | A client to migrate blockchain from geth to mongo. 26 | 27 | Description: 28 | ------------ 29 | Before starting, make sure geth is running in RPC (port 8545 by default). 30 | Initializing a Crawler object will automatically scan the blockchain from 31 | the last block saved in mongo to the most recent block in geth. 32 | 33 | Parameters: 34 | ----------- 35 | rpc_port: default 8545 # The port on which geth RPC can be called 36 | host: default "http://localhost" # The geth host 37 | start: default True # Create the graph upon instantiation 38 | 39 | Usage: 40 | ------ 41 | Default behavior: 42 | crawler = Crawler() 43 | 44 | Interactive mode: 45 | crawler = Crawler(start=False) 46 | 47 | Get the data from a particular block: 48 | block = crawler.getBlock(block_number) 49 | 50 | Save the block to mongo. This will fail if the block already exists: 51 | crawler.saveBlock(block) 52 | 53 | """ 54 | 55 | def __init__( 56 | self, 57 | start=True, 58 | rpc_port=8545, 59 | host="http://localhost", 60 | delay=0.0001 61 | ): 62 | """Initialize the Crawler.""" 63 | logging.debug("Starting Crawler") 64 | self.url = "{}:{}".format(host, rpc_port) 65 | self.headers = {"content-type": "application/json"} 66 | 67 | # Initializes to default host/port = localhost/27017 68 | self.mongo_client = crawler_util.initMongo(MongoClient()) 69 | # The max block number that is in mongo 70 | self.max_block_mongo = None 71 | # The max block number in the public blockchain 72 | self.max_block_geth = None 73 | # Record errors for inserting block data into mongo 74 | self.insertion_errors = list() 75 | # Make a stack of block numbers that are in mongo 76 | self.block_queue = crawler_util.makeBlockQueue(self.mongo_client) 77 | # The delay between requests to geth 78 | self.delay = delay 79 | 80 | if start: 81 | self.max_block_mongo = self.highestBlockMongo() 82 | self.max_block_geth = self.highestBlockEth() 83 | self.run() 84 | 85 | def _rpcRequest(self, method, params, key): 86 | """Make an RPC request to geth on port 8545.""" 87 | payload = { 88 | "method": method, 89 | "params": params, 90 | "jsonrpc": "2.0", 91 | "id": 0 92 | } 93 | time.sleep(self.delay) 94 | res = requests.post( 95 | self.url, 96 | data=json.dumps(payload), 97 | headers=self.headers).json() 98 | return res[key] 99 | 100 | def getBlock(self, n): 101 | """Get a specific block from the blockchain and filter the data.""" 102 | data = self._rpcRequest("eth_getBlockByNumber", [hex(n), True], "result") 103 | block = crawler_util.decodeBlock(data) 104 | return block 105 | 106 | def highestBlockEth(self): 107 | """Find the highest numbered block in geth.""" 108 | num_hex = self._rpcRequest("eth_blockNumber", [], "result") 109 | return int(num_hex, 16) 110 | 111 | def saveBlock(self, block): 112 | """Insert a given parsed block into mongo.""" 113 | e = crawler_util.insertMongo(self.mongo_client, block) 114 | if e: 115 | self.insertion_errors.append(e) 116 | 117 | def highestBlockMongo(self): 118 | """Find the highest numbered block in the mongo database.""" 119 | highest_block = crawler_util.highestBlock(self.mongo_client) 120 | logging.info("Highest block found in mongodb:{}".format(highest_block)) 121 | return highest_block 122 | 123 | def add_block(self, n): 124 | """Add a block to mongo.""" 125 | b = self.getBlock(n) 126 | if b: 127 | self.saveBlock(b) 128 | time.sleep(0.001) 129 | else: 130 | self.saveBlock({"number": n, "transactions": []}) 131 | 132 | def run(self): 133 | """ 134 | Run the process. 135 | 136 | Iterate through the blockchain on geth and fill up mongodb 137 | with block data. 138 | """ 139 | logging.debug("Processing geth blockchain:") 140 | logging.info("Highest block found as: {}".format(self.max_block_geth)) 141 | logging.info("Number of blocks to process: {}".format( 142 | len(self.block_queue))) 143 | 144 | # Make sure the database isn't missing any blocks up to this point 145 | logging.debug("Verifying that mongo isn't missing any blocks...") 146 | self.max_block_mongo = 1 147 | if len(self.block_queue) > 0: 148 | print("Looking for missing blocks...") 149 | self.max_block_mongo = self.block_queue.pop() 150 | for n in tqdm.tqdm(range(1, self.max_block_mongo)): 151 | if len(self.block_queue) == 0: 152 | # If we have reached the max index of the queue, 153 | # break the loop 154 | break 155 | else: 156 | # -If a block with number = current index is not in 157 | # the queue, add it to mongo. 158 | # -If the lowest block number in the queue (_n) is 159 | # not the current running index (n), then _n > n 160 | # and we must add block n to mongo. After doing so, 161 | # we will add _n back to the queue. 162 | _n = self.block_queue.popleft() 163 | if n != _n: 164 | self.add_block(n) 165 | self.block_queue.appendleft(_n) 166 | logging.info("Added block {}".format(n)) 167 | 168 | # Get all new blocks 169 | print("Processing remainder of the blockchain...") 170 | for n in tqdm.tqdm(range(self.max_block_mongo, self.max_block_geth)): 171 | self.add_block(n) 172 | 173 | print("Done!\n") 174 | -------------------------------------------------------------------------------- /Preprocessing/Crawler/__init__.py: -------------------------------------------------------------------------------- 1 | from Crawler import Crawler 2 | from util import * 3 | -------------------------------------------------------------------------------- /Preprocessing/Crawler/crawler_util.py: -------------------------------------------------------------------------------- 1 | """Util functions for interacting with geth and mongo.""" 2 | import pymongo 3 | from collections import deque 4 | import os 5 | import pdb 6 | 7 | DB_NAME = "blockchain" 8 | COLLECTION = "transactions" 9 | 10 | # mongodb 11 | # ------- 12 | def initMongo(client): 13 | """ 14 | Given a mongo client instance, create db/collection if either doesn't exist 15 | 16 | Parameters: 17 | ----------- 18 | client 19 | 20 | Returns: 21 | -------- 22 | 23 | """ 24 | db = client[DB_NAME] 25 | try: 26 | db.create_collection(COLLECTION) 27 | except: 28 | pass 29 | try: 30 | # Index the block number so duplicate records cannot be made 31 | db[COLLECTION].create_index( 32 | [("number", pymongo.DESCENDING)], 33 | unique=True 34 | ) 35 | except: 36 | pass 37 | 38 | return db[COLLECTION] 39 | 40 | 41 | def insertMongo(client, d): 42 | """ 43 | Insert a document into mongo client with collection selected. 44 | 45 | Params: 46 | ------- 47 | client 48 | d 49 | 50 | Returns: 51 | -------- 52 | error 53 | """ 54 | try: 55 | client.insert_one(d) 56 | return None 57 | except Exception as err: 58 | pass 59 | 60 | 61 | def highestBlock(client): 62 | """ 63 | Get the highest numbered block in the collection. 64 | 65 | Params: 66 | ------- 67 | client 68 | 69 | Returns: 70 | -------- 71 | 72 | """ 73 | n = client.find_one(sort=[("number", pymongo.DESCENDING)]) 74 | if not n: 75 | # If the database is empty, the highest block # is 0 76 | return 0 77 | assert "number" in n, "Highest block is incorrectly formatted" 78 | return n["number"] 79 | 80 | 81 | def makeBlockQueue(client): 82 | """ 83 | Form a queue of blocks that are recorded in mongo. 84 | 85 | Params: 86 | ------- 87 | client 88 | 89 | Returns: 90 | -------- 91 | 92 | """ 93 | queue = deque() 94 | all_n = client.find({}, {"number":1, "_id":0}, 95 | sort=[("number", pymongo.ASCENDING)]) 96 | for i in all_n: 97 | queue.append(i["number"]) 98 | return queue 99 | 100 | # Geth 101 | # ---- 102 | def decodeBlock(block): 103 | """ 104 | Decode various pieces of information (from hex) for a block and return the parsed data. 105 | 106 | Note that the block is of the form: 107 | { 108 | "id": 0, 109 | "jsonrpc": "2.0", 110 | "result": { 111 | "number": "0xf4241", 112 | "hash": "0xcb5cab7266694daa0d28cbf40496c08dd30bf732c41e0455e7ad389c10d79f4f", 113 | "parentHash": "0x8e38b4dbf6b11fcc3b9dee84fb7986e29ca0a02cecd8977c161ff7333329681e", 114 | "nonce": "0x9112b8c2b377fbe8", 115 | "sha3Uncles": "0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347", 116 | "logsBloom": "0x0", 117 | "transactionsRoot": "0xc61c50a0a2800ddc5e9984af4e6668de96aee1584179b3141f458ffa7d4ecec6", 118 | "stateRoot": "0x7dd4aabb93795feba9866821c0c7d6a992eda7fbdd412ea0f715059f9654ef23", 119 | "receiptRoot": "0xb873ddefdb56d448343d13b188241a4919b2de10cccea2ea573acf8dbc839bef", 120 | "miner": "0x2a65aca4d5fc5b5c859090a6c34d164135398226", 121 | "difficulty": "0xb6b4bbd735f", 122 | "totalDifficulty": "0x63056041aaea71c9", 123 | "size": "0x292", 124 | "extraData": "0xd783010303844765746887676f312e352e31856c696e7578", 125 | "gasLimit": "0x2fefd8", 126 | "gasUsed": "0x5208", 127 | "timestamp": "0x56bfb41a", 128 | "transactions": [ 129 | { 130 | "hash": "0xefb6c796269c0d1f15fdedb5496fa196eb7fb55b601c0fa527609405519fd581", 131 | "nonce": "0x2a121", 132 | "blockHash": "0xcb5cab7266694daa0d28cbf40496c08dd30bf732c41e0455e7ad389c10d79f4f", 133 | "blockNumber": "0xf4241", 134 | "transactionIndex": "0x0", 135 | "from": "0x2a65aca4d5fc5b5c859090a6c34d164135398226", 136 | "to": "0x819f4b08e6d3baa33ba63f660baed65d2a6eb64c", 137 | "value": "0xe8e43bc79c88000", 138 | "gas": "0x15f90", 139 | "gasPrice": "0xba43b7400", 140 | "input": "0x" 141 | } 142 | ], 143 | "uncles": [] 144 | } 145 | } 146 | """ 147 | try: 148 | b = block 149 | if "result" in block: 150 | b = block["result"] 151 | # Filter the block 152 | new_block = { 153 | "number": int(b["number"], 16), 154 | "timestamp": int(b["timestamp"], 16), # Timestamp is in unix time 155 | "transactions": [] 156 | } 157 | # Filter and decode each transaction and add it back 158 | # Value, gas, and gasPrice are all converted to ether 159 | for t in b["transactions"]: 160 | new_t = { 161 | "from": t["from"], 162 | "to": t["to"], 163 | "value": float(int(t["value"], 16))/1000000000000000000., 164 | "data": t["input"] 165 | } 166 | new_block["transactions"].append(new_t) 167 | return new_block 168 | 169 | except: 170 | return None 171 | 172 | 173 | def refresh_logger(filename): 174 | """Remove old logs and create new ones.""" 175 | if os.path.isfile(filename): 176 | try: 177 | os.remove(filename) 178 | except Exception: 179 | pass 180 | open(filename, 'a').close() 181 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ethereum Blockchain Parser 2 | 3 | This is a project to parse the Ethereum blockchain from a local geth node. Blockchains are perfect data sets because they contain every transaction ever made on the network. This is valuable data if you want to analyze the network, but Ethereum stores its blockchain in [RLP](https://github.com/ethereum/wiki/wiki/RLP) encoded binary blobs within a series of LevelDB files and these are surprisingly difficult to access, even given the available tools. This project takes the approach of querying a local node via [JSON-RPC](https://github.com/ethereum/wiki/wiki/JSON-RPC), which returns unencoded transactional data, and then moves that data to a mongo database. 4 | 5 | ![Blocks 1 to 120000](.content/1_120000.jpg) 6 | 7 | 8 | ## Usage 9 | 10 | ### Streaming data 11 | 12 | To stream blockchain data for real-time analysis, make sure you have both geth and mongo running and start the process with: 13 | 14 | python3 stream.py 15 | 16 | Note that this will automatically backfill your mongo database with blocks that it is missing. 17 | 18 | ### Backfilling your Mongo database 19 | 20 | To get data from the blockchain as it exists now and then stop parsing, simply run the following scripts, which are located in the `Scripts` directory. Note that at the time of writing, the Ethereum blockchain has about 1.5 million blocks so this will likely take several hours. 21 | 22 | 1. Funnel the data from geth to MongoDB: 23 | 24 | 25 | python3 preprocess.py 26 | 27 | 2. Create a series of snapshots of the blockchain through time and for each snapshot, calculate key metrics. Dump the data into a CSV file: 28 | 29 | 30 | python3 extract.py 31 | 32 | 33 | 34 | ## Prerequisites: 35 | 36 | Before using this tool to analyze your copy of the blockchain, you need the following things: 37 | 38 | ### Geth 39 | [Geth](https://github.com/ethereum/go-ethereum/wiki/Geth) is the Go implementation of a full Ethereum node. We will need to run it with the `--rpc` flag in order to request data (**WARNING** if you run this on a geth client containing an account that has ether in it, make sure you put a firewall 8545 or whatever port you run geth RPC on). 40 | 41 | A geth instance downloads the blockchain and processes it, saving the blocks as LevelDB files in the specified data directory (`~/.ethereum/chaindata` by default). The geth instance can be queried via RPC with the `eth_getBlockByNumber([block, true])` endpoint (see [here](https://github.com/ethereum/wiki/wiki/JSON-RPC#eth_getblockbynumber)) to get the `X-th` block (with `true` indicating we want the transactional data included), which returns data of the form: 42 | 43 | { 44 | number: 1000000, 45 | timestamp: 1465003569, 46 | ... 47 | transactions: [ 48 | { 49 | blockHash: "0x2052ce710a08094b81b5047ea9df5119773ce4b263a23d86659fa7293251055e", 50 | blockNumber: 1284937, 51 | from: "0x1f57f826caf594f7a837d9fc092456870a289365", 52 | gas: 22050, 53 | gasPrice: 20000000000, 54 | hash: "0x654ac26084ee6e40767e8735f38274ef5f594454a4d34cfdd70c93aa95be0c64", 55 | input: "0x", 56 | nonce: 6610, 57 | to: "0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98", 58 | transactionIndex: 27, 59 | value: 201544820000000000 60 | } 61 | ] 62 | } 63 | 64 | Since I am only interested in `number`, `timestamps`, and `transactions` for this application, I have omitted the rest of the data, but there is lots of additional information in the block (explore [here](https://etherchain.org/blocks)), including a few Merkle trees to maintain hashes of state, transactions, and receipts (read [here](https://blog.ethereum.org/2015/11/15/merkling-in-ethereum/). 65 | 66 | Using the `from` and `to` addresses in the `transactions` array, I can map the flow of ether through the network as time processes. Note that the value, gas, and gasPrice are in Wei, where 1 Ether = 1018 Wei. The numbers are converted into Ether automatically with this tool. 67 | 68 | ### MongoDB 69 | 70 | We will use mongo to essentially copy each block served by Geth, preserving its structure. The data outside the scope of this analysis will be omitted. Note that this project also requires pymongo. 71 | 72 | ### graph-tool 73 | 74 | [graph-tool](https://graph-tool.skewed.de/) is a python library written in C to construct graphs quickly and has a flexible feature set for mapping properties to its edges and vertices. Depending on your system, this may be tricky to install, so be sure and follow their instructions carefully. I recommend you find some way to install it with a package manager because building from source is a pain. 75 | 76 | ### python3 77 | 78 | This was written for python 3.4 with the packages: contractmap, tqdm and requests. Some things will probably break if you try to do this analysis in python 2. 79 | 80 | 81 | ## Workflow 82 | 83 | The following outlines the procedure used to turn the data from bytes on the blockchain to data in a CSV file. 84 | 85 | ### 1. Process the blockchain 86 | 87 | Preprocessing is done with the `Crawler` class, which can be found in the `Preprocessing/Crawler` directory. Before instantiating a `Crawler` object, you need to have geth and mongo processes running. Starting a `Crawler()` instance will go through the processes of requesting and processing the blockchain from geth and copying it over to a Mongo collection named `transactions`. Once copied over, you can close the `Crawler()` instance. 88 | 89 | ### 2. Take a snapshot of the blockchain 90 | 91 | A snapshot of the network (i.e. all of the transactions occurring between two timestamps, or numbered blocks in the block chain) can be taken with a `TxnGraph()` instance. This class can be found in the `Analysis` directory. Create an instance with: 92 | 93 | snapshot = TxnGraph(a, b) 94 | 95 | where a is the starting block (int) and b is ending block (int). This will load a directed graph of all ethereum addresses that made transactions between the two specified blocks. It will also weight vertices by the total amount of Ether at the time that the ending block was mined and edges by the amount of ether send in the transaction. 96 | 97 | To move on to the next snapshot (i.e. forward in time): 98 | 99 | snapshot.extend(c) 100 | 101 | where `c` is the number of blocks to proceed. 102 | 103 | At each snapshot, the instance will automatically pickle the snapshot and save the state to a local file (disable on instantiation with `save=False`). 104 | 105 | #### Drawing an image: 106 | 107 | Once `TxnGraph` is created, it will create a graph out of all of the data in the blocks between a and b. An image can be drawn by calling `TxnGraph.draw()` and specific dimensions can be passed using `TxnGraph.draw(w=A, h=B)` where A and B are ints corresponding to numbers of pixels. By default, this is saved to the `Analysis/data/snapshots` directory. 108 | 109 | #### Saving/Loading State (using pickle) 110 | 111 | The `TxnGraph` instance state can be (and automatically is) pickled with `TxnGraph.save()` where the filename is parameterized by the start/end blocks and is saved. By default, this saves to the `Analysis/data/pickles` directory. If another instance was pickled with a different set of start/end blocks, it can be reloaded with `TxnGraph.load(a,b)`. 112 | 113 | ### 3: (Optional) Add a lookup table for smart contract transactions 114 | 115 | An important consideration when doing an analysis of the Ethereum network is of smart contract addresses. Much ether flows to and from contracts, which you may want to distinguish from simple peer-to-peer transactions. This can be done by loading a `ContractMap` instance. It is recommended you pass the most recent block in the blockchain for `last_block`, as this will find all contracts that were transacted with up to that point in history: 116 | 117 | # If a mongo_client is passed, the ContractMap will scan geth via RPC 118 | # for new contract addresses starting at "last_block". 119 | cmap = ContractMap(mongo_client, last_block=90000, filepath="./contracts.p") 120 | cmap.save() 121 | 122 | # If None is passed for a mongo_client, the ContractMap will automatically 123 | # load the map of addresses from the pickle file specified in "filepath", 124 | # ./contracts.p by default. 125 | cmap = ContractMap() 126 | 127 | This will create a hash table of all contract addresses using a `defaultdict` and will save it to a pickle file. 128 | 129 | ### 4: Aggregate data and analyze 130 | 131 | Once a snapshot has been created, initialize an instance of `ParsedBlocks` with a `TxnGraph` instance. This will automatically aggregate the data and save to a local CSV file, which can then be analyzed. 132 | -------------------------------------------------------------------------------- /Scripts/draw_graphs.py: -------------------------------------------------------------------------------- 1 | from ChainAnalysis.TxnGraph import TxnGraph 2 | import os 3 | import multiprocessing 4 | 5 | # Build and snap a graph based on a tuple of form (start_block, end_block) 6 | def build(blocks, old_graph): 7 | path_exists = os.path.exists("data/snapshots/") 8 | assert path_exists, "No path exists to store the snapshots." 9 | 10 | print("Start=%s, End=%s; Building graph."%(blocks[0], blocks[1])) 11 | 12 | if old_graph: 13 | previous = {"graph": old_graph, "end_block": blocks[0]} 14 | else: 15 | previous = None 16 | tmp = TxnGraph(blocks[0], blocks[1], previous=previous) 17 | tmp.draw() 18 | return tmp.graph, blocks[1] 19 | 20 | 21 | 22 | 23 | if __name__=="__main__": 24 | # Take a bunch of snapshots based on the resolution. 25 | # Between each snapshot, pass the previous graph object and the previous 26 | # end_block number as the start_block in the new snapshot. 27 | resolution = 100000 28 | 29 | block_max = 1000000 30 | tmp_graph = None 31 | tmp_last_block = 0 32 | for i in range(block_max//resolution): 33 | tmp_graph, tmp_last_block = build( 34 | (tmp_last_block, resolution*i), tmp_graph 35 | ) 36 | -------------------------------------------------------------------------------- /Scripts/extract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parse a bunch of snapshots of the blockchain and dump contents into a CSV file. 3 | """ 4 | import sys 5 | sys.path.append("./../Analysis") 6 | import os 7 | os.environ['ETH_BLOCKCHAIN_ANALYSIS_DIR'] = './../Analysis/' 8 | from ParsedBlocks import ParsedBlocks 9 | from TxnGraph import TxnGraph 10 | import tqdm 11 | 12 | def syncCSV(filename): 13 | """Resume populating the CSV file.""" 14 | block = 0 15 | with open(filename, "r") as f: 16 | for line in f: 17 | data = line.split(",") 18 | try: 19 | if int(data[0]) > block: 20 | block = int(data[0]) 21 | except: 22 | pass 23 | return block 24 | 25 | 26 | if __name__ == "__main__": 27 | max_block = 1600000 28 | resolution = 1000 29 | CSVFILE = "blockchain.csv" 30 | STEP = 1000 31 | prev_max_block = 0 32 | 33 | if os.path.exists(CSVFILE): 34 | prev_max_block = syncCSV(CSVFILE) 35 | 36 | # Always start at block 1 because the data is cumulative. 37 | # Resume at previous block + 1000 38 | t = TxnGraph(1, prev_max_block + STEP) 39 | for i in tqdm.tqdm(range(max_block//resolution)): 40 | 41 | if t.end_block > prev_max_block: 42 | blocks = ParsedBlocks(t) 43 | t.extend(STEP) 44 | else: 45 | t.end_block += STEP 46 | -------------------------------------------------------------------------------- /Scripts/preprocess.py: -------------------------------------------------------------------------------- 1 | """Pull data from geth and parse it into mongo.""" 2 | 3 | import subprocess 4 | import sys 5 | sys.path.append("./../Preprocessing") 6 | sys.path.append("./../Analysis") 7 | import os 8 | os.environ['ETH_BLOCKCHAIN_ANALYSIS_DIR'] = './../Preprocessing/' 9 | from Crawler import Crawler 10 | from ContractMap import ContractMap 11 | import subprocess 12 | import time 13 | LOGDIR = "./../Preprocessing/logs" 14 | 15 | 16 | subprocess.call([ 17 | "(geth --rpc --rpcport 8545 > {}/geth.log 2>&1) &".format(LOGDIR), 18 | "(mongod --dbpath mongo/data --port 27017 > {}/mongo.log 2>&1) &".format(LOGDIR) 19 | ], shell=True) 20 | 21 | print("Booting processes.") 22 | # Catch up with the crawler 23 | c = Crawler() 24 | 25 | print("Updating contract hash map.") 26 | # Update the contract addresses that have been interacted with 27 | ContractMap(c.mongo_client, last_block=c.max_block_mongo) 28 | 29 | print("Update complete.") 30 | subprocess.call([ 31 | "(geth --rpc --rpcport 8545 > {}/geth.log 2>&1) &".format(LOGDIR), 32 | "(mongod --dbpath mongo/data --port 27017 > {}/mongo.log 2>&1) &".format(LOGDIR) 33 | ], shell=True) 34 | -------------------------------------------------------------------------------- /stream.py: -------------------------------------------------------------------------------- 1 | """Stream updates to the blockchain from geth to mongo.""" 2 | import sys 3 | import os 4 | sys.path.append("Preprocessing/Crawler") 5 | from Crawler import Crawler 6 | sys.path.append("Analysis") 7 | from TxnGraph import TxnGraph 8 | from ParsedBlocks import ParsedBlocks 9 | sys.path.append("Scripts") 10 | from extract import syncCSV 11 | import tqdm 12 | 13 | 14 | def syncMongo(c): 15 | """Sync mongo with geth blocks.""" 16 | gethBlock = c.highestBlockEth() 17 | mongoBlock = c.highestBlockMongo() 18 | counter = 0 19 | if gethBlock > mongoBlock: 20 | print("Syncing Mongo...") 21 | for i in range(gethBlock-mongoBlock): 22 | c.add_block(mongoBlock+i) 23 | counter += 1 24 | if counter >= 100: 25 | print("Successfully parsed {} blocks.".format(counter)) 26 | print("Currently at block {} of {}".format(mongoBlock, gethBlock)) 27 | counter = 0 28 | 29 | if __name__ == "__main__": 30 | # Print success every N iterations 31 | n = 100 32 | 33 | # Initialize a crawler that will catch the mongodb up 34 | c = Crawler() 35 | syncMongo(c) 36 | 37 | # Initialize a TxnGraph and save it every N blocks 38 | N = 1000 39 | t = None 40 | 41 | # Global vars 42 | CSVFILE = "Scripts/blockchain.csv" 43 | STEP = 1000 44 | 45 | # Sync with the CSV file 46 | if os.path.exists(CSVFILE): 47 | prev_max_block = syncCSV(CSVFILE) 48 | 49 | # Catch the CSV data up 50 | _highestBlockMongo = c.highestBlockMongo() 51 | 52 | if prev_max_block + STEP <= _highestBlockMongo: 53 | t = TxnGraph(1, prev_max_block+STEP) 54 | for i in tqdm.tqdm(range(_highestBlockMongo//STEP)): 55 | if t.end_block > prev_max_block: 56 | blocks = ParsedBlocks(t) 57 | t.extend(STEP) 58 | else: 59 | t.end_block += STEP 60 | 61 | while True: 62 | # Sync 63 | syncMongo(c) 64 | 65 | # Initialize TxnGraph if it doesn't exist yet 66 | if not t: 67 | t = TxnGraph(1, c.highestBlockMongo()) 68 | 69 | # Do the next iteration of the TxnGraph if applciable 70 | if t.end_block + STEP <= c.highestBlockMongo(): 71 | t.extend(STEP) 72 | 73 | # Print an update at a certain resolution 74 | if not t.end_block % 10000: 75 | print("Streaming at block {}".format(t.end_block)) 76 | -------------------------------------------------------------------------------- /test/forecast.py: -------------------------------------------------------------------------------- 1 | """Test workflow of forecasting model.""" 2 | import sys 3 | sys.path.append("../Forecasting") 4 | import model 5 | 6 | 7 | def test_forecast(): 8 | """Optimize an ARIMA model and predict a few data points.""" 9 | START = 5 10 | END = 10 11 | print("Forecasting...") 12 | f = model.Forecast('../Forecasting/blockchain.csv') 13 | f.optimizeARIMA( 14 | range(5), range(5), range(5), f.endog, f.exog 15 | ) 16 | pred = f.predictARIMA(START, END) 17 | assert len(pred) == (END - START) 18 | -------------------------------------------------------------------------------- /test/verify_blocks.py: -------------------------------------------------------------------------------- 1 | """Test that the transactions in local blocks are correct.""" 2 | import requests 3 | import random 4 | import json 5 | import sys 6 | sys.path.append("../Preprocessing") 7 | from Crawler import Crawler 8 | import pprint 9 | 10 | def test_blocks(): 11 | """ 12 | Check transactions in each of a random sample of blocks. 13 | 14 | Send a request to https://etherchain.org/api/block/:block/tx to get a list 15 | of all transactions that occurred in that block. Cross-reference with the 16 | transactions in the local block (in mongo). 17 | """ 18 | c = Crawler.Crawler(start=False) 19 | client = c.mongo_client 20 | 21 | sample = random.sample(range(1, 1700000), 100) 22 | N = len(sample) 23 | 24 | # Track the number of times the number of transactions is different. 25 | wrong_blocks = list() 26 | num_error = "Incorrect number of transactions in {}% of {} blocks." 27 | 28 | blocks = client.find({"number": {"$in": sample}}) 29 | for block in blocks: 30 | n = block["number"] 31 | uri = "https://etherchain.org/api/block/{}/tx".format(n) 32 | ethchain = json.loads(requests.get(uri).text) 33 | 34 | # Check the number of transactions in the block 35 | if len(ethchain["data"]) != len(block["transactions"]): 36 | wrong_blocks.append(n) 37 | 38 | wrong_nums = len(wrong_blocks) 39 | pprint.pprint(wrong_blocks) 40 | assert wrong_nums == 0, num_error.format(100.*wrong_nums/N, N) 41 | --------------------------------------------------------------------------------