├── .gitignore ├── LICENSE ├── README.md ├── nodeapi_utils ├── __init__.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | output/ 132 | 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Web3Analytic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Node API Utilities 2 | 3 | This is a toolkit to make large batches of async queries to a RPC Provider API to download a historical dataset of blocks and transactions. See below for more information: 4 | 5 | - [QuickNode](https://www.quicknode.com/core-api) 6 | - [Alchemy](https://docs.alchemy.com/reference/api-overview) 7 | - [Infura](https://www.infura.io/product/overview) 8 | - [Chainstack](https://chainstack.com/solution) 9 | 10 | Making serial API calls to `eth_getBlockByNumber` costs roughly 0.15 seconds per call. Parallelizing across $N$ threads roughly cuts that time down to $\frac{0.15}{N}$ seconds per call. For instance, downloading 42.5M blocks from Arbitrum with 1 thread would take ~74 days, and with 10 threads it would take ~6 days (average 0.01 seconds per call). For machines with a large number of threads, this can greatly reduce the time-cost. 11 | 12 | ## Disclaimer 13 | 14 | Parallelization does not reduce the cost of using RPC Provider API. Please monitor your usage and keep track of method call costs: 15 | 16 | - [QuickNode's Credits](https://www.quicknode.com/api-credits) 17 | - [Alchemy's Compute Units](https://docs.alchemy.com/reference/compute-units) 18 | - [Infura's Requests](https://www.infura.io/pricing) 19 | - [Chainstack's Requests](https://chainstack.com/pricing) 20 | 21 | ## Setup 22 | 23 | After cloning this repo, run: 24 | 25 | ``` 26 | pip install -e . 27 | ``` 28 | 29 | If you want to upload data to Google cloud storage, you will need to provide a path to a service account credentials file at the environment variable `GOOGLE_APPLICATION_CREDENTIALS`. 30 | 31 | ## Usage 32 | 33 | Here is an example code snippet to download all blocks and transactions from Arbitrum. It will save a file of [jsonlines](https://pypi.org/project/jsonlines/) every 100k blocks. 34 | 35 | ```python 36 | from nodeapi_utils import DatasetBuilder 37 | 38 | builder = DatasetBuilder( 39 | rpc_provider=..., # RPC provider name (e.g. quicknode, alchemy, infura, chainstack) 40 | rpc_provider_url=..., # Your RPC provider url (Optional if using api_key) 41 | api_key=None, # Your API key (Optional if using rpc_provider_url. Required for quicknode & chainstack) 42 | out_dir='./output', # Optional: Output directory to save API responses to 43 | chain='arbitrum', # Supports ethereum, arbitrum, optimism, polygon, etc (Optional if using rpc_provider_url) 44 | start_block=16092775, # Block to begin pulling data from 45 | save_every=100000, # Saves a file for every 100k blocks 46 | ) 47 | # Increase number of threads for faster performance 48 | builder.async_get(num_threads=10) 49 | 50 | # After that completes, upload to storage buckets 51 | # NOTE: requires authentication through gcloud CLI 52 | builder.upload_buckets( 53 | 'some_unique_bucket_name', 54 | create_bucket=True, # Creates a bucket 55 | delete_post_upload=False, # Delete raw file after upload 56 | ) 57 | ``` 58 | 59 | The code only supports `eth_getBlockNumber` RPC method, and it's supported for the following chains (depending on the provider selected): `ethereum`, `arbitrum`, `arbitrum-nova`, `optimism`, `polygon`, `avalanche`, `celo`, `fantom`, `binance-smart-chain`, `gnosis`. Reach out for inquiries on additional methods to include. 60 | -------------------------------------------------------------------------------- /nodeapi_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import DatasetBuilder -------------------------------------------------------------------------------- /nodeapi_utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import requests 4 | import jsonlines 5 | import numpy as np 6 | from tqdm import tqdm 7 | from glob import glob 8 | 9 | from os import makedirs, remove 10 | from os.path import join, basename 11 | 12 | from timeit import default_timer 13 | from concurrent.futures import ThreadPoolExecutor 14 | 15 | from google.cloud import storage 16 | 17 | # Internal variable used for logging 18 | START_TIME = default_timer() 19 | 20 | # List of supported RPC providers 21 | SUPPORTED_PROVIDERS = ['quicknode', 'alchemy', 'infura', 'chainstack'] 22 | 23 | 24 | class DatasetBuilder: 25 | r"""Builds a dataset of historical transaction data. 26 | Arguments: 27 | -- 28 | rpc_provider (str): RPC provider name (e.g. quicknode, alchemy, infura). 29 | rpc_provider_url (Optional[str],default=None): RPC endpoint. Optional if using `api_key`. 30 | api_key (Optional[str],default=None): RPC Provider API key. Optional if using `rpc_provider_url`. 31 | out_dir (Optional[str], default='./output'): Output directory to save API responses to. 32 | chain (str, default=ethereum): Which chain to pull data from? Optional if using `rpc_provider_url`. 33 | start_block (int, default=1): Which block number to start pulling data from? 34 | end_block (Optional[int], default=None): Which block number to stop pulling data from? 35 | If None is supplied, defaults to the latest block. 36 | save_every (int, default=100000): Transactions will be saved in batches of this size. 37 | For example, a dataset of 1M rows will be saved through 10 files if `save_every` is 100k. 38 | """ 39 | def __init__(self, 40 | rpc_provider, 41 | rpc_provider_url=None, 42 | api_key=None, 43 | out_dir='./output', 44 | chain='ethereum', 45 | start_block=1, 46 | end_block=None, 47 | save_every=100000, 48 | ): 49 | assert rpc_provider in SUPPORTED_PROVIDERS, f'Provider {rpc_provider} not supported.' 50 | 51 | if (rpc_provider_url is None) and (api_key is None): 52 | raise Exception('You have to provide at least one of the following parameters: `rpc_provider_url` or `api_key`.') 53 | 54 | if rpc_provider in ['quicknode', 'chainstack']: 55 | assert rpc_provider_url is not None, f'{rpc_provider} requires you to specify the `rpc_provider_url`.' 56 | 57 | # Get the URL to send requests to 58 | rpc_url = get_rpc_provider(rpc_provider, rpc_provider_url, chain, api_key) 59 | 60 | if end_block is None: 61 | # Ping RPC to get the latest block 62 | last_block = get_current_block(rpc_url) 63 | end_block = last_block 64 | 65 | # Create directory to save output if not existing yet 66 | makedirs(out_dir, exist_ok=True) 67 | 68 | # `save_every`` cannot be bigger than the number of blocks 69 | save_every = min(save_every, end_block - start_block) 70 | 71 | # Save to class 72 | self.rpc_url = rpc_url 73 | self.out_dir = out_dir 74 | self.start_block = start_block 75 | self.end_block = end_block 76 | self.save_every = save_every 77 | 78 | def async_get(self, num_threads=10): 79 | r"""Parallel API calls. 80 | Arguments: 81 | -- 82 | num_threads (int, default=10): Number of parallel threads 83 | """ 84 | chunks = np.arange(self.start_block, self.end_block + 1, self.save_every) 85 | 86 | for i in range(len(chunks) - 1): 87 | start_block_i = int(chunks[i]) 88 | end_block_i = int(chunks[i+1]) 89 | out_file = join(self.out_dir, 90 | f'blocks-{start_block_i}-to-{end_block_i}.jsonl', 91 | ) 92 | 93 | loop = asyncio.get_event_loop() 94 | future = asyncio.ensure_future( 95 | async_make_api_requests(url=self.rpc_url, 96 | start_block=start_block_i, 97 | end_block=end_block_i, 98 | num_threads=num_threads, 99 | out_file=out_file, 100 | ) 101 | ) 102 | loop.run_until_complete(future) 103 | 104 | def upload_buckets(self, 105 | bucket_name, 106 | create_bucket=False, 107 | delete_post_upload=False, 108 | ): 109 | r"""Upload cached CSVs to google cloud bucket. 110 | Arguments: 111 | -- 112 | bucket_name (str): Name of the storage bucket to upload to. 113 | create_bucket (bool, default=False): If true, create the bucket. 114 | delete_post_upload (bool, default=False): Remove CSV after uploading if True. 115 | Notes: 116 | -- 117 | A progress bar will be printed upon execution. 118 | """ 119 | storage_client = storage.Client() 120 | 121 | if create_bucket: 122 | # If designated, create the storage bucket 123 | success = create_storage_bucket(bucket_name) 124 | 125 | if not success: 126 | raise Exception(f'Could not create bucket {bucket_name}. Try again?') 127 | 128 | # Check that the bucket exists 129 | assert storage_bucket_exists(bucket_name) 130 | 131 | # Fetch the bucket 132 | bucket = storage_client.bucket(bucket_name) 133 | 134 | # Find all files we want to upload 135 | data_files = glob(join(self.out_dir, '*.jsonl')) 136 | 137 | pbar = tqdm(total=len(data_files), desc='uploading to bucket') 138 | for data_file in data_files: 139 | blob_name = basename(data_file) 140 | blob = bucket.blob(blob_name) 141 | 142 | blob.upload_from_filename(data_file) 143 | 144 | # Delete CSV if requested 145 | if delete_post_upload: 146 | remove(data_file) 147 | 148 | pbar.update() 149 | pbar.close() 150 | 151 | 152 | def get_rpc_provider(rpc_provider, rpc_provider_url, chain, api_key): 153 | r"""Returns the chain URL from RPC provider. 154 | Arguments: 155 | -- 156 | chain (str): Chain to pull data from. 157 | Choices: ethereum | polygon | optimism | arbitrum 158 | api_key (str): API key 159 | API key 160 | Returns: 161 | -- 162 | provider_url (str): RPC url 163 | """ 164 | if rpc_provider == 'quicknode': 165 | # NOTE: QuickNode generates custom URLs for every project. 166 | provider_url = rpc_provider_url 167 | elif rpc_provider == 'chainstack': 168 | # NOTE: ChainStack assumes provider URL is direcvtly given. 169 | provider_url = rpc_provider_url 170 | elif rpc_provider == 'alchemy': 171 | if rpc_provider_url is not None: 172 | provider_url = rpc_provider_url 173 | else: 174 | # NOTE: This is not exhaustive. More can be added on request. Or use `provider_url` 175 | if chain == 'ethereum': 176 | provider_url = f'https://eth-mainnet.g.alchemy.com/v2/{api_key}' 177 | elif chain == 'polygon': 178 | provider_url = f'https://polygon-mainnet.g.alchemy.com/v2/{api_key}' 179 | elif chain == 'optimism': 180 | provider_url = f'https://opt-mainnet.g.alchemy.com/v2/{api_key}' 181 | elif chain == 'arbitrum': 182 | provider_url = f'https://arb-mainnet.g.alchemy.com/v2/{api_key}' 183 | else: 184 | raise Exception(f'Chain {chain} not supported.') 185 | elif rpc_provider == 'infura': 186 | if rpc_provider_url is not None: 187 | provider_url = rpc_provider_url 188 | else: 189 | # NOTE: This is not exhaustive. More can be added on request. Or use `provider_url` 190 | if chain == 'ethereum': 191 | provider_url = f'https://mainnet.infura.io/v3/{api_key}' 192 | elif chain == 'polygon': 193 | provider_url = f'https://polygon-mainnet.infura.io/v3/{api_key}' 194 | elif chain == 'optimism': 195 | provider_url = f'https://optimism-mainnet.infura.io/v3{api_key}' 196 | elif chain == 'arbitrum': 197 | provider_url = f'https://arbitrum-mainnet.infura.io/v3/{api_key}' 198 | else: 199 | raise Exception(f'Chain {chain} not supported.') 200 | else: 201 | raise Exception(f'Provider {rpc_provider} not supported.') 202 | 203 | return provider_url 204 | 205 | def get_current_block(url): 206 | r"""Get the current block. 207 | Arguments: 208 | -- 209 | url (str) 210 | Returns: 211 | -- 212 | block_number (int): Latest block number. 213 | """ 214 | payload = { 215 | "id": 1, 216 | "jsonrpc": "2.0", 217 | "method": "eth_blockNumber" 218 | } 219 | headers = { 220 | "accept": "application/json", 221 | "content-type": "application/json" 222 | } 223 | response = requests.post(url, json=payload, headers=headers) 224 | if response.status_code != 200: 225 | raise Exception('Failed to fetch latest block number. RPC Provider response: ' + str(json.loads(response.text))) 226 | 227 | response_data = json.loads(response.text) 228 | block_number = int(response_data['result'], 0) 229 | 230 | return block_number 231 | 232 | 233 | async def async_make_api_requests(url, 234 | start_block, 235 | end_block, 236 | out_file, 237 | num_threads=10, 238 | ): 239 | r"""Make async API requests. 240 | Arguments: 241 | -- 242 | url (str): API endpoint. 243 | start_block (int): Block number to start pulling at. 244 | end_block (int): Block number to stop pulling at. 245 | out_file (str): Where to save API results 246 | num_threads (int, default=10): Number of threads to use. 247 | """ 248 | print("{0:<30} {1:>20}".format("Block number", "Completed at")) 249 | 250 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 251 | with requests.Session() as session: 252 | loop = asyncio.get_event_loop() 253 | START_TIME = default_timer() # update global start time 254 | tasks = [ 255 | loop.run_in_executor( 256 | executor, 257 | make_api_request, 258 | *(session, block, url) 259 | ) 260 | for block in range(start_block, end_block) 261 | ] 262 | 263 | # Discard any failed responses 264 | dataset = [] 265 | for block_data in await asyncio.gather(*tasks): 266 | if block_data is not None: 267 | dataset.append(block_data) 268 | 269 | # Write to file 270 | with jsonlines.open(out_file, mode='w') as writer: 271 | writer.write_all(dataset) 272 | 273 | 274 | def make_api_request(session, block_number, url): 275 | r"""Pings the method `eth_getBlockByNumber`. 276 | Arguments: 277 | -- 278 | block_number (int): Number of the block. 279 | url (str): URL for the RPC API. 280 | Returns: 281 | -- 282 | response_data (Dict[str, any]): Block and transaction JSON. 283 | Notes: 284 | -- 285 | Prints block #'s and timestamps as it runs. 286 | """ 287 | payload = { 288 | "id": 1, 289 | "jsonrpc": "2.0", 290 | "method": "eth_getBlockByNumber", 291 | "params": [hex(block_number), True], 292 | } 293 | headers = { 294 | "accept": "application/json", 295 | "content-type": "application/json" 296 | } 297 | with session.post(url, json=payload, headers=headers) as response: 298 | if response.status_code != 200: 299 | return None 300 | 301 | response = json.loads(response.text) 302 | response_data = response['result'] 303 | 304 | # Print update 305 | elapsed_time = default_timer() - START_TIME 306 | completed_at = "{:5.2f}s".format(elapsed_time) 307 | print("{0:<30} {1:>20}".format(block_number, completed_at)) 308 | 309 | return response_data 310 | 311 | 312 | def storage_bucket_exists(gcloud_bucket): 313 | r"""Checks if a storage bucket exists. 314 | Returns: 315 | -- 316 | exists (bool): True if the bucket exists; False otherwise. 317 | """ 318 | client = storage.Client() 319 | exists = storage.Bucket(client, gcloud_bucket).exists() 320 | 321 | return exists 322 | 323 | 324 | def create_storage_bucket(gcloud_bucket): 325 | r"""Creates a new storage bucket. 326 | Returns: 327 | -- 328 | success (bool): True if the bucket was created. False otherwise. 329 | Notes: 330 | -- 331 | Assumes the storage bucket does not exist yet. We do not explicitly 332 | check for this. 333 | """ 334 | client = storage.Client() 335 | client.create_bucket(gcloud_bucket) 336 | success = storage_bucket_exists(gcloud_bucket) 337 | return success 338 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines==3.1.0 2 | numpy==1.21.1 3 | protobuf==4.21.10 4 | requests==2.28.1 5 | setuptools==41.2.0 6 | tqdm==4.62.3 7 | google-cloud-storage==2.6.0 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | LONG_DESCRIPTION = open("README.md", "r").read() 4 | 5 | setup( 6 | name="nodeapi_utils", 7 | version="0.1.0", 8 | author="web3analytic", 9 | author_email="mike@paretolabs.xyz", 10 | packages=find_packages(), 11 | scripts=[], 12 | description="Toolkit for parallel API requests to fetch transactions.", 13 | long_description=LONG_DESCRIPTION, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/web3analytic-xyz/nodeapi-utils", 16 | install_requires=[ 17 | "jsonlines==3.1.0", 18 | "numpy==1.21.1", 19 | "protobuf==4.21.10", 20 | "requests==2.28.1", 21 | "setuptools==41.2.0", 22 | "tqdm==4.62.3", 23 | "google-cloud-storage==2.6.0" 24 | ] 25 | ) 26 | --------------------------------------------------------------------------------