├── .gitignore
├── LICENSE
├── README.md
├── nodeapi_utils
    ├── __init__.py
    └── utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | output/
132 | 
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Web3Analytic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Node API Utilities
 2 | 
 3 | This is a toolkit to make large batches of async queries to a RPC Provider API to download a historical dataset of blocks and transactions. See below for more information:
 4 | 
 5 | - [QuickNode](https://www.quicknode.com/core-api)
 6 | - [Alchemy](https://docs.alchemy.com/reference/api-overview)
 7 | - [Infura](https://www.infura.io/product/overview)
 8 | - [Chainstack](https://chainstack.com/solution)
 9 | 
10 | Making serial API calls to `eth_getBlockByNumber` costs roughly 0.15 seconds per call. Parallelizing across $N$ threads roughly cuts that time down to $\frac{0.15}{N}$ seconds per call. For instance, downloading 42.5M blocks from Arbitrum with 1 thread would take ~74 days, and with 10 threads it would take ~6 days (average 0.01 seconds per call). For machines with a large number of threads, this can greatly reduce the time-cost.
11 | 
12 | ## Disclaimer
13 | 
14 | Parallelization does not reduce the cost of using RPC Provider API. Please monitor your usage and keep track of method call costs:
15 | 
16 | - [QuickNode's Credits](https://www.quicknode.com/api-credits)
17 | - [Alchemy's Compute Units](https://docs.alchemy.com/reference/compute-units)
18 | - [Infura's Requests](https://www.infura.io/pricing)
19 | - [Chainstack's Requests](https://chainstack.com/pricing)
20 | 
21 | ## Setup
22 | 
23 | After cloning this repo, run:
24 | 
25 | ```
26 | pip install -e .
27 | ```
28 | 
29 | If you want to upload data to Google cloud storage, you will need to provide a path to a service account credentials file at the environment variable `GOOGLE_APPLICATION_CREDENTIALS`.
30 | 
31 | ## Usage
32 | 
33 | Here is an example code snippet to download all blocks and transactions from Arbitrum. It will save a file of [jsonlines](https://pypi.org/project/jsonlines/) every 100k blocks.
34 | 
35 | ```python
36 | from nodeapi_utils import DatasetBuilder
37 | 
38 | builder = DatasetBuilder(
39 |     rpc_provider=...,     # RPC provider name (e.g. quicknode, alchemy, infura, chainstack)
40 |     rpc_provider_url=..., # Your RPC provider url (Optional if using api_key)
41 |     api_key=None,         # Your API key (Optional if using rpc_provider_url. Required for quicknode & chainstack)
42 |     out_dir='./output',   # Optional: Output directory to save API responses to
43 |     chain='arbitrum',     # Supports ethereum, arbitrum, optimism, polygon, etc (Optional if using rpc_provider_url)
44 |     start_block=16092775, # Block to begin pulling data from
45 |     save_every=100000,    # Saves a file for every 100k blocks
46 | )
47 | # Increase number of threads for faster performance
48 | builder.async_get(num_threads=10)
49 | 
50 | # After that completes, upload to storage buckets
51 | # NOTE: requires authentication through gcloud CLI
52 | builder.upload_buckets(
53 |     'some_unique_bucket_name',
54 |     create_bucket=True,        # Creates a bucket
55 |     delete_post_upload=False,  # Delete raw file after upload
56 | )
57 | ```
58 | 
59 | The code only supports `eth_getBlockNumber` RPC method, and it's supported for the following chains (depending on the provider selected): `ethereum`, `arbitrum`, `arbitrum-nova`, `optimism`, `polygon`, `avalanche`, `celo`, `fantom`, `binance-smart-chain`, `gnosis`. Reach out for inquiries on additional methods to include.
60 | 


--------------------------------------------------------------------------------
/nodeapi_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import DatasetBuilder


--------------------------------------------------------------------------------
/nodeapi_utils/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | import requests
  4 | import jsonlines
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from glob import glob
  8 | 
  9 | from os import makedirs, remove
 10 | from os.path import join, basename
 11 | 
 12 | from timeit import default_timer
 13 | from concurrent.futures import ThreadPoolExecutor
 14 | 
 15 | from google.cloud import storage
 16 | 
 17 | # Internal variable used for logging
 18 | START_TIME = default_timer()
 19 | 
 20 | # List of supported RPC providers
 21 | SUPPORTED_PROVIDERS = ['quicknode', 'alchemy', 'infura', 'chainstack']
 22 | 
 23 | 
 24 | class DatasetBuilder:
 25 |     r"""Builds a dataset of historical transaction data.
 26 |     Arguments:
 27 |     --
 28 |     rpc_provider (str): RPC provider name (e.g. quicknode, alchemy, infura).
 29 |     rpc_provider_url (Optional[str],default=None): RPC endpoint. Optional if using `api_key`.
 30 |     api_key (Optional[str],default=None): RPC Provider API key. Optional if using `rpc_provider_url`.
 31 |     out_dir (Optional[str], default='./output'): Output directory to save API responses to. 
 32 |     chain (str, default=ethereum): Which chain to pull data from? Optional if using `rpc_provider_url`.
 33 |     start_block (int, default=1): Which block number to start pulling data from?
 34 |     end_block (Optional[int], default=None): Which block number to stop pulling data from?
 35 |         If None is supplied, defaults to the latest block.
 36 |     save_every (int, default=100000): Transactions will be saved in batches of this size. 
 37 |         For example, a dataset of 1M rows will be saved through 10 files if `save_every` is 100k.
 38 |     """
 39 |     def __init__(self,
 40 |                  rpc_provider,
 41 |                  rpc_provider_url=None,
 42 |                  api_key=None,
 43 |                  out_dir='./output',
 44 |                  chain='ethereum',
 45 |                  start_block=1,
 46 |                  end_block=None,
 47 |                  save_every=100000,
 48 |                  ):
 49 |         assert rpc_provider in SUPPORTED_PROVIDERS, f'Provider {rpc_provider} not supported.'
 50 | 
 51 |         if (rpc_provider_url is None) and (api_key is None):
 52 |             raise Exception('You have to provide at least one of the following parameters: `rpc_provider_url` or `api_key`.')
 53 | 
 54 |         if rpc_provider in ['quicknode', 'chainstack']:
 55 |             assert rpc_provider_url is not None, f'{rpc_provider} requires you to specify the `rpc_provider_url`.'
 56 | 
 57 |         # Get the URL to send requests to
 58 |         rpc_url = get_rpc_provider(rpc_provider, rpc_provider_url, chain, api_key)
 59 |         
 60 |         if end_block is None:
 61 |             # Ping RPC to get the latest block
 62 |             last_block = get_current_block(rpc_url)
 63 |             end_block = last_block
 64 | 
 65 |         # Create directory to save output if not existing yet
 66 |         makedirs(out_dir, exist_ok=True)
 67 | 
 68 |         # `save_every`` cannot be bigger than the number of blocks
 69 |         save_every = min(save_every, end_block - start_block)
 70 | 
 71 |         # Save to class
 72 |         self.rpc_url = rpc_url
 73 |         self.out_dir = out_dir
 74 |         self.start_block = start_block
 75 |         self.end_block = end_block
 76 |         self.save_every = save_every
 77 | 
 78 |     def async_get(self, num_threads=10):
 79 |         r"""Parallel API calls.
 80 |         Arguments:
 81 |         --
 82 |         num_threads (int, default=10): Number of parallel threads
 83 |         """
 84 |         chunks = np.arange(self.start_block, self.end_block + 1, self.save_every)
 85 | 
 86 |         for i in range(len(chunks) - 1):
 87 |             start_block_i = int(chunks[i])
 88 |             end_block_i = int(chunks[i+1])
 89 |             out_file = join(self.out_dir, 
 90 |                             f'blocks-{start_block_i}-to-{end_block_i}.jsonl',
 91 |                             )
 92 | 
 93 |             loop = asyncio.get_event_loop()
 94 |             future = asyncio.ensure_future(
 95 |                 async_make_api_requests(url=self.rpc_url,
 96 |                                         start_block=start_block_i,
 97 |                                         end_block=end_block_i,
 98 |                                         num_threads=num_threads,
 99 |                                         out_file=out_file,
100 |                                         )
101 |                 )
102 |             loop.run_until_complete(future)
103 | 
104 |     def upload_buckets(self,
105 |                        bucket_name,
106 |                        create_bucket=False,
107 |                        delete_post_upload=False,
108 |                        ):
109 |         r"""Upload cached CSVs to google cloud bucket.
110 |         Arguments:
111 |         --
112 |         bucket_name (str): Name of the storage bucket to upload to.
113 |         create_bucket (bool, default=False): If true, create the bucket.
114 |         delete_post_upload (bool, default=False): Remove CSV after uploading if True.
115 |         Notes:
116 |         --
117 |         A progress bar will be printed upon execution.
118 |         """
119 |         storage_client = storage.Client()
120 | 
121 |         if create_bucket:
122 |             # If designated, create the storage bucket
123 |             success = create_storage_bucket(bucket_name)
124 | 
125 |             if not success:
126 |                 raise Exception(f'Could not create bucket {bucket_name}. Try again?')
127 | 
128 |         # Check that the bucket exists
129 |         assert storage_bucket_exists(bucket_name)
130 | 
131 |         # Fetch the bucket
132 |         bucket = storage_client.bucket(bucket_name)
133 | 
134 |         # Find all files we want to upload
135 |         data_files = glob(join(self.out_dir, '*.jsonl'))
136 | 
137 |         pbar = tqdm(total=len(data_files), desc='uploading to bucket')
138 |         for data_file in data_files:
139 |             blob_name = basename(data_file)
140 |             blob = bucket.blob(blob_name)
141 | 
142 |             blob.upload_from_filename(data_file)
143 | 
144 |             # Delete CSV if requested
145 |             if delete_post_upload:
146 |                 remove(data_file)
147 | 
148 |             pbar.update()
149 |         pbar.close()
150 | 
151 | 
152 | def get_rpc_provider(rpc_provider, rpc_provider_url, chain, api_key):
153 |     r"""Returns the chain URL from RPC provider.
154 |     Arguments:
155 |     --
156 |     chain (str): Chain to pull data from.
157 |         Choices: ethereum | polygon | optimism | arbitrum
158 |     api_key (str): API key
159 |         API key
160 |     Returns:
161 |     --
162 |     provider_url (str): RPC url
163 |     """
164 |     if rpc_provider == 'quicknode':
165 |         # NOTE: QuickNode generates custom URLs for every project. 
166 |         provider_url = rpc_provider_url
167 |     elif rpc_provider == 'chainstack':
168 |         # NOTE: ChainStack assumes provider URL is direcvtly given.
169 |         provider_url = rpc_provider_url
170 |     elif rpc_provider == 'alchemy':
171 |         if rpc_provider_url is not None:
172 |             provider_url = rpc_provider_url
173 |         else:
174 |             # NOTE: This is not exhaustive. More can be added on request. Or use `provider_url`
175 |             if chain == 'ethereum':
176 |                 provider_url = f'https://eth-mainnet.g.alchemy.com/v2/{api_key}'
177 |             elif chain == 'polygon':
178 |                 provider_url = f'https://polygon-mainnet.g.alchemy.com/v2/{api_key}'
179 |             elif chain == 'optimism':
180 |                 provider_url = f'https://opt-mainnet.g.alchemy.com/v2/{api_key}'
181 |             elif chain == 'arbitrum':
182 |                 provider_url = f'https://arb-mainnet.g.alchemy.com/v2/{api_key}'
183 |             else:
184 |                 raise Exception(f'Chain {chain} not supported.')
185 |     elif rpc_provider == 'infura':
186 |         if rpc_provider_url is not None:
187 |             provider_url = rpc_provider_url
188 |         else:
189 |             # NOTE: This is not exhaustive. More can be added on request. Or use `provider_url`
190 |             if chain == 'ethereum':
191 |                 provider_url = f'https://mainnet.infura.io/v3/{api_key}'
192 |             elif chain == 'polygon':
193 |                 provider_url = f'https://polygon-mainnet.infura.io/v3/{api_key}'
194 |             elif chain == 'optimism':
195 |                 provider_url = f'https://optimism-mainnet.infura.io/v3{api_key}'
196 |             elif chain == 'arbitrum':
197 |                 provider_url = f'https://arbitrum-mainnet.infura.io/v3/{api_key}'
198 |             else:
199 |                 raise Exception(f'Chain {chain} not supported.')
200 |     else:
201 |         raise Exception(f'Provider {rpc_provider} not supported.')
202 | 
203 |     return provider_url
204 | 
205 | def get_current_block(url):
206 |     r"""Get the current block.
207 |     Arguments:
208 |     --
209 |     url (str)
210 |     Returns:
211 |     --
212 |     block_number (int): Latest block number.
213 |     """
214 |     payload = {
215 |         "id": 1,
216 |         "jsonrpc": "2.0",
217 |         "method": "eth_blockNumber"
218 |     }
219 |     headers = {
220 |         "accept": "application/json",
221 |         "content-type": "application/json"
222 |     }
223 |     response = requests.post(url, json=payload, headers=headers)
224 |     if response.status_code != 200:
225 |         raise Exception('Failed to fetch latest block number. RPC Provider response: ' + str(json.loads(response.text)))
226 | 
227 |     response_data = json.loads(response.text)
228 |     block_number = int(response_data['result'], 0)
229 | 
230 |     return block_number
231 | 
232 | 
233 | async def async_make_api_requests(url,
234 |                                   start_block,
235 |                                   end_block,
236 |                                   out_file,
237 |                                   num_threads=10,
238 |                                   ):
239 |     r"""Make async API requests.
240 |     Arguments:
241 |     --
242 |     url (str): API endpoint.
243 |     start_block (int): Block number to start pulling at.
244 |     end_block (int): Block number to stop pulling at.
245 |     out_file (str): Where to save API results
246 |     num_threads (int, default=10): Number of threads to use.
247 |     """
248 |     print("{0:<30} {1:>20}".format("Block number", "Completed at"))
249 | 
250 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
251 |         with requests.Session() as session:
252 |             loop = asyncio.get_event_loop()
253 |             START_TIME = default_timer()  # update global start time
254 |             tasks = [
255 |                 loop.run_in_executor(
256 |                     executor,
257 |                     make_api_request,
258 |                     *(session, block, url)
259 |                 )
260 |                 for block in range(start_block, end_block)
261 |             ]
262 | 
263 |             # Discard any failed responses
264 |             dataset = []
265 |             for block_data in await asyncio.gather(*tasks):
266 |                 if block_data is not None:
267 |                     dataset.append(block_data)
268 | 
269 |             # Write to file
270 |             with jsonlines.open(out_file, mode='w') as writer:
271 |                 writer.write_all(dataset)
272 | 
273 | 
274 | def make_api_request(session, block_number, url):
275 |     r"""Pings the method `eth_getBlockByNumber`.
276 |     Arguments:
277 |     --
278 |     block_number (int): Number of the block.
279 |     url (str): URL for the RPC API.
280 |     Returns:
281 |     --
282 |     response_data (Dict[str, any]): Block and transaction JSON.
283 |     Notes:
284 |     --
285 |     Prints block #'s and timestamps as it runs.
286 |     """
287 |     payload = {
288 |         "id": 1,
289 |         "jsonrpc": "2.0",
290 |         "method": "eth_getBlockByNumber",
291 |         "params": [hex(block_number), True],
292 |     }
293 |     headers = {
294 |         "accept": "application/json",
295 |         "content-type": "application/json"
296 |     }
297 |     with session.post(url, json=payload, headers=headers) as response:
298 |         if response.status_code != 200:
299 |             return None
300 | 
301 |         response = json.loads(response.text)
302 |         response_data = response['result']
303 | 
304 |         # Print update
305 |         elapsed_time = default_timer() - START_TIME
306 |         completed_at = "{:5.2f}s".format(elapsed_time)
307 |         print("{0:<30} {1:>20}".format(block_number, completed_at))
308 | 
309 |         return response_data
310 | 
311 | 
312 | def storage_bucket_exists(gcloud_bucket):
313 |     r"""Checks if a storage bucket exists.
314 |     Returns:
315 |     --
316 |     exists (bool): True if the bucket exists; False otherwise.
317 |     """
318 |     client = storage.Client()
319 |     exists = storage.Bucket(client, gcloud_bucket).exists()
320 | 
321 |     return exists
322 | 
323 | 
324 | def create_storage_bucket(gcloud_bucket):
325 |     r"""Creates a new storage bucket.
326 |     Returns:
327 |     --
328 |     success (bool): True if the bucket was created. False otherwise.
329 |     Notes:
330 |     --
331 |     Assumes the storage bucket does not exist yet. We do not explicitly 
332 |     check for this.
333 |     """
334 |     client = storage.Client()
335 |     client.create_bucket(gcloud_bucket)
336 |     success = storage_bucket_exists(gcloud_bucket)
337 |     return success
338 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==3.1.0
2 | numpy==1.21.1
3 | protobuf==4.21.10
4 | requests==2.28.1
5 | setuptools==41.2.0
6 | tqdm==4.62.3
7 | google-cloud-storage==2.6.0
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | LONG_DESCRIPTION = open("README.md", "r").read()
 4 | 
 5 | setup(
 6 |     name="nodeapi_utils",
 7 |     version="0.1.0",
 8 |     author="web3analytic",
 9 |     author_email="mike@paretolabs.xyz",
10 |     packages=find_packages(),
11 |     scripts=[],
12 |     description="Toolkit for parallel API requests to fetch transactions.",
13 |     long_description=LONG_DESCRIPTION,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/web3analytic-xyz/nodeapi-utils",
16 |     install_requires=[
17 |         "jsonlines==3.1.0",
18 |         "numpy==1.21.1",
19 |         "protobuf==4.21.10",
20 |         "requests==2.28.1",
21 |         "setuptools==41.2.0",
22 |         "tqdm==4.62.3",
23 |         "google-cloud-storage==2.6.0"
24 |     ]
25 | )
26 | 


--------------------------------------------------------------------------------