├── .gitignore ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── datasets ├── README.md ├── ethereum_contracts │ ├── LICENSE-CC0 │ ├── README.md │ └── dataset_manifest.json ├── ethereum_native_transfers │ ├── LICENSE-CC0 │ ├── README.md │ └── dataset_manifest.json ├── ethereum_slots │ ├── LICENSE-CC0 │ ├── README.md │ └── dataset_manifest.json └── global_manifest.json ├── notebooks └── explore_ethereum_contracts.ipynb ├── pdp ├── __init__.py ├── __main__.py ├── cli │ ├── __init__.py │ ├── cli_run.py │ └── commands │ │ ├── __init__.py │ │ ├── collect_command.py │ │ ├── dataset_command.py │ │ ├── download_command.py │ │ ├── ls_command.py │ │ ├── package_command.py │ │ ├── root_command.py │ │ ├── update_command.py │ │ ├── upload_command.py │ │ └── validate_command.py ├── config_utils.py ├── data_utils │ ├── __init__.py │ ├── collect_utils.py │ ├── download_utils.py │ ├── file_utils.py │ ├── job_utils.py │ ├── manifest_utils.py │ ├── query_utils.py │ ├── readme_utils.py │ ├── schema_utils.py │ └── update_utils.py ├── datasets │ ├── __init__.py │ ├── contracts │ │ ├── __init__.py │ │ ├── contracts_collect.py │ │ ├── contracts_queries.py │ │ └── contracts_spec.py │ ├── native_transfers │ │ ├── __init__.py │ │ ├── native_transfers_collect.py │ │ ├── native_transfers_queries.py │ │ └── native_transfers_spec.py │ └── slots │ │ ├── __init__.py │ │ ├── slots_collect.py │ │ ├── slots_queries.py │ │ └── slots_spec.py ├── py.typed └── spec.py ├── pyproject.toml └── tests ├── remote_tests └── test_manifests.py ├── test_collect.py └── test_validate.py /.gitignore: -------------------------------------------------------------------------------- 1 | # specific files 2 | CHANGELOG.md 3 | ROADMAP.md 4 | TODO.md 5 | pytestdebug.log 6 | tuna.log 7 | .DS_STORE 8 | 9 | # filetypes 10 | *.egg-info 11 | *.pyc 12 | *.csv 13 | *.parquet 14 | 15 | # folders 16 | */.hypothesis 17 | .coverage_html/* 18 | .hypothesis/ 19 | .tox/* 20 | __pycache__/* 21 | build/* 22 | dist/* 23 | roadmap/* 24 | data/* 25 | extras_ignore/* 26 | *.ipynb 27 | *.ipynb_checkpoints 28 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 Paradigm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Paradigm Data Portal 3 | 4 | The Paradigm Data Portal is a collection of open source crypto datasets for researchers and tool builders 5 | 6 | ## Datasets 7 | - [`ethereum_contracts`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_contracts): all historical contract deployments 8 | - [`ethereum_native_transfers`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_native_transfers): all native transfers in similar format to ERC20 Transfers (excluding tx fees) 9 | - [`ethereum_slots`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_slots): all slots of each contract, including historical usage metadata 10 | 11 | All datasets are released under a [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) license into the public domain unless otherwise noted. 12 | 13 | ## `pdp` 14 | 15 | `pdp` is a CLI tool that can be used to obtain and manage PDP datasets 16 | 17 | To install: `pip install paradigm-data-portal` 18 | 19 | 20 | #### Example Usage 21 | 22 | - List available datasets `pdp ls` 23 | - List dataset files `pdp ls ` 24 | - Download a dataset `pdp download ` 25 | 26 | Each command has multiple options, view help with `pdp -h` 27 | 28 | 29 | ## Dataset Versioning 30 | 31 | Every dataset has a version in `..` format, e.g. `1.2.8` 32 | - when a schema is updated, the major version is increased 33 | - when rows are added, removed, or modified, the minor version is increased 34 | - when rows are added due to new blocks, the patch is increased 35 | 36 | Updates will be documented in dataset changelogs 37 | 38 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Datasets 3 | - [`ethereum_contracts`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_contracts): all historical contract deployments 4 | - [`ethereum_native_transfers`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_native_transfers): all native transfers in similar format to ERC20 Transfers (excluding tx fees) 5 | - [`ethereum_slots`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_slots): all slots of each contract, including historical usage metadata 6 | 7 | -------------------------------------------------------------------------------- /datasets/ethereum_contracts/LICENSE-CC0: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /datasets/ethereum_contracts/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Ethereum Contracts Dataset v1.0.0 3 | 4 | This is a dataset of all historical contract deployments 5 | 6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/contracts/contracts_collect.py) 7 | 8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/) 9 | 10 | ## Usage 11 | 12 | Some example uses of this dataset include: 13 | - look up all contracts deployed by an address 14 | - look up all contracts that have a given bytecode 15 | - analyze distribution of contract bytecode motifs 16 | 17 | An example notebook exploring this dataset can be found [here](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/notebooks/explore_ethereum_contracts.ipynb) 18 | 19 | ## Schema 20 | 21 | #### `contracts` table 22 | each row corresponds to a contract create trace 23 | | column | type | description | 24 | | - | - | - | 25 | | block_number | INTEGER | block number when contract was created | 26 | | create_index | INTEGER | increased by 1 for each contract created in block | 27 | | transaction_hash | BINARY | hash of transaction that created contract | 28 | | contract_address | BINARY | address of deployed contract | 29 | | deployer | BINARY | EOA that deployed the contract | 30 | | factory | BINARY | the `from` field in the creation trace | 31 | | init_code | BINARY | initialization bytecode of contract | 32 | | code | BINARY | bytecode of contract | 33 | | init_code_hash | BINARY | keccak hash of contract initialization code | 34 | | code_hash | BINARY | keccak hash of contract bytecode | 35 | 36 | ## Download 37 | 38 | This dataset can be downloaded using either the `pdp` cli tool or the urls below 39 | 40 | The total dataset size is **6.57GB** 41 | 42 | ### Use `pdp` 43 | 44 | The command `pdp download ethereum_contracts` will download all files in this dataset 45 | 46 | See `pdp download -h` for available options 47 | 48 | ### Use URLs 49 | 50 | | | file | size | 51 | | - | - | - | 52 | | 1 | [ethereum_contracts__v1_0_0__00000000_to_00999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__00000000_to_00999999.parquet) | 2.96MB | 53 | | 2 | [ethereum_contracts__v1_0_0__01000000_to_01999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__01000000_to_01999999.parquet) | 13.08MB | 54 | | 3 | [ethereum_contracts__v1_0_0__02000000_to_02999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__02000000_to_02999999.parquet) | 24.86MB | 55 | | 4 | [ethereum_contracts__v1_0_0__03000000_to_03999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__03000000_to_03999999.parquet) | 83.30MB | 56 | | 5 | [ethereum_contracts__v1_0_0__04000000_to_04999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__04000000_to_04999999.parquet) | 295.85MB | 57 | | 6 | [ethereum_contracts__v1_0_0__05000000_to_05999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__05000000_to_05999999.parquet) | 313.06MB | 58 | | 7 | [ethereum_contracts__v1_0_0__06000000_to_06999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__06000000_to_06999999.parquet) | 384.52MB | 59 | | 8 | [ethereum_contracts__v1_0_0__07000000_to_07999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__07000000_to_07999999.parquet) | 338.28MB | 60 | | 9 | [ethereum_contracts__v1_0_0__08000000_to_08999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__08000000_to_08999999.parquet) | 318.73MB | 61 | | 10 | [ethereum_contracts__v1_0_0__09000000_to_09999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__09000000_to_09999999.parquet) | 401.13MB | 62 | | 11 | [ethereum_contracts__v1_0_0__10000000_to_10999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__10000000_to_10999999.parquet) | 484.85MB | 63 | | 12 | [ethereum_contracts__v1_0_0__11000000_to_11999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__11000000_to_11999999.parquet) | 529.76MB | 64 | | 13 | [ethereum_contracts__v1_0_0__12000000_to_12999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__12000000_to_12999999.parquet) | 618.64MB | 65 | | 14 | [ethereum_contracts__v1_0_0__13000000_to_13999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__13000000_to_13999999.parquet) | 567.07MB | 66 | | 15 | [ethereum_contracts__v1_0_0__14000000_to_14999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__14000000_to_14999999.parquet) | 761.28MB | 67 | | 16 | [ethereum_contracts__v1_0_0__15000000_to_15999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__15000000_to_15999999.parquet) | 909.94MB | 68 | | 17 | [ethereum_contracts__v1_0_0__16000000_to_16799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__16000000_to_16799999.parquet) | 677.91MB | 69 | -------------------------------------------------------------------------------- /datasets/ethereum_contracts/dataset_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "datatype": "contracts", 3 | "description": "all historical contract deployments", 4 | "files": [ 5 | { 6 | "hash": "755f40506dccfee0849f00d79e4336db", 7 | "n_bytes": 3107682, 8 | "name": "ethereum_contracts__v1_0_0__00000000_to_00999999.parquet" 9 | }, 10 | { 11 | "hash": "c77f1732bc2619e6814e4d2fe723b0c7", 12 | "n_bytes": 13713368, 13 | "name": "ethereum_contracts__v1_0_0__01000000_to_01999999.parquet" 14 | }, 15 | { 16 | "hash": "10e8590e91f44bdcb39b9fa7b0433ce9", 17 | "n_bytes": 26068725, 18 | "name": "ethereum_contracts__v1_0_0__02000000_to_02999999.parquet" 19 | }, 20 | { 21 | "hash": "0852e9c89d13e3af3c1defb28e6eb85c", 22 | "n_bytes": 87348694, 23 | "name": "ethereum_contracts__v1_0_0__03000000_to_03999999.parquet" 24 | }, 25 | { 26 | "hash": "c07dec107c820804e48ed613454d0095", 27 | "n_bytes": 310223347, 28 | "name": "ethereum_contracts__v1_0_0__04000000_to_04999999.parquet" 29 | }, 30 | { 31 | "hash": "e406e069f13e3b6c53bc7a634b4f4ea8", 32 | "n_bytes": 328268119, 33 | "name": "ethereum_contracts__v1_0_0__05000000_to_05999999.parquet" 34 | }, 35 | { 36 | "hash": "9651c6d0f349ee589b20e8bea896fdc1", 37 | "n_bytes": 403202482, 38 | "name": "ethereum_contracts__v1_0_0__06000000_to_06999999.parquet" 39 | }, 40 | { 41 | "hash": "d2ae7c208b51a09537a1cf41d36dc4ed", 42 | "n_bytes": 354711950, 43 | "name": "ethereum_contracts__v1_0_0__07000000_to_07999999.parquet" 44 | }, 45 | { 46 | "hash": "45dfc78c8223c71ae530f8bea494d92b", 47 | "n_bytes": 334208521, 48 | "name": "ethereum_contracts__v1_0_0__08000000_to_08999999.parquet" 49 | }, 50 | { 51 | "hash": "c4a2509578d51135eec6cd0d604d7657", 52 | "n_bytes": 420620123, 53 | "name": "ethereum_contracts__v1_0_0__09000000_to_09999999.parquet" 54 | }, 55 | { 56 | "hash": "50da1c9a328f4152b768c9ac1e3b5c99", 57 | "n_bytes": 508399240, 58 | "name": "ethereum_contracts__v1_0_0__10000000_to_10999999.parquet" 59 | }, 60 | { 61 | "hash": "8013ba9f27c2844ba5341a44e70eee74", 62 | "n_bytes": 555497504, 63 | "name": "ethereum_contracts__v1_0_0__11000000_to_11999999.parquet" 64 | }, 65 | { 66 | "hash": "c9ad9a784ee11d3d1e44a63b3be7a25d", 67 | "n_bytes": 648691436, 68 | "name": "ethereum_contracts__v1_0_0__12000000_to_12999999.parquet" 69 | }, 70 | { 71 | "hash": "49aca9eb469fb435fc11c09037434ff8", 72 | "n_bytes": 594615935, 73 | "name": "ethereum_contracts__v1_0_0__13000000_to_13999999.parquet" 74 | }, 75 | { 76 | "hash": "4ce044cc6f6255d3b2fab72a333e5ed8", 77 | "n_bytes": 798261906, 78 | "name": "ethereum_contracts__v1_0_0__14000000_to_14999999.parquet" 79 | }, 80 | { 81 | "hash": "26f63f9ec8023dac8a716dcde6ab9078", 82 | "n_bytes": 954142634, 83 | "name": "ethereum_contracts__v1_0_0__15000000_to_15999999.parquet" 84 | }, 85 | { 86 | "hash": "b71b9887d256daf9c89a973cb10af6d7", 87 | "n_bytes": 710840143, 88 | "name": "ethereum_contracts__v1_0_0__16000000_to_16799999.parquet" 89 | } 90 | ], 91 | "name": "ethereum_contracts", 92 | "network": "ethereum", 93 | "schema": { 94 | "description": "all historical contract deployments", 95 | "tables": { 96 | "contracts": { 97 | "columns": [ 98 | { 99 | "description": "block number when contract was created", 100 | "name": "block_number", 101 | "type": "INTEGER" 102 | }, 103 | { 104 | "description": "increased by 1 for each contract created in block", 105 | "name": "create_index", 106 | "type": "INTEGER" 107 | }, 108 | { 109 | "description": "hash of transaction that created contract", 110 | "name": "transaction_hash", 111 | "type": "BINARY" 112 | }, 113 | { 114 | "description": "address of deployed contract", 115 | "name": "contract_address", 116 | "type": "BINARY" 117 | }, 118 | { 119 | "description": "EOA that deployed the contract", 120 | "name": "deployer", 121 | "type": "BINARY" 122 | }, 123 | { 124 | "description": "the `from` field in the creation trace", 125 | "name": "factory", 126 | "type": "BINARY" 127 | }, 128 | { 129 | "description": "initialization bytecode of contract", 130 | "name": "init_code", 131 | "type": "BINARY" 132 | }, 133 | { 134 | "description": "bytecode of contract", 135 | "name": "code", 136 | "type": "BINARY" 137 | }, 138 | { 139 | "description": "keccak hash of contract initialization code", 140 | "name": "init_code_hash", 141 | "type": "BINARY" 142 | }, 143 | { 144 | "description": "keccak hash of contract bytecode", 145 | "name": "code_hash", 146 | "type": "BINARY" 147 | } 148 | ], 149 | "description": "each row corresponds to a contract create trace" 150 | } 151 | } 152 | }, 153 | "version": "1.0.0" 154 | } -------------------------------------------------------------------------------- /datasets/ethereum_native_transfers/LICENSE-CC0: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /datasets/ethereum_native_transfers/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Ethereum Native Transfers Dataset v1.0.0 3 | 4 | This is a dataset of all native transfers in similar format to ERC20 Transfers (excluding tx fees) 5 | 6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/native_transfers/native_transfers_collect.py) 7 | 8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/) 9 | 10 | ## Usage 11 | 12 | Some example uses of this dataset include: 13 | - look up all inbound transfers to an address 14 | - analyze transfer size distributions 15 | - analyze transfer frequency distributions 16 | 17 | 18 | 19 | ## Schema 20 | 21 | #### `native_transfers` table 22 | each row corresponds to a trace that transfers native token 23 | | column | type | description | 24 | | - | - | - | 25 | | block_number | INTEGER | block number where native token was transfered | 26 | | transfer_index | INTEGER | increased by 1 for each native transfer in block | 27 | | transaction_hash | BINARY | hash of transaction that contains transfer | 28 | | to_address | BINARY | address that native token is transferred to | 29 | | from_address | BINARY | address that native token is transferred from | 30 | | value | BINARY | amount of native token transferred | 31 | 32 | ## Download 33 | 34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below 35 | 36 | The total dataset size is **61.00GB** 37 | 38 | ### Use `pdp` 39 | 40 | The command `pdp download ethereum_native_transfers` will download all files in this dataset 41 | 42 | See `pdp download -h` for available options 43 | 44 | ### Use URLs 45 | 46 | | | file | size | 47 | | - | - | - | 48 | | 1 | [ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet) | 5.43MB | 49 | | 2 | [ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet) | 10.81MB | 50 | | 3 | [ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet) | 12.15MB | 51 | | 4 | [ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet) | 15.24MB | 52 | | 5 | [ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet) | 23.57MB | 53 | | 6 | [ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet) | 43.13MB | 54 | | 7 | [ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet) | 52.66MB | 55 | | 8 | [ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet) | 68.87MB | 56 | | 9 | [ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet) | 71.94MB | 57 | | 10 | [ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet) | 75.75MB | 58 | | 11 | [ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet) | 78.88MB | 59 | | 12 | [ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet) | 119.10MB | 60 | | 13 | [ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet) | 66.45MB | 61 | | 14 | [ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet) | 66.38MB | 62 | | 15 | [ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet) | 65.67MB | 63 | | 16 | [ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet) | 70.41MB | 64 | | 17 | [ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet) | 106.03MB | 65 | | 18 | [ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet) | 136.80MB | 66 | | 19 | [ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet) | 233.24MB | 67 | | 20 | [ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet) | 469.12MB | 68 | | 21 | [ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet) | 592.62MB | 69 | | 22 | [ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet) | 804.41MB | 70 | | 23 | [ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet) | 516.11MB | 71 | | 24 | [ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet) | 1.09GB | 72 | | 25 | [ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet) | 1.71GB | 73 | | 26 | [ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet) | 1020.42MB | 74 | | 27 | [ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet) | 769.68MB | 75 | | 28 | [ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet) | 878.05MB | 76 | | 29 | [ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet) | 893.71MB | 77 | | 30 | [ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet) | 705.26MB | 78 | | 31 | [ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet) | 745.70MB | 79 | | 32 | [ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet) | 643.32MB | 80 | | 33 | [ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet) | 588.53MB | 81 | | 34 | [ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet) | 607.73MB | 82 | | 35 | [ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet) | 587.19MB | 83 | | 36 | [ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet) | 598.46MB | 84 | | 37 | [ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet) | 588.88MB | 85 | | 38 | [ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet) | 629.00MB | 86 | | 39 | [ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet) | 663.01MB | 87 | | 40 | [ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet) | 729.28MB | 88 | | 41 | [ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet) | 627.52MB | 89 | | 42 | [ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet) | 560.94MB | 90 | | 43 | [ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet) | 522.91MB | 91 | | 44 | [ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet) | 475.54MB | 92 | | 45 | [ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet) | 496.36MB | 93 | | 46 | [ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet) | 509.63MB | 94 | | 47 | [ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet) | 437.48MB | 95 | | 48 | [ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet) | 531.73MB | 96 | | 49 | [ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet) | 559.07MB | 97 | | 50 | [ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet) | 669.82MB | 98 | | 51 | [ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet) | 794.10MB | 99 | | 52 | [ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet) | 856.74MB | 100 | | 53 | [ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet) | 1.02GB | 101 | | 54 | [ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet) | 1016.86MB | 102 | | 55 | [ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet) | 901.55MB | 103 | | 56 | [ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet) | 911.41MB | 104 | | 57 | [ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet) | 950.09MB | 105 | | 58 | [ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet) | 1020.27MB | 106 | | 59 | [ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet) | 1.12GB | 107 | | 60 | [ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet) | 1.20GB | 108 | | 61 | [ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet) | 1.28GB | 109 | | 62 | [ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet) | 1.50GB | 110 | | 63 | [ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet) | 1.42GB | 111 | | 64 | [ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet) | 1.18GB | 112 | | 65 | [ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet) | 1.23GB | 113 | | 66 | [ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet) | 1.24GB | 114 | | 67 | [ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet) | 1.28GB | 115 | | 68 | [ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet) | 1.43GB | 116 | | 69 | [ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet) | 1.39GB | 117 | | 70 | [ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet) | 1.33GB | 118 | | 71 | [ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet) | 1.33GB | 119 | | 72 | [ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet) | 1.29GB | 120 | | 73 | [ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet) | 1.29GB | 121 | | 74 | [ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet) | 1.30GB | 122 | | 75 | [ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet) | 1.20GB | 123 | | 76 | [ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet) | 1.23GB | 124 | | 77 | [ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet) | 1.30GB | 125 | | 78 | [ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet) | 1.20GB | 126 | | 79 | [ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet) | 1.03GB | 127 | | 80 | [ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet) | 975.74MB | 128 | | 81 | [ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet) | 1009.98MB | 129 | | 82 | [ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet) | 958.26MB | 130 | | 83 | [ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet) | 972.90MB | 131 | | 84 | [ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet) | 1.00GB | 132 | -------------------------------------------------------------------------------- /datasets/ethereum_native_transfers/dataset_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "datatype": "native_transfers", 3 | "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)", 4 | "files": [ 5 | { 6 | "hash": "48cc4472dae3afe90f2a2cdffcfdae0a", 7 | "n_bytes": 5690786, 8 | "name": "ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet" 9 | }, 10 | { 11 | "hash": "de4c0e0aaad27071dd42ceb5d972dfbb", 12 | "n_bytes": 11331350, 13 | "name": "ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet" 14 | }, 15 | { 16 | "hash": "eac20eaec5cf632d171a00775529386d", 17 | "n_bytes": 12736409, 18 | "name": "ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet" 19 | }, 20 | { 21 | "hash": "07effd621987733a5ff51e247bb26dd3", 22 | "n_bytes": 15982188, 23 | "name": "ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet" 24 | }, 25 | { 26 | "hash": "b03398e16152ee692d4c1197cd2db915", 27 | "n_bytes": 24712485, 28 | "name": "ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet" 29 | }, 30 | { 31 | "hash": "8b571782b37a3f1478e3ceebf75895cb", 32 | "n_bytes": 45226256, 33 | "name": "ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet" 34 | }, 35 | { 36 | "hash": "9eac0d7cd3625b9c3718ec9241bc0ee1", 37 | "n_bytes": 55216367, 38 | "name": "ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet" 39 | }, 40 | { 41 | "hash": "3a55f7070511cfbc561832e21df7ac4b", 42 | "n_bytes": 72211149, 43 | "name": "ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet" 44 | }, 45 | { 46 | "hash": "4195473710d5a27a24cbce960a6f353d", 47 | "n_bytes": 75430132, 48 | "name": "ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet" 49 | }, 50 | { 51 | "hash": "e99b0c8dbd2110566f3759b593c559b9", 52 | "n_bytes": 79432031, 53 | "name": "ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet" 54 | }, 55 | { 56 | "hash": "1878bc4b8c6b750d3c3102064ff31980", 57 | "n_bytes": 82707629, 58 | "name": "ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet" 59 | }, 60 | { 61 | "hash": "c707da6438970db942fc984d9ee3e577", 62 | "n_bytes": 124886832, 63 | "name": "ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet" 64 | }, 65 | { 66 | "hash": "c183e61f0704585d37454ca16ec5a15d", 67 | "n_bytes": 69677040, 68 | "name": "ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet" 69 | }, 70 | { 71 | "hash": "f4b9c196453e68490c44f555399c0847", 72 | "n_bytes": 69608858, 73 | "name": "ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet" 74 | }, 75 | { 76 | "hash": "6c91ee04199ce9bc8415adbb972fcadc", 77 | "n_bytes": 68858529, 78 | "name": "ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet" 79 | }, 80 | { 81 | "hash": "f653d38c01959e5a6ebd92790c728226", 82 | "n_bytes": 73834555, 83 | "name": "ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet" 84 | }, 85 | { 86 | "hash": "bdba4b5171bbfc480003425e5e6f1fa3", 87 | "n_bytes": 111175493, 88 | "name": "ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet" 89 | }, 90 | { 91 | "hash": "b3a189d018880ad8be30a8f3b29b0fd3", 92 | "n_bytes": 143441746, 93 | "name": "ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet" 94 | }, 95 | { 96 | "hash": "1fba8d7018a320a5fa0dda2dd99d47b1", 97 | "n_bytes": 244566467, 98 | "name": "ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet" 99 | }, 100 | { 101 | "hash": "644c4fecfdb7e240abed67c8dd299e09", 102 | "n_bytes": 491904504, 103 | "name": "ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet" 104 | }, 105 | { 106 | "hash": "747797caf009a871a39d1e8488ef1bba", 107 | "n_bytes": 621409562, 108 | "name": "ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet" 109 | }, 110 | { 111 | "hash": "90538dc4957d907e97b8736d15a5e98a", 112 | "n_bytes": 843481817, 113 | "name": "ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet" 114 | }, 115 | { 116 | "hash": "c4afb09f7500af27f75d25f215ed7d6d", 117 | "n_bytes": 541175455, 118 | "name": "ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet" 119 | }, 120 | { 121 | "hash": "d57db6ec0071750304112c534687306c", 122 | "n_bytes": 1172756317, 123 | "name": "ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet" 124 | }, 125 | { 126 | "hash": "56c9f2fdbd9111b591e338c46117d6e7", 127 | "n_bytes": 1834495623, 128 | "name": "ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet" 129 | }, 130 | { 131 | "hash": "dd7578adca464d60d6017b6aa5533149", 132 | "n_bytes": 1069985060, 133 | "name": "ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet" 134 | }, 135 | { 136 | "hash": "f0b4975e9167f6be5c5f8a3a44c38daa", 137 | "n_bytes": 807066352, 138 | "name": "ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet" 139 | }, 140 | { 141 | "hash": "01fa37f1e9f4889583af7dd48c6af413", 142 | "n_bytes": 920702621, 143 | "name": "ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet" 144 | }, 145 | { 146 | "hash": "7df8235ac272f0061679aaea9d956aba", 147 | "n_bytes": 937126477, 148 | "name": "ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet" 149 | }, 150 | { 151 | "hash": "49f3491190048339aa2e5d56f82f1582", 152 | "n_bytes": 739518170, 153 | "name": "ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet" 154 | }, 155 | { 156 | "hash": "879cecaae2077ae3f047e416f4218556", 157 | "n_bytes": 781918979, 158 | "name": "ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet" 159 | }, 160 | { 161 | "hash": "daa82a38b90b633c330ced9463e4ba3e", 162 | "n_bytes": 674568646, 163 | "name": "ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet" 164 | }, 165 | { 166 | "hash": "716a3c88a5a948a44d4d8d8a8f94084e", 167 | "n_bytes": 617115529, 168 | "name": "ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet" 169 | }, 170 | { 171 | "hash": "7263265adf1c75686655dd8bf22b54f3", 172 | "n_bytes": 637247843, 173 | "name": "ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet" 174 | }, 175 | { 176 | "hash": "f28a4ea84fdd1b2a76155f80bd3a20c4", 177 | "n_bytes": 615714596, 178 | "name": "ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet" 179 | }, 180 | { 181 | "hash": "5d4d19ec322b8058ebee1e56e48c9efc", 182 | "n_bytes": 627526606, 183 | "name": "ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet" 184 | }, 185 | { 186 | "hash": "e65df6724a58a57b4b2dfb360eb9b4c2", 187 | "n_bytes": 617488778, 188 | "name": "ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet" 189 | }, 190 | { 191 | "hash": "95b6fe1605afa22e70bf15dd791bb598", 192 | "n_bytes": 659556612, 193 | "name": "ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet" 194 | }, 195 | { 196 | "hash": "f8a731183326f6ae7d0269cb38e8d354", 197 | "n_bytes": 695213366, 198 | "name": "ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet" 199 | }, 200 | { 201 | "hash": "ff0377618f7343bc6ccf634d63e85f74", 202 | "n_bytes": 764704854, 203 | "name": "ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet" 204 | }, 205 | { 206 | "hash": "5528152d4d71d2a80b29a99600f6a2b6", 207 | "n_bytes": 658004756, 208 | "name": "ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet" 209 | }, 210 | { 211 | "hash": "fb1260145d0729301dbe3fcf98b0f926", 212 | "n_bytes": 588192142, 213 | "name": "ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet" 214 | }, 215 | { 216 | "hash": "edcacec2a0b0931eb22b341eca353bdf", 217 | "n_bytes": 548307705, 218 | "name": "ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet" 219 | }, 220 | { 221 | "hash": "b1c360c12ba515b8d24abb7ead463b67", 222 | "n_bytes": 498643017, 223 | "name": "ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet" 224 | }, 225 | { 226 | "hash": "c8045e5a30a070fd89b0d6fb53f7d62f", 227 | "n_bytes": 520471903, 228 | "name": "ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet" 229 | }, 230 | { 231 | "hash": "3eab3ac719978e273da51b6f62fca8a3", 232 | "n_bytes": 534387942, 233 | "name": "ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet" 234 | }, 235 | { 236 | "hash": "afd242d4cc1ab2731e8bdb9faf362903", 237 | "n_bytes": 458727305, 238 | "name": "ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet" 239 | }, 240 | { 241 | "hash": "9fd3aeb9cd4256dff43c24a9dc2109ec", 242 | "n_bytes": 557555799, 243 | "name": "ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet" 244 | }, 245 | { 246 | "hash": "0a8b0784fc6caf485d817f67afdfdac8", 247 | "n_bytes": 586230193, 248 | "name": "ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet" 249 | }, 250 | { 251 | "hash": "994df72a68a3b1e346a8d9683ad50f08", 252 | "n_bytes": 702357167, 253 | "name": "ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet" 254 | }, 255 | { 256 | "hash": "130e0deb357f3b831076625213847120", 257 | "n_bytes": 832679303, 258 | "name": "ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet" 259 | }, 260 | { 261 | "hash": "ef7bc9bf89360105c5c2ea49283d0d69", 262 | "n_bytes": 898361669, 263 | "name": "ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet" 264 | }, 265 | { 266 | "hash": "e00932ab8b79f669acab9a84a88db9dc", 267 | "n_bytes": 1098405632, 268 | "name": "ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet" 269 | }, 270 | { 271 | "hash": "2a116bbaecaeb4b8074048cf8f09b42a", 272 | "n_bytes": 1066257859, 273 | "name": "ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet" 274 | }, 275 | { 276 | "hash": "ae25ed040f74e86be557f1649b9e7868", 277 | "n_bytes": 945348380, 278 | "name": "ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet" 279 | }, 280 | { 281 | "hash": "8214c1b997afeafe151e51af79f58042", 282 | "n_bytes": 955687252, 283 | "name": "ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet" 284 | }, 285 | { 286 | "hash": "8ad8a297a2ff8af9a251ae9de2468e15", 287 | "n_bytes": 996245659, 288 | "name": "ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet" 289 | }, 290 | { 291 | "hash": "8781baab23d4371b8b167abbfc72e3cd", 292 | "n_bytes": 1069832245, 293 | "name": "ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet" 294 | }, 295 | { 296 | "hash": "f3bb049fdb1e7ac901bd87eacb7fc0be", 297 | "n_bytes": 1205906693, 298 | "name": "ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet" 299 | }, 300 | { 301 | "hash": "f6b91916959cc1c22784871e2861b673", 302 | "n_bytes": 1289157248, 303 | "name": "ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet" 304 | }, 305 | { 306 | "hash": "1d256b55145d1d3aa7827d6138c71c41", 307 | "n_bytes": 1371643335, 308 | "name": "ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet" 309 | }, 310 | { 311 | "hash": "41b9d5eea89cf9690e1693bc7072d9a7", 312 | "n_bytes": 1608330834, 313 | "name": "ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet" 314 | }, 315 | { 316 | "hash": "545735c48a6b3ffe2a4c37577e3cadee", 317 | "n_bytes": 1524857332, 318 | "name": "ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet" 319 | }, 320 | { 321 | "hash": "1b247ca25341ccd7a3c156ae50f34c7a", 322 | "n_bytes": 1266490013, 323 | "name": "ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet" 324 | }, 325 | { 326 | "hash": "b61ee91e56277855a79f02f59372e5c5", 327 | "n_bytes": 1319524141, 328 | "name": "ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet" 329 | }, 330 | { 331 | "hash": "29b03078218311bb9b991760a866eb64", 332 | "n_bytes": 1333697482, 333 | "name": "ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet" 334 | }, 335 | { 336 | "hash": "93d5aaa4484f4133e87d67f57bbcead0", 337 | "n_bytes": 1373377723, 338 | "name": "ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet" 339 | }, 340 | { 341 | "hash": "300021d0fe02e888a204a92de2f35d36", 342 | "n_bytes": 1536096326, 343 | "name": "ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet" 344 | }, 345 | { 346 | "hash": "d402839ed968a45b6e0113f44f433bff", 347 | "n_bytes": 1491833900, 348 | "name": "ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet" 349 | }, 350 | { 351 | "hash": "e0e72aa8516aa88139e7d61907544c1d", 352 | "n_bytes": 1427555638, 353 | "name": "ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet" 354 | }, 355 | { 356 | "hash": "9c969d5798f70fd632371116c885f44d", 357 | "n_bytes": 1433320799, 358 | "name": "ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet" 359 | }, 360 | { 361 | "hash": "004bca3751e8a1e280bf478087ab0759", 362 | "n_bytes": 1385189844, 363 | "name": "ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet" 364 | }, 365 | { 366 | "hash": "6d48c40d2f8a9798b1cd19a960cc9bf1", 367 | "n_bytes": 1385243142, 368 | "name": "ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet" 369 | }, 370 | { 371 | "hash": "4ebe41010502841324893c461cde65bf", 372 | "n_bytes": 1397068928, 373 | "name": "ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet" 374 | }, 375 | { 376 | "hash": "a4c5e82f9fecf54d273a6fd8777f9217", 377 | "n_bytes": 1292891099, 378 | "name": "ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet" 379 | }, 380 | { 381 | "hash": "ef2f477dfba75fca6ee49f8cf979dbcb", 382 | "n_bytes": 1319237305, 383 | "name": "ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet" 384 | }, 385 | { 386 | "hash": "4cf025c9923535c2975f6e5eaa39662f", 387 | "n_bytes": 1398570730, 388 | "name": "ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet" 389 | }, 390 | { 391 | "hash": "82bf15b457879cb12b6f96d6ab969aa0", 392 | "n_bytes": 1285511893, 393 | "name": "ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet" 394 | }, 395 | { 396 | "hash": "3405eff80cdbd50f6fae9f12c3c1651c", 397 | "n_bytes": 1102072040, 398 | "name": "ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet" 399 | }, 400 | { 401 | "hash": "308f47604aae7728fc6d8419c42967de", 402 | "n_bytes": 1023139324, 403 | "name": "ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet" 404 | }, 405 | { 406 | "hash": "847821dbbc7d6a4c71cf980f87b16029", 407 | "n_bytes": 1059039821, 408 | "name": "ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet" 409 | }, 410 | { 411 | "hash": "f8cbbbbaaf51a94f816c43880236984c", 412 | "n_bytes": 1004807721, 413 | "name": "ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet" 414 | }, 415 | { 416 | "hash": "3c81ab319fdebd8da11e92d995d9e37d", 417 | "n_bytes": 1020162220, 418 | "name": "ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet" 419 | }, 420 | { 421 | "hash": "54fbd2256b2fb5ce89b8dc9021b80c47", 422 | "n_bytes": 1073940594, 423 | "name": "ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet" 424 | } 425 | ], 426 | "name": "ethereum_native_transfers", 427 | "network": "ethereum", 428 | "schema": { 429 | "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)", 430 | "tables": { 431 | "native_transfers": { 432 | "columns": [ 433 | { 434 | "description": "block number where native token was transfered", 435 | "name": "block_number", 436 | "type": "INTEGER" 437 | }, 438 | { 439 | "description": "increased by 1 for each native transfer in block", 440 | "name": "transfer_index", 441 | "type": "INTEGER" 442 | }, 443 | { 444 | "description": "hash of transaction that contains transfer", 445 | "name": "transaction_hash", 446 | "type": "BINARY" 447 | }, 448 | { 449 | "description": "address that native token is transferred to", 450 | "name": "to_address", 451 | "type": "BINARY" 452 | }, 453 | { 454 | "description": "address that native token is transferred from", 455 | "name": "from_address", 456 | "type": "BINARY" 457 | }, 458 | { 459 | "description": "amount of native token transferred", 460 | "name": "value", 461 | "type": "BINARY" 462 | } 463 | ], 464 | "description": "each row corresponds to a trace that transfers native token" 465 | } 466 | } 467 | }, 468 | "version": "1.0.0" 469 | } -------------------------------------------------------------------------------- /datasets/ethereum_slots/LICENSE-CC0: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /datasets/ethereum_slots/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Ethereum Slots Dataset v1.0.0 3 | 4 | This is a dataset of all slots of each contract, including historical usage metadata 5 | 6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/slots/slots_collect.py) 7 | 8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/) 9 | 10 | ## Usage 11 | 12 | Some example uses of this dataset include: 13 | - look up how much storage space is used by a given contract 14 | - look up which slots are used by a given contract 15 | - look up which slots change most frequently for a given contract 16 | 17 | 18 | 19 | ## Schema 20 | 21 | #### `slots` table 22 | each row corresponds to a slot of a contract 23 | | column | type | description | 24 | | - | - | - | 25 | | contract_address | BINARY | contract of slot | 26 | | slot | BINARY | address of slot | 27 | | value | BINARY | last data stored in slot | 28 | | first_updated_block | INTEGER | first block where slot was used | 29 | | last_updated_block | INTEGER | last block where slot was updated | 30 | | n_tx_updates | INTEGER | number of transactions that updated slot | 31 | 32 | ## Download 33 | 34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below 35 | 36 | The total dataset size is **38.38GB** 37 | 38 | ### Use `pdp` 39 | 40 | The command `pdp download ethereum_slots` will download all files in this dataset 41 | 42 | See `pdp download -h` for available options 43 | 44 | ### Use URLs 45 | 46 | | | file | size | 47 | | - | - | - | 48 | | 1 | [ethereum_slots__v1.0.0__0x00_to_0x0f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x00_to_0x0f.parquet) | 3.80GB | 49 | | 2 | [ethereum_slots__v1.0.0__0x10_to_0x1f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x10_to_0x1f.parquet) | 1.92GB | 50 | | 3 | [ethereum_slots__v1.0.0__0x20_to_0x2f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x20_to_0x2f.parquet) | 2.23GB | 51 | | 4 | [ethereum_slots__v1.0.0__0x30_to_0x3f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x30_to_0x3f.parquet) | 2.00GB | 52 | | 5 | [ethereum_slots__v1.0.0__0x40_to_0x4f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x40_to_0x4f.parquet) | 2.15GB | 53 | | 6 | [ethereum_slots__v1.0.0__0x50_to_0x5f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x50_to_0x5f.parquet) | 3.09GB | 54 | | 7 | [ethereum_slots__v1.0.0__0x60_to_0x6f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x60_to_0x6f.parquet) | 1.86GB | 55 | | 8 | [ethereum_slots__v1.0.0__0x70_to_0x7f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x70_to_0x7f.parquet) | 3.21GB | 56 | | 9 | [ethereum_slots__v1.0.0__0x80_to_0x8f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x80_to_0x8f.parquet) | 2.51GB | 57 | | 10 | [ethereum_slots__v1.0.0__0x90_to_0x9f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x90_to_0x9f.parquet) | 1.98GB | 58 | | 11 | [ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet) | 2.72GB | 59 | | 12 | [ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet) | 2.01GB | 60 | | 13 | [ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet) | 2.12GB | 61 | | 14 | [ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet) | 3.32GB | 62 | | 15 | [ethereum_slots__v1.0.0__0xe0_to_0xef.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xe0_to_0xef.parquet) | 1.50GB | 63 | | 16 | [ethereum_slots__v1.0.0__0xf0_to_0xff.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xf0_to_0xff.parquet) | 1.97GB | 64 | -------------------------------------------------------------------------------- /datasets/ethereum_slots/dataset_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "datatype": "slots", 3 | "description": "all slots of each contract, including historical usage metadata", 4 | "files": [ 5 | { 6 | "hash": "0fe90a691805d4e6ecadef3326701ead", 7 | "n_bytes": 4082875020, 8 | "name": "ethereum_slots__v1.0.0__0x00_to_0x0f.parquet" 9 | }, 10 | { 11 | "hash": "ee5bbec43eb1c31f5217237211dc725f", 12 | "n_bytes": 2058888792, 13 | "name": "ethereum_slots__v1.0.0__0x10_to_0x1f.parquet" 14 | }, 15 | { 16 | "hash": "d6002aadaf62df7806c502e8ee5cb21d", 17 | "n_bytes": 2394828784, 18 | "name": "ethereum_slots__v1.0.0__0x20_to_0x2f.parquet" 19 | }, 20 | { 21 | "hash": "9acff8c6635d857a43b1cdc2ddf59574", 22 | "n_bytes": 2147275687, 23 | "name": "ethereum_slots__v1.0.0__0x30_to_0x3f.parquet" 24 | }, 25 | { 26 | "hash": "15d7ab03c7efad6988f9a4340bd6b65e", 27 | "n_bytes": 2310360858, 28 | "name": "ethereum_slots__v1.0.0__0x40_to_0x4f.parquet" 29 | }, 30 | { 31 | "hash": "4fb3e9b0a420e93a338988752f69d407", 32 | "n_bytes": 3315134125, 33 | "name": "ethereum_slots__v1.0.0__0x50_to_0x5f.parquet" 34 | }, 35 | { 36 | "hash": "690aa995ef17c326f1a1e7fcdd12de02", 37 | "n_bytes": 1999353262, 38 | "name": "ethereum_slots__v1.0.0__0x60_to_0x6f.parquet" 39 | }, 40 | { 41 | "hash": "0d72d635f47828739f6c9abc1e7146b3", 42 | "n_bytes": 3451637035, 43 | "name": "ethereum_slots__v1.0.0__0x70_to_0x7f.parquet" 44 | }, 45 | { 46 | "hash": "a2d663f62f9aba90607de8538f46b141", 47 | "n_bytes": 2692255804, 48 | "name": "ethereum_slots__v1.0.0__0x80_to_0x8f.parquet" 49 | }, 50 | { 51 | "hash": "ea374784b0f541e5744990e6a5663dee", 52 | "n_bytes": 2126238030, 53 | "name": "ethereum_slots__v1.0.0__0x90_to_0x9f.parquet" 54 | }, 55 | { 56 | "hash": "ebba9199de2373fd2875852f86f8f1b8", 57 | "n_bytes": 2920825856, 58 | "name": "ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet" 59 | }, 60 | { 61 | "hash": "effc1b8a42cd412d0d6976924261a15e", 62 | "n_bytes": 2153804795, 63 | "name": "ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet" 64 | }, 65 | { 66 | "hash": "988cf08e232daf2ddfa2a28316fee31e", 67 | "n_bytes": 2271461320, 68 | "name": "ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet" 69 | }, 70 | { 71 | "hash": "13509d1fe410d3d0877f03f395fbff6d", 72 | "n_bytes": 3562804550, 73 | "name": "ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet" 74 | }, 75 | { 76 | "hash": "26fa7fcaf6cd0d7781cc9fb21143e936", 77 | "n_bytes": 1610498317, 78 | "name": "ethereum_slots__v1.0.0__0xe0_to_0xef.parquet" 79 | }, 80 | { 81 | "hash": "94344f169fd8fb9f1d7fa8c35e9cc427", 82 | "n_bytes": 2114482565, 83 | "name": "ethereum_slots__v1.0.0__0xf0_to_0xff.parquet" 84 | } 85 | ], 86 | "name": "ethereum_slots", 87 | "network": "ethereum", 88 | "schema": { 89 | "description": "all slots of each contract, including historical usage metadata", 90 | "tables": { 91 | "slots": { 92 | "columns": [ 93 | { 94 | "description": "contract of slot", 95 | "name": "contract_address", 96 | "type": "BINARY" 97 | }, 98 | { 99 | "description": "address of slot", 100 | "name": "slot", 101 | "type": "BINARY" 102 | }, 103 | { 104 | "description": "last data stored in slot", 105 | "name": "value", 106 | "type": "BINARY" 107 | }, 108 | { 109 | "description": "first block where slot was used", 110 | "name": "first_updated_block", 111 | "type": "INTEGER" 112 | }, 113 | { 114 | "description": "last block where slot was updated", 115 | "name": "last_updated_block", 116 | "type": "INTEGER" 117 | }, 118 | { 119 | "description": "number of transactions that updated slot", 120 | "name": "n_tx_updates", 121 | "type": "INTEGER" 122 | } 123 | ], 124 | "description": "each row corresponds to a slot of a contract" 125 | } 126 | } 127 | }, 128 | "version": "1.0.0" 129 | } -------------------------------------------------------------------------------- /datasets/global_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "datasets": { 3 | "ethereum_contracts": { 4 | "datatype": "contracts", 5 | "description": "all historical contract deployments", 6 | "n_bytes": 7051921809, 7 | "n_files": 17, 8 | "name": "ethereum_contracts", 9 | "network": "ethereum", 10 | "schema": { 11 | "description": "all historical contract deployments", 12 | "tables": { 13 | "contracts": { 14 | "columns": [ 15 | { 16 | "description": "block number when contract was created", 17 | "name": "block_number", 18 | "type": "INTEGER" 19 | }, 20 | { 21 | "description": "increased by 1 for each contract created in block", 22 | "name": "create_index", 23 | "type": "INTEGER" 24 | }, 25 | { 26 | "description": "hash of transaction that created contract", 27 | "name": "transaction_hash", 28 | "type": "BINARY" 29 | }, 30 | { 31 | "description": "address of deployed contract", 32 | "name": "contract_address", 33 | "type": "BINARY" 34 | }, 35 | { 36 | "description": "EOA that deployed the contract", 37 | "name": "deployer", 38 | "type": "BINARY" 39 | }, 40 | { 41 | "description": "the `from` field in the creation trace", 42 | "name": "factory", 43 | "type": "BINARY" 44 | }, 45 | { 46 | "description": "initialization bytecode of contract", 47 | "name": "init_code", 48 | "type": "BINARY" 49 | }, 50 | { 51 | "description": "bytecode of contract", 52 | "name": "code", 53 | "type": "BINARY" 54 | }, 55 | { 56 | "description": "keccak hash of contract initialization code", 57 | "name": "init_code_hash", 58 | "type": "BINARY" 59 | }, 60 | { 61 | "description": "keccak hash of contract bytecode", 62 | "name": "code_hash", 63 | "type": "BINARY" 64 | } 65 | ], 66 | "description": "each row corresponds to a contract create trace" 67 | } 68 | } 69 | }, 70 | "version": "1.0.0" 71 | }, 72 | "ethereum_native_transfers": { 73 | "datatype": "native_transfers", 74 | "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)", 75 | "n_bytes": 65501766122, 76 | "n_files": 84, 77 | "name": "ethereum_native_transfers", 78 | "network": "ethereum", 79 | "schema": { 80 | "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)", 81 | "tables": { 82 | "native_transfers": { 83 | "columns": [ 84 | { 85 | "description": "block number where native token was transfered", 86 | "name": "block_number", 87 | "type": "INTEGER" 88 | }, 89 | { 90 | "description": "increased by 1 for each native transfer in block", 91 | "name": "transfer_index", 92 | "type": "INTEGER" 93 | }, 94 | { 95 | "description": "hash of transaction that contains transfer", 96 | "name": "transaction_hash", 97 | "type": "BINARY" 98 | }, 99 | { 100 | "description": "address that native token is transferred to", 101 | "name": "to_address", 102 | "type": "BINARY" 103 | }, 104 | { 105 | "description": "address that native token is transferred from", 106 | "name": "from_address", 107 | "type": "BINARY" 108 | }, 109 | { 110 | "description": "amount of native token transferred", 111 | "name": "value", 112 | "type": "BINARY" 113 | } 114 | ], 115 | "description": "each row corresponds to a trace that transfers native token" 116 | } 117 | } 118 | }, 119 | "version": "1.0.0" 120 | }, 121 | "ethereum_slots": { 122 | "datatype": "slots", 123 | "description": "all slots of each contract, including historical usage metadata", 124 | "n_bytes": 41212724800, 125 | "n_files": 16, 126 | "name": "ethereum_slots", 127 | "network": "ethereum", 128 | "schema": { 129 | "description": "all slots of each contract, including historical usage metadata", 130 | "tables": { 131 | "slots": { 132 | "columns": [ 133 | { 134 | "description": "contract of slot", 135 | "name": "contract_address", 136 | "type": "BINARY" 137 | }, 138 | { 139 | "description": "address of slot", 140 | "name": "slot", 141 | "type": "BINARY" 142 | }, 143 | { 144 | "description": "last data stored in slot", 145 | "name": "value", 146 | "type": "BINARY" 147 | }, 148 | { 149 | "description": "first block where slot was used", 150 | "name": "first_updated_block", 151 | "type": "INTEGER" 152 | }, 153 | { 154 | "description": "last block where slot was updated", 155 | "name": "last_updated_block", 156 | "type": "INTEGER" 157 | }, 158 | { 159 | "description": "number of transactions that updated slot", 160 | "name": "n_tx_updates", 161 | "type": "INTEGER" 162 | } 163 | ], 164 | "description": "each row corresponds to a slot of a contract" 165 | } 166 | } 167 | }, 168 | "version": "1.0.0" 169 | } 170 | }, 171 | "version": "1.0.0" 172 | } -------------------------------------------------------------------------------- /pdp/__init__.py: -------------------------------------------------------------------------------- 1 | """pdp downloads and manages datasets from the Paradigm Data Portal""" 2 | 3 | from .config_utils import * 4 | from .data_utils import * 5 | from .spec import * 6 | 7 | 8 | __version__ = '0.2.2' 9 | 10 | -------------------------------------------------------------------------------- /pdp/__main__.py: -------------------------------------------------------------------------------- 1 | 2 | if __name__ == '__main__': 3 | import pdp.cli.cli_run 4 | 5 | pdp.cli.cli_run.run_cli() 6 | 7 | -------------------------------------------------------------------------------- /pdp/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/cli/__init__.py -------------------------------------------------------------------------------- /pdp/cli/cli_run.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import typing 5 | 6 | import toolcli 7 | 8 | import pdp 9 | 10 | 11 | def cd_dir_help() -> typing.Mapping[str, str]: 12 | dir_dict = { 13 | '\[data_root]': 'directory where pdp datasets are stored', 14 | } 15 | local_datasets = pdp.get_local_datasets() 16 | for dataset in local_datasets: 17 | manifest = pdp.get_dataset_manifest(dataset, source='local') 18 | dir_dict[dataset] = manifest['description'] 19 | return dir_dict 20 | 21 | 22 | def cd_dir_getter(dirname: str) -> str: 23 | data_root = pdp.get_data_root(require=False) 24 | if data_root is None: 25 | raise toolcli.CDException('must set PDP_DATA_ROOT env var') 26 | 27 | if dirname in ['', 'data_root']: 28 | return data_root 29 | else: 30 | dirpath = os.path.join(data_root, dirname) 31 | if not os.path.isdir(dirpath): 32 | raise Exception('not a directory: ' + str(dirname)) 33 | if not os.path.isfile( 34 | os.path.join(dirpath, pdp.dataset_manifest_filename) 35 | ): 36 | import toolstr 37 | 38 | toolstr.print( 39 | 'no manifest file ' 40 | + pdp.dataset_manifest_filename 41 | + ' detected for dataset', 42 | style='red', 43 | ) 44 | 45 | return dirpath 46 | 47 | 48 | def run_cli(raw_command: str | None = None) -> None: 49 | import tempfile 50 | 51 | help_cache_dir = os.path.join(tempfile.gettempdir(), 'pdp', 'help_cache') 52 | 53 | command_index: toolcli.CommandIndex = { 54 | ('',): 'pdp.cli.commands.root_command', 55 | ('collect',): 'pdp.cli.commands.collect_command', 56 | ('dataset',): 'pdp.cli.commands.dataset_command', 57 | ('download',): 'pdp.cli.commands.download_command', 58 | ('help',): 'toolcli.command_utils.standard_subcommands.help_command', 59 | ('ls',): 'pdp.cli.commands.ls_command', 60 | ('package',): 'pdp.cli.commands.package_command', 61 | ('update',): 'pdp.cli.commands.update_command', 62 | ('upload',): 'pdp.cli.commands.upload_command', 63 | ('validate',): 'pdp.cli.commands.validate_command', 64 | ( 65 | 'version', 66 | ): 'toolcli.command_utils.standard_subcommands.version_command', 67 | } 68 | 69 | config: toolcli.CLIConfig = { 70 | 'base_command': 'pdp', 71 | 'description': pdp.__doc__, 72 | 'version': pdp.__version__, 73 | 'include_standard_subcommands': [('cd',)], 74 | 'default_command_sequence': ('help',), 75 | 'include_debug_arg': True, 76 | 'help_cache_dir': help_cache_dir, 77 | 'style_theme': pdp.styles, 78 | 'cd_dir_help': cd_dir_help, 79 | 'cd_dir_getter': cd_dir_getter, 80 | } 81 | 82 | toolcli.run_cli( 83 | command_index=command_index, 84 | config=config, 85 | ) 86 | 87 | -------------------------------------------------------------------------------- /pdp/cli/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/cli/commands/__init__.py -------------------------------------------------------------------------------- /pdp/cli/commands/collect_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import typing 5 | 6 | import toolcli 7 | 8 | import pdp 9 | 10 | if typing.TYPE_CHECKING: 11 | import ctc.spec 12 | 13 | class StandardCollectKwargs(typing.TypedDict): 14 | start_block: int 15 | end_block: int 16 | output_dir: str 17 | network: str 18 | chunk_size: int | None 19 | output_filetype: str | None 20 | executor: typing.Literal['parallel', 'serial'] 21 | verbose: bool 22 | 23 | 24 | help_message = """collect a dataset from RPC nodes or other sources 25 | 26 | collecting on-chain datasets requires ctc to be installed and configured""" 27 | 28 | 29 | def get_command_spec() -> toolcli.CommandSpec: 30 | return { 31 | 'f': collect_command, 32 | 'help': help_message, 33 | 'args': [ 34 | {'name': 'dataset', 'help': 'name of dataset to collect'}, 35 | { 36 | 'name': 'output-dir', 37 | 'nargs': '?', 38 | 'help': 'output directory, omit to use PDP_DATA_ROOT', 39 | }, 40 | { 41 | 'name': ('-b', '--blocks'), 42 | 'help': 'block range, as start_block:end_block:chunk_size', 43 | }, 44 | { 45 | 'name': ('-r', '--rpc'), 46 | 'help': 'rpc node url, omit to use ctc configuration', 47 | }, 48 | { 49 | 'name': ('-f', '--format'), 50 | 'dest': 'output_format', 51 | 'help': 'format of output (parquet or csv)', 52 | }, 53 | { 54 | 'name': ('-s', '--serial'), 55 | 'help': 'use serial execution instead of parallel', 56 | 'action': 'store_true', 57 | }, 58 | { 59 | 'name': ('-v', '--verbose'), 60 | 'help': 'output additional information', 61 | 'action': 'store_true', 62 | }, 63 | { 64 | 'name': ('-e', '--extension'), 65 | 'help': 'extension module.function for dataset collection', 66 | 'hidden': True, 67 | }, 68 | { 69 | 'name': ('-p', '--parameters'), 70 | 'help': 'extra parameters given to collection function', 71 | 'hidden': True, 72 | }, 73 | ], 74 | 'examples': [ 75 | 'ethereum_contracts', 76 | 'ethereum_native_transfers', 77 | 'ethereum_slots --blocks 14_000_000:14_100_000', 78 | ], 79 | } 80 | 81 | 82 | def collect_command( 83 | dataset: str, 84 | blocks: str | None, 85 | rpc: str | None, 86 | output_dir: str | None, 87 | output_format: str | None, 88 | serial: bool, 89 | verbose: bool, 90 | extension: str | None, 91 | parameters: str | None, 92 | ) -> None: 93 | # get context 94 | parsed = pdp.parse_dataset_name(dataset) 95 | datatype = parsed['datatype'] 96 | network = parsed['network'] 97 | if rpc is None: 98 | context: ctc.spec.Context = {'network': network} 99 | else: 100 | context = {'network': network, 'provider': rpc} 101 | 102 | # get block range 103 | if blocks is None: 104 | pdp.ensure_ctc() 105 | import ctc.rpc 106 | 107 | start_block = 0 108 | end_block = ctc.rpc.sync_eth_block_number(context=context) 109 | chunk_size_int = None 110 | else: 111 | pdp.ensure_ctc() 112 | import ctc.cli.cli_utils 113 | 114 | ( 115 | start_block, 116 | end_block, 117 | chunk_size_int, 118 | ) = ctc.cli.cli_utils.sync_parse_block_chunks(blocks) 119 | 120 | # parse output parameters 121 | if output_dir is None: 122 | data_root = pdp.get_data_root(require=False) 123 | if data_root is None or data_root == '': 124 | raise Exception( 125 | 'must specify output_dir or set PDP_DATA_ROOT env var' 126 | ) 127 | else: 128 | output_dir = os.path.join(data_root, dataset) 129 | 130 | if serial: 131 | executor: typing.Literal['parallel', 'serial'] = 'serial' 132 | else: 133 | executor = 'parallel' 134 | 135 | # collect parameters 136 | if parameters is not None: 137 | import ast 138 | 139 | extra_kwargs = ast.literal_eval(parameters) 140 | if not isinstance(extra_kwargs, dict): 141 | raise Exception( 142 | 'extra parameters should be specified with dict syntax' 143 | ) 144 | else: 145 | extra_kwargs = {} 146 | standard_kwargs: StandardCollectKwargs = { 147 | 'start_block': start_block, 148 | 'end_block': end_block, 149 | 'output_dir': output_dir, 150 | 'network': network, 151 | 'chunk_size': chunk_size_int, 152 | 'output_filetype': output_format, 153 | 'executor': executor, 154 | 'verbose': verbose, 155 | } 156 | 157 | # 158 | # # perform collection 159 | # 160 | 161 | if datatype == 'contracts': 162 | from pdp.datasets import contracts 163 | 164 | contracts.collect_contracts_dataset(**standard_kwargs, **extra_kwargs) 165 | 166 | elif datatype == 'native_transfers': 167 | from pdp.datasets import native_transfers 168 | 169 | native_transfers.collect_native_transfers_dataset( 170 | **standard_kwargs, **extra_kwargs 171 | ) 172 | 173 | elif datatype == 'slots': 174 | from pdp.datasets import slots 175 | 176 | slots.collect_slots_dataset(**standard_kwargs, **extra_kwargs) 177 | 178 | elif extension is not None: 179 | import importlib 180 | 181 | try: 182 | module_path = ( 183 | extension 184 | + '.datasets.' 185 | + datatype 186 | + '.' 187 | + datatype 188 | + '_collect' 189 | ) 190 | module = importlib.import_module(module_path) 191 | function_name = 'collect_' + datatype + '_dataset' 192 | function = getattr(module, function_name) 193 | except (ValueError, ImportError, AttributeError) as e: 194 | print('invalid extension, could not get extension function: ' + str(e.args[0])) 195 | return 196 | 197 | function(**standard_kwargs, **extra_kwargs) 198 | 199 | else: 200 | raise Exception('invalid datatype: ' + str(datatype)) 201 | 202 | -------------------------------------------------------------------------------- /pdp/cli/commands/dataset_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import toolcli 4 | import toolstr 5 | 6 | import pdp 7 | 8 | 9 | def get_command_spec() -> toolcli.CommandSpec: 10 | return { 11 | 'f': dataset_command, 12 | 'help': 'show info about a dataset', 13 | 'args': [ 14 | {'name': 'dataset', 'help': 'name of dataset'}, 15 | ], 16 | 'examples': [ 17 | 'ethereum_contracts', 18 | 'ethereum_native_transfers', 19 | ], 20 | } 21 | 22 | 23 | def dataset_command(dataset: str) -> None: 24 | manifest = pdp.get_dataset_manifest(dataset) 25 | toolstr.print_text_box( 26 | toolstr.add_style(manifest['name'] + ' dataset', pdp.styles['metavar']), 27 | style=pdp.styles['title'], 28 | ) 29 | toolstr.print_bullet( 30 | key='description', 31 | value=manifest['description'], 32 | styles=pdp.styles, 33 | ) 34 | toolstr.print_bullet( 35 | key='version', 36 | value=manifest['version'], 37 | styles=pdp.styles, 38 | ) 39 | toolstr.print_bullet( 40 | key='n_files', 41 | value=len(manifest['files']), 42 | styles=pdp.styles, 43 | ) 44 | total_size = sum(file['n_bytes'] for file in manifest['files']) 45 | toolstr.print_bullet( 46 | key='total_size', 47 | value=toolstr.format_nbytes(total_size, decimals=1), 48 | styles=pdp.styles, 49 | ) 50 | 51 | print() 52 | for table_name, table in manifest['schema']['tables'].items(): 53 | if len(manifest['schema']['tables']) > 1: 54 | toolstr.print( 55 | table_name + ' table', style=pdp.styles['content'], indent=4 56 | ) 57 | rows = [] 58 | for column in table['columns']: 59 | row = [column['name'], column['type'], column['description']] 60 | rows.append(row) 61 | labels = ['column', 'type', 'description'] 62 | toolstr.print_table( 63 | rows, 64 | labels=labels, 65 | column_styles={ 66 | 'column': pdp.styles['metavar'], 67 | 'type': pdp.styles['content'], 68 | 'description': pdp.styles['description'], 69 | }, 70 | # style=pdp.styles['metavar'], 71 | border=pdp.styles['content'], 72 | label_style=pdp.styles['metavar'], 73 | indent=4, 74 | ) 75 | 76 | -------------------------------------------------------------------------------- /pdp/cli/commands/download_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import toolcli 6 | 7 | import pdp 8 | 9 | 10 | def get_command_spec() -> toolcli.CommandSpec: 11 | return { 12 | 'f': download_command, 13 | 'help': 'download dataset directly from Paradigm data portal', 14 | 'args': [ 15 | {'name': 'dataset', 'help': 'dataset to list info of'}, 16 | {'name': '--output-dir', 'help': 'output directory path'}, 17 | {'name': '--portal-root', 'help': 'root url of data portal'}, 18 | ], 19 | 'examples': [ 20 | 'ethereum_contracts', 21 | 'ethereum_contracts --output-dir /path/to/some/dir', 22 | ], 23 | } 24 | 25 | 26 | def download_command( 27 | dataset: str, 28 | output_dir: str | None, 29 | portal_root: str | None, 30 | ) -> None: 31 | 32 | if output_dir is None: 33 | output_dir = os.path.abspath('.') 34 | 35 | pdp.download_dataset( 36 | dataset=dataset, 37 | output_dir=output_dir, 38 | portal_root=portal_root, 39 | ) 40 | 41 | -------------------------------------------------------------------------------- /pdp/cli/commands/ls_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | 5 | import toolcli 6 | import toolstr 7 | 8 | import pdp 9 | 10 | 11 | def get_command_spec() -> toolcli.CommandSpec: 12 | return { 13 | 'f': ls_command, 14 | 'help': 'list datasets or dataset files', 15 | 'args': [ 16 | { 17 | 'name': 'dataset', 18 | 'help': 'dataset to list info of, omit for global manifest', 19 | 'nargs': '?', 20 | }, 21 | { 22 | 'name': '--hashes', 23 | 'help': 'show md5 hashes of each file', 24 | 'action': 'store_true', 25 | }, 26 | { 27 | 'name': '--urls', 28 | 'help': 'show full urls of each file', 29 | 'action': 'store_true', 30 | }, 31 | {'name': '--portal-root', 'help': 'root url of data portal'}, 32 | ], 33 | 'examples': { 34 | '': 'show all datasets', 35 | 'ethereum_contracts': 'show files of dataset', 36 | 'ethereum_contracts --hashes': 'show files of dataset with hashes', 37 | 'ethereum_contracts --urls': 'show urls of files in dataset', 38 | }, 39 | } 40 | 41 | 42 | def ls_command( 43 | dataset: str | None, 44 | hashes: bool, 45 | urls: bool, 46 | portal_root: str | None, 47 | ) -> None: 48 | 49 | if dataset is None: 50 | 51 | toolstr.print( 52 | 'fetching global manifest...', style=pdp.styles['comment'], end='\r' 53 | ) 54 | global_manifest = pdp.get_global_manifest() 55 | sys.stdout.write("\033[K") 56 | toolstr.print('Datasets', style=pdp.styles['title']) 57 | for dataset_name, dataset_manifest in global_manifest[ 58 | 'datasets' 59 | ].items(): 60 | toolstr.print_bullet( 61 | key=dataset_name, 62 | value=dataset_manifest['description'], 63 | styles=pdp.styles, 64 | ) 65 | print() 66 | toolstr.print( 67 | '(use ' 68 | + toolstr.add_style( 69 | 'pdp dataset ', pdp.styles['description'] 70 | ) 71 | + ' for info or ' 72 | + toolstr.add_style('pdp ls ', pdp.styles['description']) 73 | + ' for file list)', 74 | style=pdp.styles['comment'], 75 | ) 76 | 77 | else: 78 | 79 | # get dataset manifest 80 | manifest = pdp.get_dataset_manifest(dataset=dataset) 81 | 82 | # get id of each file, either filename or url 83 | if urls: 84 | file_ids = pdp.get_dataset_file_urls( 85 | dataset=dataset, 86 | portal_root=portal_root, 87 | manifest=manifest, 88 | ) 89 | else: 90 | file_ids = [file['name'] for file in manifest['files']] 91 | 92 | # print either with or without hashes 93 | if hashes: 94 | rows = [ 95 | [file_id, file['hash']] 96 | for file_id, file in zip(file_ids, manifest['files']) 97 | ] 98 | toolstr.print_table(rows, compact=True) 99 | else: 100 | for file_id in file_ids: 101 | print(file_id) 102 | 103 | -------------------------------------------------------------------------------- /pdp/cli/commands/package_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import toolcli 6 | 7 | import pdp 8 | 9 | 10 | def get_command_spec() -> toolcli.CommandSpec: 11 | return { 12 | 'f': package_command, 13 | 'help': 'package a dataset manifest or a global manifest', 14 | 'args': [ 15 | { 16 | 'name': 'directory', 17 | 'help': 'data directory to use for manifest', 18 | 'nargs': '?', 19 | }, 20 | { 21 | 'name': '--global', 22 | 'help': 'create a global manifest instead of a dataset manifest', 23 | 'action': 'store_true', 24 | 'dest': 'global_manifest', 25 | }, 26 | { 27 | 'name': '--output-path', 28 | 'help': 'output path of manifest', 29 | }, 30 | { 31 | 'name': '--confirm', 32 | 'help': 'confirm overwriting ', 33 | 'action': 'store_true', 34 | }, 35 | { 36 | 'name': '--reuse-hashes', 37 | 'help': '', 38 | 'action': 'store_true', 39 | }, 40 | ], 41 | 'examples': [ 42 | '', 43 | 'path/to/some/dataset', 44 | '--global', 45 | '--confirm', 46 | ], 47 | } 48 | 49 | 50 | def package_command( 51 | *, 52 | global_manifest: bool, 53 | directory: str | None, 54 | output_path: str | bool | None, 55 | confirm: bool, 56 | reuse_hashes: bool, 57 | ) -> None: 58 | 59 | if directory is None: 60 | directory = '.' 61 | directory = os.path.expanduser(directory) 62 | if output_path is None: 63 | output_path = True 64 | if isinstance(output_path, str): 65 | output_path = os.path.expanduser(output_path) 66 | 67 | if global_manifest: 68 | 69 | pdp.create_global_manifest( 70 | data_root=directory, 71 | version=pdp.global_version, 72 | output_path=output_path, 73 | confirm=confirm, 74 | ) 75 | 76 | else: 77 | 78 | # check no subdatasets contained 79 | for item in os.listdir(directory): 80 | subpath = os.path.join(directory, item) 81 | if os.path.isdir( 82 | subpath 83 | ) and pdp.dataset_manifest_filename in os.listdir(subpath): 84 | raise Exception('use --global to package a global manifest') 85 | 86 | dataset_manifest = pdp.create_dataset_manifest( 87 | dataset_dir=directory, 88 | output_path=output_path, 89 | confirm=confirm, 90 | reuse_hashes=reuse_hashes, 91 | ) 92 | 93 | # get readme path 94 | if isinstance(output_path, bool): 95 | readme_path: str | bool = True 96 | elif isinstance(output_path, str): 97 | readme_path = os.path.join( 98 | os.path.dirname(output_path), 99 | pdp.dataset_readme_filename, 100 | ) 101 | else: 102 | raise Exception('unknown output_path type: ' + str(output_path)) 103 | 104 | # create readme 105 | pdp.create_dataset_readme( 106 | dataset_manifest=dataset_manifest, 107 | output_path=readme_path, 108 | confirm=confirm, 109 | ) 110 | 111 | -------------------------------------------------------------------------------- /pdp/cli/commands/root_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import toolcli 4 | 5 | 6 | def get_command_spec() -> toolcli.CommandSpec: 7 | return { 8 | 'f': root_command, 9 | 'help': 'display help message', 10 | 'hidden': True, 11 | 'extra_data': ['parse_spec'], 12 | } 13 | 14 | 15 | def root_command(parse_spec: toolcli.ParseSpec) -> None: 16 | toolcli.command_utils.execution.execute_other_command_sequence( 17 | ('help',), 18 | args={'parse_spec': parse_spec}, 19 | parse_spec=parse_spec, 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /pdp/cli/commands/update_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | import toolcli 6 | 7 | import pdp 8 | 9 | 10 | def get_command_spec() -> toolcli.CommandSpec: 11 | return { 12 | 'f': update_command, 13 | 'help': 'update a dataset or datasets to their latest versions', 14 | 'hidden': True, 15 | 'args': [ 16 | { 17 | 'name': 'datasets', 18 | 'help': 'space-separated list of datasets to update', 19 | 'nargs': '+', 20 | }, 21 | { 22 | 'name': ['-a', '--all'], 23 | 'help': 'update all datasets', 24 | 'dest': 'all_datasets', 25 | 'action': 'store_true', 26 | }, 27 | { 28 | 'name': ['-m', '--method'], 29 | 'help': 'method used for syncing ("download" or "collect")', 30 | }, 31 | ], 32 | 'examples': [ 33 | 'ethereum_contracts', 34 | 'ethereum_contracts ethereum_slots', 35 | '--all', 36 | ], 37 | } 38 | 39 | 40 | def update_command( 41 | datasets: typing.Sequence[str], 42 | all_datasets: bool, 43 | method: typing.Literal['download', 'collect'], 44 | ) -> None: 45 | if all_datasets: 46 | datasets = pdp.get_local_datasets() 47 | 48 | for dataset in datasets: 49 | pdp.update(dataset=dataset, method=method) 50 | 51 | -------------------------------------------------------------------------------- /pdp/cli/commands/upload_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import os 5 | 6 | import toolcli 7 | 8 | import pdp 9 | 10 | 11 | help_message = """upload files to bucket 12 | 13 | If local-path not specified, upload current directory 14 | 15 | If bucket-path not specified, upload relative to global manifest location""" 16 | 17 | 18 | def get_command_spec() -> toolcli.CommandSpec: 19 | return { 20 | 'f': upload_command, 21 | 'help': help_message, 22 | 'args': [ 23 | { 24 | 'name': 'local-path', 25 | 'help': 'local path to upload from', 26 | 'nargs': '?', 27 | }, 28 | { 29 | 'name': 'bucket-path', 30 | 'help': 'bucket path to upload to', 31 | 'nargs': '?', 32 | }, 33 | { 34 | 'name': '--all', 35 | 'help': 'upload all files in directory instead of just manifest files', 36 | 'action': 'store_true', 37 | 'dest': 'all_files', 38 | }, 39 | ], 40 | 'examples': { 41 | '': 'upload current directory', 42 | '/path/to/some/dir': 'upload some other directory', 43 | }, 44 | 'hidden': True, 45 | } 46 | 47 | 48 | def upload_command( 49 | *, local_path: str, bucket_path: str, all_files: bool 50 | ) -> None: 51 | 52 | if local_path is None: 53 | local_path = '.' 54 | local_path = os.path.abspath(os.path.expanduser(local_path)) 55 | if os.path.isdir(local_path): 56 | local_dir = local_path 57 | elif os.path.isfile(local_path): 58 | local_dir = os.path.dirname(local_path) 59 | else: 60 | raise Exception() 61 | 62 | # if local_path is a dataset directory, upload according to manifest 63 | dir_files = None 64 | if os.path.isdir(local_path): 65 | dir_contents = os.listdir(local_path) 66 | 67 | if all_files: 68 | dir_files = None 69 | 70 | elif pdp.global_manifest_filename in dir_contents: 71 | print('uploading global manifest:', pdp.global_manifest_filename) 72 | dir_files = [pdp.global_manifest_filename] 73 | 74 | elif pdp.dataset_manifest_filename in dir_contents: 75 | print( 76 | 'uploading files in dataset manifest:', 77 | pdp.dataset_manifest_filename, 78 | ) 79 | manifest_path = os.path.join( 80 | local_path, pdp.dataset_manifest_filename 81 | ) 82 | with open(manifest_path) as f: 83 | dataset_manifest = json.load(f) 84 | dir_files = [pdp.dataset_manifest_filename] 85 | for file in dataset_manifest['files']: 86 | dir_files.append(file['name']) 87 | 88 | else: 89 | raise Exception( 90 | 'no manifest file found, use --all to upload all files' 91 | ) 92 | 93 | if bucket_path is None: 94 | 95 | if pdp.global_manifest_filename in os.listdir(local_dir): 96 | # assume local_dir is global root, 97 | bucket_path = pdp.bucket_root_path 98 | 99 | else: 100 | 101 | # find root dir 102 | data_root = local_dir 103 | while pdp.global_manifest_filename not in os.listdir(data_root): 104 | next_data_root = os.path.dirname(data_root) 105 | if next_data_root == data_root: 106 | raise Exception( 107 | 'could not find global data root, must specify bucket-path manually' 108 | ) 109 | data_root = next_data_root 110 | 111 | # get path relative to root dir 112 | local_relpath = os.path.relpath(local_path, data_root) 113 | bucket_path = os.path.join(pdp.bucket_root_path, local_relpath) 114 | 115 | if os.path.isfile(local_path): 116 | pdp.upload_file( 117 | local_path=local_path, 118 | bucket_path=bucket_path, 119 | ) 120 | else: 121 | pdp.upload_directory( 122 | local_path=local_path, 123 | dir_files=dir_files, 124 | bucket_path=bucket_path, 125 | ) 126 | 127 | -------------------------------------------------------------------------------- /pdp/cli/commands/validate_command.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import toolcli 6 | 7 | import pdp 8 | 9 | 10 | def get_command_spec() -> toolcli.CommandSpec: 11 | return { 12 | 'f': validate_command, 13 | 'help': 'validate files in dataset manifest', 14 | 'args': [ 15 | { 16 | 'name': 'dataset_directory', 17 | 'help': 'dataset directory, default is current directory', 18 | 'nargs': '?', 19 | }, 20 | {'name': '--no-hashes', 'help': 'skip hashing each file', 'action': 'store_true'}, 21 | ], 22 | 'examples': { 23 | '': 'validate dataset in current directory', 24 | '/path/to/some/dir': 'validate dataset in some other directory', 25 | }, 26 | } 27 | 28 | 29 | def validate_command(dataset_directory: str | None, no_hashes: bool) -> None: 30 | if dataset_directory is None: 31 | path = '.' 32 | else: 33 | path = dataset_directory 34 | path = os.path.abspath(os.path.expanduser(path)) 35 | 36 | pdp.validate_dataset_directory(path, no_hashes=no_hashes) 37 | 38 | -------------------------------------------------------------------------------- /pdp/config_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import typing 5 | 6 | from . import data_utils 7 | from . import spec 8 | 9 | 10 | @typing.overload 11 | def get_data_root(*, require: typing.Literal[True]) -> str: 12 | ... 13 | 14 | 15 | @typing.overload 16 | def get_data_root(*, require: bool) -> str | None: 17 | ... 18 | 19 | 20 | @typing.overload 21 | def get_data_root() -> str: 22 | ... 23 | 24 | 25 | def get_data_root(*, require: bool = True) -> str | None: 26 | data_root = os.environ.get('PDP_DATA_ROOT') 27 | if (data_root is None or data_root == '') and require: 28 | raise Exception('PDP_DATA_ROOT not set') 29 | return data_root 30 | 31 | 32 | def get_dataset_glob( 33 | dataset: str | None = None, 34 | table: str | None = None, 35 | *, 36 | network: str | int | None = None, 37 | datatype: str | None = None, 38 | ) -> str: 39 | dataset_path = get_dataset_local_path( 40 | dataset=dataset, 41 | network=network, 42 | datatype=datatype, 43 | ) 44 | 45 | if table is None: 46 | filename = '*.parquet' 47 | else: 48 | filename = table + '_*.parquet' 49 | 50 | return os.path.join(dataset_path, filename) 51 | 52 | 53 | def get_dataset_local_path( 54 | dataset: str | None = None, 55 | *, 56 | network: str | int | None = None, 57 | datatype: str | None = None, 58 | ) -> str: 59 | if dataset is None: 60 | if network is None or datatype is None: 61 | raise Exception( 62 | 'must specify datatype and network to get dataset name' 63 | ) 64 | dataset = data_utils.get_dataset_name( 65 | datatype=datatype, network=network 66 | ) 67 | 68 | return os.path.join(get_data_root(), dataset) 69 | 70 | 71 | def get_local_datasets( 72 | data_root: str | None = None, 73 | ) -> typing.Sequence[str]: 74 | if data_root is None: 75 | data_root = get_data_root(require=True) 76 | 77 | data_dirs = [] 78 | for subdir in os.listdir(data_root): 79 | subpath = os.path.join(data_root, subdir) 80 | if os.path.isdir(subpath): 81 | manifest_path = os.path.join( 82 | subpath, spec.dataset_manifest_filename 83 | ) 84 | if os.path.isfile(manifest_path): 85 | data_dirs.append(subdir) 86 | 87 | return data_dirs 88 | 89 | -------------------------------------------------------------------------------- /pdp/data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .collect_utils import * 3 | from .download_utils import * 4 | from .file_utils import * 5 | from .job_utils import * 6 | from .manifest_utils import * 7 | from .query_utils import * 8 | from .readme_utils import * 9 | from .schema_utils import * 10 | from .update_utils import * 11 | 12 | -------------------------------------------------------------------------------- /pdp/data_utils/collect_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | if typing.TYPE_CHECKING: 6 | import types 7 | 8 | 9 | def ensure_ctc() -> types.ModuleType: 10 | try: 11 | import ctc 12 | 13 | return ctc 14 | except ImportError: 15 | raise Exception('must install ctc to use this functionality') 16 | 17 | -------------------------------------------------------------------------------- /pdp/data_utils/download_utils.py: -------------------------------------------------------------------------------- 1 | """functions for downloading datasets""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | import typing 7 | 8 | from .. import spec 9 | from . import file_utils 10 | from . import manifest_utils 11 | from . import schema_utils 12 | 13 | 14 | def download_dataset( 15 | dataset: str, 16 | *, 17 | output_dir: str, 18 | portal_root: str | None = None, 19 | skip_existing: bool = True, 20 | ) -> None: 21 | """download files of a dataset""" 22 | 23 | print('Downloading dataset:', dataset) 24 | 25 | urls = get_dataset_file_urls(dataset, portal_root=portal_root) 26 | 27 | base_url = os.path.dirname(urls[0]) 28 | readme_url = os.path.join(base_url, 'README.md') 29 | manifest_url = os.path.join(base_url, 'dataset_manifest.json') 30 | file_utils.download_files( 31 | urls=[readme_url, manifest_url], 32 | output_dir=output_dir, 33 | skip_existing=skip_existing, 34 | ) 35 | 36 | # download files 37 | file_utils.download_files( 38 | urls=urls, 39 | output_dir=output_dir, 40 | skip_existing=skip_existing, 41 | ) 42 | 43 | 44 | def get_dataset_file_urls( 45 | dataset: str, 46 | *, 47 | portal_root: str | None, 48 | manifest: spec.DatasetManifest | None = None, 49 | ) -> typing.Sequence[str]: 50 | """get file urls of a dataset""" 51 | 52 | parsed = schema_utils.parse_dataset_name(dataset) 53 | 54 | if portal_root is None: 55 | portal_root = spec.portal_root 56 | if manifest is None: 57 | manifest = manifest_utils.get_dataset_manifest( 58 | dataset=dataset, portal_root=portal_root 59 | ) 60 | urls = [] 61 | for file in manifest['files']: 62 | url = spec.urls['dataset_file'].format( 63 | portal_root=portal_root, 64 | datatype=parsed['datatype'], 65 | network=parsed['network'], 66 | filename=file['name'], 67 | ) 68 | urls.append(url) 69 | return urls 70 | 71 | 72 | def get_dataset_file_url( 73 | datatype: str, 74 | network: str, 75 | filename: str, 76 | portal_root: str | None = None, 77 | ) -> str: 78 | if portal_root is None: 79 | portal_root = spec.portal_root 80 | return spec.urls['dataset_file'].format( 81 | portal_root=portal_root, 82 | datatype=datatype, 83 | network=network, 84 | filename=filename, 85 | ) 86 | 87 | 88 | def validate_dataset_directory(path: str, *, no_hashes: bool = False) -> bool: 89 | """validate the files in a dataset directory""" 90 | 91 | import json 92 | 93 | # load manifest 94 | manifest_path = os.path.join(path, spec.dataset_manifest_filename) 95 | if not os.path.isfile(manifest_path): 96 | raise Exception( 97 | 'no ' + spec.dataset_manifest_filename + ' found in directory' 98 | ) 99 | with open(manifest_path, 'r') as f: 100 | manifest = json.load(f) 101 | manifest_files = {file['name'] for file in manifest['files']} 102 | 103 | print('validating data of', manifest['name'], 'in', path) 104 | 105 | # gather files present 106 | present_files = set(os.listdir(path)) 107 | 108 | # check for missing files 109 | missing_files = manifest_files - present_files 110 | 111 | # check for extra files 112 | extra_files = (present_files - manifest_files) - { 113 | spec.dataset_manifest_filename 114 | } 115 | 116 | # check file hashes 117 | if no_hashes: 118 | bad_hashes = None 119 | else: 120 | bad_hashes = [] 121 | for file in manifest['files']: 122 | if file['name'] in present_files: 123 | hash = file_utils.get_file_hash( 124 | os.path.join(path, file['name']) 125 | ) 126 | target_hash = file['hash'] 127 | if hash != target_hash: 128 | bad_hashes.append(file['name']) 129 | 130 | # print errors 131 | print() 132 | errors_found = 0 133 | skipping = False 134 | for error_name, errors in { 135 | 'missing files': missing_files, 136 | 'extra files': extra_files, 137 | 'bad hashes': bad_hashes, 138 | }.items(): 139 | if errors is None: 140 | print('SKIPPED checking for ' + error_name) 141 | skipping = True 142 | continue 143 | if len(errors) > 0: 144 | errors_found += len(errors) 145 | print(error_name + ':') 146 | for file in sorted(errors)[:10]: 147 | print('-', file) 148 | if len(errors) > 10: 149 | print('- ...') 150 | 151 | # print summary 152 | if errors_found > 0 or skipping: 153 | print() 154 | print( 155 | len(missing_files), 156 | 'missing files,', 157 | len(extra_files), 158 | 'extra files, and', 159 | len(bad_hashes) if bad_hashes is not None else 0, 160 | 'bad hashes', 161 | ) 162 | 163 | return not errors_found 164 | 165 | -------------------------------------------------------------------------------- /pdp/data_utils/file_utils.py: -------------------------------------------------------------------------------- 1 | """functions for generic file operations""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | import typing 7 | 8 | 9 | def download_files( 10 | urls: typing.Sequence[str], 11 | *, 12 | output_dir: str, 13 | skip_existing: bool = True, 14 | ) -> None: 15 | """download a list of files""" 16 | 17 | # get output dir 18 | if output_dir is None: 19 | output_dir = '.' 20 | output_dir = os.path.abspath(os.path.expanduser(output_dir)) 21 | 22 | print('downloading', len(urls), 'files') 23 | print() 24 | print('using output_dir', output_dir) 25 | 26 | # skip existing files 27 | if skip_existing: 28 | url_filenames = [os.path.basename(url) for url in urls] 29 | skip_urls = set() 30 | for url, filename in zip(urls, url_filenames): 31 | if filename in os.listdir(output_dir): 32 | skip_urls.add(url) 33 | if len(skip_urls) > 0: 34 | print() 35 | print('skipping', len(skip_urls), 'files that already exist') 36 | else: 37 | skip_urls = set() 38 | 39 | # download files 40 | for url in urls: 41 | if url not in skip_urls: 42 | download_file(url) 43 | 44 | print() 45 | print('done') 46 | 47 | 48 | def download_file(url: str, output_path: str | None = None) -> None: 49 | """download a file""" 50 | import subprocess 51 | 52 | print() 53 | print('downloading', url) 54 | if output_path is None: 55 | output_path = os.path.basename(url) 56 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 57 | subprocess.call(['curl', url, '--output', output_path]) 58 | 59 | 60 | def get_file_hash(path: str) -> str: 61 | """get hash of file""" 62 | 63 | import hashlib 64 | 65 | with open(path, 'rb') as f: 66 | hashed = hashlib.md5(f.read()) 67 | 68 | return hashed.hexdigest() 69 | 70 | 71 | def get_file_hashes(paths: typing.Sequence[str]) -> typing.Sequence[str]: 72 | """get hashes of multiple files""" 73 | 74 | return [get_file_hash(path) for path in paths] 75 | 76 | 77 | def upload_file(local_path: str, bucket_path: str) -> None: 78 | """upload single file to s3 bucket""" 79 | 80 | import subprocess 81 | 82 | command = [ 83 | 'rclone', 84 | 'copyto', 85 | local_path, 86 | 'paradigm-data-portal:' + bucket_path, 87 | '-v', 88 | ] 89 | 90 | subprocess.call(command) 91 | 92 | 93 | def upload_directory( 94 | local_path: str, 95 | bucket_path: str, 96 | *, 97 | dir_files: typing.Sequence[str] | None, 98 | remove_deleted_files: bool = False, 99 | ) -> None: 100 | """upload nested directory of files to s3 bucket""" 101 | 102 | import subprocess 103 | 104 | print('uploading directory:', local_path) 105 | print('to bucket path:', bucket_path) 106 | print() 107 | 108 | if remove_deleted_files: 109 | action = 'sync' 110 | else: 111 | action = 'copy' 112 | 113 | command = [ 114 | 'rclone', 115 | action, 116 | local_path, 117 | 'paradigm-data-portal:' + bucket_path, 118 | '-v', 119 | ] 120 | 121 | if dir_files is not None: 122 | # create tempfile with list of files to upload 123 | import tempfile 124 | 125 | temp_dir = tempfile.mkdtemp() 126 | temp_path = os.path.join(temp_dir, 'file_list.txt') 127 | with open(temp_path, 'w') as f: 128 | f.write('\n'.join(dir_files)) 129 | command.extend(['--files-from', temp_path]) 130 | 131 | else: 132 | command.extend(['--exclude', '".*"']) 133 | 134 | subprocess.call(command) 135 | 136 | -------------------------------------------------------------------------------- /pdp/data_utils/job_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | import tooljob 6 | 7 | if typing.TYPE_CHECKING: 8 | import ctc.spec 9 | import polars as pl 10 | 11 | 12 | class BlockChunkJobs(tooljob.Batch): 13 | """create jobs by splitting a block interval into chunks, one job for each chunk""" 14 | 15 | start_block: int 16 | end_block: int 17 | chunk_size: int 18 | context: ctc.spec.Context | None = None 19 | tracker: tooljob.trackers.file_tracker.FileTracker | tooljob.trackers.multifile_tracker.MultifileTracker 20 | 21 | def __init__( 22 | self, 23 | start_block: int, 24 | end_block: int, 25 | chunk_size: int, 26 | context: ctc.spec.Context | None = None, 27 | **kwargs: typing.Any, 28 | ) -> None: 29 | if end_block < start_block: 30 | raise Exception('start_block must be less than end_block') 31 | self.start_block = start_block 32 | self.end_block = end_block 33 | self.chunk_size = chunk_size 34 | self.context = context 35 | super().__init__(**kwargs) 36 | 37 | # 38 | # # jobs 39 | # 40 | 41 | def get_n_jobs(self) -> int: 42 | import math 43 | 44 | n_blocks = self.end_block - self.start_block + 1 45 | return math.floor(n_blocks / self.chunk_size) 46 | 47 | def get_job_data(self, i: int) -> tooljob.JobData: 48 | n_jobs = self.get_n_jobs() 49 | if i < 0 or i >= n_jobs: 50 | raise Exception('job index too high, max is ' + str(n_jobs - 1)) 51 | 52 | start_block = i * self.chunk_size + self.start_block 53 | end_block = (i + 1) * self.chunk_size - 1 + self.start_block 54 | if end_block > self.end_block: 55 | end_block = self.end_block 56 | 57 | return {'start_block': start_block, 'end_block': end_block} 58 | 59 | # 60 | # # names 61 | # 62 | 63 | def get_job_name( 64 | self, 65 | i: int | None = None, 66 | *, 67 | job_data: tooljob.JobData | None = None, 68 | parameters: typing.Mapping[str, str] | None = None, 69 | ) -> str: 70 | from tooljob.trackers import multifile_tracker 71 | 72 | # get job data 73 | if job_data is None: 74 | if i is None: 75 | raise Exception('must specify job_data or i') 76 | job_data = self.get_job_data(i) 77 | 78 | # get output_name 79 | if parameters is not None: 80 | output_name = parameters.get('output_name') 81 | else: 82 | output_name = None 83 | 84 | # create name 85 | if ( 86 | isinstance(self.tracker, multifile_tracker.MultifileTracker) 87 | and output_name is not None 88 | and parameters is not None 89 | and parameters.get('output_name') is not None 90 | ): 91 | # get ctc 92 | from . import collect_utils 93 | 94 | collect_utils.ensure_ctc() 95 | import ctc 96 | import ctc.config 97 | 98 | # handle parameters for multi-output job name 99 | network: str = ctc.config.get_context_network_name(self.context) 100 | block_range = self.get_block_range_str(i) 101 | return network + '_' + output_name + '__' + block_range 102 | 103 | else: 104 | # use vanilla job name 105 | return self.get_job_list_name() + '__' + self.get_block_range_str(i) 106 | 107 | def parse_job_name(self, name: str) -> typing.Mapping[str, typing.Any]: 108 | block_range = name.split('__')[-1] 109 | start_str, end_str = block_range.split('_to_') 110 | return {'start_block': int(start_str), 'end_block': int(end_str)} 111 | 112 | def get_block_range_str( 113 | self, 114 | i: int | None = None, 115 | *, 116 | start_block: int | None = None, 117 | end_block: int | None = None, 118 | ) -> str: 119 | if i is not None and (start_block is not None or end_block is not None): 120 | raise Exception('specify either job or start_block and end_block') 121 | elif i is not None: 122 | job = self.get_job_data(i) 123 | start = job['start_block'] 124 | end = job['end_block'] 125 | elif start_block is not None and end_block is not None: 126 | start = start_block 127 | end = end_block 128 | else: 129 | raise Exception('specify either job or start_block and end_block') 130 | 131 | return '{start_block:08d}_to_{end_block:08d}'.format( 132 | start_block=start, 133 | end_block=end, 134 | ) 135 | 136 | # 137 | # # summary 138 | # 139 | 140 | def get_attribute_list(self) -> typing.Sequence[str]: 141 | attributes = super().get_attribute_list() 142 | attributes = list(attributes) 143 | end_block_index = attributes.index('end_block') 144 | attributes.insert(end_block_index + 1, 'n_blocks') 145 | return attributes 146 | 147 | def get_formatted_attribute(self, key: str) -> str | None: 148 | import toolstr 149 | 150 | if key == 'end_block': 151 | return ' ' + str(self.end_block) 152 | elif key == 'n_blocks': 153 | return toolstr.format(self.end_block - self.start_block) 154 | elif key == 'context': 155 | return None 156 | else: 157 | return super().get_formatted_attribute(key) 158 | 159 | def print_additional_conclusion( 160 | self, 161 | start_time: int | float, 162 | end_time: int | float, 163 | jobs: typing.Sequence[int], 164 | ) -> None: 165 | import toolstr 166 | 167 | duration = end_time - start_time 168 | n_blocks = len(jobs) * self.chunk_size 169 | bps = n_blocks / duration 170 | self.print_bullet(key='blocks covered', value=toolstr.format(n_blocks)) 171 | toolstr.print_bullet( 172 | key='blocks per second', 173 | value=toolstr.format(bps, decimals=2), 174 | ) 175 | toolstr.print_bullet( 176 | key='blocks per minute', 177 | value=toolstr.format(bps * 60, decimals=2), 178 | ) 179 | toolstr.print_bullet( 180 | key='blocks per hour', 181 | value=toolstr.format(bps * 60 * 60, decimals=2), 182 | ) 183 | toolstr.print_bullet( 184 | key='blocks per day', 185 | value=toolstr.format(bps * 86400, decimals=2), 186 | ) 187 | 188 | def summarize_blocks_per_second( 189 | self, sample_time: int = 60 190 | ) -> pl.DataFrame: 191 | import polars as pl 192 | 193 | jobs_per_second = self.summarize_jobs_per_second( 194 | sample_time=sample_time 195 | ) 196 | start_blocks = [ 197 | self.get_job_data(i)['start_block'] 198 | for i in range(self.get_n_jobs()) 199 | ] 200 | columns: typing.Sequence[pl.type_aliases.IntoExpr] = [ 201 | pl.Series(start_blocks).alias('start_block'), 202 | (pl.col('jobs_per_second') * self.chunk_size).alias( 203 | 'blocks_per_second' 204 | ), 205 | ] 206 | return jobs_per_second.with_columns(columns) 207 | 208 | def plot_blocks_per_second(self, sample_time: int = 60) -> None: 209 | import matplotlib.pyplot as plt # type: ignore 210 | import toolplot 211 | 212 | df = self.summarize_blocks_per_second(sample_time=sample_time) 213 | 214 | plt.plot(df['start_block'], df['blocks_per_second']) 215 | toolplot.add_tick_grid() 216 | toolplot.format_yticks() 217 | toolplot.format_xticks() 218 | plt.ylabel('blocks per second') 219 | plt.title('Tracing speed at different points in history') 220 | plt.xlabel('block number') 221 | 222 | -------------------------------------------------------------------------------- /pdp/data_utils/manifest_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import typing 5 | 6 | from .. import config_utils 7 | from .. import spec 8 | from . import file_utils 9 | from . import schema_utils 10 | 11 | if typing.TYPE_CHECKING: 12 | import toolsql 13 | 14 | 15 | def get_global_manifest( 16 | *, 17 | portal_root: str | None = None, 18 | source: typing.Literal['remote', 'local'] = 'remote', 19 | ) -> spec.GlobalManifest: 20 | """get global manifest of all datasets""" 21 | 22 | if source == 'remote': 23 | 24 | import requests 25 | 26 | # build url 27 | if portal_root is None: 28 | portal_root = spec.portal_root 29 | url = spec.urls['global_manifest'].format(portal_root=portal_root) 30 | 31 | # get manifest 32 | response = requests.get(url) 33 | if response.status_code != 200: 34 | raise Exception('could not obtain global manifest at url: ' + str(url)) 35 | manifest: spec.GlobalManifest = response.json() 36 | 37 | return manifest 38 | 39 | elif source == 'local': 40 | import json 41 | 42 | if portal_root is not None: 43 | data_root = portal_root 44 | else: 45 | data_root = config_utils.get_data_root(require=True) 46 | path = os.path.join(data_root, spec.global_manifest_filename) 47 | with open(path, 'r') as f: 48 | result: spec.GlobalManifest = json.load(f) 49 | return result 50 | 51 | else: 52 | raise Exception('invalid source: ' + str(source)) 53 | 54 | 55 | def get_dataset_manifest( 56 | dataset: str, 57 | *, 58 | portal_root: str | None = None, 59 | source: typing.Literal['remote', 'local'] = 'remote', 60 | ) -> spec.DatasetManifest: 61 | """get manifest of a particular dataset""" 62 | 63 | if source == 'remote': 64 | import requests 65 | 66 | parsed = schema_utils.parse_dataset_name(dataset) 67 | 68 | # build url 69 | if portal_root is None: 70 | portal_root = spec.portal_root 71 | url = spec.urls['dataset_manifest'].format( 72 | portal_root=portal_root, 73 | datatype=parsed['datatype'], 74 | network=parsed['network'], 75 | ) 76 | 77 | # get manifest 78 | response = requests.get(url) 79 | if response.status_code != 200: 80 | raise Exception( 81 | 'could not obtain dataset manifest at url: ' + str(url) 82 | ) 83 | manifest: spec.DatasetManifest = response.json() 84 | 85 | return manifest 86 | 87 | elif source == 'local': 88 | import json 89 | 90 | if portal_root is not None: 91 | data_root = portal_root 92 | else: 93 | data_root = config_utils.get_data_root(require=True) 94 | path = os.path.join(data_root, dataset, spec.dataset_manifest_filename) 95 | with open(path, 'r') as f: 96 | result: spec.DatasetManifest = json.load(f) 97 | return result 98 | 99 | else: 100 | raise Exception('invalid source: ' + str(source)) 101 | 102 | 103 | def create_global_manifest( 104 | *, 105 | data_root: str | None = None, 106 | datasets: typing.Mapping[str, spec.DatasetManifestSlim] | None = None, 107 | version: str | None, 108 | output_path: str | bool | None = None, 109 | confirm: bool = False, 110 | ) -> spec.GlobalManifest: 111 | """create global manifest describing all datasets""" 112 | 113 | import json 114 | 115 | # versions 116 | if version is None: 117 | version = spec.global_version 118 | 119 | print('creating global manifest', version) 120 | print() 121 | 122 | # gather dataset manifests 123 | if datasets is None: 124 | if data_root is None: 125 | raise Exception('must specify data_root or datasets') 126 | found_datasets: typing.MutableMapping[ 127 | str, spec.DatasetManifestSlim 128 | ] = {} 129 | for item in os.listdir(data_root): 130 | path = os.path.join(data_root, item) 131 | if os.path.isdir(path): 132 | if spec.dataset_manifest_filename in os.listdir(path): 133 | manifest_path = os.path.join( 134 | path, spec.dataset_manifest_filename 135 | ) 136 | with open(manifest_path) as f: 137 | dataset_manifest = json.load(f) 138 | name = dataset_manifest['name'] 139 | found_datasets[name] = _reduce_dataset_manifest( 140 | dataset_manifest 141 | ) 142 | print('gathered:', item) 143 | else: 144 | print('no dataset manifest found for: ' + item) 145 | datasets = found_datasets 146 | 147 | # create global manifest 148 | global_manifest: spec.GlobalManifest = { 149 | 'version': version, 150 | 'datasets': datasets, 151 | } 152 | 153 | if len(datasets) == 0: 154 | print('no datasets detected for global manifest') 155 | return global_manifest 156 | 157 | # write output file 158 | if output_path is not None and output_path: 159 | import shutil 160 | 161 | if isinstance(output_path, bool): 162 | if data_root is not None: 163 | output_dir = data_root 164 | else: 165 | output_dir = '.' 166 | output_path = os.path.join( 167 | output_dir, spec.global_manifest_filename 168 | ) 169 | 170 | if os.path.exists(output_path) and not confirm: 171 | raise Exception('use --confirm to overwrite existing file') 172 | 173 | with open(output_path + '_tmp', 'w') as f: 174 | json.dump(global_manifest, f, indent=4, sort_keys=True) 175 | shutil.move(output_path + '_tmp', output_path) 176 | if data_root is not None: 177 | print_dir = os.path.relpath(output_path, data_root) 178 | else: 179 | print_dir = output_path 180 | print() 181 | print('wrote global manifest to:', print_dir) 182 | 183 | return global_manifest 184 | 185 | 186 | def _reduce_dataset_manifest( 187 | manifest: spec.DatasetManifest, 188 | ) -> spec.DatasetManifestSlim: 189 | return { 190 | 'name': manifest['name'], 191 | 'version': manifest['version'], 192 | 'description': manifest['description'], 193 | 'datatype': manifest['datatype'], 194 | 'network': manifest['network'], 195 | 'n_files': len(manifest['files']), 196 | 'n_bytes': sum(file['n_bytes'] for file in manifest['files']), 197 | 'schema': manifest['schema'], 198 | } 199 | 200 | 201 | def create_dataset_manifest( 202 | *, 203 | dataset_dir: str | None = None, 204 | name: str | None = None, 205 | version: str | None = None, 206 | datatype: str | None = None, 207 | network: str | None = None, 208 | description: str | None = None, 209 | data_root: str | None = None, 210 | schema: toolsql.DBSchemaShorthand | None = None, 211 | paths: typing.Sequence[str] | None = None, 212 | reuse_hashes: bool = False, 213 | output_path: str | bool | None = None, 214 | confirm: bool = False, 215 | ) -> spec.DatasetManifest: 216 | """describe dataset manifest describing dataset contents""" 217 | 218 | import json 219 | 220 | # ensure valid output path 221 | if output_path is not None and output_path: 222 | if isinstance(output_path, bool): 223 | if data_root is not None: 224 | output_dir = data_root 225 | else: 226 | output_dir = '.' 227 | output_path = os.path.join( 228 | output_dir, spec.dataset_manifest_filename 229 | ) 230 | if os.path.exists(output_path) and not confirm: 231 | raise Exception('use --confirm to overwrite existing file') 232 | 233 | # gather metadata 234 | if dataset_dir is not None: 235 | dataset_dir = os.path.abspath(os.path.expanduser(dataset_dir)) 236 | if name is None: 237 | if dataset_dir is None: 238 | raise Exception('must specify dataset_dir or name') 239 | name = os.path.basename(dataset_dir) 240 | if network is None or datatype is None: 241 | parsed = schema_utils.parse_dataset_name(name) 242 | parsed_network = parsed['network'] 243 | parsed_datatype = parsed['datatype'] 244 | if network is None: 245 | network = parsed_network 246 | elif network != parsed_network: 247 | raise Exception('parsed network does not equal input network') 248 | if datatype is None: 249 | datatype = parsed_datatype 250 | elif datatype != parsed_datatype: 251 | raise Exception('parsed datatype does not equal input datatype') 252 | try: 253 | module = schema_utils._get_datatype_module(datatype) 254 | except Exception: 255 | module = None 256 | if version is None: 257 | if module is not None: 258 | version = module.version 259 | else: 260 | raise Exception('unknown version for dataset') 261 | if description is None: 262 | if module is not None: 263 | description = module.schema['description'] 264 | else: 265 | raise Exception('could not find description for dataset') 266 | if schema is None: 267 | if module is not None: 268 | schema = module.schema 269 | else: 270 | raise Exception('could not find schema for dataset') 271 | schema_normalized = toolsql.normalize_shorthand_db_schema(schema) 272 | print('creating manifest for', name, version) 273 | 274 | # gather files 275 | if paths is None: 276 | if dataset_dir is None: 277 | raise Exception('must specify paths or dataset_dir') 278 | exclude = [ 279 | spec.dataset_manifest_filename, 280 | spec.dataset_readme_filename, 281 | ] + spec.dataset_license_filenames 282 | paths = [ 283 | os.path.join(dataset_dir, filename) 284 | for filename in sorted(os.listdir(dataset_dir)) 285 | if filename not in exclude 286 | ] 287 | 288 | # gather hashes 289 | print('gathering hashes of', len(paths), 'files') 290 | if reuse_hashes: 291 | if output_path is None: 292 | raise Exception('must specify output_path when reuse_hashes=True') 293 | with open(output_path, 'r') as f: 294 | old_data = json.load(f) 295 | old_hashes = {file['name']: file['hash'] for file in old_data['files']} 296 | file_hashes = [] 297 | for path in paths: 298 | if reuse_hashes and os.path.basename(path) in old_hashes: 299 | file_hashes.append(old_hashes[os.path.basename(path)]) 300 | else: 301 | file_hashes.append(file_utils.get_file_hash(path)) 302 | 303 | # assemble files 304 | files = [] 305 | for path, file_hash in zip(paths, file_hashes): 306 | file: spec.FileMetadata = { 307 | 'name': os.path.basename(path), 308 | 'hash': file_hash, 309 | 'n_bytes': os.path.getsize(path), 310 | } 311 | files.append(file) 312 | 313 | # build manifest 314 | manifest: spec.DatasetManifest = { 315 | 'name': name, 316 | 'version': version, 317 | 'description': description or '', 318 | 'datatype': datatype, 319 | 'network': network, 320 | 'files': files, 321 | 'schema': schema_normalized, 322 | } 323 | 324 | # save manifest 325 | if output_path is not None and output_path: 326 | import shutil 327 | 328 | with open(output_path + '_tmp', 'w') as f: 329 | json.dump(manifest, f, indent=4, sort_keys=True) 330 | 331 | shutil.move(output_path + '_tmp', output_path) 332 | if data_root is not None: 333 | print_dir = os.path.relpath(output_path, data_root) 334 | else: 335 | print_dir = output_path 336 | print('wrote dataset manifest to:', print_dir) 337 | 338 | return manifest 339 | 340 | -------------------------------------------------------------------------------- /pdp/data_utils/query_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import typing 5 | 6 | from .. import config_utils 7 | from .. import spec 8 | 9 | if typing.TYPE_CHECKING: 10 | import polars as pl 11 | 12 | 13 | def query( 14 | filters: spec.PolarsExpression, 15 | # outputs 16 | columns: spec.PolarsExpression | None = None, 17 | group_by: spec.PolarsExpression | None = None, 18 | output_binary: bool = True, 19 | sort: spec.PolarsExpression | None = None, 20 | descending: bool = False, 21 | unique_sort: spec.PolarsExpression | None = None, 22 | unique_descending: bool = False, 23 | unique_columns: typing.Sequence[str] | None = None, 24 | unique_keep: typing.Literal['last', 'first', 'any'] | None = None, 25 | # inputs 26 | source_path: str | None = None, 27 | dataset: str | None = None, 28 | network: str | int | None = None, 29 | datatype: str | None = None, 30 | table: str | None = None, 31 | scan_kwargs: typing.Any = None, 32 | # outputs 33 | collect: bool = True, 34 | streaming: bool = True, 35 | output_path: str | None = None, 36 | output_kwargs: typing.Any = None, 37 | ) -> pl.DataFrame: 38 | import polars as pl 39 | 40 | # determine data source 41 | if source_path is None: 42 | if network is None: 43 | raise Exception('must specify network (e.g. network=\'ethereum\')') 44 | source_path = config_utils.get_dataset_glob( 45 | network=network, 46 | datatype=datatype, 47 | dataset=dataset, 48 | table=table, 49 | ) 50 | elif os.path.isdir(source_path): 51 | source_path = os.path.join(source_path, '*.parquet') 52 | 53 | # initiate scan 54 | if scan_kwargs is None: 55 | scan_kwargs = {} 56 | lf = pl.scan_parquet(source_path, **scan_kwargs) 57 | 58 | # add filters 59 | if filters is not None: 60 | if isinstance(filters, (list, tuple)): 61 | filters_list = filters 62 | else: 63 | filters_list = [filters] 64 | 65 | if len(filters_list) > 0: 66 | filter = filters_list[0] 67 | for other_filter in filters_list[1:]: 68 | filter &= other_filter 69 | lf = lf.filter(filter) 70 | 71 | # filter unique 72 | if unique_columns is not None: 73 | if unique_keep is None: 74 | raise Exception( 75 | "must specify unique_keep (e.g. 'first', 'last', or 'any')" 76 | ) 77 | 78 | # maintain order if unique_sort equals output sort 79 | if unique_sort is not None: 80 | lf = lf.sort(unique_sort, descending=unique_descending) 81 | already_sorted: bool = _polars_exprs_equal(sort, unique_sort) and ( 82 | descending == unique_descending 83 | ) 84 | else: 85 | already_sorted = False 86 | 87 | # keep unique 88 | lf = lf.unique( 89 | maintain_order=already_sorted, 90 | subset=unique_columns, 91 | keep=unique_keep, 92 | ) 93 | else: 94 | already_sorted = False 95 | 96 | # group by 97 | if group_by is not None: 98 | # group and aggregate 99 | if columns is not None: 100 | raise Exception('must specify columns for agg when using groupby') 101 | lf = lf.groupby(group_by).agg(columns) 102 | 103 | # sort 104 | if sort: 105 | lf = lf.sort(sort, descending=descending) 106 | 107 | else: 108 | # sort 109 | if sort and not already_sorted: 110 | lf = lf.sort(sort, descending=descending) 111 | 112 | # select columns 113 | if columns is not None: 114 | lf = lf.select(columns) 115 | 116 | # encode binary as hex 117 | if not output_binary: 118 | encode_columns = [ 119 | ('0x' + pl.col(column_name).bin.encode('hex')).alias(column_name) 120 | for column_name, column_type in lf.schema.items() 121 | if column_type == pl.Binary 122 | ] 123 | lf = lf.with_columns(encode_columns) 124 | 125 | # return output 126 | if output_kwargs is None: 127 | output_kwargs = {} 128 | if output_path: 129 | return lf.sink_parquet(output_path, **output_kwargs) 130 | elif collect and streaming: 131 | return lf.collect(streaming=True, **output_kwargs) 132 | elif collect: 133 | return lf.collect() 134 | else: 135 | return lf # type: ignore 136 | 137 | 138 | def create_query_filters( 139 | *, 140 | simple_filters: typing.Mapping[str, typing.Any] | None = None, 141 | block_filters: typing.Mapping[str, int | None] | None = None, 142 | binary_filters: typing.Mapping[str, str | bytes | None] | None = None, 143 | binary_is_in_filters: typing.Mapping[ 144 | str, typing.Sequence[str | bytes] | None 145 | ] 146 | | None = None, 147 | ) -> typing.MutableSequence[pl.type_aliases.IntoExpr]: 148 | import polars as pl 149 | 150 | filters: typing.MutableSequence[pl.type_aliases.IntoExpr] = [] 151 | 152 | # block filters 153 | if block_filters is not None: 154 | start_block = block_filters.get('start_block') 155 | end_block = block_filters.get('end_block') 156 | block_number = block_filters.get('block_number') 157 | if start_block is not None: 158 | filters.append(pl.col('block_number') >= start_block) 159 | if end_block is not None: 160 | filters.append(pl.col('block_number') <= end_block) 161 | if block_number is not None: 162 | filters.append(pl.col('block_number') == block_number) 163 | 164 | # binary filters 165 | if binary_filters is not None: 166 | for column, value in binary_filters.items(): 167 | if value is not None: 168 | filters.append(pl.col(column) == spec.to_binary(value)) 169 | 170 | # binary is_in filters 171 | if binary_is_in_filters is not None: 172 | for key, list_value in binary_is_in_filters.items(): 173 | if list_value is not None: 174 | binary_values = [ 175 | spec.to_binary(subvalue) for subvalue in list_value 176 | ] 177 | filters.append(pl.col(key).is_in(binary_values)) 178 | 179 | return filters 180 | 181 | 182 | def _polars_exprs_equal( 183 | expr1: spec.PolarsExpression, 184 | expr2: spec.PolarsExpression, 185 | ) -> bool: 186 | if isinstance(expr1, str): 187 | expr1 = pl.col(expr1) 188 | if isinstance(expr2, str): 189 | expr1 = pl.col(expr2) 190 | return str(expr1) == str(expr2) 191 | 192 | -------------------------------------------------------------------------------- /pdp/data_utils/readme_utils.py: -------------------------------------------------------------------------------- 1 | """functions for creating dataset README's""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | 7 | from .. import spec 8 | from . import download_utils 9 | from . import schema_utils 10 | 11 | 12 | readme_template = """ 13 | # {network} {datatype} Dataset v{version} 14 | 15 | This is a dataset of {description} 16 | 17 | The dataset was created by using [this script]({script_url}) 18 | 19 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/) 20 | 21 | ## Usage 22 | 23 | Some example uses of this dataset include: 24 | {example_usage} 25 | 26 | {notebook_str} 27 | 28 | ## Schema 29 | 30 | {schema} 31 | 32 | ## Download 33 | 34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below 35 | 36 | The total dataset size is **{dataset_size}** 37 | 38 | ### Use `pdp` 39 | 40 | The command `pdp download {dataset_name}` will download all files in this dataset 41 | 42 | See `pdp download -h` for available options 43 | 44 | ### Use URLs 45 | 46 | | | file | size | 47 | | - | - | - | 48 | {file_urls} 49 | """ 50 | 51 | script_url_template = 'https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/{dataset}/{dataset}_collect.py' 52 | 53 | notebook_url_template = 'https://github.com/paradigmxyz/paradigm-data-portal/blob/main/notebooks/explore_{dataset_name}.ipynb' 54 | 55 | table_schema_template = """#### `{table_name}` table 56 | {table_description} 57 | | column | type | description | 58 | | - | - | - | 59 | {table_rows}""" 60 | 61 | 62 | def create_dataset_readme( 63 | dataset_manifest: spec.DatasetManifest, 64 | output_path: str | bool = False, 65 | confirm: bool = False, 66 | ) -> str: 67 | 68 | import shutil 69 | 70 | readme_str = _create_readme_str(dataset_manifest) 71 | 72 | if output_path is not None and output_path: 73 | if isinstance(output_path, bool): 74 | output_path = spec.dataset_readme_filename 75 | if os.path.exists(output_path) and not confirm: 76 | print('use --confirm to overwrite existing README') 77 | with open(output_path + '_tmp', 'w') as f: 78 | f.write(readme_str) 79 | shutil.move(output_path + '_tmp', output_path) 80 | print('wrote README to ' + output_path) 81 | 82 | return readme_str 83 | 84 | 85 | def _create_readme_str(dataset_manifest: spec.DatasetManifest) -> str: 86 | 87 | import toolsql 88 | import toolstr 89 | 90 | module = schema_utils._get_datatype_module(dataset_manifest['datatype']) 91 | example_usage_pieces = ['- ' + example for example in module.example_usage] 92 | example_usage_str = '\n'.join(example_usage_pieces) 93 | 94 | schema_pieces = [] 95 | db_schema = toolsql.normalize_shorthand_db_schema( 96 | dataset_manifest['schema'] 97 | ) 98 | for table_name, table in db_schema['tables'].items(): 99 | table_table_pieces = [ 100 | '| ' 101 | + column['name'] 102 | + ' | ' 103 | + column['type'] 104 | + ' | ' 105 | + (column['description'] or '') 106 | + ' |' 107 | for column in table['columns'] 108 | ] 109 | table_table = '\n'.join(table_table_pieces) 110 | table_schema_str = table_schema_template.format( 111 | table_name=table['name'], 112 | table_description=table['description'], 113 | table_rows=table_table, 114 | ) 115 | schema_pieces.append(table_schema_str) 116 | schema_str = '\n'.join(schema_pieces) 117 | 118 | url_pieces: list[str] = [] 119 | for file in dataset_manifest['files']: 120 | if file['name'] == spec.dataset_readme_filename: 121 | continue 122 | file_url = download_utils.get_dataset_file_url( 123 | datatype=dataset_manifest['datatype'], 124 | network=dataset_manifest['network'], 125 | filename=file['name'], 126 | ) 127 | url_piece = ( 128 | '| ' 129 | + str(len(url_pieces) + 1) 130 | + ' | ' 131 | + ('[' + file['name'] + '](' + file_url + ')') 132 | + ' | ' 133 | + toolstr.format_nbytes(file['n_bytes']) 134 | + ' |' 135 | ) 136 | url_pieces.append(url_piece) 137 | url_str = '\n'.join(url_pieces) 138 | 139 | dataset_nbytes = sum(file['n_bytes'] for file in dataset_manifest['files']) 140 | 141 | if dataset_manifest['datatype'] == 'contracts': 142 | notebook_url = notebook_url_template.format( 143 | dataset_name=dataset_manifest['name'] 144 | ) 145 | notebook_str = """An example notebook exploring this dataset can be found [here]({notebook_url})""".format( 146 | notebook_url=notebook_url 147 | ) 148 | else: 149 | notebook_str = '' 150 | 151 | return readme_template.format( 152 | dataset_name=dataset_manifest['name'], 153 | network=dataset_manifest['network'].replace('_', ' ').title(), 154 | datatype=dataset_manifest['datatype'].replace('_', ' ').title(), 155 | version=dataset_manifest['version'], 156 | description=dataset_manifest['description'], 157 | example_usage=example_usage_str, 158 | schema=schema_str, 159 | dataset_size=toolstr.format_nbytes(dataset_nbytes), 160 | script_url=script_url_template.format(dataset=dataset_manifest['name']), 161 | notebook_str=notebook_str, 162 | file_urls=url_str, 163 | ) 164 | 165 | -------------------------------------------------------------------------------- /pdp/data_utils/schema_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | from .. import spec 6 | 7 | if typing.TYPE_CHECKING: 8 | import types 9 | import toolsql 10 | 11 | 12 | def get_dataset_name(*, datatype: str, network: str | int) -> str: 13 | """create dataset name based on metadata""" 14 | if isinstance(network, int): 15 | network = spec.networks[network] 16 | return network + '_' + datatype 17 | 18 | 19 | def get_versioned_dataset_name(*, datatype: str, network: str | int, version: str) -> str: 20 | """create versioned dataset name for use in file names""" 21 | dataset_name = get_dataset_name(datatype=datatype, network=network) 22 | version_str = 'v' + version.replace('.', '_') 23 | return dataset_name + '__' + version_str 24 | 25 | 26 | def parse_dataset_name(dataset: str) -> typing.Mapping[str, str]: 27 | """parse metadata from a dataset name""" 28 | network, datatype = dataset.split('_', maxsplit=1) 29 | return { 30 | 'network': network, 31 | 'datatype': datatype, 32 | } 33 | 34 | 35 | def get_datatype_schema(datatype: str) -> toolsql.DBSchema: 36 | import toolsql 37 | 38 | module = _get_datatype_module(datatype) 39 | schema: toolsql.DBSchema = toolsql.normalize_shorthand_db_schema( 40 | module.schema 41 | ) 42 | return schema 43 | 44 | 45 | def get_dataset_schema( 46 | dataset: str, multichain_tables: bool = False 47 | ) -> toolsql.DBSchema: 48 | import copy 49 | import toolsql 50 | 51 | # parse dataset name 52 | parsed = parse_dataset_name(dataset) 53 | network = parsed['network'] 54 | 55 | # load schema 56 | datatype_schema = get_datatype_schema(parsed['datatype']) 57 | datatype_schema = copy.deepcopy(datatype_schema) 58 | datatype_schema['name'] = dataset 59 | 60 | # make schema names dataset-specific 61 | if multichain_tables: 62 | # if a multichain table, add chain_id to each table 63 | for table_schema in datatype_schema['tables'].values(): 64 | 65 | # make chain_id primary only if there exist other primary keys 66 | is_primary = any( 67 | other_column.get('primary') 68 | for other_column in table_schema['columns'] 69 | ) 70 | raw_column: toolsql.ColumnSchemaShorthand = { 71 | 'name': 'chain_id', 72 | 'type': 'INTEGER', 73 | 'index': True, 74 | 'primary': is_primary, 75 | } 76 | column = toolsql.normalize_shorthand_column_schema(raw_column) 77 | table_schema['columns'].append(column) # type: ignore 78 | 79 | else: 80 | # if a single chain table, add network to table names 81 | datatype_schema['tables'] = { 82 | network + '_' + k: v for k, v in datatype_schema['tables'].items() 83 | } 84 | 85 | return datatype_schema 86 | 87 | 88 | def _get_datatype_module(datatype: str) -> types.ModuleType: 89 | import importlib 90 | 91 | try: 92 | return importlib.import_module('pdp.datasets.' + datatype) 93 | except Exception: 94 | raise Exception('could not get module for dataset' + str(datatype)) 95 | 96 | -------------------------------------------------------------------------------- /pdp/data_utils/update_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | 6 | def update( 7 | dataset: str, 8 | *, 9 | method: typing.Literal['download', 'collect'] = 'download', 10 | ) -> None: 11 | raise NotImplementedError() 12 | 13 | -------------------------------------------------------------------------------- /pdp/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/datasets/__init__.py -------------------------------------------------------------------------------- /pdp/datasets/contracts/__init__.py: -------------------------------------------------------------------------------- 1 | from .contracts_collect import * 2 | from .contracts_queries import * 3 | from .contracts_spec import * 4 | -------------------------------------------------------------------------------- /pdp/datasets/contracts/contracts_collect.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import typing 5 | 6 | import pdp 7 | from . import contracts_spec 8 | 9 | if typing.TYPE_CHECKING: 10 | import ctc.spec 11 | import tooljob.trackers.file_tracker 12 | 13 | 14 | def collect_contracts_dataset( 15 | *, 16 | start_block: int, 17 | end_block: int, 18 | output_dir: str, 19 | network: ctc.spec.NetworkReference, 20 | chunk_size: int | None = None, 21 | output_filetype: str | None = None, 22 | executor: typing.Literal['serial', 'parallel'] = 'parallel', 23 | verbose: bool = False, 24 | ) -> None: 25 | if chunk_size is None: 26 | chunk_size = 1000 27 | if output_filetype is None: 28 | output_filetype = 'csv' 29 | 30 | dataset_name = pdp.get_versioned_dataset_name( 31 | datatype='contracts', 32 | network=network, 33 | version=contracts_spec.version, 34 | ) 35 | 36 | extractor = _ExtractContracts( 37 | start_block=start_block, 38 | end_block=end_block, 39 | chunk_size=chunk_size, 40 | output_dir=output_dir, 41 | tracker='file', 42 | output_filetype=output_filetype, 43 | name=dataset_name, 44 | context={'network': network}, 45 | styles=pdp.styles, 46 | verbose=verbose, 47 | ) 48 | 49 | extractor.orchestrate_jobs(executor=executor) 50 | 51 | 52 | class _ExtractContracts(pdp.BlockChunkJobs): 53 | tracker: tooljob.trackers.file_tracker.FileTracker 54 | 55 | def execute_job(self, i: int) -> typing.Any: 56 | job_data = self.get_job_data(i) 57 | job_name = self.get_job_name(i) 58 | path = self.tracker.get_job_output_path(i) 59 | _sync_trace_blocks( 60 | start_block=job_data['start_block'], 61 | end_block=job_data['end_block'], 62 | job_name=job_name, 63 | path=path, 64 | context=self.context, 65 | ) 66 | 67 | 68 | async def _async_trace_blocks( 69 | *, 70 | start_block: int, 71 | end_block: ctc.spec.BlockNumberReference, 72 | path: str, 73 | context: ctc.spec.Context, 74 | ) -> None: 75 | import polars as pl 76 | 77 | pdp.ensure_ctc() 78 | import ctc 79 | import ctc.rpc 80 | from ctc.toolbox import pl_utils 81 | 82 | create_traces = await ctc.async_trace_contract_creations( 83 | start_block=start_block, 84 | end_block=end_block, 85 | context=context, 86 | ) 87 | await ctc.rpc.async_close_http_session() 88 | 89 | df = pl.DataFrame(create_traces) 90 | pl_utils.write_df(df=df, path=path, create_dir=True) 91 | 92 | return None 93 | 94 | 95 | def _sync_trace_blocks( 96 | *, 97 | start_block: int, 98 | end_block: ctc.spec.BlockNumberReference, 99 | job_name: str, 100 | path: str, 101 | context: ctc.spec.Context, 102 | ) -> None: 103 | try: 104 | asyncio.run( 105 | _async_trace_blocks( 106 | start_block=start_block, 107 | end_block=end_block, 108 | path=path, 109 | context=context, 110 | ) 111 | ) 112 | except Exception as e: 113 | print('job', job_name, 'failed:' + str(e)) 114 | raise e 115 | 116 | -------------------------------------------------------------------------------- /pdp/datasets/contracts/contracts_queries.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO 3 | - create block_number predicate pushdown 4 | """ 5 | 6 | from __future__ import annotations 7 | 8 | import typing 9 | 10 | import ctc 11 | 12 | import pdp 13 | from . import contracts_spec 14 | 15 | if typing.TYPE_CHECKING: 16 | import polars as pl 17 | 18 | 19 | def query_contract( 20 | contract_address: str | bytes, **kwargs: typing.Any 21 | ) -> contracts_spec.Contract | None: 22 | """return most recent deployment of contract""" 23 | 24 | # query data 25 | result = query_contracts(contract_address=contract_address, collect=True, **kwargs) 26 | 27 | # convert to dict 28 | if len(result) == 0: 29 | return None 30 | else: 31 | return result.to_dicts()[0] # type: ignore 32 | 33 | 34 | def query_contracts( 35 | *, 36 | # filters 37 | contract_address: str | bytes | None = None, 38 | contract_addresses: typing.Sequence[str | bytes] | None = None, 39 | deployer: str | bytes | None = None, 40 | factory: str | bytes | None = None, 41 | start_block: int | None = None, 42 | end_block: int | None = None, 43 | block_number: int | None = None, 44 | code: str | bytes | None = None, 45 | code_hash: str | bytes | None = None, 46 | init_code: str | bytes | None = None, 47 | init_code_hash: str | bytes | None = None, 48 | # outputs 49 | sort: bool | pdp.PolarsExpression = False, 50 | descending: bool = False, 51 | unique: bool = False, 52 | unique_keep: typing.Literal['last', 'first', 'any'] | None = 'last', 53 | columns: pdp.PolarsExpression | None = None, 54 | output_binary: bool = True, 55 | # inputs 56 | source_path: str | None = None, 57 | network: str | int | None = None, 58 | scan_kwargs: typing.Any = None, 59 | collect: bool = True, 60 | streaming: bool = True, 61 | ) -> pl.DataFrame: 62 | 63 | # convert to hashes 64 | if code is not None: 65 | code_hash = ctc.keccak(code) 66 | if init_code is not None: 67 | init_code_hash = ctc.keccak(init_code) 68 | 69 | # collect filters 70 | block_filters = { 71 | 'start_block': start_block, 72 | 'end_block': end_block, 73 | 'block_number': block_number, 74 | } 75 | binary_filters = { 76 | 'factory': factory, 77 | 'deployer': deployer, 78 | 'contract_address': contract_address, 79 | 'code_hash': code_hash, 80 | 'init_code_hash': init_code_hash, 81 | } 82 | filters = pdp.create_query_filters( 83 | binary_filters=binary_filters, 84 | block_filters=block_filters, 85 | binary_is_in_filters={'contract_address': contract_addresses}, 86 | ) 87 | 88 | # keep unique contracts 89 | if unique: 90 | unique_sort = ['block_number', 'create_index'] 91 | unique_descending = False 92 | unique_columns = ['contract_address'] 93 | else: 94 | unique_sort = None 95 | unique_descending = False 96 | unique_columns = None 97 | 98 | # create sort expression 99 | if sort and isinstance(sort, bool): 100 | sort = ['block_number', 'create_index'] 101 | 102 | return pdp.query( 103 | datatype='contracts', 104 | filters=filters, 105 | sort=sort, 106 | descending=descending, 107 | columns=columns, 108 | output_binary=output_binary, 109 | source_path=source_path, 110 | network=network, 111 | unique_columns=unique_columns, 112 | unique_sort=unique_sort, 113 | unique_descending=unique_descending, 114 | unique_keep=unique_keep, 115 | scan_kwargs=scan_kwargs, 116 | collect=collect, 117 | streaming=streaming, 118 | ) 119 | 120 | -------------------------------------------------------------------------------- /pdp/datasets/contracts/contracts_spec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | if typing.TYPE_CHECKING: 6 | import toolsql 7 | 8 | class Contract(typing.TypedDict, total=False): 9 | block_number: int 10 | create_index: int 11 | transaction_hash: str 12 | contract_address: str 13 | deployer: str 14 | factory: str 15 | init_code: str 16 | code: str 17 | init_code_hash: str 18 | code_hash: str 19 | 20 | class ContractBinary(typing.TypedDict, total=False): 21 | block_number: int 22 | create_index: int 23 | transaction_hash: bytes 24 | contract_address: bytes 25 | deployer: bytes 26 | factory: bytes 27 | init_code: bytes 28 | code: bytes 29 | init_code_hash: bytes 30 | code_hash: bytes 31 | 32 | 33 | version = '1.1.0' 34 | 35 | example_usage = [ 36 | 'look up all contracts deployed by an address', 37 | 'look up all contracts that have a given bytecode', 38 | 'analyze distribution of contract bytecode motifs', 39 | ] 40 | 41 | schema: toolsql.DBSchemaShorthand = { 42 | 'name': 'contracts', 43 | 'description': 'all historical contract deployments', 44 | 'tables': { 45 | 'contracts': { 46 | 'description': 'each row corresponds to a contract create trace', 47 | 'columns': [ 48 | { 49 | 'name': 'block_number', 50 | 'type': 'INTEGER', 51 | 'description': 'block number when contract was created', 52 | 'primary': True, 53 | }, 54 | { 55 | 'name': 'create_index', 56 | 'type': 'INTEGER', 57 | 'description': 'increased by 1 for each contract created in block', 58 | }, 59 | { 60 | 'name': 'transaction_hash', 61 | 'type': 'BINARY', 62 | 'description': 'hash of transaction that created contract', 63 | 'index': True, 64 | }, 65 | { 66 | 'name': 'contract_address', 67 | 'type': 'BINARY', 68 | 'description': 'address of deployed contract', 69 | 'primary': True, 70 | }, 71 | { 72 | 'name': 'deployer', 73 | 'type': 'BINARY', 74 | 'description': 'EOA that deployed the contract', 75 | 'index': True, 76 | }, 77 | { 78 | 'name': 'factory', 79 | 'type': 'BINARY', 80 | 'description': 'the `from` field in the creation trace', 81 | 'index': True, 82 | }, 83 | { 84 | 'name': 'init_code', 85 | 'type': 'BINARY', 86 | 'description': 'initialization bytecode of contract', 87 | }, 88 | { 89 | 'name': 'code', 90 | 'type': 'BINARY', 91 | 'description': 'bytecode of contract', 92 | }, 93 | { 94 | 'name': 'init_code_hash', 95 | 'type': 'BINARY', 96 | 'description': 'keccak hash of contract initialization code', 97 | }, 98 | { 99 | 'name': 'code_hash', 100 | 'type': 'BINARY', 101 | 'description': 'keccak hash of contract bytecode', 102 | 'index': True, 103 | }, 104 | ], 105 | }, 106 | }, 107 | } 108 | 109 | -------------------------------------------------------------------------------- /pdp/datasets/native_transfers/__init__.py: -------------------------------------------------------------------------------- 1 | from .native_transfers_collect import * 2 | from .native_transfers_queries import * 3 | from .native_transfers_spec import * 4 | -------------------------------------------------------------------------------- /pdp/datasets/native_transfers/native_transfers_collect.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import typing 5 | 6 | import polars as pl 7 | 8 | import pdp 9 | from . import native_transfers_spec 10 | 11 | if typing.TYPE_CHECKING: 12 | import ctc.spec 13 | import tooljob.trackers.file_tracker 14 | 15 | 16 | def collect_native_transfers_dataset( 17 | *, 18 | start_block: int, 19 | end_block: int, 20 | output_dir: str, 21 | network: ctc.spec.NetworkReference, 22 | chunk_size: int | None = None, 23 | output_filetype: str | None = None, 24 | executor: typing.Literal['serial', 'parallel'] = 'parallel', 25 | verbose: bool = False, 26 | ) -> None: 27 | if chunk_size is None: 28 | chunk_size = 1000 29 | if output_filetype is None: 30 | output_filetype = 'csv' 31 | 32 | dataset_name = pdp.get_versioned_dataset_name( 33 | datatype='native_transfers', 34 | network=network, 35 | version=native_transfers_spec.version, 36 | ) 37 | 38 | extractor = _ExtractNativeTransfers( 39 | start_block=start_block, 40 | end_block=end_block, 41 | chunk_size=chunk_size, 42 | output_dir=output_dir, 43 | tracker='file', 44 | output_filetype=output_filetype, 45 | name=dataset_name, 46 | context={'network': network}, 47 | styles=pdp.styles, 48 | verbose=verbose, 49 | ) 50 | 51 | extractor.orchestrate_jobs(executor=executor) 52 | 53 | 54 | class _ExtractNativeTransfers(pdp.BlockChunkJobs): 55 | tracker: tooljob.trackers.file_tracker.FileTracker 56 | 57 | def execute_job(self, i: int) -> typing.Any: 58 | job_data = self.get_job_data(i) 59 | job_name = self.get_job_name(i) 60 | path = self.tracker.get_job_output_path(i) 61 | _sync_extract_native_transfers( 62 | start_block=job_data['start_block'], 63 | end_block=job_data['end_block'], 64 | job_name=job_name, 65 | path=path, 66 | context=self.context, 67 | ) 68 | 69 | 70 | def _sync_extract_native_transfers( 71 | *, 72 | start_block: int, 73 | end_block: ctc.spec.BlockNumberReference, 74 | job_name: str, 75 | path: str, 76 | context: ctc.spec.Context, 77 | ) -> None: 78 | try: 79 | asyncio.run( 80 | _async_extract_native_transfers( 81 | start_block=start_block, 82 | end_block=end_block, 83 | path=path, 84 | context=context, 85 | ) 86 | ) 87 | except Exception as e: 88 | print('job', job_name, 'failed:' + str(e)) 89 | raise e 90 | 91 | 92 | async def _async_extract_native_transfers( 93 | *, 94 | start_block: int, 95 | end_block: ctc.spec.BlockNumberReference, 96 | path: str, 97 | context: ctc.spec.Context, 98 | ) -> None: 99 | pdp.ensure_ctc() 100 | import ctc 101 | import ctc.rpc 102 | from ctc.toolbox import pl_utils 103 | 104 | transfers = await ctc.async_trace_native_transfers( 105 | start_block=start_block, 106 | end_block=end_block, 107 | context=context, 108 | ) 109 | 110 | await ctc.rpc.async_close_http_session(context=context) 111 | 112 | # load data into output file 113 | df = pl.DataFrame( 114 | transfers, 115 | orient='row', 116 | schema=[ 117 | ('block_number', pl.datatypes.Int32), 118 | ('transfer_index', pl.datatypes.Int32), 119 | ('transaction_hash', pl.datatypes.Utf8), 120 | ('from_address', pl.datatypes.Utf8), 121 | ('to_address', pl.datatypes.Utf8), 122 | ('value', pl.datatypes.Utf8), 123 | ], 124 | ) 125 | pl_utils.write_df(df=df, path=path, create_dir=True) 126 | 127 | -------------------------------------------------------------------------------- /pdp/datasets/native_transfers/native_transfers_queries.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | import pdp 6 | 7 | if typing.TYPE_CHECKING: 8 | import polars as pl 9 | 10 | 11 | def query_native_transfers( 12 | # filters 13 | from_address: str | bytes | None = None, 14 | to_address: str | bytes | None = None, 15 | from_addresses: typing.Sequence[str | bytes] | None = None, 16 | to_addresses: typing.Sequence[str | bytes] | None = None, 17 | start_block: int | None = None, 18 | end_block: int | None = None, 19 | block_number: int | None = None, 20 | # outputs 21 | sort: bool | pdp.PolarsExpression = True, 22 | descending: bool = False, 23 | unique_keep: typing.Literal['last', 'first', 'any'] = 'last', 24 | columns: pdp.PolarsExpression | None = None, 25 | output_binary: bool = True, 26 | # inputs 27 | source_path: str | None = None, 28 | network: str | int | None = None, 29 | scan_kwargs: typing.Any = None, 30 | collect: bool = True, 31 | streaming: bool = True, 32 | ) -> pl.DataFrame: 33 | 34 | # collect filters 35 | block_filters = { 36 | 'start_block': start_block, 37 | 'end_block': end_block, 38 | 'block_number': block_number, 39 | } 40 | binary_filters = { 41 | 'from_address': from_address, 42 | 'to_address': to_address, 43 | } 44 | binary_is_in_filters = { 45 | 'from_addresses': from_addresses, 46 | 'to_addresses': to_addresses, 47 | } 48 | filters = pdp.create_query_filters( 49 | binary_filters=binary_filters, 50 | block_filters=block_filters, 51 | binary_is_in_filters=binary_is_in_filters, 52 | ) 53 | 54 | if sort and isinstance(sort, bool): 55 | sort = ['block_number', 'transfer_index'] 56 | 57 | return pdp.query( 58 | datatype='native_transfers', 59 | filters=filters, 60 | sort=sort, 61 | descending=descending, 62 | columns=columns, 63 | output_binary=output_binary, 64 | source_path=source_path, 65 | network=network, 66 | unique_keep=unique_keep, 67 | scan_kwargs=scan_kwargs, 68 | collect=collect, 69 | streaming=streaming, 70 | ) 71 | 72 | -------------------------------------------------------------------------------- /pdp/datasets/native_transfers/native_transfers_spec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | if typing.TYPE_CHECKING: 6 | import toolsql 7 | 8 | 9 | version = '1.1.0' 10 | 11 | example_usage = [ 12 | 'look up all inbound transfers to an address', 13 | 'analyze transfer size distributions', 14 | 'analyze transfer frequency distributions', 15 | ] 16 | 17 | schema: toolsql.DBSchemaShorthand = { 18 | 'name': 'native_transfers', 19 | 'description': 'all native transfers in similar format to ERC20 Transfers (excluding tx fees)', 20 | 'tables': { 21 | 'native_transfers': { 22 | 'description': 'each row corresponds to a trace that transfers native token', 23 | 'columns': [ 24 | { 25 | 'name': 'block_number', 26 | 'type': 'INTEGER', 27 | 'description': 'block number where native token was transfered', 28 | }, 29 | { 30 | 'name': 'transfer_index', 31 | 'type': 'INTEGER', 32 | 'description': 'increased by 1 for each native transfer in block', 33 | }, 34 | { 35 | 'name': 'transaction_hash', 36 | 'type': 'BINARY', 37 | 'description': 'hash of transaction that contains transfer', 38 | }, 39 | { 40 | 'name': 'to_address', 41 | 'type': 'BINARY', 42 | 'description': 'address that native token is transferred to', 43 | }, 44 | { 45 | 'name': 'from_address', 46 | 'type': 'BINARY', 47 | 'description': 'address that native token is transferred from', 48 | }, 49 | { 50 | 'name': 'value', 51 | 'type': 'BINARY', 52 | 'description': 'amount of native token transferred', 53 | }, 54 | ], 55 | }, 56 | }, 57 | } 58 | 59 | -------------------------------------------------------------------------------- /pdp/datasets/slots/__init__.py: -------------------------------------------------------------------------------- 1 | from .slots_collect import * 2 | from .slots_queries import * 3 | from .slots_spec import * 4 | -------------------------------------------------------------------------------- /pdp/datasets/slots/slots_collect.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import typing 5 | 6 | import pdp 7 | from . import slots_spec 8 | 9 | if typing.TYPE_CHECKING: 10 | import ctc.spec 11 | import tooljob.trackers.file_tracker 12 | 13 | 14 | def collect_slots_dataset( 15 | *, 16 | start_block: int, 17 | end_block: int, 18 | output_dir: str, 19 | network: ctc.spec.NetworkReference, 20 | chunk_size: int | None = None, 21 | output_filetype: str | None = None, 22 | executor: typing.Literal['serial', 'parallel'] = 'parallel', 23 | verbose: bool = False, 24 | ) -> None: 25 | if chunk_size is None: 26 | chunk_size = 1000 27 | if output_filetype is None: 28 | output_filetype = 'parquet' 29 | 30 | dataset_name = pdp.get_versioned_dataset_name( 31 | datatype='slots', 32 | network=network, 33 | version=slots_spec.version, 34 | ) 35 | 36 | extractor = _ExtractSlots( 37 | start_block=start_block, 38 | end_block=end_block, 39 | chunk_size=chunk_size, 40 | output_dir=output_dir, 41 | tracker='file', 42 | output_filetype=output_filetype, 43 | name=dataset_name, 44 | context={'network': network}, 45 | styles=pdp.styles, 46 | verbose=verbose, 47 | ) 48 | 49 | extractor.orchestrate_jobs(executor=executor) 50 | 51 | 52 | class _ExtractSlots(pdp.BlockChunkJobs): 53 | tracker: tooljob.trackers.file_tracker.FileTracker 54 | 55 | def execute_job(self, i: int) -> typing.Any: 56 | job_data = self.get_job_data(i) 57 | job_name = self.get_job_name(i) 58 | path = self.tracker.get_job_output_path(i) 59 | _sync_extract_slots( 60 | start_block=job_data['start_block'], 61 | end_block=job_data['end_block'], 62 | job_name=job_name, 63 | path=path, 64 | context=self.context, 65 | ) 66 | 67 | 68 | def _sync_extract_slots( 69 | *, 70 | start_block: int, 71 | end_block: ctc.spec.BlockNumberReference, 72 | job_name: str, 73 | path: str, 74 | context: ctc.spec.Context, 75 | ) -> None: 76 | try: 77 | asyncio.run( 78 | _async_extract_slots( 79 | start_block=start_block, 80 | end_block=end_block, 81 | path=path, 82 | context=context, 83 | ) 84 | ) 85 | except Exception as e: 86 | print('job', job_name, 'failed:' + str(e)) 87 | raise e 88 | 89 | 90 | async def _async_extract_slots( 91 | *, 92 | start_block: int, 93 | end_block: ctc.spec.BlockNumberReference, 94 | path: str, 95 | context: ctc.spec.Context, 96 | ) -> None: 97 | pdp.ensure_ctc() 98 | import ctc 99 | import ctc.rpc 100 | from ctc.toolbox import pl_utils 101 | 102 | df = await ctc.async_trace_slot_stats( 103 | start_block=start_block, 104 | end_block=end_block, 105 | context=context, 106 | ) 107 | 108 | await ctc.rpc.async_close_http_session() 109 | 110 | pl_utils.write_df(df=df, path=path, create_dir=True) 111 | 112 | -------------------------------------------------------------------------------- /pdp/datasets/slots/slots_queries.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | import pdp 6 | from . import slots_spec 7 | 8 | if typing.TYPE_CHECKING: 9 | import polars as pl 10 | 11 | 12 | def query_slots_of_contract( 13 | contract_address: str | bytes, 14 | network: str | int | None = None, 15 | **query_kwargs: typing.Any 16 | ) -> pl.DataFrame: 17 | return query_slots( 18 | contract_address=contract_address, 19 | network=network, 20 | **query_kwargs, 21 | ) 22 | 23 | 24 | def query_contract_slot_counts( 25 | network: str | int | None = None, 26 | **query_kwargs: typing.Any, 27 | ) -> pl.DataFrame: 28 | lf: pl.LazyFrame = ( 29 | query_slots(collect=False, **query_kwargs) # type: ignore 30 | .groupby('contract_address') 31 | .agg(pl.count()) 32 | .sort('counts', descending=True) 33 | ) 34 | 35 | return lf.collect(streaming=True) 36 | 37 | 38 | def query_slot( 39 | contract_address: str | bytes, 40 | slot: str | bytes, 41 | network: str | int | None = None, 42 | ) -> slots_spec.Slot | None: 43 | result = query_slots( 44 | contract_address=contract_address, 45 | slot=slot, 46 | ) 47 | if len(result) == 1: 48 | return result.to_dicts()[0] # type: ignore 49 | else: 50 | return None 51 | 52 | 53 | def query_slots( 54 | # filters 55 | contract_address: str | bytes | None = None, 56 | contract_addresses: typing.Sequence[str | bytes] | None = None, 57 | slot: str | bytes | None = None, 58 | slots: typing.Sequence[str | bytes] | None = None, 59 | # outputs 60 | sort: bool | pdp.PolarsExpression = True, 61 | unique_keep: typing.Literal['last', 'first', 'all'] = 'last', 62 | columns: pdp.PolarsExpression | None = None, 63 | output_binary: bool = True, 64 | # inputs 65 | source_path: str | None = None, 66 | network: str | int | None = None, 67 | scan_kwargs: typing.Any = None, 68 | collect: bool = True, 69 | streaming: bool = True, 70 | ) -> pl.DataFrame: 71 | 72 | # filters 73 | binary_filters = { 74 | 'contract_address': contract_address, 75 | 'slot': slot, 76 | } 77 | binary_is_in_filters = { 78 | 'contract_addresses': contract_addresses, 79 | 'slots': slots, 80 | } 81 | filters = pdp.create_query_filters( 82 | binary_filters=binary_filters, 83 | binary_is_in_filters=binary_is_in_filters, 84 | ) 85 | 86 | return pdp.query( 87 | filters=filters, 88 | sort=sort, 89 | columns=columns, 90 | output_binary=output_binary, 91 | source_path=source_path, 92 | network=network, 93 | datatype='slots', 94 | scan_kwargs=scan_kwargs, 95 | collect=collect, 96 | streaming=streaming, 97 | ) 98 | 99 | -------------------------------------------------------------------------------- /pdp/datasets/slots/slots_spec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | if typing.TYPE_CHECKING: 6 | import toolsql 7 | 8 | class Slot(typing.TypedDict): 9 | contract_address: str 10 | slot: str 11 | value: bytes 12 | first_updated_block: int 13 | last_updated_block: int 14 | n_tx_updates: int 15 | 16 | 17 | version = '1.1.0' 18 | 19 | example_usage = [ 20 | 'look up how much storage space is used by a given contract', 21 | 'look up which slots are used by a given contract', 22 | 'look up which slots change most frequently for a given contract', 23 | ] 24 | 25 | schema: toolsql.DBSchemaShorthand = { 26 | 'name': 'slots', 27 | 'description': 'all slots of each contract, including historical usage metadata', 28 | 'tables': { 29 | 'slots': { 30 | 'description': 'each row corresponds to a slot of a contract', 31 | 'columns': [ 32 | { 33 | 'name': 'contract_address', 34 | 'type': 'BINARY', 35 | 'description': 'contract of slot', 36 | }, 37 | { 38 | 'name': 'slot', 39 | 'type': 'BINARY', 40 | 'description': 'address of slot', 41 | }, 42 | { 43 | 'name': 'value', 44 | 'type': 'BINARY', 45 | 'description': 'last data stored in slot', 46 | }, 47 | { 48 | 'name': 'first_updated_block', 49 | 'type': 'INTEGER', 50 | 'description': 'first block where slot was used', 51 | }, 52 | { 53 | 'name': 'last_updated_block', 54 | 'type': 'INTEGER', 55 | 'description': 'last block where slot was updated', 56 | }, 57 | { 58 | 'name': 'n_tx_updates', 59 | 'type': 'INTEGER', 60 | 'description': 'number of transactions that updated slot', 61 | }, 62 | ], 63 | }, 64 | }, 65 | } 66 | 67 | -------------------------------------------------------------------------------- /pdp/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/py.typed -------------------------------------------------------------------------------- /pdp/spec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | 6 | if typing.TYPE_CHECKING: 7 | 8 | import polars as pl 9 | import toolcli 10 | import toolsql 11 | 12 | class GlobalManifest(typing.TypedDict): 13 | """manifest of all datasets""" 14 | 15 | version: str 16 | datasets: typing.Mapping[str, DatasetManifestSlim] 17 | 18 | class DatasetManifest(typing.TypedDict): 19 | """manifest of a particular dataset""" 20 | 21 | name: str 22 | version: str 23 | description: str 24 | datatype: str 25 | network: str 26 | files: typing.Sequence[FileMetadata] 27 | schema: toolsql.DBSchema 28 | 29 | class DatasetManifestSlim(typing.TypedDict): 30 | """manifest of a particular dataset""" 31 | 32 | name: str 33 | version: str 34 | description: str 35 | datatype: str 36 | network: str 37 | n_files: int 38 | n_bytes: int 39 | schema: toolsql.DBSchema 40 | 41 | class FileMetadata(typing.TypedDict): 42 | """metadata of a dataset file""" 43 | 44 | name: str 45 | hash: str 46 | n_bytes: int 47 | 48 | PolarsExpression = typing.Union[ 49 | pl.type_aliases.IntoExpr, 50 | typing.Sequence[pl.type_aliases.IntoExpr], 51 | ] 52 | 53 | # 54 | # # datasets 55 | # 56 | 57 | global_version = '1.0.0' 58 | 59 | networks = { 60 | 1: 'ethereum', 61 | } 62 | 63 | 64 | # 65 | # # urls and paths 66 | # 67 | 68 | # default portal root 69 | portal_root = 'https://datasets.paradigm.xyz/datasets' 70 | bucket_root_path = 'datasets' 71 | 72 | # schema for various portal urls 73 | urls = { 74 | 'global_manifest': '{portal_root}/global_manifest.json', 75 | 'dataset_manifest': '{portal_root}/{network}_{datatype}/dataset_manifest.json', 76 | 'dataset_file': '{portal_root}/{network}_{datatype}/{filename}', 77 | 'old_global_manifests': '{portal_root}/old_global_manifests/v{version}.json', 78 | 'old_dataset_manifest': '{portal_root}/old_dataset_manifests/{dataset}__v{version}.json', 79 | } 80 | 81 | global_manifest_filename = 'global_manifest.json' 82 | dataset_manifest_filename = 'dataset_manifest.json' 83 | dataset_readme_filename = 'README.md' 84 | dataset_filename_template = '{dataset}__v{version}__{file_id}.{filetype}' 85 | dataset_license_filenames = ['LICENSE-CC0'] 86 | 87 | 88 | # 89 | # # cli behavior 90 | # 91 | 92 | styles: toolcli.StyleTheme = { 93 | 'title': 'bold #00e100', 94 | 'metavar': 'bold #e5e9f0', 95 | 'description': '#aaaaaa', 96 | 'content': '#00B400', 97 | 'option': 'bold #e5e9f0', 98 | 'comment': '#888888', 99 | } 100 | 101 | 102 | # 103 | # # formats 104 | # 105 | 106 | def to_binary(value: str | bytes) -> bytes: 107 | if isinstance(value, bytes): 108 | return value 109 | elif isinstance(value, str): 110 | if value.startswith('0x'): 111 | return bytes.fromhex(value[2:]) 112 | else: 113 | return bytes.fromhex(value) 114 | else: 115 | raise Exception('invalid format: ' + str(value)) 116 | 117 | 118 | def to_hex( 119 | value: str | bytes, *, prefix: bool = True, validate: bool = True 120 | ) -> str: 121 | 122 | if isinstance(value, str): 123 | if value.startswith('0x'): 124 | if validate: 125 | bytes.fromhex(value[2:]) 126 | 127 | if prefix: 128 | return value 129 | else: 130 | return value[2:] 131 | else: 132 | if validate: 133 | bytes.fromhex(value) 134 | 135 | if prefix: 136 | return '0x' + value 137 | else: 138 | return value 139 | 140 | elif isinstance(value, bytes): 141 | if prefix: 142 | return '0x' + value.hex() 143 | else: 144 | return value.hex() 145 | 146 | else: 147 | raise Exception('invalid value format') 148 | 149 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [build-system] 3 | requires = ["flit_core >=3.2, <4"] 4 | build-backend = "flit_core.buildapi" 5 | 6 | [project] 7 | name = "paradigm-data-portal" 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | dynamic = ["version", "description"] 11 | license = {text = "MIT OR Apache-2.0"} 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Financial and Insurance Industry", 16 | "Intended Audience :: Science/Research", 17 | "License :: OSI Approved :: Apache Software License", 18 | "License :: OSI Approved :: MIT License", 19 | "Natural Language :: English", 20 | "Operating System :: MacOS", 21 | "Operating System :: Microsoft :: Windows", 22 | "Operating System :: POSIX :: Linux", 23 | "Programming Language :: Python :: 3.7", 24 | "Programming Language :: Python :: 3.8", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Typing :: Typed", 29 | ] 30 | dependencies = [ 31 | 'typing-extensions >=4.2.0, <5', 32 | 'requests >=2.20.0, <3', 33 | 'toolcli >=0.6.13, <0.7', 34 | 'toolstr >=0.9.3, <0.10', 35 | 'tooljob >=0.1.6, <0.2', 36 | ] 37 | 38 | [project.optional-dependencies] 39 | test = [ 40 | 'mypy ==1.2.0', 41 | 'mypy_extensions >= 1.0.0, <1.1.0', 42 | 'pytest >=6, <7', 43 | ] 44 | 45 | [project.scripts] 46 | pdp = "pdp.cli.cli_run:run_cli" 47 | 48 | [tool.flit.module] 49 | name = "pdp" 50 | 51 | [tool.mypy] 52 | python_version = "3.9" 53 | strict = true 54 | implicit_reexport = true 55 | files = ["pdp"] 56 | 57 | [tool.pytest.ini_options] 58 | testpaths = [ 59 | "tests", 60 | ] 61 | asyncio_mode = 'auto' 62 | 63 | -------------------------------------------------------------------------------- /tests/remote_tests/test_manifests.py: -------------------------------------------------------------------------------- 1 | import pdp 2 | 3 | 4 | def test_get_global_manifest(): 5 | global_manifest = pdp.get_global_manifest(source='remote') 6 | 7 | 8 | def test_get_dataset_manifests(): 9 | global_manifest = pdp.get_global_manifest(source='remote') 10 | for dataset_name in global_manifest['datasets'].keys(): 11 | dataset_manifest = pdp.get_dataset_manifest( 12 | dataset_name, source='remote' 13 | ) 14 | 15 | -------------------------------------------------------------------------------- /tests/test_collect.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from pdp.datasets import contracts 7 | from pdp.datasets import slots 8 | from pdp.datasets import native_transfers 9 | 10 | 11 | dataset_collectors = [ 12 | contracts.collect_contracts_dataset, 13 | slots.collect_slots_dataset, 14 | native_transfers.collect_native_transfers_dataset, 15 | ] 16 | 17 | 18 | collect_kwargs_sets = [ 19 | { 20 | 'start_block': 14_000_000, 21 | 'end_block': 14_000_100, 22 | 'chunk_size': 20, 23 | 'network': 'ethereum', 24 | 'executor': 'parallel', 25 | 'verbose': True, 26 | }, 27 | ] 28 | 29 | 30 | @pytest.mark.parametrize('dataset_collector', dataset_collectors) 31 | @pytest.mark.parametrize('collect_kwargs', collect_kwargs_sets) 32 | @pytest.mark.parametrize('output_filetype', ['parquet', 'csv']) 33 | def test(dataset_collector, collect_kwargs, output_filetype): 34 | output_dir = tempfile.mkdtemp() 35 | 36 | dataset_collector( 37 | output_dir=output_dir, 38 | output_filetype=output_filetype, 39 | **collect_kwargs, 40 | ) 41 | 42 | output_files = os.listdir(output_dir) 43 | assert ( 44 | len(output_files) 45 | == (collect_kwargs['end_block'] - collect_kwargs['start_block']) 46 | / collect_kwargs['chunk_size'] 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /tests/test_validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | import pdp 6 | 7 | 8 | data_root = pdp.get_data_root(require=False) 9 | ethereum_contracts_present = ( 10 | data_root is not None 11 | and os.path.isdir(data_root) 12 | and 'ethereum_contracts' in os.listdir(data_root) 13 | ) 14 | 15 | 16 | @pytest.mark.skipif( 17 | not ethereum_contracts_present, 18 | reason='ethereum_contracts dataset not present', 19 | ) 20 | def test_validate_dataset(): 21 | path = pdp.get_dataset_local_path('ethereum_contracts') 22 | pdp.validate_dataset_directory(path) 23 | 24 | --------------------------------------------------------------------------------