├── .gitignore
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── datasets
    ├── README.md
    ├── ethereum_contracts
    │   ├── LICENSE-CC0
    │   ├── README.md
    │   └── dataset_manifest.json
    ├── ethereum_native_transfers
    │   ├── LICENSE-CC0
    │   ├── README.md
    │   └── dataset_manifest.json
    ├── ethereum_slots
    │   ├── LICENSE-CC0
    │   ├── README.md
    │   └── dataset_manifest.json
    └── global_manifest.json
├── notebooks
    └── explore_ethereum_contracts.ipynb
├── pdp
    ├── __init__.py
    ├── __main__.py
    ├── cli
    │   ├── __init__.py
    │   ├── cli_run.py
    │   └── commands
    │   │   ├── __init__.py
    │   │   ├── collect_command.py
    │   │   ├── dataset_command.py
    │   │   ├── download_command.py
    │   │   ├── ls_command.py
    │   │   ├── package_command.py
    │   │   ├── root_command.py
    │   │   ├── update_command.py
    │   │   ├── upload_command.py
    │   │   └── validate_command.py
    ├── config_utils.py
    ├── data_utils
    │   ├── __init__.py
    │   ├── collect_utils.py
    │   ├── download_utils.py
    │   ├── file_utils.py
    │   ├── job_utils.py
    │   ├── manifest_utils.py
    │   ├── query_utils.py
    │   ├── readme_utils.py
    │   ├── schema_utils.py
    │   └── update_utils.py
    ├── datasets
    │   ├── __init__.py
    │   ├── contracts
    │   │   ├── __init__.py
    │   │   ├── contracts_collect.py
    │   │   ├── contracts_queries.py
    │   │   └── contracts_spec.py
    │   ├── native_transfers
    │   │   ├── __init__.py
    │   │   ├── native_transfers_collect.py
    │   │   ├── native_transfers_queries.py
    │   │   └── native_transfers_spec.py
    │   └── slots
    │   │   ├── __init__.py
    │   │   ├── slots_collect.py
    │   │   ├── slots_queries.py
    │   │   └── slots_spec.py
    ├── py.typed
    └── spec.py
├── pyproject.toml
└── tests
    ├── remote_tests
        └── test_manifests.py
    ├── test_collect.py
    └── test_validate.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # specific files
 2 | CHANGELOG.md
 3 | ROADMAP.md
 4 | TODO.md
 5 | pytestdebug.log
 6 | tuna.log
 7 | .DS_STORE
 8 | 
 9 | # filetypes
10 | *.egg-info
11 | *.pyc
12 | *.csv
13 | *.parquet
14 | 
15 | # folders
16 | */.hypothesis
17 | .coverage_html/*
18 | .hypothesis/
19 | .tox/*
20 | __pycache__/*
21 | build/*
22 | dist/*
23 | roadmap/*
24 | data/*
25 | extras_ignore/*
26 | *.ipynb
27 | *.ipynb_checkpoints
28 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2022 Paradigm
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Paradigm Data Portal
 3 | 
 4 | The Paradigm Data Portal is a collection of open source crypto datasets for researchers and tool builders
 5 | 
 6 | ## Datasets
 7 | - [`ethereum_contracts`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_contracts): all historical contract deployments
 8 | - [`ethereum_native_transfers`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_native_transfers): all native transfers in similar format to ERC20 Transfers (excluding tx fees)
 9 | - [`ethereum_slots`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_slots): all slots of each contract, including historical usage metadata
10 | 
11 | All datasets are released under a [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/) license into the public domain unless otherwise noted.
12 | 
13 | ## `pdp`
14 | 
15 | `pdp` is a CLI tool that can be used to obtain and manage PDP datasets
16 | 
17 | To install: `pip install paradigm-data-portal`
18 | 
19 | 
20 | #### Example Usage
21 | 
22 | - List available datasets `pdp ls`
23 | - List dataset files `pdp ls <dataset_name>`
24 | - Download a dataset `pdp download <dataset_name>`
25 | 
26 | Each command has multiple options, view help with `pdp <command> -h`
27 | 
28 | 
29 | ## Dataset Versioning
30 | 
31 | Every dataset has a version in `<major>.<minor>.<patch>` format, e.g. `1.2.8`
32 | - when a schema is updated, the major version is increased
33 | - when rows are added, removed, or modified, the minor version is increased
34 | - when rows are added due to new blocks, the patch is increased
35 | 
36 | Updates will be documented in dataset changelogs
37 | 
38 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ## Datasets
3 | - [`ethereum_contracts`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_contracts): all historical contract deployments
4 | - [`ethereum_native_transfers`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_native_transfers): all native transfers in similar format to ERC20 Transfers (excluding tx fees)
5 | - [`ethereum_slots`](https://github.com/paradigmxyz/paradigm-data-portal/tree/main/datasets/ethereum_slots): all slots of each contract, including historical usage metadata
6 | 
7 | 


--------------------------------------------------------------------------------
/datasets/ethereum_contracts/LICENSE-CC0:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/datasets/ethereum_contracts/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Ethereum Contracts Dataset v1.0.0
 3 | 
 4 | This is a dataset of all historical contract deployments
 5 | 
 6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/contracts/contracts_collect.py)
 7 | 
 8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/)
 9 | 
10 | ## Usage
11 | 
12 | Some example uses of this dataset include:
13 | - look up all contracts deployed by an address
14 | - look up all contracts that have a given bytecode
15 | - analyze distribution of contract bytecode motifs
16 | 
17 | An example notebook exploring this dataset can be found [here](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/notebooks/explore_ethereum_contracts.ipynb)
18 | 
19 | ## Schema
20 | 
21 | #### `contracts` table
22 | each row corresponds to a contract create trace
23 | | column | type | description |
24 | | - | - | - |
25 | | block_number | INTEGER | block number when contract was created |
26 | | create_index | INTEGER | increased by 1 for each contract created in block |
27 | | transaction_hash | BINARY | hash of transaction that created contract |
28 | | contract_address | BINARY | address of deployed contract |
29 | | deployer | BINARY | EOA that deployed the contract |
30 | | factory | BINARY | the `from` field in the creation trace |
31 | | init_code | BINARY | initialization bytecode of contract |
32 | | code | BINARY | bytecode of contract |
33 | | init_code_hash | BINARY | keccak hash of contract initialization code |
34 | | code_hash | BINARY | keccak hash of contract bytecode |
35 | 
36 | ## Download
37 | 
38 | This dataset can be downloaded using either the `pdp` cli tool or the urls below
39 | 
40 | The total dataset size is **6.57GB**
41 | 
42 | ### Use `pdp`
43 | 
44 | The command `pdp download ethereum_contracts` will download all files in this dataset
45 | 
46 | See `pdp download -h` for available options
47 | 
48 | ### Use URLs
49 | 
50 | | | file | size |
51 | | - | - | - |
52 | | 1 | [ethereum_contracts__v1_0_0__00000000_to_00999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__00000000_to_00999999.parquet) | 2.96MB |
53 | | 2 | [ethereum_contracts__v1_0_0__01000000_to_01999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__01000000_to_01999999.parquet) | 13.08MB |
54 | | 3 | [ethereum_contracts__v1_0_0__02000000_to_02999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__02000000_to_02999999.parquet) | 24.86MB |
55 | | 4 | [ethereum_contracts__v1_0_0__03000000_to_03999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__03000000_to_03999999.parquet) | 83.30MB |
56 | | 5 | [ethereum_contracts__v1_0_0__04000000_to_04999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__04000000_to_04999999.parquet) | 295.85MB |
57 | | 6 | [ethereum_contracts__v1_0_0__05000000_to_05999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__05000000_to_05999999.parquet) | 313.06MB |
58 | | 7 | [ethereum_contracts__v1_0_0__06000000_to_06999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__06000000_to_06999999.parquet) | 384.52MB |
59 | | 8 | [ethereum_contracts__v1_0_0__07000000_to_07999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__07000000_to_07999999.parquet) | 338.28MB |
60 | | 9 | [ethereum_contracts__v1_0_0__08000000_to_08999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__08000000_to_08999999.parquet) | 318.73MB |
61 | | 10 | [ethereum_contracts__v1_0_0__09000000_to_09999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__09000000_to_09999999.parquet) | 401.13MB |
62 | | 11 | [ethereum_contracts__v1_0_0__10000000_to_10999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__10000000_to_10999999.parquet) | 484.85MB |
63 | | 12 | [ethereum_contracts__v1_0_0__11000000_to_11999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__11000000_to_11999999.parquet) | 529.76MB |
64 | | 13 | [ethereum_contracts__v1_0_0__12000000_to_12999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__12000000_to_12999999.parquet) | 618.64MB |
65 | | 14 | [ethereum_contracts__v1_0_0__13000000_to_13999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__13000000_to_13999999.parquet) | 567.07MB |
66 | | 15 | [ethereum_contracts__v1_0_0__14000000_to_14999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__14000000_to_14999999.parquet) | 761.28MB |
67 | | 16 | [ethereum_contracts__v1_0_0__15000000_to_15999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__15000000_to_15999999.parquet) | 909.94MB |
68 | | 17 | [ethereum_contracts__v1_0_0__16000000_to_16799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_contracts/ethereum_contracts__v1_0_0__16000000_to_16799999.parquet) | 677.91MB |
69 | 


--------------------------------------------------------------------------------
/datasets/ethereum_contracts/dataset_manifest.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "datatype": "contracts",
  3 |     "description": "all historical contract deployments",
  4 |     "files": [
  5 |         {
  6 |             "hash": "755f40506dccfee0849f00d79e4336db",
  7 |             "n_bytes": 3107682,
  8 |             "name": "ethereum_contracts__v1_0_0__00000000_to_00999999.parquet"
  9 |         },
 10 |         {
 11 |             "hash": "c77f1732bc2619e6814e4d2fe723b0c7",
 12 |             "n_bytes": 13713368,
 13 |             "name": "ethereum_contracts__v1_0_0__01000000_to_01999999.parquet"
 14 |         },
 15 |         {
 16 |             "hash": "10e8590e91f44bdcb39b9fa7b0433ce9",
 17 |             "n_bytes": 26068725,
 18 |             "name": "ethereum_contracts__v1_0_0__02000000_to_02999999.parquet"
 19 |         },
 20 |         {
 21 |             "hash": "0852e9c89d13e3af3c1defb28e6eb85c",
 22 |             "n_bytes": 87348694,
 23 |             "name": "ethereum_contracts__v1_0_0__03000000_to_03999999.parquet"
 24 |         },
 25 |         {
 26 |             "hash": "c07dec107c820804e48ed613454d0095",
 27 |             "n_bytes": 310223347,
 28 |             "name": "ethereum_contracts__v1_0_0__04000000_to_04999999.parquet"
 29 |         },
 30 |         {
 31 |             "hash": "e406e069f13e3b6c53bc7a634b4f4ea8",
 32 |             "n_bytes": 328268119,
 33 |             "name": "ethereum_contracts__v1_0_0__05000000_to_05999999.parquet"
 34 |         },
 35 |         {
 36 |             "hash": "9651c6d0f349ee589b20e8bea896fdc1",
 37 |             "n_bytes": 403202482,
 38 |             "name": "ethereum_contracts__v1_0_0__06000000_to_06999999.parquet"
 39 |         },
 40 |         {
 41 |             "hash": "d2ae7c208b51a09537a1cf41d36dc4ed",
 42 |             "n_bytes": 354711950,
 43 |             "name": "ethereum_contracts__v1_0_0__07000000_to_07999999.parquet"
 44 |         },
 45 |         {
 46 |             "hash": "45dfc78c8223c71ae530f8bea494d92b",
 47 |             "n_bytes": 334208521,
 48 |             "name": "ethereum_contracts__v1_0_0__08000000_to_08999999.parquet"
 49 |         },
 50 |         {
 51 |             "hash": "c4a2509578d51135eec6cd0d604d7657",
 52 |             "n_bytes": 420620123,
 53 |             "name": "ethereum_contracts__v1_0_0__09000000_to_09999999.parquet"
 54 |         },
 55 |         {
 56 |             "hash": "50da1c9a328f4152b768c9ac1e3b5c99",
 57 |             "n_bytes": 508399240,
 58 |             "name": "ethereum_contracts__v1_0_0__10000000_to_10999999.parquet"
 59 |         },
 60 |         {
 61 |             "hash": "8013ba9f27c2844ba5341a44e70eee74",
 62 |             "n_bytes": 555497504,
 63 |             "name": "ethereum_contracts__v1_0_0__11000000_to_11999999.parquet"
 64 |         },
 65 |         {
 66 |             "hash": "c9ad9a784ee11d3d1e44a63b3be7a25d",
 67 |             "n_bytes": 648691436,
 68 |             "name": "ethereum_contracts__v1_0_0__12000000_to_12999999.parquet"
 69 |         },
 70 |         {
 71 |             "hash": "49aca9eb469fb435fc11c09037434ff8",
 72 |             "n_bytes": 594615935,
 73 |             "name": "ethereum_contracts__v1_0_0__13000000_to_13999999.parquet"
 74 |         },
 75 |         {
 76 |             "hash": "4ce044cc6f6255d3b2fab72a333e5ed8",
 77 |             "n_bytes": 798261906,
 78 |             "name": "ethereum_contracts__v1_0_0__14000000_to_14999999.parquet"
 79 |         },
 80 |         {
 81 |             "hash": "26f63f9ec8023dac8a716dcde6ab9078",
 82 |             "n_bytes": 954142634,
 83 |             "name": "ethereum_contracts__v1_0_0__15000000_to_15999999.parquet"
 84 |         },
 85 |         {
 86 |             "hash": "b71b9887d256daf9c89a973cb10af6d7",
 87 |             "n_bytes": 710840143,
 88 |             "name": "ethereum_contracts__v1_0_0__16000000_to_16799999.parquet"
 89 |         }
 90 |     ],
 91 |     "name": "ethereum_contracts",
 92 |     "network": "ethereum",
 93 |     "schema": {
 94 |         "description": "all historical contract deployments",
 95 |         "tables": {
 96 |             "contracts": {
 97 |                 "columns": [
 98 |                     {
 99 |                         "description": "block number when contract was created",
100 |                         "name": "block_number",
101 |                         "type": "INTEGER"
102 |                     },
103 |                     {
104 |                         "description": "increased by 1 for each contract created in block",
105 |                         "name": "create_index",
106 |                         "type": "INTEGER"
107 |                     },
108 |                     {
109 |                         "description": "hash of transaction that created contract",
110 |                         "name": "transaction_hash",
111 |                         "type": "BINARY"
112 |                     },
113 |                     {
114 |                         "description": "address of deployed contract",
115 |                         "name": "contract_address",
116 |                         "type": "BINARY"
117 |                     },
118 |                     {
119 |                         "description": "EOA that deployed the contract",
120 |                         "name": "deployer",
121 |                         "type": "BINARY"
122 |                     },
123 |                     {
124 |                         "description": "the `from` field in the creation trace",
125 |                         "name": "factory",
126 |                         "type": "BINARY"
127 |                     },
128 |                     {
129 |                         "description": "initialization bytecode of contract",
130 |                         "name": "init_code",
131 |                         "type": "BINARY"
132 |                     },
133 |                     {
134 |                         "description": "bytecode of contract",
135 |                         "name": "code",
136 |                         "type": "BINARY"
137 |                     },
138 |                     {
139 |                         "description": "keccak hash of contract initialization code",
140 |                         "name": "init_code_hash",
141 |                         "type": "BINARY"
142 |                     },
143 |                     {
144 |                         "description": "keccak hash of contract bytecode",
145 |                         "name": "code_hash",
146 |                         "type": "BINARY"
147 |                     }
148 |                 ],
149 |                 "description": "each row corresponds to a contract create trace"
150 |             }
151 |         }
152 |     },
153 |     "version": "1.0.0"
154 | }


--------------------------------------------------------------------------------
/datasets/ethereum_native_transfers/LICENSE-CC0:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/datasets/ethereum_native_transfers/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Ethereum Native Transfers Dataset v1.0.0
  3 | 
  4 | This is a dataset of all native transfers in similar format to ERC20 Transfers (excluding tx fees)
  5 | 
  6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/native_transfers/native_transfers_collect.py)
  7 | 
  8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/)
  9 | 
 10 | ## Usage
 11 | 
 12 | Some example uses of this dataset include:
 13 | - look up all inbound transfers to an address
 14 | - analyze transfer size distributions
 15 | - analyze transfer frequency distributions
 16 | 
 17 | 
 18 | 
 19 | ## Schema
 20 | 
 21 | #### `native_transfers` table
 22 | each row corresponds to a trace that transfers native token
 23 | | column | type | description |
 24 | | - | - | - |
 25 | | block_number | INTEGER | block number where native token was transfered |
 26 | | transfer_index | INTEGER | increased by 1 for each native transfer in block |
 27 | | transaction_hash | BINARY | hash of transaction that contains transfer |
 28 | | to_address | BINARY | address that native token is transferred to |
 29 | | from_address | BINARY | address that native token is transferred from |
 30 | | value | BINARY | amount of native token transferred |
 31 | 
 32 | ## Download
 33 | 
 34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below
 35 | 
 36 | The total dataset size is **61.00GB**
 37 | 
 38 | ### Use `pdp`
 39 | 
 40 | The command `pdp download ethereum_native_transfers` will download all files in this dataset
 41 | 
 42 | See `pdp download -h` for available options
 43 | 
 44 | ### Use URLs
 45 | 
 46 | | | file | size |
 47 | | - | - | - |
 48 | | 1 | [ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet) | 5.43MB |
 49 | | 2 | [ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet) | 10.81MB |
 50 | | 3 | [ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet) | 12.15MB |
 51 | | 4 | [ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet) | 15.24MB |
 52 | | 5 | [ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet) | 23.57MB |
 53 | | 6 | [ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet) | 43.13MB |
 54 | | 7 | [ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet) | 52.66MB |
 55 | | 8 | [ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet) | 68.87MB |
 56 | | 9 | [ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet) | 71.94MB |
 57 | | 10 | [ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet) | 75.75MB |
 58 | | 11 | [ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet) | 78.88MB |
 59 | | 12 | [ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet) | 119.10MB |
 60 | | 13 | [ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet) | 66.45MB |
 61 | | 14 | [ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet) | 66.38MB |
 62 | | 15 | [ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet) | 65.67MB |
 63 | | 16 | [ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet) | 70.41MB |
 64 | | 17 | [ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet) | 106.03MB |
 65 | | 18 | [ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet) | 136.80MB |
 66 | | 19 | [ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet) | 233.24MB |
 67 | | 20 | [ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet) | 469.12MB |
 68 | | 21 | [ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet) | 592.62MB |
 69 | | 22 | [ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet) | 804.41MB |
 70 | | 23 | [ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet) | 516.11MB |
 71 | | 24 | [ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet) | 1.09GB |
 72 | | 25 | [ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet) | 1.71GB |
 73 | | 26 | [ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet) | 1020.42MB |
 74 | | 27 | [ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet) | 769.68MB |
 75 | | 28 | [ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet) | 878.05MB |
 76 | | 29 | [ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet) | 893.71MB |
 77 | | 30 | [ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet) | 705.26MB |
 78 | | 31 | [ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet) | 745.70MB |
 79 | | 32 | [ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet) | 643.32MB |
 80 | | 33 | [ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet) | 588.53MB |
 81 | | 34 | [ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet) | 607.73MB |
 82 | | 35 | [ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet) | 587.19MB |
 83 | | 36 | [ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet) | 598.46MB |
 84 | | 37 | [ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet) | 588.88MB |
 85 | | 38 | [ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet) | 629.00MB |
 86 | | 39 | [ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet) | 663.01MB |
 87 | | 40 | [ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet) | 729.28MB |
 88 | | 41 | [ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet) | 627.52MB |
 89 | | 42 | [ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet) | 560.94MB |
 90 | | 43 | [ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet) | 522.91MB |
 91 | | 44 | [ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet) | 475.54MB |
 92 | | 45 | [ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet) | 496.36MB |
 93 | | 46 | [ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet) | 509.63MB |
 94 | | 47 | [ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet) | 437.48MB |
 95 | | 48 | [ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet) | 531.73MB |
 96 | | 49 | [ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet) | 559.07MB |
 97 | | 50 | [ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet) | 669.82MB |
 98 | | 51 | [ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet) | 794.10MB |
 99 | | 52 | [ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet) | 856.74MB |
100 | | 53 | [ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet) | 1.02GB |
101 | | 54 | [ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet) | 1016.86MB |
102 | | 55 | [ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet) | 901.55MB |
103 | | 56 | [ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet) | 911.41MB |
104 | | 57 | [ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet) | 950.09MB |
105 | | 58 | [ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet) | 1020.27MB |
106 | | 59 | [ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet) | 1.12GB |
107 | | 60 | [ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet) | 1.20GB |
108 | | 61 | [ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet) | 1.28GB |
109 | | 62 | [ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet) | 1.50GB |
110 | | 63 | [ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet) | 1.42GB |
111 | | 64 | [ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet) | 1.18GB |
112 | | 65 | [ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet) | 1.23GB |
113 | | 66 | [ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet) | 1.24GB |
114 | | 67 | [ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet) | 1.28GB |
115 | | 68 | [ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet) | 1.43GB |
116 | | 69 | [ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet) | 1.39GB |
117 | | 70 | [ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet) | 1.33GB |
118 | | 71 | [ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet) | 1.33GB |
119 | | 72 | [ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet) | 1.29GB |
120 | | 73 | [ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet) | 1.29GB |
121 | | 74 | [ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet) | 1.30GB |
122 | | 75 | [ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet) | 1.20GB |
123 | | 76 | [ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet) | 1.23GB |
124 | | 77 | [ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet) | 1.30GB |
125 | | 78 | [ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet) | 1.20GB |
126 | | 79 | [ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet) | 1.03GB |
127 | | 80 | [ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet) | 975.74MB |
128 | | 81 | [ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet) | 1009.98MB |
129 | | 82 | [ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet) | 958.26MB |
130 | | 83 | [ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet) | 972.90MB |
131 | | 84 | [ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet](https://datasets.paradigm.xyz/datasets/ethereum_native_transfers/ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet) | 1.00GB |
132 | 


--------------------------------------------------------------------------------
/datasets/ethereum_native_transfers/dataset_manifest.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "datatype": "native_transfers",
  3 |     "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)",
  4 |     "files": [
  5 |         {
  6 |             "hash": "48cc4472dae3afe90f2a2cdffcfdae0a",
  7 |             "n_bytes": 5690786,
  8 |             "name": "ethereum_native_transfers__v1_0_0__00000000_to_00199999.parquet"
  9 |         },
 10 |         {
 11 |             "hash": "de4c0e0aaad27071dd42ceb5d972dfbb",
 12 |             "n_bytes": 11331350,
 13 |             "name": "ethereum_native_transfers__v1_0_0__00200000_to_00399999.parquet"
 14 |         },
 15 |         {
 16 |             "hash": "eac20eaec5cf632d171a00775529386d",
 17 |             "n_bytes": 12736409,
 18 |             "name": "ethereum_native_transfers__v1_0_0__00400000_to_00599999.parquet"
 19 |         },
 20 |         {
 21 |             "hash": "07effd621987733a5ff51e247bb26dd3",
 22 |             "n_bytes": 15982188,
 23 |             "name": "ethereum_native_transfers__v1_0_0__00600000_to_00799999.parquet"
 24 |         },
 25 |         {
 26 |             "hash": "b03398e16152ee692d4c1197cd2db915",
 27 |             "n_bytes": 24712485,
 28 |             "name": "ethereum_native_transfers__v1_0_0__00800000_to_00999999.parquet"
 29 |         },
 30 |         {
 31 |             "hash": "8b571782b37a3f1478e3ceebf75895cb",
 32 |             "n_bytes": 45226256,
 33 |             "name": "ethereum_native_transfers__v1_0_0__01000000_to_01199999.parquet"
 34 |         },
 35 |         {
 36 |             "hash": "9eac0d7cd3625b9c3718ec9241bc0ee1",
 37 |             "n_bytes": 55216367,
 38 |             "name": "ethereum_native_transfers__v1_0_0__01200000_to_01399999.parquet"
 39 |         },
 40 |         {
 41 |             "hash": "3a55f7070511cfbc561832e21df7ac4b",
 42 |             "n_bytes": 72211149,
 43 |             "name": "ethereum_native_transfers__v1_0_0__01400000_to_01599999.parquet"
 44 |         },
 45 |         {
 46 |             "hash": "4195473710d5a27a24cbce960a6f353d",
 47 |             "n_bytes": 75430132,
 48 |             "name": "ethereum_native_transfers__v1_0_0__01600000_to_01799999.parquet"
 49 |         },
 50 |         {
 51 |             "hash": "e99b0c8dbd2110566f3759b593c559b9",
 52 |             "n_bytes": 79432031,
 53 |             "name": "ethereum_native_transfers__v1_0_0__01800000_to_01999999.parquet"
 54 |         },
 55 |         {
 56 |             "hash": "1878bc4b8c6b750d3c3102064ff31980",
 57 |             "n_bytes": 82707629,
 58 |             "name": "ethereum_native_transfers__v1_0_0__02000000_to_02199999.parquet"
 59 |         },
 60 |         {
 61 |             "hash": "c707da6438970db942fc984d9ee3e577",
 62 |             "n_bytes": 124886832,
 63 |             "name": "ethereum_native_transfers__v1_0_0__02200000_to_02399999.parquet"
 64 |         },
 65 |         {
 66 |             "hash": "c183e61f0704585d37454ca16ec5a15d",
 67 |             "n_bytes": 69677040,
 68 |             "name": "ethereum_native_transfers__v1_0_0__02400000_to_02599999.parquet"
 69 |         },
 70 |         {
 71 |             "hash": "f4b9c196453e68490c44f555399c0847",
 72 |             "n_bytes": 69608858,
 73 |             "name": "ethereum_native_transfers__v1_0_0__02600000_to_02799999.parquet"
 74 |         },
 75 |         {
 76 |             "hash": "6c91ee04199ce9bc8415adbb972fcadc",
 77 |             "n_bytes": 68858529,
 78 |             "name": "ethereum_native_transfers__v1_0_0__02800000_to_02999999.parquet"
 79 |         },
 80 |         {
 81 |             "hash": "f653d38c01959e5a6ebd92790c728226",
 82 |             "n_bytes": 73834555,
 83 |             "name": "ethereum_native_transfers__v1_0_0__03000000_to_03199999.parquet"
 84 |         },
 85 |         {
 86 |             "hash": "bdba4b5171bbfc480003425e5e6f1fa3",
 87 |             "n_bytes": 111175493,
 88 |             "name": "ethereum_native_transfers__v1_0_0__03200000_to_03399999.parquet"
 89 |         },
 90 |         {
 91 |             "hash": "b3a189d018880ad8be30a8f3b29b0fd3",
 92 |             "n_bytes": 143441746,
 93 |             "name": "ethereum_native_transfers__v1_0_0__03400000_to_03599999.parquet"
 94 |         },
 95 |         {
 96 |             "hash": "1fba8d7018a320a5fa0dda2dd99d47b1",
 97 |             "n_bytes": 244566467,
 98 |             "name": "ethereum_native_transfers__v1_0_0__03600000_to_03799999.parquet"
 99 |         },
100 |         {
101 |             "hash": "644c4fecfdb7e240abed67c8dd299e09",
102 |             "n_bytes": 491904504,
103 |             "name": "ethereum_native_transfers__v1_0_0__03800000_to_03999999.parquet"
104 |         },
105 |         {
106 |             "hash": "747797caf009a871a39d1e8488ef1bba",
107 |             "n_bytes": 621409562,
108 |             "name": "ethereum_native_transfers__v1_0_0__04000000_to_04199999.parquet"
109 |         },
110 |         {
111 |             "hash": "90538dc4957d907e97b8736d15a5e98a",
112 |             "n_bytes": 843481817,
113 |             "name": "ethereum_native_transfers__v1_0_0__04200000_to_04399999.parquet"
114 |         },
115 |         {
116 |             "hash": "c4afb09f7500af27f75d25f215ed7d6d",
117 |             "n_bytes": 541175455,
118 |             "name": "ethereum_native_transfers__v1_0_0__04400000_to_04599999.parquet"
119 |         },
120 |         {
121 |             "hash": "d57db6ec0071750304112c534687306c",
122 |             "n_bytes": 1172756317,
123 |             "name": "ethereum_native_transfers__v1_0_0__04600000_to_04799999.parquet"
124 |         },
125 |         {
126 |             "hash": "56c9f2fdbd9111b591e338c46117d6e7",
127 |             "n_bytes": 1834495623,
128 |             "name": "ethereum_native_transfers__v1_0_0__04800000_to_04999999.parquet"
129 |         },
130 |         {
131 |             "hash": "dd7578adca464d60d6017b6aa5533149",
132 |             "n_bytes": 1069985060,
133 |             "name": "ethereum_native_transfers__v1_0_0__05000000_to_05199999.parquet"
134 |         },
135 |         {
136 |             "hash": "f0b4975e9167f6be5c5f8a3a44c38daa",
137 |             "n_bytes": 807066352,
138 |             "name": "ethereum_native_transfers__v1_0_0__05200000_to_05399999.parquet"
139 |         },
140 |         {
141 |             "hash": "01fa37f1e9f4889583af7dd48c6af413",
142 |             "n_bytes": 920702621,
143 |             "name": "ethereum_native_transfers__v1_0_0__05400000_to_05599999.parquet"
144 |         },
145 |         {
146 |             "hash": "7df8235ac272f0061679aaea9d956aba",
147 |             "n_bytes": 937126477,
148 |             "name": "ethereum_native_transfers__v1_0_0__05600000_to_05799999.parquet"
149 |         },
150 |         {
151 |             "hash": "49f3491190048339aa2e5d56f82f1582",
152 |             "n_bytes": 739518170,
153 |             "name": "ethereum_native_transfers__v1_0_0__05800000_to_05999999.parquet"
154 |         },
155 |         {
156 |             "hash": "879cecaae2077ae3f047e416f4218556",
157 |             "n_bytes": 781918979,
158 |             "name": "ethereum_native_transfers__v1_0_0__06000000_to_06199999.parquet"
159 |         },
160 |         {
161 |             "hash": "daa82a38b90b633c330ced9463e4ba3e",
162 |             "n_bytes": 674568646,
163 |             "name": "ethereum_native_transfers__v1_0_0__06200000_to_06399999.parquet"
164 |         },
165 |         {
166 |             "hash": "716a3c88a5a948a44d4d8d8a8f94084e",
167 |             "n_bytes": 617115529,
168 |             "name": "ethereum_native_transfers__v1_0_0__06400000_to_06599999.parquet"
169 |         },
170 |         {
171 |             "hash": "7263265adf1c75686655dd8bf22b54f3",
172 |             "n_bytes": 637247843,
173 |             "name": "ethereum_native_transfers__v1_0_0__06600000_to_06799999.parquet"
174 |         },
175 |         {
176 |             "hash": "f28a4ea84fdd1b2a76155f80bd3a20c4",
177 |             "n_bytes": 615714596,
178 |             "name": "ethereum_native_transfers__v1_0_0__06800000_to_06999999.parquet"
179 |         },
180 |         {
181 |             "hash": "5d4d19ec322b8058ebee1e56e48c9efc",
182 |             "n_bytes": 627526606,
183 |             "name": "ethereum_native_transfers__v1_0_0__07000000_to_07199999.parquet"
184 |         },
185 |         {
186 |             "hash": "e65df6724a58a57b4b2dfb360eb9b4c2",
187 |             "n_bytes": 617488778,
188 |             "name": "ethereum_native_transfers__v1_0_0__07200000_to_07399999.parquet"
189 |         },
190 |         {
191 |             "hash": "95b6fe1605afa22e70bf15dd791bb598",
192 |             "n_bytes": 659556612,
193 |             "name": "ethereum_native_transfers__v1_0_0__07400000_to_07599999.parquet"
194 |         },
195 |         {
196 |             "hash": "f8a731183326f6ae7d0269cb38e8d354",
197 |             "n_bytes": 695213366,
198 |             "name": "ethereum_native_transfers__v1_0_0__07600000_to_07799999.parquet"
199 |         },
200 |         {
201 |             "hash": "ff0377618f7343bc6ccf634d63e85f74",
202 |             "n_bytes": 764704854,
203 |             "name": "ethereum_native_transfers__v1_0_0__07800000_to_07999999.parquet"
204 |         },
205 |         {
206 |             "hash": "5528152d4d71d2a80b29a99600f6a2b6",
207 |             "n_bytes": 658004756,
208 |             "name": "ethereum_native_transfers__v1_0_0__08000000_to_08199999.parquet"
209 |         },
210 |         {
211 |             "hash": "fb1260145d0729301dbe3fcf98b0f926",
212 |             "n_bytes": 588192142,
213 |             "name": "ethereum_native_transfers__v1_0_0__08200000_to_08399999.parquet"
214 |         },
215 |         {
216 |             "hash": "edcacec2a0b0931eb22b341eca353bdf",
217 |             "n_bytes": 548307705,
218 |             "name": "ethereum_native_transfers__v1_0_0__08400000_to_08599999.parquet"
219 |         },
220 |         {
221 |             "hash": "b1c360c12ba515b8d24abb7ead463b67",
222 |             "n_bytes": 498643017,
223 |             "name": "ethereum_native_transfers__v1_0_0__08600000_to_08799999.parquet"
224 |         },
225 |         {
226 |             "hash": "c8045e5a30a070fd89b0d6fb53f7d62f",
227 |             "n_bytes": 520471903,
228 |             "name": "ethereum_native_transfers__v1_0_0__08800000_to_08999999.parquet"
229 |         },
230 |         {
231 |             "hash": "3eab3ac719978e273da51b6f62fca8a3",
232 |             "n_bytes": 534387942,
233 |             "name": "ethereum_native_transfers__v1_0_0__09000000_to_09199999.parquet"
234 |         },
235 |         {
236 |             "hash": "afd242d4cc1ab2731e8bdb9faf362903",
237 |             "n_bytes": 458727305,
238 |             "name": "ethereum_native_transfers__v1_0_0__09200000_to_09399999.parquet"
239 |         },
240 |         {
241 |             "hash": "9fd3aeb9cd4256dff43c24a9dc2109ec",
242 |             "n_bytes": 557555799,
243 |             "name": "ethereum_native_transfers__v1_0_0__09400000_to_09599999.parquet"
244 |         },
245 |         {
246 |             "hash": "0a8b0784fc6caf485d817f67afdfdac8",
247 |             "n_bytes": 586230193,
248 |             "name": "ethereum_native_transfers__v1_0_0__09600000_to_09799999.parquet"
249 |         },
250 |         {
251 |             "hash": "994df72a68a3b1e346a8d9683ad50f08",
252 |             "n_bytes": 702357167,
253 |             "name": "ethereum_native_transfers__v1_0_0__09800000_to_09999999.parquet"
254 |         },
255 |         {
256 |             "hash": "130e0deb357f3b831076625213847120",
257 |             "n_bytes": 832679303,
258 |             "name": "ethereum_native_transfers__v1_0_0__10000000_to_10199999.parquet"
259 |         },
260 |         {
261 |             "hash": "ef7bc9bf89360105c5c2ea49283d0d69",
262 |             "n_bytes": 898361669,
263 |             "name": "ethereum_native_transfers__v1_0_0__10200000_to_10399999.parquet"
264 |         },
265 |         {
266 |             "hash": "e00932ab8b79f669acab9a84a88db9dc",
267 |             "n_bytes": 1098405632,
268 |             "name": "ethereum_native_transfers__v1_0_0__10400000_to_10599999.parquet"
269 |         },
270 |         {
271 |             "hash": "2a116bbaecaeb4b8074048cf8f09b42a",
272 |             "n_bytes": 1066257859,
273 |             "name": "ethereum_native_transfers__v1_0_0__10600000_to_10799999.parquet"
274 |         },
275 |         {
276 |             "hash": "ae25ed040f74e86be557f1649b9e7868",
277 |             "n_bytes": 945348380,
278 |             "name": "ethereum_native_transfers__v1_0_0__10800000_to_10999999.parquet"
279 |         },
280 |         {
281 |             "hash": "8214c1b997afeafe151e51af79f58042",
282 |             "n_bytes": 955687252,
283 |             "name": "ethereum_native_transfers__v1_0_0__11000000_to_11199999.parquet"
284 |         },
285 |         {
286 |             "hash": "8ad8a297a2ff8af9a251ae9de2468e15",
287 |             "n_bytes": 996245659,
288 |             "name": "ethereum_native_transfers__v1_0_0__11200000_to_11399999.parquet"
289 |         },
290 |         {
291 |             "hash": "8781baab23d4371b8b167abbfc72e3cd",
292 |             "n_bytes": 1069832245,
293 |             "name": "ethereum_native_transfers__v1_0_0__11400000_to_11599999.parquet"
294 |         },
295 |         {
296 |             "hash": "f3bb049fdb1e7ac901bd87eacb7fc0be",
297 |             "n_bytes": 1205906693,
298 |             "name": "ethereum_native_transfers__v1_0_0__11600000_to_11799999.parquet"
299 |         },
300 |         {
301 |             "hash": "f6b91916959cc1c22784871e2861b673",
302 |             "n_bytes": 1289157248,
303 |             "name": "ethereum_native_transfers__v1_0_0__11800000_to_11999999.parquet"
304 |         },
305 |         {
306 |             "hash": "1d256b55145d1d3aa7827d6138c71c41",
307 |             "n_bytes": 1371643335,
308 |             "name": "ethereum_native_transfers__v1_0_0__12000000_to_12199999.parquet"
309 |         },
310 |         {
311 |             "hash": "41b9d5eea89cf9690e1693bc7072d9a7",
312 |             "n_bytes": 1608330834,
313 |             "name": "ethereum_native_transfers__v1_0_0__12200000_to_12399999.parquet"
314 |         },
315 |         {
316 |             "hash": "545735c48a6b3ffe2a4c37577e3cadee",
317 |             "n_bytes": 1524857332,
318 |             "name": "ethereum_native_transfers__v1_0_0__12400000_to_12599999.parquet"
319 |         },
320 |         {
321 |             "hash": "1b247ca25341ccd7a3c156ae50f34c7a",
322 |             "n_bytes": 1266490013,
323 |             "name": "ethereum_native_transfers__v1_0_0__12600000_to_12799999.parquet"
324 |         },
325 |         {
326 |             "hash": "b61ee91e56277855a79f02f59372e5c5",
327 |             "n_bytes": 1319524141,
328 |             "name": "ethereum_native_transfers__v1_0_0__12800000_to_12999999.parquet"
329 |         },
330 |         {
331 |             "hash": "29b03078218311bb9b991760a866eb64",
332 |             "n_bytes": 1333697482,
333 |             "name": "ethereum_native_transfers__v1_0_0__13000000_to_13199999.parquet"
334 |         },
335 |         {
336 |             "hash": "93d5aaa4484f4133e87d67f57bbcead0",
337 |             "n_bytes": 1373377723,
338 |             "name": "ethereum_native_transfers__v1_0_0__13200000_to_13399999.parquet"
339 |         },
340 |         {
341 |             "hash": "300021d0fe02e888a204a92de2f35d36",
342 |             "n_bytes": 1536096326,
343 |             "name": "ethereum_native_transfers__v1_0_0__13400000_to_13599999.parquet"
344 |         },
345 |         {
346 |             "hash": "d402839ed968a45b6e0113f44f433bff",
347 |             "n_bytes": 1491833900,
348 |             "name": "ethereum_native_transfers__v1_0_0__13600000_to_13799999.parquet"
349 |         },
350 |         {
351 |             "hash": "e0e72aa8516aa88139e7d61907544c1d",
352 |             "n_bytes": 1427555638,
353 |             "name": "ethereum_native_transfers__v1_0_0__13800000_to_13999999.parquet"
354 |         },
355 |         {
356 |             "hash": "9c969d5798f70fd632371116c885f44d",
357 |             "n_bytes": 1433320799,
358 |             "name": "ethereum_native_transfers__v1_0_0__14000000_to_14199999.parquet"
359 |         },
360 |         {
361 |             "hash": "004bca3751e8a1e280bf478087ab0759",
362 |             "n_bytes": 1385189844,
363 |             "name": "ethereum_native_transfers__v1_0_0__14200000_to_14399999.parquet"
364 |         },
365 |         {
366 |             "hash": "6d48c40d2f8a9798b1cd19a960cc9bf1",
367 |             "n_bytes": 1385243142,
368 |             "name": "ethereum_native_transfers__v1_0_0__14400000_to_14599999.parquet"
369 |         },
370 |         {
371 |             "hash": "4ebe41010502841324893c461cde65bf",
372 |             "n_bytes": 1397068928,
373 |             "name": "ethereum_native_transfers__v1_0_0__14600000_to_14799999.parquet"
374 |         },
375 |         {
376 |             "hash": "a4c5e82f9fecf54d273a6fd8777f9217",
377 |             "n_bytes": 1292891099,
378 |             "name": "ethereum_native_transfers__v1_0_0__14800000_to_14999999.parquet"
379 |         },
380 |         {
381 |             "hash": "ef2f477dfba75fca6ee49f8cf979dbcb",
382 |             "n_bytes": 1319237305,
383 |             "name": "ethereum_native_transfers__v1_0_0__15000000_to_15199999.parquet"
384 |         },
385 |         {
386 |             "hash": "4cf025c9923535c2975f6e5eaa39662f",
387 |             "n_bytes": 1398570730,
388 |             "name": "ethereum_native_transfers__v1_0_0__15200000_to_15399999.parquet"
389 |         },
390 |         {
391 |             "hash": "82bf15b457879cb12b6f96d6ab969aa0",
392 |             "n_bytes": 1285511893,
393 |             "name": "ethereum_native_transfers__v1_0_0__15400000_to_15599999.parquet"
394 |         },
395 |         {
396 |             "hash": "3405eff80cdbd50f6fae9f12c3c1651c",
397 |             "n_bytes": 1102072040,
398 |             "name": "ethereum_native_transfers__v1_0_0__15600000_to_15799999.parquet"
399 |         },
400 |         {
401 |             "hash": "308f47604aae7728fc6d8419c42967de",
402 |             "n_bytes": 1023139324,
403 |             "name": "ethereum_native_transfers__v1_0_0__15800000_to_15999999.parquet"
404 |         },
405 |         {
406 |             "hash": "847821dbbc7d6a4c71cf980f87b16029",
407 |             "n_bytes": 1059039821,
408 |             "name": "ethereum_native_transfers__v1_0_0__16000000_to_16199999.parquet"
409 |         },
410 |         {
411 |             "hash": "f8cbbbbaaf51a94f816c43880236984c",
412 |             "n_bytes": 1004807721,
413 |             "name": "ethereum_native_transfers__v1_0_0__16200000_to_16399999.parquet"
414 |         },
415 |         {
416 |             "hash": "3c81ab319fdebd8da11e92d995d9e37d",
417 |             "n_bytes": 1020162220,
418 |             "name": "ethereum_native_transfers__v1_0_0__16400000_to_16599999.parquet"
419 |         },
420 |         {
421 |             "hash": "54fbd2256b2fb5ce89b8dc9021b80c47",
422 |             "n_bytes": 1073940594,
423 |             "name": "ethereum_native_transfers__v1_0_0__16600000_to_16799999.parquet"
424 |         }
425 |     ],
426 |     "name": "ethereum_native_transfers",
427 |     "network": "ethereum",
428 |     "schema": {
429 |         "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)",
430 |         "tables": {
431 |             "native_transfers": {
432 |                 "columns": [
433 |                     {
434 |                         "description": "block number where native token was transfered",
435 |                         "name": "block_number",
436 |                         "type": "INTEGER"
437 |                     },
438 |                     {
439 |                         "description": "increased by 1 for each native transfer in block",
440 |                         "name": "transfer_index",
441 |                         "type": "INTEGER"
442 |                     },
443 |                     {
444 |                         "description": "hash of transaction that contains transfer",
445 |                         "name": "transaction_hash",
446 |                         "type": "BINARY"
447 |                     },
448 |                     {
449 |                         "description": "address that native token is transferred to",
450 |                         "name": "to_address",
451 |                         "type": "BINARY"
452 |                     },
453 |                     {
454 |                         "description": "address that native token is transferred from",
455 |                         "name": "from_address",
456 |                         "type": "BINARY"
457 |                     },
458 |                     {
459 |                         "description": "amount of native token transferred",
460 |                         "name": "value",
461 |                         "type": "BINARY"
462 |                     }
463 |                 ],
464 |                 "description": "each row corresponds to a trace that transfers native token"
465 |             }
466 |         }
467 |     },
468 |     "version": "1.0.0"
469 | }


--------------------------------------------------------------------------------
/datasets/ethereum_slots/LICENSE-CC0:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/datasets/ethereum_slots/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Ethereum Slots Dataset v1.0.0
 3 | 
 4 | This is a dataset of all slots of each contract, including historical usage metadata
 5 | 
 6 | The dataset was created by using [this script](https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/slots/slots_collect.py)
 7 | 
 8 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/)
 9 | 
10 | ## Usage
11 | 
12 | Some example uses of this dataset include:
13 | - look up how much storage space is used by a given contract
14 | - look up which slots are used by a given contract
15 | - look up which slots change most frequently for a given contract
16 | 
17 | 
18 | 
19 | ## Schema
20 | 
21 | #### `slots` table
22 | each row corresponds to a slot of a contract
23 | | column | type | description |
24 | | - | - | - |
25 | | contract_address | BINARY | contract of slot |
26 | | slot | BINARY | address of slot |
27 | | value | BINARY | last data stored in slot |
28 | | first_updated_block | INTEGER | first block where slot was used |
29 | | last_updated_block | INTEGER | last block where slot was updated |
30 | | n_tx_updates | INTEGER | number of transactions that updated slot |
31 | 
32 | ## Download
33 | 
34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below
35 | 
36 | The total dataset size is **38.38GB**
37 | 
38 | ### Use `pdp`
39 | 
40 | The command `pdp download ethereum_slots` will download all files in this dataset
41 | 
42 | See `pdp download -h` for available options
43 | 
44 | ### Use URLs
45 | 
46 | | | file | size |
47 | | - | - | - |
48 | | 1 | [ethereum_slots__v1.0.0__0x00_to_0x0f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x00_to_0x0f.parquet) | 3.80GB |
49 | | 2 | [ethereum_slots__v1.0.0__0x10_to_0x1f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x10_to_0x1f.parquet) | 1.92GB |
50 | | 3 | [ethereum_slots__v1.0.0__0x20_to_0x2f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x20_to_0x2f.parquet) | 2.23GB |
51 | | 4 | [ethereum_slots__v1.0.0__0x30_to_0x3f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x30_to_0x3f.parquet) | 2.00GB |
52 | | 5 | [ethereum_slots__v1.0.0__0x40_to_0x4f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x40_to_0x4f.parquet) | 2.15GB |
53 | | 6 | [ethereum_slots__v1.0.0__0x50_to_0x5f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x50_to_0x5f.parquet) | 3.09GB |
54 | | 7 | [ethereum_slots__v1.0.0__0x60_to_0x6f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x60_to_0x6f.parquet) | 1.86GB |
55 | | 8 | [ethereum_slots__v1.0.0__0x70_to_0x7f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x70_to_0x7f.parquet) | 3.21GB |
56 | | 9 | [ethereum_slots__v1.0.0__0x80_to_0x8f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x80_to_0x8f.parquet) | 2.51GB |
57 | | 10 | [ethereum_slots__v1.0.0__0x90_to_0x9f.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0x90_to_0x9f.parquet) | 1.98GB |
58 | | 11 | [ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet) | 2.72GB |
59 | | 12 | [ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet) | 2.01GB |
60 | | 13 | [ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet) | 2.12GB |
61 | | 14 | [ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet) | 3.32GB |
62 | | 15 | [ethereum_slots__v1.0.0__0xe0_to_0xef.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xe0_to_0xef.parquet) | 1.50GB |
63 | | 16 | [ethereum_slots__v1.0.0__0xf0_to_0xff.parquet](https://datasets.paradigm.xyz/datasets/ethereum_slots/ethereum_slots__v1.0.0__0xf0_to_0xff.parquet) | 1.97GB |
64 | 


--------------------------------------------------------------------------------
/datasets/ethereum_slots/dataset_manifest.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "datatype": "slots",
  3 |     "description": "all slots of each contract, including historical usage metadata",
  4 |     "files": [
  5 |         {
  6 |             "hash": "0fe90a691805d4e6ecadef3326701ead",
  7 |             "n_bytes": 4082875020,
  8 |             "name": "ethereum_slots__v1.0.0__0x00_to_0x0f.parquet"
  9 |         },
 10 |         {
 11 |             "hash": "ee5bbec43eb1c31f5217237211dc725f",
 12 |             "n_bytes": 2058888792,
 13 |             "name": "ethereum_slots__v1.0.0__0x10_to_0x1f.parquet"
 14 |         },
 15 |         {
 16 |             "hash": "d6002aadaf62df7806c502e8ee5cb21d",
 17 |             "n_bytes": 2394828784,
 18 |             "name": "ethereum_slots__v1.0.0__0x20_to_0x2f.parquet"
 19 |         },
 20 |         {
 21 |             "hash": "9acff8c6635d857a43b1cdc2ddf59574",
 22 |             "n_bytes": 2147275687,
 23 |             "name": "ethereum_slots__v1.0.0__0x30_to_0x3f.parquet"
 24 |         },
 25 |         {
 26 |             "hash": "15d7ab03c7efad6988f9a4340bd6b65e",
 27 |             "n_bytes": 2310360858,
 28 |             "name": "ethereum_slots__v1.0.0__0x40_to_0x4f.parquet"
 29 |         },
 30 |         {
 31 |             "hash": "4fb3e9b0a420e93a338988752f69d407",
 32 |             "n_bytes": 3315134125,
 33 |             "name": "ethereum_slots__v1.0.0__0x50_to_0x5f.parquet"
 34 |         },
 35 |         {
 36 |             "hash": "690aa995ef17c326f1a1e7fcdd12de02",
 37 |             "n_bytes": 1999353262,
 38 |             "name": "ethereum_slots__v1.0.0__0x60_to_0x6f.parquet"
 39 |         },
 40 |         {
 41 |             "hash": "0d72d635f47828739f6c9abc1e7146b3",
 42 |             "n_bytes": 3451637035,
 43 |             "name": "ethereum_slots__v1.0.0__0x70_to_0x7f.parquet"
 44 |         },
 45 |         {
 46 |             "hash": "a2d663f62f9aba90607de8538f46b141",
 47 |             "n_bytes": 2692255804,
 48 |             "name": "ethereum_slots__v1.0.0__0x80_to_0x8f.parquet"
 49 |         },
 50 |         {
 51 |             "hash": "ea374784b0f541e5744990e6a5663dee",
 52 |             "n_bytes": 2126238030,
 53 |             "name": "ethereum_slots__v1.0.0__0x90_to_0x9f.parquet"
 54 |         },
 55 |         {
 56 |             "hash": "ebba9199de2373fd2875852f86f8f1b8",
 57 |             "n_bytes": 2920825856,
 58 |             "name": "ethereum_slots__v1.0.0__0xa0_to_0xaf.parquet"
 59 |         },
 60 |         {
 61 |             "hash": "effc1b8a42cd412d0d6976924261a15e",
 62 |             "n_bytes": 2153804795,
 63 |             "name": "ethereum_slots__v1.0.0__0xb0_to_0xbf.parquet"
 64 |         },
 65 |         {
 66 |             "hash": "988cf08e232daf2ddfa2a28316fee31e",
 67 |             "n_bytes": 2271461320,
 68 |             "name": "ethereum_slots__v1.0.0__0xc0_to_0xcf.parquet"
 69 |         },
 70 |         {
 71 |             "hash": "13509d1fe410d3d0877f03f395fbff6d",
 72 |             "n_bytes": 3562804550,
 73 |             "name": "ethereum_slots__v1.0.0__0xd0_to_0xdf.parquet"
 74 |         },
 75 |         {
 76 |             "hash": "26fa7fcaf6cd0d7781cc9fb21143e936",
 77 |             "n_bytes": 1610498317,
 78 |             "name": "ethereum_slots__v1.0.0__0xe0_to_0xef.parquet"
 79 |         },
 80 |         {
 81 |             "hash": "94344f169fd8fb9f1d7fa8c35e9cc427",
 82 |             "n_bytes": 2114482565,
 83 |             "name": "ethereum_slots__v1.0.0__0xf0_to_0xff.parquet"
 84 |         }
 85 |     ],
 86 |     "name": "ethereum_slots",
 87 |     "network": "ethereum",
 88 |     "schema": {
 89 |         "description": "all slots of each contract, including historical usage metadata",
 90 |         "tables": {
 91 |             "slots": {
 92 |                 "columns": [
 93 |                     {
 94 |                         "description": "contract of slot",
 95 |                         "name": "contract_address",
 96 |                         "type": "BINARY"
 97 |                     },
 98 |                     {
 99 |                         "description": "address of slot",
100 |                         "name": "slot",
101 |                         "type": "BINARY"
102 |                     },
103 |                     {
104 |                         "description": "last data stored in slot",
105 |                         "name": "value",
106 |                         "type": "BINARY"
107 |                     },
108 |                     {
109 |                         "description": "first block where slot was used",
110 |                         "name": "first_updated_block",
111 |                         "type": "INTEGER"
112 |                     },
113 |                     {
114 |                         "description": "last block where slot was updated",
115 |                         "name": "last_updated_block",
116 |                         "type": "INTEGER"
117 |                     },
118 |                     {
119 |                         "description": "number of transactions that updated slot",
120 |                         "name": "n_tx_updates",
121 |                         "type": "INTEGER"
122 |                     }
123 |                 ],
124 |                 "description": "each row corresponds to a slot of a contract"
125 |             }
126 |         }
127 |     },
128 |     "version": "1.0.0"
129 | }


--------------------------------------------------------------------------------
/datasets/global_manifest.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "datasets": {
  3 |         "ethereum_contracts": {
  4 |             "datatype": "contracts",
  5 |             "description": "all historical contract deployments",
  6 |             "n_bytes": 7051921809,
  7 |             "n_files": 17,
  8 |             "name": "ethereum_contracts",
  9 |             "network": "ethereum",
 10 |             "schema": {
 11 |                 "description": "all historical contract deployments",
 12 |                 "tables": {
 13 |                     "contracts": {
 14 |                         "columns": [
 15 |                             {
 16 |                                 "description": "block number when contract was created",
 17 |                                 "name": "block_number",
 18 |                                 "type": "INTEGER"
 19 |                             },
 20 |                             {
 21 |                                 "description": "increased by 1 for each contract created in block",
 22 |                                 "name": "create_index",
 23 |                                 "type": "INTEGER"
 24 |                             },
 25 |                             {
 26 |                                 "description": "hash of transaction that created contract",
 27 |                                 "name": "transaction_hash",
 28 |                                 "type": "BINARY"
 29 |                             },
 30 |                             {
 31 |                                 "description": "address of deployed contract",
 32 |                                 "name": "contract_address",
 33 |                                 "type": "BINARY"
 34 |                             },
 35 |                             {
 36 |                                 "description": "EOA that deployed the contract",
 37 |                                 "name": "deployer",
 38 |                                 "type": "BINARY"
 39 |                             },
 40 |                             {
 41 |                                 "description": "the `from` field in the creation trace",
 42 |                                 "name": "factory",
 43 |                                 "type": "BINARY"
 44 |                             },
 45 |                             {
 46 |                                 "description": "initialization bytecode of contract",
 47 |                                 "name": "init_code",
 48 |                                 "type": "BINARY"
 49 |                             },
 50 |                             {
 51 |                                 "description": "bytecode of contract",
 52 |                                 "name": "code",
 53 |                                 "type": "BINARY"
 54 |                             },
 55 |                             {
 56 |                                 "description": "keccak hash of contract initialization code",
 57 |                                 "name": "init_code_hash",
 58 |                                 "type": "BINARY"
 59 |                             },
 60 |                             {
 61 |                                 "description": "keccak hash of contract bytecode",
 62 |                                 "name": "code_hash",
 63 |                                 "type": "BINARY"
 64 |                             }
 65 |                         ],
 66 |                         "description": "each row corresponds to a contract create trace"
 67 |                     }
 68 |                 }
 69 |             },
 70 |             "version": "1.0.0"
 71 |         },
 72 |         "ethereum_native_transfers": {
 73 |             "datatype": "native_transfers",
 74 |             "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)",
 75 |             "n_bytes": 65501766122,
 76 |             "n_files": 84,
 77 |             "name": "ethereum_native_transfers",
 78 |             "network": "ethereum",
 79 |             "schema": {
 80 |                 "description": "all native transfers in similar format to ERC20 Transfers (excluding tx fees)",
 81 |                 "tables": {
 82 |                     "native_transfers": {
 83 |                         "columns": [
 84 |                             {
 85 |                                 "description": "block number where native token was transfered",
 86 |                                 "name": "block_number",
 87 |                                 "type": "INTEGER"
 88 |                             },
 89 |                             {
 90 |                                 "description": "increased by 1 for each native transfer in block",
 91 |                                 "name": "transfer_index",
 92 |                                 "type": "INTEGER"
 93 |                             },
 94 |                             {
 95 |                                 "description": "hash of transaction that contains transfer",
 96 |                                 "name": "transaction_hash",
 97 |                                 "type": "BINARY"
 98 |                             },
 99 |                             {
100 |                                 "description": "address that native token is transferred to",
101 |                                 "name": "to_address",
102 |                                 "type": "BINARY"
103 |                             },
104 |                             {
105 |                                 "description": "address that native token is transferred from",
106 |                                 "name": "from_address",
107 |                                 "type": "BINARY"
108 |                             },
109 |                             {
110 |                                 "description": "amount of native token transferred",
111 |                                 "name": "value",
112 |                                 "type": "BINARY"
113 |                             }
114 |                         ],
115 |                         "description": "each row corresponds to a trace that transfers native token"
116 |                     }
117 |                 }
118 |             },
119 |             "version": "1.0.0"
120 |         },
121 |         "ethereum_slots": {
122 |             "datatype": "slots",
123 |             "description": "all slots of each contract, including historical usage metadata",
124 |             "n_bytes": 41212724800,
125 |             "n_files": 16,
126 |             "name": "ethereum_slots",
127 |             "network": "ethereum",
128 |             "schema": {
129 |                 "description": "all slots of each contract, including historical usage metadata",
130 |                 "tables": {
131 |                     "slots": {
132 |                         "columns": [
133 |                             {
134 |                                 "description": "contract of slot",
135 |                                 "name": "contract_address",
136 |                                 "type": "BINARY"
137 |                             },
138 |                             {
139 |                                 "description": "address of slot",
140 |                                 "name": "slot",
141 |                                 "type": "BINARY"
142 |                             },
143 |                             {
144 |                                 "description": "last data stored in slot",
145 |                                 "name": "value",
146 |                                 "type": "BINARY"
147 |                             },
148 |                             {
149 |                                 "description": "first block where slot was used",
150 |                                 "name": "first_updated_block",
151 |                                 "type": "INTEGER"
152 |                             },
153 |                             {
154 |                                 "description": "last block where slot was updated",
155 |                                 "name": "last_updated_block",
156 |                                 "type": "INTEGER"
157 |                             },
158 |                             {
159 |                                 "description": "number of transactions that updated slot",
160 |                                 "name": "n_tx_updates",
161 |                                 "type": "INTEGER"
162 |                             }
163 |                         ],
164 |                         "description": "each row corresponds to a slot of a contract"
165 |                     }
166 |                 }
167 |             },
168 |             "version": "1.0.0"
169 |         }
170 |     },
171 |     "version": "1.0.0"
172 | }


--------------------------------------------------------------------------------
/pdp/__init__.py:
--------------------------------------------------------------------------------
 1 | """pdp downloads and manages datasets from the Paradigm Data Portal"""
 2 | 
 3 | from .config_utils import *
 4 | from .data_utils import *
 5 | from .spec import *
 6 | 
 7 | 
 8 | __version__ = '0.2.2'
 9 | 
10 | 


--------------------------------------------------------------------------------
/pdp/__main__.py:
--------------------------------------------------------------------------------
1 | 
2 | if __name__ == '__main__':
3 |     import pdp.cli.cli_run
4 | 
5 |     pdp.cli.cli_run.run_cli()
6 | 
7 | 


--------------------------------------------------------------------------------
/pdp/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/cli/__init__.py


--------------------------------------------------------------------------------
/pdp/cli/cli_run.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import typing
 5 | 
 6 | import toolcli
 7 | 
 8 | import pdp
 9 | 
10 | 
11 | def cd_dir_help() -> typing.Mapping[str, str]:
12 |     dir_dict = {
13 |         '\[data_root]': 'directory where pdp datasets are stored',
14 |     }
15 |     local_datasets = pdp.get_local_datasets()
16 |     for dataset in local_datasets:
17 |         manifest = pdp.get_dataset_manifest(dataset, source='local')
18 |         dir_dict[dataset] = manifest['description']
19 |     return dir_dict
20 | 
21 | 
22 | def cd_dir_getter(dirname: str) -> str:
23 |     data_root = pdp.get_data_root(require=False)
24 |     if data_root is None:
25 |         raise toolcli.CDException('must set PDP_DATA_ROOT env var')
26 | 
27 |     if dirname in ['', 'data_root']:
28 |         return data_root
29 |     else:
30 |         dirpath = os.path.join(data_root, dirname)
31 |         if not os.path.isdir(dirpath):
32 |             raise Exception('not a directory: ' + str(dirname))
33 |         if not os.path.isfile(
34 |             os.path.join(dirpath, pdp.dataset_manifest_filename)
35 |         ):
36 |             import toolstr
37 | 
38 |             toolstr.print(
39 |                 'no manifest file '
40 |                 + pdp.dataset_manifest_filename
41 |                 + ' detected for dataset',
42 |                 style='red',
43 |             )
44 | 
45 |         return dirpath
46 | 
47 | 
48 | def run_cli(raw_command: str | None = None) -> None:
49 |     import tempfile
50 | 
51 |     help_cache_dir = os.path.join(tempfile.gettempdir(), 'pdp', 'help_cache')
52 | 
53 |     command_index: toolcli.CommandIndex = {
54 |         ('',): 'pdp.cli.commands.root_command',
55 |         ('collect',): 'pdp.cli.commands.collect_command',
56 |         ('dataset',): 'pdp.cli.commands.dataset_command',
57 |         ('download',): 'pdp.cli.commands.download_command',
58 |         ('help',): 'toolcli.command_utils.standard_subcommands.help_command',
59 |         ('ls',): 'pdp.cli.commands.ls_command',
60 |         ('package',): 'pdp.cli.commands.package_command',
61 |         ('update',): 'pdp.cli.commands.update_command',
62 |         ('upload',): 'pdp.cli.commands.upload_command',
63 |         ('validate',): 'pdp.cli.commands.validate_command',
64 |         (
65 |             'version',
66 |         ): 'toolcli.command_utils.standard_subcommands.version_command',
67 |     }
68 | 
69 |     config: toolcli.CLIConfig = {
70 |         'base_command': 'pdp',
71 |         'description': pdp.__doc__,
72 |         'version': pdp.__version__,
73 |         'include_standard_subcommands': [('cd',)],
74 |         'default_command_sequence': ('help',),
75 |         'include_debug_arg': True,
76 |         'help_cache_dir': help_cache_dir,
77 |         'style_theme': pdp.styles,
78 |         'cd_dir_help': cd_dir_help,
79 |         'cd_dir_getter': cd_dir_getter,
80 |     }
81 | 
82 |     toolcli.run_cli(
83 |         command_index=command_index,
84 |         config=config,
85 |     )
86 | 
87 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/cli/commands/__init__.py


--------------------------------------------------------------------------------
/pdp/cli/commands/collect_command.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import typing
  5 | 
  6 | import toolcli
  7 | 
  8 | import pdp
  9 | 
 10 | if typing.TYPE_CHECKING:
 11 |     import ctc.spec
 12 | 
 13 |     class StandardCollectKwargs(typing.TypedDict):
 14 |         start_block: int
 15 |         end_block: int
 16 |         output_dir: str
 17 |         network: str
 18 |         chunk_size: int | None
 19 |         output_filetype: str | None
 20 |         executor: typing.Literal['parallel', 'serial']
 21 |         verbose: bool
 22 | 
 23 | 
 24 | help_message = """collect a dataset from RPC nodes or other sources
 25 | 
 26 | collecting on-chain datasets requires ctc to be installed and configured"""
 27 | 
 28 | 
 29 | def get_command_spec() -> toolcli.CommandSpec:
 30 |     return {
 31 |         'f': collect_command,
 32 |         'help': help_message,
 33 |         'args': [
 34 |             {'name': 'dataset', 'help': 'name of dataset to collect'},
 35 |             {
 36 |                 'name': 'output-dir',
 37 |                 'nargs': '?',
 38 |                 'help': 'output directory, omit to use PDP_DATA_ROOT',
 39 |             },
 40 |             {
 41 |                 'name': ('-b', '--blocks'),
 42 |                 'help': 'block range, as start_block:end_block:chunk_size',
 43 |             },
 44 |             {
 45 |                 'name': ('-r', '--rpc'),
 46 |                 'help': 'rpc node url, omit to use ctc configuration',
 47 |             },
 48 |             {
 49 |                 'name': ('-f', '--format'),
 50 |                 'dest': 'output_format',
 51 |                 'help': 'format of output (parquet or csv)',
 52 |             },
 53 |             {
 54 |                 'name': ('-s', '--serial'),
 55 |                 'help': 'use serial execution instead of parallel',
 56 |                 'action': 'store_true',
 57 |             },
 58 |             {
 59 |                 'name': ('-v', '--verbose'),
 60 |                 'help': 'output additional information',
 61 |                 'action': 'store_true',
 62 |             },
 63 |             {
 64 |                 'name': ('-e', '--extension'),
 65 |                 'help': 'extension module.function for dataset collection',
 66 |                 'hidden': True,
 67 |             },
 68 |             {
 69 |                 'name': ('-p', '--parameters'),
 70 |                 'help': 'extra parameters given to collection function',
 71 |                 'hidden': True,
 72 |             },
 73 |         ],
 74 |         'examples': [
 75 |             'ethereum_contracts',
 76 |             'ethereum_native_transfers',
 77 |             'ethereum_slots --blocks 14_000_000:14_100_000',
 78 |         ],
 79 |     }
 80 | 
 81 | 
 82 | def collect_command(
 83 |     dataset: str,
 84 |     blocks: str | None,
 85 |     rpc: str | None,
 86 |     output_dir: str | None,
 87 |     output_format: str | None,
 88 |     serial: bool,
 89 |     verbose: bool,
 90 |     extension: str | None,
 91 |     parameters: str | None,
 92 | ) -> None:
 93 |     # get context
 94 |     parsed = pdp.parse_dataset_name(dataset)
 95 |     datatype = parsed['datatype']
 96 |     network = parsed['network']
 97 |     if rpc is None:
 98 |         context: ctc.spec.Context = {'network': network}
 99 |     else:
100 |         context = {'network': network, 'provider': rpc}
101 | 
102 |     # get block range
103 |     if blocks is None:
104 |         pdp.ensure_ctc()
105 |         import ctc.rpc
106 | 
107 |         start_block = 0
108 |         end_block = ctc.rpc.sync_eth_block_number(context=context)
109 |         chunk_size_int = None
110 |     else:
111 |         pdp.ensure_ctc()
112 |         import ctc.cli.cli_utils
113 | 
114 |         (
115 |             start_block,
116 |             end_block,
117 |             chunk_size_int,
118 |         ) = ctc.cli.cli_utils.sync_parse_block_chunks(blocks)
119 | 
120 |     # parse output parameters
121 |     if output_dir is None:
122 |         data_root = pdp.get_data_root(require=False)
123 |         if data_root is None or data_root == '':
124 |             raise Exception(
125 |                 'must specify output_dir or set PDP_DATA_ROOT env var'
126 |             )
127 |         else:
128 |             output_dir = os.path.join(data_root, dataset)
129 | 
130 |     if serial:
131 |         executor: typing.Literal['parallel', 'serial'] = 'serial'
132 |     else:
133 |         executor = 'parallel'
134 | 
135 |     # collect parameters
136 |     if parameters is not None:
137 |         import ast
138 | 
139 |         extra_kwargs = ast.literal_eval(parameters)
140 |         if not isinstance(extra_kwargs, dict):
141 |             raise Exception(
142 |                 'extra parameters should be specified with dict syntax'
143 |             )
144 |     else:
145 |         extra_kwargs = {}
146 |     standard_kwargs: StandardCollectKwargs = {
147 |         'start_block': start_block,
148 |         'end_block': end_block,
149 |         'output_dir': output_dir,
150 |         'network': network,
151 |         'chunk_size': chunk_size_int,
152 |         'output_filetype': output_format,
153 |         'executor': executor,
154 |         'verbose': verbose,
155 |     }
156 | 
157 |     #
158 |     # # perform collection
159 |     #
160 | 
161 |     if datatype == 'contracts':
162 |         from pdp.datasets import contracts
163 | 
164 |         contracts.collect_contracts_dataset(**standard_kwargs, **extra_kwargs)
165 | 
166 |     elif datatype == 'native_transfers':
167 |         from pdp.datasets import native_transfers
168 | 
169 |         native_transfers.collect_native_transfers_dataset(
170 |             **standard_kwargs, **extra_kwargs
171 |         )
172 | 
173 |     elif datatype == 'slots':
174 |         from pdp.datasets import slots
175 | 
176 |         slots.collect_slots_dataset(**standard_kwargs, **extra_kwargs)
177 | 
178 |     elif extension is not None:
179 |         import importlib
180 | 
181 |         try:
182 |             module_path = (
183 |                 extension
184 |                 + '.datasets.'
185 |                 + datatype
186 |                 + '.'
187 |                 + datatype
188 |                 + '_collect'
189 |             )
190 |             module = importlib.import_module(module_path)
191 |             function_name = 'collect_' + datatype + '_dataset'
192 |             function = getattr(module, function_name)
193 |         except (ValueError, ImportError, AttributeError) as e:
194 |             print('invalid extension, could not get extension function: ' + str(e.args[0]))
195 |             return
196 | 
197 |         function(**standard_kwargs, **extra_kwargs)
198 | 
199 |     else:
200 |         raise Exception('invalid datatype: ' + str(datatype))
201 | 
202 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/dataset_command.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import toolcli
 4 | import toolstr
 5 | 
 6 | import pdp
 7 | 
 8 | 
 9 | def get_command_spec() -> toolcli.CommandSpec:
10 |     return {
11 |         'f': dataset_command,
12 |         'help': 'show info about a dataset',
13 |         'args': [
14 |             {'name': 'dataset', 'help': 'name of dataset'},
15 |         ],
16 |         'examples': [
17 |             'ethereum_contracts',
18 |             'ethereum_native_transfers',
19 |         ],
20 |     }
21 | 
22 | 
23 | def dataset_command(dataset: str) -> None:
24 |     manifest = pdp.get_dataset_manifest(dataset)
25 |     toolstr.print_text_box(
26 |         toolstr.add_style(manifest['name'] + ' dataset', pdp.styles['metavar']),
27 |         style=pdp.styles['title'],
28 |     )
29 |     toolstr.print_bullet(
30 |         key='description',
31 |         value=manifest['description'],
32 |         styles=pdp.styles,
33 |     )
34 |     toolstr.print_bullet(
35 |         key='version',
36 |         value=manifest['version'],
37 |         styles=pdp.styles,
38 |     )
39 |     toolstr.print_bullet(
40 |         key='n_files',
41 |         value=len(manifest['files']),
42 |         styles=pdp.styles,
43 |     )
44 |     total_size = sum(file['n_bytes'] for file in manifest['files'])
45 |     toolstr.print_bullet(
46 |         key='total_size',
47 |         value=toolstr.format_nbytes(total_size, decimals=1),
48 |         styles=pdp.styles,
49 |     )
50 | 
51 |     print()
52 |     for table_name, table in manifest['schema']['tables'].items():
53 |         if len(manifest['schema']['tables']) > 1:
54 |             toolstr.print(
55 |                 table_name + ' table', style=pdp.styles['content'], indent=4
56 |             )
57 |         rows = []
58 |         for column in table['columns']:
59 |             row = [column['name'], column['type'], column['description']]
60 |             rows.append(row)
61 |         labels = ['column', 'type', 'description']
62 |         toolstr.print_table(
63 |             rows,
64 |             labels=labels,
65 |             column_styles={
66 |                 'column': pdp.styles['metavar'],
67 |                 'type': pdp.styles['content'],
68 |                 'description': pdp.styles['description'],
69 |             },
70 |             # style=pdp.styles['metavar'],
71 |             border=pdp.styles['content'],
72 |             label_style=pdp.styles['metavar'],
73 |             indent=4,
74 |         )
75 | 
76 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/download_command.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | import toolcli
 6 | 
 7 | import pdp
 8 | 
 9 | 
10 | def get_command_spec() -> toolcli.CommandSpec:
11 |     return {
12 |         'f': download_command,
13 |         'help': 'download dataset directly from Paradigm data portal',
14 |         'args': [
15 |             {'name': 'dataset', 'help': 'dataset to list info of'},
16 |             {'name': '--output-dir', 'help': 'output directory path'},
17 |             {'name': '--portal-root', 'help': 'root url of data portal'},
18 |         ],
19 |         'examples': [
20 |             'ethereum_contracts',
21 |             'ethereum_contracts --output-dir /path/to/some/dir',
22 |         ],
23 |     }
24 | 
25 | 
26 | def download_command(
27 |     dataset: str,
28 |     output_dir: str | None,
29 |     portal_root: str | None,
30 | ) -> None:
31 | 
32 |     if output_dir is None:
33 |         output_dir = os.path.abspath('.')
34 | 
35 |     pdp.download_dataset(
36 |         dataset=dataset,
37 |         output_dir=output_dir,
38 |         portal_root=portal_root,
39 |     )
40 | 
41 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/ls_command.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import sys
  4 | 
  5 | import toolcli
  6 | import toolstr
  7 | 
  8 | import pdp
  9 | 
 10 | 
 11 | def get_command_spec() -> toolcli.CommandSpec:
 12 |     return {
 13 |         'f': ls_command,
 14 |         'help': 'list datasets or dataset files',
 15 |         'args': [
 16 |             {
 17 |                 'name': 'dataset',
 18 |                 'help': 'dataset to list info of, omit for global manifest',
 19 |                 'nargs': '?',
 20 |             },
 21 |             {
 22 |                 'name': '--hashes',
 23 |                 'help': 'show md5 hashes of each file',
 24 |                 'action': 'store_true',
 25 |             },
 26 |             {
 27 |                 'name': '--urls',
 28 |                 'help': 'show full urls of each file',
 29 |                 'action': 'store_true',
 30 |             },
 31 |             {'name': '--portal-root', 'help': 'root url of data portal'},
 32 |         ],
 33 |         'examples': {
 34 |             '': 'show all datasets',
 35 |             'ethereum_contracts': 'show files of dataset',
 36 |             'ethereum_contracts --hashes': 'show files of dataset with hashes',
 37 |             'ethereum_contracts --urls': 'show urls of files in dataset',
 38 |         },
 39 |     }
 40 | 
 41 | 
 42 | def ls_command(
 43 |     dataset: str | None,
 44 |     hashes: bool,
 45 |     urls: bool,
 46 |     portal_root: str | None,
 47 | ) -> None:
 48 | 
 49 |     if dataset is None:
 50 | 
 51 |         toolstr.print(
 52 |             'fetching global manifest...', style=pdp.styles['comment'], end='\r'
 53 |         )
 54 |         global_manifest = pdp.get_global_manifest()
 55 |         sys.stdout.write("\033[K")
 56 |         toolstr.print('Datasets', style=pdp.styles['title'])
 57 |         for dataset_name, dataset_manifest in global_manifest[
 58 |             'datasets'
 59 |         ].items():
 60 |             toolstr.print_bullet(
 61 |                 key=dataset_name,
 62 |                 value=dataset_manifest['description'],
 63 |                 styles=pdp.styles,
 64 |             )
 65 |         print()
 66 |         toolstr.print(
 67 |             '(use '
 68 |             + toolstr.add_style(
 69 |                 'pdp dataset <DATASET>', pdp.styles['description']
 70 |             )
 71 |             + ' for info or '
 72 |             + toolstr.add_style('pdp ls <DATASET>', pdp.styles['description'])
 73 |             + ' for file list)',
 74 |             style=pdp.styles['comment'],
 75 |         )
 76 | 
 77 |     else:
 78 | 
 79 |         # get dataset manifest
 80 |         manifest = pdp.get_dataset_manifest(dataset=dataset)
 81 | 
 82 |         # get id of each file, either filename or url
 83 |         if urls:
 84 |             file_ids = pdp.get_dataset_file_urls(
 85 |                 dataset=dataset,
 86 |                 portal_root=portal_root,
 87 |                 manifest=manifest,
 88 |             )
 89 |         else:
 90 |             file_ids = [file['name'] for file in manifest['files']]
 91 | 
 92 |         # print either with or without hashes
 93 |         if hashes:
 94 |             rows = [
 95 |                 [file_id, file['hash']]
 96 |                 for file_id, file in zip(file_ids, manifest['files'])
 97 |             ]
 98 |             toolstr.print_table(rows, compact=True)
 99 |         else:
100 |             for file_id in file_ids:
101 |                 print(file_id)
102 | 
103 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/package_command.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | 
  5 | import toolcli
  6 | 
  7 | import pdp
  8 | 
  9 | 
 10 | def get_command_spec() -> toolcli.CommandSpec:
 11 |     return {
 12 |         'f': package_command,
 13 |         'help': 'package a dataset manifest or a global manifest',
 14 |         'args': [
 15 |             {
 16 |                 'name': 'directory',
 17 |                 'help': 'data directory to use for manifest',
 18 |                 'nargs': '?',
 19 |             },
 20 |             {
 21 |                 'name': '--global',
 22 |                 'help': 'create a global manifest instead of a dataset manifest',
 23 |                 'action': 'store_true',
 24 |                 'dest': 'global_manifest',
 25 |             },
 26 |             {
 27 |                 'name': '--output-path',
 28 |                 'help': 'output path of manifest',
 29 |             },
 30 |             {
 31 |                 'name': '--confirm',
 32 |                 'help': 'confirm overwriting ',
 33 |                 'action': 'store_true',
 34 |             },
 35 |             {
 36 |                 'name': '--reuse-hashes',
 37 |                 'help': '',
 38 |                 'action': 'store_true',
 39 |             },
 40 |         ],
 41 |         'examples': [
 42 |             '',
 43 |             'path/to/some/dataset',
 44 |             '--global',
 45 |             '--confirm',
 46 |         ],
 47 |     }
 48 | 
 49 | 
 50 | def package_command(
 51 |     *,
 52 |     global_manifest: bool,
 53 |     directory: str | None,
 54 |     output_path: str | bool | None,
 55 |     confirm: bool,
 56 |     reuse_hashes: bool,
 57 | ) -> None:
 58 | 
 59 |     if directory is None:
 60 |         directory = '.'
 61 |     directory = os.path.expanduser(directory)
 62 |     if output_path is None:
 63 |         output_path = True
 64 |     if isinstance(output_path, str):
 65 |         output_path = os.path.expanduser(output_path)
 66 | 
 67 |     if global_manifest:
 68 | 
 69 |         pdp.create_global_manifest(
 70 |             data_root=directory,
 71 |             version=pdp.global_version,
 72 |             output_path=output_path,
 73 |             confirm=confirm,
 74 |         )
 75 | 
 76 |     else:
 77 | 
 78 |         # check no subdatasets contained
 79 |         for item in os.listdir(directory):
 80 |             subpath = os.path.join(directory, item)
 81 |             if os.path.isdir(
 82 |                 subpath
 83 |             ) and pdp.dataset_manifest_filename in os.listdir(subpath):
 84 |                 raise Exception('use --global to package a global manifest')
 85 | 
 86 |         dataset_manifest = pdp.create_dataset_manifest(
 87 |             dataset_dir=directory,
 88 |             output_path=output_path,
 89 |             confirm=confirm,
 90 |             reuse_hashes=reuse_hashes,
 91 |         )
 92 | 
 93 |         # get readme path
 94 |         if isinstance(output_path, bool):
 95 |             readme_path: str | bool = True
 96 |         elif isinstance(output_path, str):
 97 |             readme_path = os.path.join(
 98 |                 os.path.dirname(output_path),
 99 |                 pdp.dataset_readme_filename,
100 |             )
101 |         else:
102 |             raise Exception('unknown output_path type: ' + str(output_path))
103 | 
104 |         # create readme
105 |         pdp.create_dataset_readme(
106 |             dataset_manifest=dataset_manifest,
107 |             output_path=readme_path,
108 |             confirm=confirm,
109 |         )
110 | 
111 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/root_command.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import toolcli
 4 | 
 5 | 
 6 | def get_command_spec() -> toolcli.CommandSpec:
 7 |     return {
 8 |         'f': root_command,
 9 |         'help': 'display help message',
10 |         'hidden': True,
11 |         'extra_data': ['parse_spec'],
12 |     }
13 | 
14 | 
15 | def root_command(parse_spec: toolcli.ParseSpec) -> None:
16 |     toolcli.command_utils.execution.execute_other_command_sequence(
17 |         ('help',),
18 |         args={'parse_spec': parse_spec},
19 |         parse_spec=parse_spec,
20 |     )
21 | 
22 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/update_command.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | import toolcli
 6 | 
 7 | import pdp
 8 | 
 9 | 
10 | def get_command_spec() -> toolcli.CommandSpec:
11 |     return {
12 |         'f': update_command,
13 |         'help': 'update a dataset or datasets to their latest versions',
14 |         'hidden': True,
15 |         'args': [
16 |             {
17 |                 'name': 'datasets',
18 |                 'help': 'space-separated list of datasets to update',
19 |                 'nargs': '+',
20 |             },
21 |             {
22 |                 'name': ['-a', '--all'],
23 |                 'help': 'update all datasets',
24 |                 'dest': 'all_datasets',
25 |                 'action': 'store_true',
26 |             },
27 |             {
28 |                 'name': ['-m', '--method'],
29 |                 'help': 'method used for syncing ("download" or "collect")',
30 |             },
31 |         ],
32 |         'examples': [
33 |             'ethereum_contracts',
34 |             'ethereum_contracts ethereum_slots',
35 |             '--all',
36 |         ],
37 |     }
38 | 
39 | 
40 | def update_command(
41 |     datasets: typing.Sequence[str],
42 |     all_datasets: bool,
43 |     method: typing.Literal['download', 'collect'],
44 | ) -> None:
45 |     if all_datasets:
46 |         datasets = pdp.get_local_datasets()
47 | 
48 |     for dataset in datasets:
49 |         pdp.update(dataset=dataset, method=method)
50 | 
51 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/upload_command.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import os
  5 | 
  6 | import toolcli
  7 | 
  8 | import pdp
  9 | 
 10 | 
 11 | help_message = """upload files to bucket
 12 | 
 13 | If local-path not specified, upload current directory
 14 | 
 15 | If bucket-path not specified, upload relative to global manifest location"""
 16 | 
 17 | 
 18 | def get_command_spec() -> toolcli.CommandSpec:
 19 |     return {
 20 |         'f': upload_command,
 21 |         'help': help_message,
 22 |         'args': [
 23 |             {
 24 |                 'name': 'local-path',
 25 |                 'help': 'local path to upload from',
 26 |                 'nargs': '?',
 27 |             },
 28 |             {
 29 |                 'name': 'bucket-path',
 30 |                 'help': 'bucket path to upload to',
 31 |                 'nargs': '?',
 32 |             },
 33 |             {
 34 |                 'name': '--all',
 35 |                 'help': 'upload all files in directory instead of just manifest files',
 36 |                 'action': 'store_true',
 37 |                 'dest': 'all_files',
 38 |             },
 39 |         ],
 40 |         'examples': {
 41 |             '': 'upload current directory',
 42 |             '/path/to/some/dir': 'upload some other directory',
 43 |         },
 44 |         'hidden': True,
 45 |     }
 46 | 
 47 | 
 48 | def upload_command(
 49 |     *, local_path: str, bucket_path: str, all_files: bool
 50 | ) -> None:
 51 | 
 52 |     if local_path is None:
 53 |         local_path = '.'
 54 |     local_path = os.path.abspath(os.path.expanduser(local_path))
 55 |     if os.path.isdir(local_path):
 56 |         local_dir = local_path
 57 |     elif os.path.isfile(local_path):
 58 |         local_dir = os.path.dirname(local_path)
 59 |     else:
 60 |         raise Exception()
 61 | 
 62 |     # if local_path is a dataset directory, upload according to manifest
 63 |     dir_files = None
 64 |     if os.path.isdir(local_path):
 65 |         dir_contents = os.listdir(local_path)
 66 | 
 67 |         if all_files:
 68 |             dir_files = None
 69 | 
 70 |         elif pdp.global_manifest_filename in dir_contents:
 71 |             print('uploading global manifest:', pdp.global_manifest_filename)
 72 |             dir_files = [pdp.global_manifest_filename]
 73 | 
 74 |         elif pdp.dataset_manifest_filename in dir_contents:
 75 |             print(
 76 |                 'uploading files in dataset manifest:',
 77 |                 pdp.dataset_manifest_filename,
 78 |             )
 79 |             manifest_path = os.path.join(
 80 |                 local_path, pdp.dataset_manifest_filename
 81 |             )
 82 |             with open(manifest_path) as f:
 83 |                 dataset_manifest = json.load(f)
 84 |             dir_files = [pdp.dataset_manifest_filename]
 85 |             for file in dataset_manifest['files']:
 86 |                 dir_files.append(file['name'])
 87 | 
 88 |         else:
 89 |             raise Exception(
 90 |                 'no manifest file found, use --all to upload all files'
 91 |             )
 92 | 
 93 |     if bucket_path is None:
 94 | 
 95 |         if pdp.global_manifest_filename in os.listdir(local_dir):
 96 |             # assume local_dir is global root,
 97 |             bucket_path = pdp.bucket_root_path
 98 | 
 99 |         else:
100 | 
101 |             # find root dir
102 |             data_root = local_dir
103 |             while pdp.global_manifest_filename not in os.listdir(data_root):
104 |                 next_data_root = os.path.dirname(data_root)
105 |                 if next_data_root == data_root:
106 |                     raise Exception(
107 |                         'could not find global data root, must specify bucket-path manually'
108 |                     )
109 |                 data_root = next_data_root
110 | 
111 |             # get path relative to root dir
112 |             local_relpath = os.path.relpath(local_path, data_root)
113 |             bucket_path = os.path.join(pdp.bucket_root_path, local_relpath)
114 | 
115 |     if os.path.isfile(local_path):
116 |         pdp.upload_file(
117 |             local_path=local_path,
118 |             bucket_path=bucket_path,
119 |         )
120 |     else:
121 |         pdp.upload_directory(
122 |             local_path=local_path,
123 |             dir_files=dir_files,
124 |             bucket_path=bucket_path,
125 |         )
126 | 
127 | 


--------------------------------------------------------------------------------
/pdp/cli/commands/validate_command.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | import toolcli
 6 | 
 7 | import pdp
 8 | 
 9 | 
10 | def get_command_spec() -> toolcli.CommandSpec:
11 |     return {
12 |         'f': validate_command,
13 |         'help': 'validate files in dataset manifest',
14 |         'args': [
15 |             {
16 |                 'name': 'dataset_directory',
17 |                 'help': 'dataset directory, default is current directory',
18 |                 'nargs': '?',
19 |             },
20 |             {'name': '--no-hashes', 'help': 'skip hashing each file', 'action': 'store_true'},
21 |         ],
22 |         'examples': {
23 |             '': 'validate dataset in current directory',
24 |             '/path/to/some/dir': 'validate dataset in some other directory',
25 |         },
26 |     }
27 | 
28 | 
29 | def validate_command(dataset_directory: str | None, no_hashes: bool) -> None:
30 |     if dataset_directory is None:
31 |         path = '.'
32 |     else:
33 |         path = dataset_directory
34 |     path = os.path.abspath(os.path.expanduser(path))
35 | 
36 |     pdp.validate_dataset_directory(path, no_hashes=no_hashes)
37 | 
38 | 


--------------------------------------------------------------------------------
/pdp/config_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import typing
 5 | 
 6 | from . import data_utils
 7 | from . import spec
 8 | 
 9 | 
10 | @typing.overload
11 | def get_data_root(*, require: typing.Literal[True]) -> str:
12 |     ...
13 | 
14 | 
15 | @typing.overload
16 | def get_data_root(*, require: bool) -> str | None:
17 |     ...
18 | 
19 | 
20 | @typing.overload
21 | def get_data_root() -> str:
22 |     ...
23 | 
24 | 
25 | def get_data_root(*, require: bool = True) -> str | None:
26 |     data_root = os.environ.get('PDP_DATA_ROOT')
27 |     if (data_root is None or data_root == '') and require:
28 |         raise Exception('PDP_DATA_ROOT not set')
29 |     return data_root
30 | 
31 | 
32 | def get_dataset_glob(
33 |     dataset: str | None = None,
34 |     table: str | None = None,
35 |     *,
36 |     network: str | int | None = None,
37 |     datatype: str | None = None,
38 | ) -> str:
39 |     dataset_path = get_dataset_local_path(
40 |         dataset=dataset,
41 |         network=network,
42 |         datatype=datatype,
43 |     )
44 | 
45 |     if table is None:
46 |         filename = '*.parquet'
47 |     else:
48 |         filename = table + '_*.parquet'
49 | 
50 |     return os.path.join(dataset_path, filename)
51 | 
52 | 
53 | def get_dataset_local_path(
54 |     dataset: str | None = None,
55 |     *,
56 |     network: str | int | None = None,
57 |     datatype: str | None = None,
58 | ) -> str:
59 |     if dataset is None:
60 |         if network is None or datatype is None:
61 |             raise Exception(
62 |                 'must specify datatype and network to get dataset name'
63 |             )
64 |         dataset = data_utils.get_dataset_name(
65 |             datatype=datatype, network=network
66 |         )
67 | 
68 |     return os.path.join(get_data_root(), dataset)
69 | 
70 | 
71 | def get_local_datasets(
72 |     data_root: str | None = None,
73 | ) -> typing.Sequence[str]:
74 |     if data_root is None:
75 |         data_root = get_data_root(require=True)
76 | 
77 |     data_dirs = []
78 |     for subdir in os.listdir(data_root):
79 |         subpath = os.path.join(data_root, subdir)
80 |         if os.path.isdir(subpath):
81 |             manifest_path = os.path.join(
82 |                 subpath, spec.dataset_manifest_filename
83 |             )
84 |             if os.path.isfile(manifest_path):
85 |                 data_dirs.append(subdir)
86 | 
87 |     return data_dirs
88 | 
89 | 


--------------------------------------------------------------------------------
/pdp/data_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .collect_utils import *
 3 | from .download_utils import *
 4 | from .file_utils import *
 5 | from .job_utils import *
 6 | from .manifest_utils import *
 7 | from .query_utils import *
 8 | from .readme_utils import *
 9 | from .schema_utils import *
10 | from .update_utils import *
11 | 
12 | 


--------------------------------------------------------------------------------
/pdp/data_utils/collect_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | if typing.TYPE_CHECKING:
 6 |     import types
 7 | 
 8 | 
 9 | def ensure_ctc() -> types.ModuleType:
10 |     try:
11 |         import ctc
12 | 
13 |         return ctc
14 |     except ImportError:
15 |         raise Exception('must install ctc to use this functionality')
16 | 
17 | 


--------------------------------------------------------------------------------
/pdp/data_utils/download_utils.py:
--------------------------------------------------------------------------------
  1 | """functions for downloading datasets"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import os
  6 | import typing
  7 | 
  8 | from .. import spec
  9 | from . import file_utils
 10 | from . import manifest_utils
 11 | from . import schema_utils
 12 | 
 13 | 
 14 | def download_dataset(
 15 |     dataset: str,
 16 |     *,
 17 |     output_dir: str,
 18 |     portal_root: str | None = None,
 19 |     skip_existing: bool = True,
 20 | ) -> None:
 21 |     """download files of a dataset"""
 22 | 
 23 |     print('Downloading dataset:', dataset)
 24 | 
 25 |     urls = get_dataset_file_urls(dataset, portal_root=portal_root)
 26 | 
 27 |     base_url = os.path.dirname(urls[0])
 28 |     readme_url = os.path.join(base_url, 'README.md')
 29 |     manifest_url = os.path.join(base_url, 'dataset_manifest.json')
 30 |     file_utils.download_files(
 31 |         urls=[readme_url, manifest_url],
 32 |         output_dir=output_dir,
 33 |         skip_existing=skip_existing,
 34 |     )
 35 | 
 36 |     # download files
 37 |     file_utils.download_files(
 38 |         urls=urls,
 39 |         output_dir=output_dir,
 40 |         skip_existing=skip_existing,
 41 |     )
 42 | 
 43 | 
 44 | def get_dataset_file_urls(
 45 |     dataset: str,
 46 |     *,
 47 |     portal_root: str | None,
 48 |     manifest: spec.DatasetManifest | None = None,
 49 | ) -> typing.Sequence[str]:
 50 |     """get file urls of a dataset"""
 51 | 
 52 |     parsed = schema_utils.parse_dataset_name(dataset)
 53 | 
 54 |     if portal_root is None:
 55 |         portal_root = spec.portal_root
 56 |     if manifest is None:
 57 |         manifest = manifest_utils.get_dataset_manifest(
 58 |             dataset=dataset, portal_root=portal_root
 59 |         )
 60 |     urls = []
 61 |     for file in manifest['files']:
 62 |         url = spec.urls['dataset_file'].format(
 63 |             portal_root=portal_root,
 64 |             datatype=parsed['datatype'],
 65 |             network=parsed['network'],
 66 |             filename=file['name'],
 67 |         )
 68 |         urls.append(url)
 69 |     return urls
 70 | 
 71 | 
 72 | def get_dataset_file_url(
 73 |     datatype: str,
 74 |     network: str,
 75 |     filename: str,
 76 |     portal_root: str | None = None,
 77 | ) -> str:
 78 |     if portal_root is None:
 79 |         portal_root = spec.portal_root
 80 |     return spec.urls['dataset_file'].format(
 81 |         portal_root=portal_root,
 82 |         datatype=datatype,
 83 |         network=network,
 84 |         filename=filename,
 85 |     )
 86 | 
 87 | 
 88 | def validate_dataset_directory(path: str, *, no_hashes: bool = False) -> bool:
 89 |     """validate the files in a dataset directory"""
 90 | 
 91 |     import json
 92 | 
 93 |     # load manifest
 94 |     manifest_path = os.path.join(path, spec.dataset_manifest_filename)
 95 |     if not os.path.isfile(manifest_path):
 96 |         raise Exception(
 97 |             'no ' + spec.dataset_manifest_filename + ' found in directory'
 98 |         )
 99 |     with open(manifest_path, 'r') as f:
100 |         manifest = json.load(f)
101 |     manifest_files = {file['name'] for file in manifest['files']}
102 | 
103 |     print('validating data of', manifest['name'], 'in', path)
104 | 
105 |     # gather files present
106 |     present_files = set(os.listdir(path))
107 | 
108 |     # check for missing files
109 |     missing_files = manifest_files - present_files
110 | 
111 |     # check for extra files
112 |     extra_files = (present_files - manifest_files) - {
113 |         spec.dataset_manifest_filename
114 |     }
115 | 
116 |     # check file hashes
117 |     if no_hashes:
118 |         bad_hashes = None
119 |     else:
120 |         bad_hashes = []
121 |         for file in manifest['files']:
122 |             if file['name'] in present_files:
123 |                 hash = file_utils.get_file_hash(
124 |                     os.path.join(path, file['name'])
125 |                 )
126 |                 target_hash = file['hash']
127 |                 if hash != target_hash:
128 |                     bad_hashes.append(file['name'])
129 | 
130 |     # print errors
131 |     print()
132 |     errors_found = 0
133 |     skipping = False
134 |     for error_name, errors in {
135 |         'missing files': missing_files,
136 |         'extra files': extra_files,
137 |         'bad hashes': bad_hashes,
138 |     }.items():
139 |         if errors is None:
140 |             print('SKIPPED checking for ' + error_name)
141 |             skipping = True
142 |             continue
143 |         if len(errors) > 0:
144 |             errors_found += len(errors)
145 |             print(error_name + ':')
146 |             for file in sorted(errors)[:10]:
147 |                 print('-', file)
148 |             if len(errors) > 10:
149 |                 print('- ...')
150 | 
151 |     # print summary
152 |     if errors_found > 0 or skipping:
153 |         print()
154 |     print(
155 |         len(missing_files),
156 |         'missing files,',
157 |         len(extra_files),
158 |         'extra files, and',
159 |         len(bad_hashes) if bad_hashes is not None else 0,
160 |         'bad hashes',
161 |     )
162 | 
163 |     return not errors_found
164 | 
165 | 


--------------------------------------------------------------------------------
/pdp/data_utils/file_utils.py:
--------------------------------------------------------------------------------
  1 | """functions for generic file operations"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import os
  6 | import typing
  7 | 
  8 | 
  9 | def download_files(
 10 |     urls: typing.Sequence[str],
 11 |     *,
 12 |     output_dir: str,
 13 |     skip_existing: bool = True,
 14 | ) -> None:
 15 |     """download a list of files"""
 16 | 
 17 |     # get output dir
 18 |     if output_dir is None:
 19 |         output_dir = '.'
 20 |     output_dir = os.path.abspath(os.path.expanduser(output_dir))
 21 | 
 22 |     print('downloading', len(urls), 'files')
 23 |     print()
 24 |     print('using output_dir', output_dir)
 25 | 
 26 |     # skip existing files
 27 |     if skip_existing:
 28 |         url_filenames = [os.path.basename(url) for url in urls]
 29 |         skip_urls = set()
 30 |         for url, filename in zip(urls, url_filenames):
 31 |             if filename in os.listdir(output_dir):
 32 |                 skip_urls.add(url)
 33 |         if len(skip_urls) > 0:
 34 |             print()
 35 |             print('skipping', len(skip_urls), 'files that already exist')
 36 |     else:
 37 |         skip_urls = set()
 38 | 
 39 |     # download files
 40 |     for url in urls:
 41 |         if url not in skip_urls:
 42 |             download_file(url)
 43 | 
 44 |     print()
 45 |     print('done')
 46 | 
 47 | 
 48 | def download_file(url: str, output_path: str | None = None) -> None:
 49 |     """download a file"""
 50 |     import subprocess
 51 | 
 52 |     print()
 53 |     print('downloading', url)
 54 |     if output_path is None:
 55 |         output_path = os.path.basename(url)
 56 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
 57 |     subprocess.call(['curl', url, '--output', output_path])
 58 | 
 59 | 
 60 | def get_file_hash(path: str) -> str:
 61 |     """get hash of file"""
 62 | 
 63 |     import hashlib
 64 | 
 65 |     with open(path, 'rb') as f:
 66 |         hashed = hashlib.md5(f.read())
 67 | 
 68 |     return hashed.hexdigest()
 69 | 
 70 | 
 71 | def get_file_hashes(paths: typing.Sequence[str]) -> typing.Sequence[str]:
 72 |     """get hashes of multiple files"""
 73 | 
 74 |     return [get_file_hash(path) for path in paths]
 75 | 
 76 | 
 77 | def upload_file(local_path: str, bucket_path: str) -> None:
 78 |     """upload single file to s3 bucket"""
 79 | 
 80 |     import subprocess
 81 | 
 82 |     command = [
 83 |         'rclone',
 84 |         'copyto',
 85 |         local_path,
 86 |         'paradigm-data-portal:' + bucket_path,
 87 |         '-v',
 88 |     ]
 89 | 
 90 |     subprocess.call(command)
 91 | 
 92 | 
 93 | def upload_directory(
 94 |     local_path: str,
 95 |     bucket_path: str,
 96 |     *,
 97 |     dir_files: typing.Sequence[str] | None,
 98 |     remove_deleted_files: bool = False,
 99 | ) -> None:
100 |     """upload nested directory of files to s3 bucket"""
101 | 
102 |     import subprocess
103 | 
104 |     print('uploading directory:', local_path)
105 |     print('to bucket path:', bucket_path)
106 |     print()
107 | 
108 |     if remove_deleted_files:
109 |         action = 'sync'
110 |     else:
111 |         action = 'copy'
112 | 
113 |     command = [
114 |         'rclone',
115 |         action,
116 |         local_path,
117 |         'paradigm-data-portal:' + bucket_path,
118 |         '-v',
119 |     ]
120 | 
121 |     if dir_files is not None:
122 |         # create tempfile with list of files to upload
123 |         import tempfile
124 | 
125 |         temp_dir = tempfile.mkdtemp()
126 |         temp_path = os.path.join(temp_dir, 'file_list.txt')
127 |         with open(temp_path, 'w') as f:
128 |             f.write('\n'.join(dir_files))
129 |         command.extend(['--files-from', temp_path])
130 | 
131 |     else:
132 |         command.extend(['--exclude', '".*"'])
133 | 
134 |     subprocess.call(command)
135 | 
136 | 


--------------------------------------------------------------------------------
/pdp/data_utils/job_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import typing
  4 | 
  5 | import tooljob
  6 | 
  7 | if typing.TYPE_CHECKING:
  8 |     import ctc.spec
  9 |     import polars as pl
 10 | 
 11 | 
 12 | class BlockChunkJobs(tooljob.Batch):
 13 |     """create jobs by splitting a block interval into chunks, one job for each chunk"""
 14 | 
 15 |     start_block: int
 16 |     end_block: int
 17 |     chunk_size: int
 18 |     context: ctc.spec.Context | None = None
 19 |     tracker: tooljob.trackers.file_tracker.FileTracker | tooljob.trackers.multifile_tracker.MultifileTracker
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         start_block: int,
 24 |         end_block: int,
 25 |         chunk_size: int,
 26 |         context: ctc.spec.Context | None = None,
 27 |         **kwargs: typing.Any,
 28 |     ) -> None:
 29 |         if end_block < start_block:
 30 |             raise Exception('start_block must be less than end_block')
 31 |         self.start_block = start_block
 32 |         self.end_block = end_block
 33 |         self.chunk_size = chunk_size
 34 |         self.context = context
 35 |         super().__init__(**kwargs)
 36 | 
 37 |     #
 38 |     # # jobs
 39 |     #
 40 | 
 41 |     def get_n_jobs(self) -> int:
 42 |         import math
 43 | 
 44 |         n_blocks = self.end_block - self.start_block + 1
 45 |         return math.floor(n_blocks / self.chunk_size)
 46 | 
 47 |     def get_job_data(self, i: int) -> tooljob.JobData:
 48 |         n_jobs = self.get_n_jobs()
 49 |         if i < 0 or i >= n_jobs:
 50 |             raise Exception('job index too high, max is ' + str(n_jobs - 1))
 51 | 
 52 |         start_block = i * self.chunk_size + self.start_block
 53 |         end_block = (i + 1) * self.chunk_size - 1 + self.start_block
 54 |         if end_block > self.end_block:
 55 |             end_block = self.end_block
 56 | 
 57 |         return {'start_block': start_block, 'end_block': end_block}
 58 | 
 59 |     #
 60 |     # # names
 61 |     #
 62 | 
 63 |     def get_job_name(
 64 |         self,
 65 |         i: int | None = None,
 66 |         *,
 67 |         job_data: tooljob.JobData | None = None,
 68 |         parameters: typing.Mapping[str, str] | None = None,
 69 |     ) -> str:
 70 |         from tooljob.trackers import multifile_tracker
 71 | 
 72 |         # get job data
 73 |         if job_data is None:
 74 |             if i is None:
 75 |                 raise Exception('must specify job_data or i')
 76 |             job_data = self.get_job_data(i)
 77 | 
 78 |         # get output_name
 79 |         if parameters is not None:
 80 |             output_name = parameters.get('output_name')
 81 |         else:
 82 |             output_name = None
 83 | 
 84 |         # create name
 85 |         if (
 86 |             isinstance(self.tracker, multifile_tracker.MultifileTracker)
 87 |             and output_name is not None
 88 |             and parameters is not None
 89 |             and parameters.get('output_name') is not None
 90 |         ):
 91 |             # get ctc
 92 |             from . import collect_utils
 93 | 
 94 |             collect_utils.ensure_ctc()
 95 |             import ctc
 96 |             import ctc.config
 97 | 
 98 |             # handle parameters for multi-output job name
 99 |             network: str = ctc.config.get_context_network_name(self.context)
100 |             block_range = self.get_block_range_str(i)
101 |             return network + '_' + output_name + '__' + block_range
102 | 
103 |         else:
104 |             # use vanilla job name
105 |             return self.get_job_list_name() + '__' + self.get_block_range_str(i)
106 | 
107 |     def parse_job_name(self, name: str) -> typing.Mapping[str, typing.Any]:
108 |         block_range = name.split('__')[-1]
109 |         start_str, end_str = block_range.split('_to_')
110 |         return {'start_block': int(start_str), 'end_block': int(end_str)}
111 | 
112 |     def get_block_range_str(
113 |         self,
114 |         i: int | None = None,
115 |         *,
116 |         start_block: int | None = None,
117 |         end_block: int | None = None,
118 |     ) -> str:
119 |         if i is not None and (start_block is not None or end_block is not None):
120 |             raise Exception('specify either job or start_block and end_block')
121 |         elif i is not None:
122 |             job = self.get_job_data(i)
123 |             start = job['start_block']
124 |             end = job['end_block']
125 |         elif start_block is not None and end_block is not None:
126 |             start = start_block
127 |             end = end_block
128 |         else:
129 |             raise Exception('specify either job or start_block and end_block')
130 | 
131 |         return '{start_block:08d}_to_{end_block:08d}'.format(
132 |             start_block=start,
133 |             end_block=end,
134 |         )
135 | 
136 |     #
137 |     # # summary
138 |     #
139 | 
140 |     def get_attribute_list(self) -> typing.Sequence[str]:
141 |         attributes = super().get_attribute_list()
142 |         attributes = list(attributes)
143 |         end_block_index = attributes.index('end_block')
144 |         attributes.insert(end_block_index + 1, 'n_blocks')
145 |         return attributes
146 | 
147 |     def get_formatted_attribute(self, key: str) -> str | None:
148 |         import toolstr
149 | 
150 |         if key == 'end_block':
151 |             return '  ' + str(self.end_block)
152 |         elif key == 'n_blocks':
153 |             return toolstr.format(self.end_block - self.start_block)
154 |         elif key == 'context':
155 |             return None
156 |         else:
157 |             return super().get_formatted_attribute(key)
158 | 
159 |     def print_additional_conclusion(
160 |         self,
161 |         start_time: int | float,
162 |         end_time: int | float,
163 |         jobs: typing.Sequence[int],
164 |     ) -> None:
165 |         import toolstr
166 | 
167 |         duration = end_time - start_time
168 |         n_blocks = len(jobs) * self.chunk_size
169 |         bps = n_blocks / duration
170 |         self.print_bullet(key='blocks covered', value=toolstr.format(n_blocks))
171 |         toolstr.print_bullet(
172 |             key='blocks per second',
173 |             value=toolstr.format(bps, decimals=2),
174 |         )
175 |         toolstr.print_bullet(
176 |             key='blocks per minute',
177 |             value=toolstr.format(bps * 60, decimals=2),
178 |         )
179 |         toolstr.print_bullet(
180 |             key='blocks per hour',
181 |             value=toolstr.format(bps * 60 * 60, decimals=2),
182 |         )
183 |         toolstr.print_bullet(
184 |             key='blocks per day',
185 |             value=toolstr.format(bps * 86400, decimals=2),
186 |         )
187 | 
188 |     def summarize_blocks_per_second(
189 |         self, sample_time: int = 60
190 |     ) -> pl.DataFrame:
191 |         import polars as pl
192 | 
193 |         jobs_per_second = self.summarize_jobs_per_second(
194 |             sample_time=sample_time
195 |         )
196 |         start_blocks = [
197 |             self.get_job_data(i)['start_block']
198 |             for i in range(self.get_n_jobs())
199 |         ]
200 |         columns: typing.Sequence[pl.type_aliases.IntoExpr] = [
201 |             pl.Series(start_blocks).alias('start_block'),
202 |             (pl.col('jobs_per_second') * self.chunk_size).alias(
203 |                 'blocks_per_second'
204 |             ),
205 |         ]
206 |         return jobs_per_second.with_columns(columns)
207 | 
208 |     def plot_blocks_per_second(self, sample_time: int = 60) -> None:
209 |         import matplotlib.pyplot as plt  # type: ignore
210 |         import toolplot
211 | 
212 |         df = self.summarize_blocks_per_second(sample_time=sample_time)
213 | 
214 |         plt.plot(df['start_block'], df['blocks_per_second'])
215 |         toolplot.add_tick_grid()
216 |         toolplot.format_yticks()
217 |         toolplot.format_xticks()
218 |         plt.ylabel('blocks per second')
219 |         plt.title('Tracing speed at different points in history')
220 |         plt.xlabel('block number')
221 | 
222 | 


--------------------------------------------------------------------------------
/pdp/data_utils/manifest_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import typing
  5 | 
  6 | from .. import config_utils
  7 | from .. import spec
  8 | from . import file_utils
  9 | from . import schema_utils
 10 | 
 11 | if typing.TYPE_CHECKING:
 12 |     import toolsql
 13 | 
 14 | 
 15 | def get_global_manifest(
 16 |     *,
 17 |     portal_root: str | None = None,
 18 |     source: typing.Literal['remote', 'local'] = 'remote',
 19 | ) -> spec.GlobalManifest:
 20 |     """get global manifest of all datasets"""
 21 | 
 22 |     if source == 'remote':
 23 | 
 24 |         import requests
 25 | 
 26 |         # build url
 27 |         if portal_root is None:
 28 |             portal_root = spec.portal_root
 29 |         url = spec.urls['global_manifest'].format(portal_root=portal_root)
 30 | 
 31 |         # get manifest
 32 |         response = requests.get(url)
 33 |         if response.status_code != 200:
 34 |             raise Exception('could not obtain global manifest at url: ' + str(url))
 35 |         manifest: spec.GlobalManifest = response.json()
 36 | 
 37 |         return manifest
 38 | 
 39 |     elif source == 'local':
 40 |         import json
 41 | 
 42 |         if portal_root is not None:
 43 |             data_root = portal_root
 44 |         else:
 45 |             data_root = config_utils.get_data_root(require=True)
 46 |         path = os.path.join(data_root, spec.global_manifest_filename)
 47 |         with open(path, 'r') as f:
 48 |             result: spec.GlobalManifest = json.load(f)
 49 |             return result
 50 | 
 51 |     else:
 52 |         raise Exception('invalid source: ' + str(source))
 53 | 
 54 | 
 55 | def get_dataset_manifest(
 56 |     dataset: str,
 57 |     *,
 58 |     portal_root: str | None = None,
 59 |     source: typing.Literal['remote', 'local'] = 'remote',
 60 | ) -> spec.DatasetManifest:
 61 |     """get manifest of a particular dataset"""
 62 | 
 63 |     if source == 'remote':
 64 |         import requests
 65 | 
 66 |         parsed = schema_utils.parse_dataset_name(dataset)
 67 | 
 68 |         # build url
 69 |         if portal_root is None:
 70 |             portal_root = spec.portal_root
 71 |         url = spec.urls['dataset_manifest'].format(
 72 |             portal_root=portal_root,
 73 |             datatype=parsed['datatype'],
 74 |             network=parsed['network'],
 75 |         )
 76 | 
 77 |         # get manifest
 78 |         response = requests.get(url)
 79 |         if response.status_code != 200:
 80 |             raise Exception(
 81 |                 'could not obtain dataset manifest at url: ' + str(url)
 82 |             )
 83 |         manifest: spec.DatasetManifest = response.json()
 84 | 
 85 |         return manifest
 86 | 
 87 |     elif source == 'local':
 88 |         import json
 89 | 
 90 |         if portal_root is not None:
 91 |             data_root = portal_root
 92 |         else:
 93 |             data_root = config_utils.get_data_root(require=True)
 94 |         path = os.path.join(data_root, dataset, spec.dataset_manifest_filename)
 95 |         with open(path, 'r') as f:
 96 |             result: spec.DatasetManifest = json.load(f)
 97 |             return result
 98 | 
 99 |     else:
100 |         raise Exception('invalid source: ' + str(source))
101 | 
102 | 
103 | def create_global_manifest(
104 |     *,
105 |     data_root: str | None = None,
106 |     datasets: typing.Mapping[str, spec.DatasetManifestSlim] | None = None,
107 |     version: str | None,
108 |     output_path: str | bool | None = None,
109 |     confirm: bool = False,
110 | ) -> spec.GlobalManifest:
111 |     """create global manifest describing all datasets"""
112 | 
113 |     import json
114 | 
115 |     # versions
116 |     if version is None:
117 |         version = spec.global_version
118 | 
119 |     print('creating global manifest', version)
120 |     print()
121 | 
122 |     # gather dataset manifests
123 |     if datasets is None:
124 |         if data_root is None:
125 |             raise Exception('must specify data_root or datasets')
126 |         found_datasets: typing.MutableMapping[
127 |             str, spec.DatasetManifestSlim
128 |         ] = {}
129 |         for item in os.listdir(data_root):
130 |             path = os.path.join(data_root, item)
131 |             if os.path.isdir(path):
132 |                 if spec.dataset_manifest_filename in os.listdir(path):
133 |                     manifest_path = os.path.join(
134 |                         path, spec.dataset_manifest_filename
135 |                     )
136 |                     with open(manifest_path) as f:
137 |                         dataset_manifest = json.load(f)
138 |                     name = dataset_manifest['name']
139 |                     found_datasets[name] = _reduce_dataset_manifest(
140 |                         dataset_manifest
141 |                     )
142 |                     print('gathered:', item)
143 |                 else:
144 |                     print('no dataset manifest found for: ' + item)
145 |         datasets = found_datasets
146 | 
147 |     # create global manifest
148 |     global_manifest: spec.GlobalManifest = {
149 |         'version': version,
150 |         'datasets': datasets,
151 |     }
152 | 
153 |     if len(datasets) == 0:
154 |         print('no datasets detected for global manifest')
155 |         return global_manifest
156 | 
157 |     # write output file
158 |     if output_path is not None and output_path:
159 |         import shutil
160 | 
161 |         if isinstance(output_path, bool):
162 |             if data_root is not None:
163 |                 output_dir = data_root
164 |             else:
165 |                 output_dir = '.'
166 |             output_path = os.path.join(
167 |                 output_dir, spec.global_manifest_filename
168 |             )
169 | 
170 |         if os.path.exists(output_path) and not confirm:
171 |             raise Exception('use --confirm to overwrite existing file')
172 | 
173 |         with open(output_path + '_tmp', 'w') as f:
174 |             json.dump(global_manifest, f, indent=4, sort_keys=True)
175 |         shutil.move(output_path + '_tmp', output_path)
176 |         if data_root is not None:
177 |             print_dir = os.path.relpath(output_path, data_root)
178 |         else:
179 |             print_dir = output_path
180 |         print()
181 |         print('wrote global manifest to:', print_dir)
182 | 
183 |     return global_manifest
184 | 
185 | 
186 | def _reduce_dataset_manifest(
187 |     manifest: spec.DatasetManifest,
188 | ) -> spec.DatasetManifestSlim:
189 |     return {
190 |         'name': manifest['name'],
191 |         'version': manifest['version'],
192 |         'description': manifest['description'],
193 |         'datatype': manifest['datatype'],
194 |         'network': manifest['network'],
195 |         'n_files': len(manifest['files']),
196 |         'n_bytes': sum(file['n_bytes'] for file in manifest['files']),
197 |         'schema': manifest['schema'],
198 |     }
199 | 
200 | 
201 | def create_dataset_manifest(
202 |     *,
203 |     dataset_dir: str | None = None,
204 |     name: str | None = None,
205 |     version: str | None = None,
206 |     datatype: str | None = None,
207 |     network: str | None = None,
208 |     description: str | None = None,
209 |     data_root: str | None = None,
210 |     schema: toolsql.DBSchemaShorthand | None = None,
211 |     paths: typing.Sequence[str] | None = None,
212 |     reuse_hashes: bool = False,
213 |     output_path: str | bool | None = None,
214 |     confirm: bool = False,
215 | ) -> spec.DatasetManifest:
216 |     """describe dataset manifest describing dataset contents"""
217 | 
218 |     import json
219 | 
220 |     # ensure valid output path
221 |     if output_path is not None and output_path:
222 |         if isinstance(output_path, bool):
223 |             if data_root is not None:
224 |                 output_dir = data_root
225 |             else:
226 |                 output_dir = '.'
227 |             output_path = os.path.join(
228 |                 output_dir, spec.dataset_manifest_filename
229 |             )
230 |         if os.path.exists(output_path) and not confirm:
231 |             raise Exception('use --confirm to overwrite existing file')
232 | 
233 |     # gather metadata
234 |     if dataset_dir is not None:
235 |         dataset_dir = os.path.abspath(os.path.expanduser(dataset_dir))
236 |     if name is None:
237 |         if dataset_dir is None:
238 |             raise Exception('must specify dataset_dir or name')
239 |         name = os.path.basename(dataset_dir)
240 |     if network is None or datatype is None:
241 |         parsed = schema_utils.parse_dataset_name(name)
242 |         parsed_network = parsed['network']
243 |         parsed_datatype = parsed['datatype']
244 |         if network is None:
245 |             network = parsed_network
246 |         elif network != parsed_network:
247 |             raise Exception('parsed network does not equal input network')
248 |         if datatype is None:
249 |             datatype = parsed_datatype
250 |         elif datatype != parsed_datatype:
251 |             raise Exception('parsed datatype does not equal input datatype')
252 |     try:
253 |         module = schema_utils._get_datatype_module(datatype)
254 |     except Exception:
255 |         module = None
256 |     if version is None:
257 |         if module is not None:
258 |             version = module.version
259 |         else:
260 |             raise Exception('unknown version for dataset')
261 |     if description is None:
262 |         if module is not None:
263 |             description = module.schema['description']
264 |         else:
265 |             raise Exception('could not find description for dataset')
266 |     if schema is None:
267 |         if module is not None:
268 |             schema = module.schema
269 |         else:
270 |             raise Exception('could not find schema for dataset')
271 |     schema_normalized = toolsql.normalize_shorthand_db_schema(schema)
272 |     print('creating manifest for', name, version)
273 | 
274 |     # gather files
275 |     if paths is None:
276 |         if dataset_dir is None:
277 |             raise Exception('must specify paths or dataset_dir')
278 |         exclude = [
279 |             spec.dataset_manifest_filename,
280 |             spec.dataset_readme_filename,
281 |         ] + spec.dataset_license_filenames
282 |         paths = [
283 |             os.path.join(dataset_dir, filename)
284 |             for filename in sorted(os.listdir(dataset_dir))
285 |             if filename not in exclude
286 |         ]
287 | 
288 |     # gather hashes
289 |     print('gathering hashes of', len(paths), 'files')
290 |     if reuse_hashes:
291 |         if output_path is None:
292 |             raise Exception('must specify output_path when reuse_hashes=True')
293 |         with open(output_path, 'r') as f:
294 |             old_data = json.load(f)
295 |         old_hashes = {file['name']: file['hash'] for file in old_data['files']}
296 |     file_hashes = []
297 |     for path in paths:
298 |         if reuse_hashes and os.path.basename(path) in old_hashes:
299 |             file_hashes.append(old_hashes[os.path.basename(path)])
300 |         else:
301 |             file_hashes.append(file_utils.get_file_hash(path))
302 | 
303 |     # assemble files
304 |     files = []
305 |     for path, file_hash in zip(paths, file_hashes):
306 |         file: spec.FileMetadata = {
307 |             'name': os.path.basename(path),
308 |             'hash': file_hash,
309 |             'n_bytes': os.path.getsize(path),
310 |         }
311 |         files.append(file)
312 | 
313 |     # build manifest
314 |     manifest: spec.DatasetManifest = {
315 |         'name': name,
316 |         'version': version,
317 |         'description': description or '',
318 |         'datatype': datatype,
319 |         'network': network,
320 |         'files': files,
321 |         'schema': schema_normalized,
322 |     }
323 | 
324 |     # save manifest
325 |     if output_path is not None and output_path:
326 |         import shutil
327 | 
328 |         with open(output_path + '_tmp', 'w') as f:
329 |             json.dump(manifest, f, indent=4, sort_keys=True)
330 | 
331 |         shutil.move(output_path + '_tmp', output_path)
332 |         if data_root is not None:
333 |             print_dir = os.path.relpath(output_path, data_root)
334 |         else:
335 |             print_dir = output_path
336 |         print('wrote dataset manifest to:', print_dir)
337 | 
338 |     return manifest
339 | 
340 | 


--------------------------------------------------------------------------------
/pdp/data_utils/query_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import typing
  5 | 
  6 | from .. import config_utils
  7 | from .. import spec
  8 | 
  9 | if typing.TYPE_CHECKING:
 10 |     import polars as pl
 11 | 
 12 | 
 13 | def query(
 14 |     filters: spec.PolarsExpression,
 15 |     # outputs
 16 |     columns: spec.PolarsExpression | None = None,
 17 |     group_by: spec.PolarsExpression | None = None,
 18 |     output_binary: bool = True,
 19 |     sort: spec.PolarsExpression | None = None,
 20 |     descending: bool = False,
 21 |     unique_sort: spec.PolarsExpression | None = None,
 22 |     unique_descending: bool = False,
 23 |     unique_columns: typing.Sequence[str] | None = None,
 24 |     unique_keep: typing.Literal['last', 'first', 'any'] | None = None,
 25 |     # inputs
 26 |     source_path: str | None = None,
 27 |     dataset: str | None = None,
 28 |     network: str | int | None = None,
 29 |     datatype: str | None = None,
 30 |     table: str | None = None,
 31 |     scan_kwargs: typing.Any = None,
 32 |     # outputs
 33 |     collect: bool = True,
 34 |     streaming: bool = True,
 35 |     output_path: str | None = None,
 36 |     output_kwargs: typing.Any = None,
 37 | ) -> pl.DataFrame:
 38 |     import polars as pl
 39 | 
 40 |     # determine data source
 41 |     if source_path is None:
 42 |         if network is None:
 43 |             raise Exception('must specify network (e.g. network=\'ethereum\')')
 44 |         source_path = config_utils.get_dataset_glob(
 45 |             network=network,
 46 |             datatype=datatype,
 47 |             dataset=dataset,
 48 |             table=table,
 49 |         )
 50 |     elif os.path.isdir(source_path):
 51 |         source_path = os.path.join(source_path, '*.parquet')
 52 | 
 53 |     # initiate scan
 54 |     if scan_kwargs is None:
 55 |         scan_kwargs = {}
 56 |     lf = pl.scan_parquet(source_path, **scan_kwargs)
 57 | 
 58 |     # add filters
 59 |     if filters is not None:
 60 |         if isinstance(filters, (list, tuple)):
 61 |             filters_list = filters
 62 |         else:
 63 |             filters_list = [filters]
 64 | 
 65 |         if len(filters_list) > 0:
 66 |             filter = filters_list[0]
 67 |             for other_filter in filters_list[1:]:
 68 |                 filter &= other_filter
 69 |             lf = lf.filter(filter)
 70 | 
 71 |     # filter unique
 72 |     if unique_columns is not None:
 73 |         if unique_keep is None:
 74 |             raise Exception(
 75 |                 "must specify unique_keep (e.g. 'first', 'last', or 'any')"
 76 |             )
 77 | 
 78 |         # maintain order if unique_sort equals output sort
 79 |         if unique_sort is not None:
 80 |             lf = lf.sort(unique_sort, descending=unique_descending)
 81 |             already_sorted: bool = _polars_exprs_equal(sort, unique_sort) and (
 82 |                 descending == unique_descending
 83 |             )
 84 |         else:
 85 |             already_sorted = False
 86 | 
 87 |         # keep unique
 88 |         lf = lf.unique(
 89 |             maintain_order=already_sorted,
 90 |             subset=unique_columns,
 91 |             keep=unique_keep,
 92 |         )
 93 |     else:
 94 |         already_sorted = False
 95 | 
 96 |     # group by
 97 |     if group_by is not None:
 98 |         # group and aggregate
 99 |         if columns is not None:
100 |             raise Exception('must specify columns for agg when using groupby')
101 |         lf = lf.groupby(group_by).agg(columns)
102 | 
103 |         # sort
104 |         if sort:
105 |             lf = lf.sort(sort, descending=descending)
106 | 
107 |     else:
108 |         # sort
109 |         if sort and not already_sorted:
110 |             lf = lf.sort(sort, descending=descending)
111 | 
112 |         # select columns
113 |         if columns is not None:
114 |             lf = lf.select(columns)
115 | 
116 |     # encode binary as hex
117 |     if not output_binary:
118 |         encode_columns = [
119 |             ('0x' + pl.col(column_name).bin.encode('hex')).alias(column_name)
120 |             for column_name, column_type in lf.schema.items()
121 |             if column_type == pl.Binary
122 |         ]
123 |         lf = lf.with_columns(encode_columns)
124 | 
125 |     # return output
126 |     if output_kwargs is None:
127 |         output_kwargs = {}
128 |     if output_path:
129 |         return lf.sink_parquet(output_path, **output_kwargs)
130 |     elif collect and streaming:
131 |         return lf.collect(streaming=True, **output_kwargs)
132 |     elif collect:
133 |         return lf.collect()
134 |     else:
135 |         return lf  # type: ignore
136 | 
137 | 
138 | def create_query_filters(
139 |     *,
140 |     simple_filters: typing.Mapping[str, typing.Any] | None = None,
141 |     block_filters: typing.Mapping[str, int | None] | None = None,
142 |     binary_filters: typing.Mapping[str, str | bytes | None] | None = None,
143 |     binary_is_in_filters: typing.Mapping[
144 |         str, typing.Sequence[str | bytes] | None
145 |     ]
146 |     | None = None,
147 | ) -> typing.MutableSequence[pl.type_aliases.IntoExpr]:
148 |     import polars as pl
149 | 
150 |     filters: typing.MutableSequence[pl.type_aliases.IntoExpr] = []
151 | 
152 |     # block filters
153 |     if block_filters is not None:
154 |         start_block = block_filters.get('start_block')
155 |         end_block = block_filters.get('end_block')
156 |         block_number = block_filters.get('block_number')
157 |         if start_block is not None:
158 |             filters.append(pl.col('block_number') >= start_block)
159 |         if end_block is not None:
160 |             filters.append(pl.col('block_number') <= end_block)
161 |         if block_number is not None:
162 |             filters.append(pl.col('block_number') == block_number)
163 | 
164 |     # binary filters
165 |     if binary_filters is not None:
166 |         for column, value in binary_filters.items():
167 |             if value is not None:
168 |                 filters.append(pl.col(column) == spec.to_binary(value))
169 | 
170 |     # binary is_in filters
171 |     if binary_is_in_filters is not None:
172 |         for key, list_value in binary_is_in_filters.items():
173 |             if list_value is not None:
174 |                 binary_values = [
175 |                     spec.to_binary(subvalue) for subvalue in list_value
176 |                 ]
177 |                 filters.append(pl.col(key).is_in(binary_values))
178 | 
179 |     return filters
180 | 
181 | 
182 | def _polars_exprs_equal(
183 |     expr1: spec.PolarsExpression,
184 |     expr2: spec.PolarsExpression,
185 | ) -> bool:
186 |     if isinstance(expr1, str):
187 |         expr1 = pl.col(expr1)
188 |     if isinstance(expr2, str):
189 |         expr1 = pl.col(expr2)
190 |     return str(expr1) == str(expr2)
191 | 
192 | 


--------------------------------------------------------------------------------
/pdp/data_utils/readme_utils.py:
--------------------------------------------------------------------------------
  1 | """functions for creating dataset README's"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import os
  6 | 
  7 | from .. import spec
  8 | from . import download_utils
  9 | from . import schema_utils
 10 | 
 11 | 
 12 | readme_template = """
 13 | # {network} {datatype} Dataset v{version}
 14 | 
 15 | This is a dataset of {description}
 16 | 
 17 | The dataset was created by using [this script]({script_url})
 18 | 
 19 | Data is distributed as [parquet](https://data.paradigm.xyz/about) files and released into the public domain under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/)
 20 | 
 21 | ## Usage
 22 | 
 23 | Some example uses of this dataset include:
 24 | {example_usage}
 25 | 
 26 | {notebook_str}
 27 | 
 28 | ## Schema
 29 | 
 30 | {schema}
 31 | 
 32 | ## Download
 33 | 
 34 | This dataset can be downloaded using either the `pdp` cli tool or the urls below
 35 | 
 36 | The total dataset size is **{dataset_size}**
 37 | 
 38 | ### Use `pdp`
 39 | 
 40 | The command `pdp download {dataset_name}` will download all files in this dataset
 41 | 
 42 | See `pdp download -h` for available options
 43 | 
 44 | ### Use URLs
 45 | 
 46 | | | file | size |
 47 | | - | - | - |
 48 | {file_urls}
 49 | """
 50 | 
 51 | script_url_template = 'https://github.com/paradigmxyz/paradigm-data-portal/blob/main/pdp/datasets/{dataset}/{dataset}_collect.py'
 52 | 
 53 | notebook_url_template = 'https://github.com/paradigmxyz/paradigm-data-portal/blob/main/notebooks/explore_{dataset_name}.ipynb'
 54 | 
 55 | table_schema_template = """#### `{table_name}` table
 56 | {table_description}
 57 | | column | type | description |
 58 | | - | - | - |
 59 | {table_rows}"""
 60 | 
 61 | 
 62 | def create_dataset_readme(
 63 |     dataset_manifest: spec.DatasetManifest,
 64 |     output_path: str | bool = False,
 65 |     confirm: bool = False,
 66 | ) -> str:
 67 | 
 68 |     import shutil
 69 | 
 70 |     readme_str = _create_readme_str(dataset_manifest)
 71 | 
 72 |     if output_path is not None and output_path:
 73 |         if isinstance(output_path, bool):
 74 |             output_path = spec.dataset_readme_filename
 75 |         if os.path.exists(output_path) and not confirm:
 76 |             print('use --confirm to overwrite existing README')
 77 |         with open(output_path + '_tmp', 'w') as f:
 78 |             f.write(readme_str)
 79 |         shutil.move(output_path + '_tmp', output_path)
 80 |         print('wrote README to ' + output_path)
 81 | 
 82 |     return readme_str
 83 | 
 84 | 
 85 | def _create_readme_str(dataset_manifest: spec.DatasetManifest) -> str:
 86 | 
 87 |     import toolsql
 88 |     import toolstr
 89 | 
 90 |     module = schema_utils._get_datatype_module(dataset_manifest['datatype'])
 91 |     example_usage_pieces = ['- ' + example for example in module.example_usage]
 92 |     example_usage_str = '\n'.join(example_usage_pieces)
 93 | 
 94 |     schema_pieces = []
 95 |     db_schema = toolsql.normalize_shorthand_db_schema(
 96 |         dataset_manifest['schema']
 97 |     )
 98 |     for table_name, table in db_schema['tables'].items():
 99 |         table_table_pieces = [
100 |             '| '
101 |             + column['name']
102 |             + ' | '
103 |             + column['type']
104 |             + ' | '
105 |             + (column['description'] or '')
106 |             + ' |'
107 |             for column in table['columns']
108 |         ]
109 |         table_table = '\n'.join(table_table_pieces)
110 |         table_schema_str = table_schema_template.format(
111 |             table_name=table['name'],
112 |             table_description=table['description'],
113 |             table_rows=table_table,
114 |         )
115 |         schema_pieces.append(table_schema_str)
116 |     schema_str = '\n'.join(schema_pieces)
117 | 
118 |     url_pieces: list[str] = []
119 |     for file in dataset_manifest['files']:
120 |         if file['name'] == spec.dataset_readme_filename:
121 |             continue
122 |         file_url = download_utils.get_dataset_file_url(
123 |             datatype=dataset_manifest['datatype'],
124 |             network=dataset_manifest['network'],
125 |             filename=file['name'],
126 |         )
127 |         url_piece = (
128 |             '| '
129 |             + str(len(url_pieces) + 1)
130 |             + ' | '
131 |             + ('[' + file['name'] + '](' + file_url + ')')
132 |             + ' | '
133 |             + toolstr.format_nbytes(file['n_bytes'])
134 |             + ' |'
135 |         )
136 |         url_pieces.append(url_piece)
137 |     url_str = '\n'.join(url_pieces)
138 | 
139 |     dataset_nbytes = sum(file['n_bytes'] for file in dataset_manifest['files'])
140 | 
141 |     if dataset_manifest['datatype'] == 'contracts':
142 |         notebook_url = notebook_url_template.format(
143 |             dataset_name=dataset_manifest['name']
144 |         )
145 |         notebook_str = """An example notebook exploring this dataset can be found [here]({notebook_url})""".format(
146 |             notebook_url=notebook_url
147 |         )
148 |     else:
149 |         notebook_str = ''
150 | 
151 |     return readme_template.format(
152 |         dataset_name=dataset_manifest['name'],
153 |         network=dataset_manifest['network'].replace('_', ' ').title(),
154 |         datatype=dataset_manifest['datatype'].replace('_', ' ').title(),
155 |         version=dataset_manifest['version'],
156 |         description=dataset_manifest['description'],
157 |         example_usage=example_usage_str,
158 |         schema=schema_str,
159 |         dataset_size=toolstr.format_nbytes(dataset_nbytes),
160 |         script_url=script_url_template.format(dataset=dataset_manifest['name']),
161 |         notebook_str=notebook_str,
162 |         file_urls=url_str,
163 |     )
164 | 
165 | 


--------------------------------------------------------------------------------
/pdp/data_utils/schema_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | from .. import spec
 6 | 
 7 | if typing.TYPE_CHECKING:
 8 |     import types
 9 |     import toolsql
10 | 
11 | 
12 | def get_dataset_name(*, datatype: str, network: str | int) -> str:
13 |     """create dataset name based on metadata"""
14 |     if isinstance(network, int):
15 |         network = spec.networks[network]
16 |     return network + '_' + datatype
17 | 
18 | 
19 | def get_versioned_dataset_name(*, datatype: str, network: str | int, version: str) -> str:
20 |     """create versioned dataset name for use in file names"""
21 |     dataset_name = get_dataset_name(datatype=datatype, network=network)
22 |     version_str = 'v' + version.replace('.', '_')
23 |     return dataset_name + '__' + version_str
24 | 
25 | 
26 | def parse_dataset_name(dataset: str) -> typing.Mapping[str, str]:
27 |     """parse metadata from a dataset name"""
28 |     network, datatype = dataset.split('_', maxsplit=1)
29 |     return {
30 |         'network': network,
31 |         'datatype': datatype,
32 |     }
33 | 
34 | 
35 | def get_datatype_schema(datatype: str) -> toolsql.DBSchema:
36 |     import toolsql
37 | 
38 |     module = _get_datatype_module(datatype)
39 |     schema: toolsql.DBSchema = toolsql.normalize_shorthand_db_schema(
40 |         module.schema
41 |     )
42 |     return schema
43 | 
44 | 
45 | def get_dataset_schema(
46 |     dataset: str, multichain_tables: bool = False
47 | ) -> toolsql.DBSchema:
48 |     import copy
49 |     import toolsql
50 | 
51 |     # parse dataset name
52 |     parsed = parse_dataset_name(dataset)
53 |     network = parsed['network']
54 | 
55 |     # load schema
56 |     datatype_schema = get_datatype_schema(parsed['datatype'])
57 |     datatype_schema = copy.deepcopy(datatype_schema)
58 |     datatype_schema['name'] = dataset
59 | 
60 |     # make schema names dataset-specific
61 |     if multichain_tables:
62 |         # if a multichain table, add chain_id to each table
63 |         for table_schema in datatype_schema['tables'].values():
64 | 
65 |             # make chain_id primary only if there exist other primary keys
66 |             is_primary = any(
67 |                 other_column.get('primary')
68 |                 for other_column in table_schema['columns']
69 |             )
70 |             raw_column: toolsql.ColumnSchemaShorthand = {
71 |                 'name': 'chain_id',
72 |                 'type': 'INTEGER',
73 |                 'index': True,
74 |                 'primary': is_primary,
75 |             }
76 |             column = toolsql.normalize_shorthand_column_schema(raw_column)
77 |             table_schema['columns'].append(column)  # type: ignore
78 | 
79 |     else:
80 |         # if a single chain table, add network to table names
81 |         datatype_schema['tables'] = {
82 |             network + '_' + k: v for k, v in datatype_schema['tables'].items()
83 |         }
84 | 
85 |     return datatype_schema
86 | 
87 | 
88 | def _get_datatype_module(datatype: str) -> types.ModuleType:
89 |     import importlib
90 | 
91 |     try:
92 |         return importlib.import_module('pdp.datasets.' + datatype)
93 |     except Exception:
94 |         raise Exception('could not get module for dataset' + str(datatype))
95 | 
96 | 


--------------------------------------------------------------------------------
/pdp/data_utils/update_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | 
 6 | def update(
 7 |     dataset: str,
 8 |     *,
 9 |     method: typing.Literal['download', 'collect'] = 'download',
10 | ) -> None:
11 |     raise NotImplementedError()
12 | 
13 | 


--------------------------------------------------------------------------------
/pdp/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/datasets/__init__.py


--------------------------------------------------------------------------------
/pdp/datasets/contracts/__init__.py:
--------------------------------------------------------------------------------
1 | from .contracts_collect import *
2 | from .contracts_queries import *
3 | from .contracts_spec import *
4 | 


--------------------------------------------------------------------------------
/pdp/datasets/contracts/contracts_collect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | import typing
  5 | 
  6 | import pdp
  7 | from . import contracts_spec
  8 | 
  9 | if typing.TYPE_CHECKING:
 10 |     import ctc.spec
 11 |     import tooljob.trackers.file_tracker
 12 | 
 13 | 
 14 | def collect_contracts_dataset(
 15 |     *,
 16 |     start_block: int,
 17 |     end_block: int,
 18 |     output_dir: str,
 19 |     network: ctc.spec.NetworkReference,
 20 |     chunk_size: int | None = None,
 21 |     output_filetype: str | None = None,
 22 |     executor: typing.Literal['serial', 'parallel'] = 'parallel',
 23 |     verbose: bool = False,
 24 | ) -> None:
 25 |     if chunk_size is None:
 26 |         chunk_size = 1000
 27 |     if output_filetype is None:
 28 |         output_filetype = 'csv'
 29 | 
 30 |     dataset_name = pdp.get_versioned_dataset_name(
 31 |         datatype='contracts',
 32 |         network=network,
 33 |         version=contracts_spec.version,
 34 |     )
 35 | 
 36 |     extractor = _ExtractContracts(
 37 |         start_block=start_block,
 38 |         end_block=end_block,
 39 |         chunk_size=chunk_size,
 40 |         output_dir=output_dir,
 41 |         tracker='file',
 42 |         output_filetype=output_filetype,
 43 |         name=dataset_name,
 44 |         context={'network': network},
 45 |         styles=pdp.styles,
 46 |         verbose=verbose,
 47 |     )
 48 | 
 49 |     extractor.orchestrate_jobs(executor=executor)
 50 | 
 51 | 
 52 | class _ExtractContracts(pdp.BlockChunkJobs):
 53 |     tracker: tooljob.trackers.file_tracker.FileTracker
 54 | 
 55 |     def execute_job(self, i: int) -> typing.Any:
 56 |         job_data = self.get_job_data(i)
 57 |         job_name = self.get_job_name(i)
 58 |         path = self.tracker.get_job_output_path(i)
 59 |         _sync_trace_blocks(
 60 |             start_block=job_data['start_block'],
 61 |             end_block=job_data['end_block'],
 62 |             job_name=job_name,
 63 |             path=path,
 64 |             context=self.context,
 65 |         )
 66 | 
 67 | 
 68 | async def _async_trace_blocks(
 69 |     *,
 70 |     start_block: int,
 71 |     end_block: ctc.spec.BlockNumberReference,
 72 |     path: str,
 73 |     context: ctc.spec.Context,
 74 | ) -> None:
 75 |     import polars as pl
 76 | 
 77 |     pdp.ensure_ctc()
 78 |     import ctc
 79 |     import ctc.rpc
 80 |     from ctc.toolbox import pl_utils
 81 | 
 82 |     create_traces = await ctc.async_trace_contract_creations(
 83 |         start_block=start_block,
 84 |         end_block=end_block,
 85 |         context=context,
 86 |     )
 87 |     await ctc.rpc.async_close_http_session()
 88 | 
 89 |     df = pl.DataFrame(create_traces)
 90 |     pl_utils.write_df(df=df, path=path, create_dir=True)
 91 | 
 92 |     return None
 93 | 
 94 | 
 95 | def _sync_trace_blocks(
 96 |     *,
 97 |     start_block: int,
 98 |     end_block: ctc.spec.BlockNumberReference,
 99 |     job_name: str,
100 |     path: str,
101 |     context: ctc.spec.Context,
102 | ) -> None:
103 |     try:
104 |         asyncio.run(
105 |             _async_trace_blocks(
106 |                 start_block=start_block,
107 |                 end_block=end_block,
108 |                 path=path,
109 |                 context=context,
110 |             )
111 |         )
112 |     except Exception as e:
113 |         print('job', job_name, 'failed:' + str(e))
114 |         raise e
115 | 
116 | 


--------------------------------------------------------------------------------
/pdp/datasets/contracts/contracts_queries.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TODO
  3 | - create block_number predicate pushdown
  4 | """
  5 | 
  6 | from __future__ import annotations
  7 | 
  8 | import typing
  9 | 
 10 | import ctc
 11 | 
 12 | import pdp
 13 | from . import contracts_spec
 14 | 
 15 | if typing.TYPE_CHECKING:
 16 |     import polars as pl
 17 | 
 18 | 
 19 | def query_contract(
 20 |     contract_address: str | bytes, **kwargs: typing.Any
 21 | ) -> contracts_spec.Contract | None:
 22 |     """return most recent deployment of contract"""
 23 | 
 24 |     # query data
 25 |     result = query_contracts(contract_address=contract_address, collect=True, **kwargs)
 26 | 
 27 |     # convert to dict
 28 |     if len(result) == 0:
 29 |         return None
 30 |     else:
 31 |         return result.to_dicts()[0]  # type: ignore
 32 | 
 33 | 
 34 | def query_contracts(
 35 |     *,
 36 |     # filters
 37 |     contract_address: str | bytes | None = None,
 38 |     contract_addresses: typing.Sequence[str | bytes] | None = None,
 39 |     deployer: str | bytes | None = None,
 40 |     factory: str | bytes | None = None,
 41 |     start_block: int | None = None,
 42 |     end_block: int | None = None,
 43 |     block_number: int | None = None,
 44 |     code: str | bytes | None = None,
 45 |     code_hash: str | bytes | None = None,
 46 |     init_code: str | bytes | None = None,
 47 |     init_code_hash: str | bytes | None = None,
 48 |     # outputs
 49 |     sort: bool | pdp.PolarsExpression = False,
 50 |     descending: bool = False,
 51 |     unique: bool = False,
 52 |     unique_keep: typing.Literal['last', 'first', 'any'] | None = 'last',
 53 |     columns: pdp.PolarsExpression | None = None,
 54 |     output_binary: bool = True,
 55 |     # inputs
 56 |     source_path: str | None = None,
 57 |     network: str | int | None = None,
 58 |     scan_kwargs: typing.Any = None,
 59 |     collect: bool = True,
 60 |     streaming: bool = True,
 61 | ) -> pl.DataFrame:
 62 | 
 63 |     # convert to hashes
 64 |     if code is not None:
 65 |         code_hash = ctc.keccak(code)
 66 |     if init_code is not None:
 67 |         init_code_hash = ctc.keccak(init_code)
 68 | 
 69 |     # collect filters
 70 |     block_filters = {
 71 |         'start_block': start_block,
 72 |         'end_block': end_block,
 73 |         'block_number': block_number,
 74 |     }
 75 |     binary_filters = {
 76 |         'factory': factory,
 77 |         'deployer': deployer,
 78 |         'contract_address': contract_address,
 79 |         'code_hash': code_hash,
 80 |         'init_code_hash': init_code_hash,
 81 |     }
 82 |     filters = pdp.create_query_filters(
 83 |         binary_filters=binary_filters,
 84 |         block_filters=block_filters,
 85 |         binary_is_in_filters={'contract_address': contract_addresses},
 86 |     )
 87 | 
 88 |     # keep unique contracts
 89 |     if unique:
 90 |         unique_sort = ['block_number', 'create_index']
 91 |         unique_descending = False
 92 |         unique_columns = ['contract_address']
 93 |     else:
 94 |         unique_sort = None
 95 |         unique_descending = False
 96 |         unique_columns = None
 97 | 
 98 |     # create sort expression
 99 |     if sort and isinstance(sort, bool):
100 |         sort = ['block_number', 'create_index']
101 | 
102 |     return pdp.query(
103 |         datatype='contracts',
104 |         filters=filters,
105 |         sort=sort,
106 |         descending=descending,
107 |         columns=columns,
108 |         output_binary=output_binary,
109 |         source_path=source_path,
110 |         network=network,
111 |         unique_columns=unique_columns,
112 |         unique_sort=unique_sort,
113 |         unique_descending=unique_descending,
114 |         unique_keep=unique_keep,
115 |         scan_kwargs=scan_kwargs,
116 |         collect=collect,
117 |         streaming=streaming,
118 |     )
119 | 
120 | 


--------------------------------------------------------------------------------
/pdp/datasets/contracts/contracts_spec.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import typing
  4 | 
  5 | if typing.TYPE_CHECKING:
  6 |     import toolsql
  7 | 
  8 |     class Contract(typing.TypedDict, total=False):
  9 |         block_number: int
 10 |         create_index: int
 11 |         transaction_hash: str
 12 |         contract_address: str
 13 |         deployer: str
 14 |         factory: str
 15 |         init_code: str
 16 |         code: str
 17 |         init_code_hash: str
 18 |         code_hash: str
 19 | 
 20 |     class ContractBinary(typing.TypedDict, total=False):
 21 |         block_number: int
 22 |         create_index: int
 23 |         transaction_hash: bytes
 24 |         contract_address: bytes
 25 |         deployer: bytes
 26 |         factory: bytes
 27 |         init_code: bytes
 28 |         code: bytes
 29 |         init_code_hash: bytes
 30 |         code_hash: bytes
 31 | 
 32 | 
 33 | version = '1.1.0'
 34 | 
 35 | example_usage = [
 36 |     'look up all contracts deployed by an address',
 37 |     'look up all contracts that have a given bytecode',
 38 |     'analyze distribution of contract bytecode motifs',
 39 | ]
 40 | 
 41 | schema: toolsql.DBSchemaShorthand = {
 42 |     'name': 'contracts',
 43 |     'description': 'all historical contract deployments',
 44 |     'tables': {
 45 |         'contracts': {
 46 |             'description': 'each row corresponds to a contract create trace',
 47 |             'columns': [
 48 |                 {
 49 |                     'name': 'block_number',
 50 |                     'type': 'INTEGER',
 51 |                     'description': 'block number when contract was created',
 52 |                     'primary': True,
 53 |                 },
 54 |                 {
 55 |                     'name': 'create_index',
 56 |                     'type': 'INTEGER',
 57 |                     'description': 'increased by 1 for each contract created in block',
 58 |                 },
 59 |                 {
 60 |                     'name': 'transaction_hash',
 61 |                     'type': 'BINARY',
 62 |                     'description': 'hash of transaction that created contract',
 63 |                     'index': True,
 64 |                 },
 65 |                 {
 66 |                     'name': 'contract_address',
 67 |                     'type': 'BINARY',
 68 |                     'description': 'address of deployed contract',
 69 |                     'primary': True,
 70 |                 },
 71 |                 {
 72 |                     'name': 'deployer',
 73 |                     'type': 'BINARY',
 74 |                     'description': 'EOA that deployed the contract',
 75 |                     'index': True,
 76 |                 },
 77 |                 {
 78 |                     'name': 'factory',
 79 |                     'type': 'BINARY',
 80 |                     'description': 'the `from` field in the creation trace',
 81 |                     'index': True,
 82 |                 },
 83 |                 {
 84 |                     'name': 'init_code',
 85 |                     'type': 'BINARY',
 86 |                     'description': 'initialization bytecode of contract',
 87 |                 },
 88 |                 {
 89 |                     'name': 'code',
 90 |                     'type': 'BINARY',
 91 |                     'description': 'bytecode of contract',
 92 |                 },
 93 |                 {
 94 |                     'name': 'init_code_hash',
 95 |                     'type': 'BINARY',
 96 |                     'description': 'keccak hash of contract initialization code',
 97 |                 },
 98 |                 {
 99 |                     'name': 'code_hash',
100 |                     'type': 'BINARY',
101 |                     'description': 'keccak hash of contract bytecode',
102 |                     'index': True,
103 |                 },
104 |             ],
105 |         },
106 |     },
107 | }
108 | 
109 | 


--------------------------------------------------------------------------------
/pdp/datasets/native_transfers/__init__.py:
--------------------------------------------------------------------------------
1 | from .native_transfers_collect import *
2 | from .native_transfers_queries import *
3 | from .native_transfers_spec import *
4 | 


--------------------------------------------------------------------------------
/pdp/datasets/native_transfers/native_transfers_collect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | import typing
  5 | 
  6 | import polars as pl
  7 | 
  8 | import pdp
  9 | from . import native_transfers_spec
 10 | 
 11 | if typing.TYPE_CHECKING:
 12 |     import ctc.spec
 13 |     import tooljob.trackers.file_tracker
 14 | 
 15 | 
 16 | def collect_native_transfers_dataset(
 17 |     *,
 18 |     start_block: int,
 19 |     end_block: int,
 20 |     output_dir: str,
 21 |     network: ctc.spec.NetworkReference,
 22 |     chunk_size: int | None = None,
 23 |     output_filetype: str | None = None,
 24 |     executor: typing.Literal['serial', 'parallel'] = 'parallel',
 25 |     verbose: bool = False,
 26 | ) -> None:
 27 |     if chunk_size is None:
 28 |         chunk_size = 1000
 29 |     if output_filetype is None:
 30 |         output_filetype = 'csv'
 31 | 
 32 |     dataset_name = pdp.get_versioned_dataset_name(
 33 |         datatype='native_transfers',
 34 |         network=network,
 35 |         version=native_transfers_spec.version,
 36 |     )
 37 | 
 38 |     extractor = _ExtractNativeTransfers(
 39 |         start_block=start_block,
 40 |         end_block=end_block,
 41 |         chunk_size=chunk_size,
 42 |         output_dir=output_dir,
 43 |         tracker='file',
 44 |         output_filetype=output_filetype,
 45 |         name=dataset_name,
 46 |         context={'network': network},
 47 |         styles=pdp.styles,
 48 |         verbose=verbose,
 49 |     )
 50 | 
 51 |     extractor.orchestrate_jobs(executor=executor)
 52 | 
 53 | 
 54 | class _ExtractNativeTransfers(pdp.BlockChunkJobs):
 55 |     tracker: tooljob.trackers.file_tracker.FileTracker
 56 | 
 57 |     def execute_job(self, i: int) -> typing.Any:
 58 |         job_data = self.get_job_data(i)
 59 |         job_name = self.get_job_name(i)
 60 |         path = self.tracker.get_job_output_path(i)
 61 |         _sync_extract_native_transfers(
 62 |             start_block=job_data['start_block'],
 63 |             end_block=job_data['end_block'],
 64 |             job_name=job_name,
 65 |             path=path,
 66 |             context=self.context,
 67 |         )
 68 | 
 69 | 
 70 | def _sync_extract_native_transfers(
 71 |     *,
 72 |     start_block: int,
 73 |     end_block: ctc.spec.BlockNumberReference,
 74 |     job_name: str,
 75 |     path: str,
 76 |     context: ctc.spec.Context,
 77 | ) -> None:
 78 |     try:
 79 |         asyncio.run(
 80 |             _async_extract_native_transfers(
 81 |                 start_block=start_block,
 82 |                 end_block=end_block,
 83 |                 path=path,
 84 |                 context=context,
 85 |             )
 86 |         )
 87 |     except Exception as e:
 88 |         print('job', job_name, 'failed:' + str(e))
 89 |         raise e
 90 | 
 91 | 
 92 | async def _async_extract_native_transfers(
 93 |     *,
 94 |     start_block: int,
 95 |     end_block: ctc.spec.BlockNumberReference,
 96 |     path: str,
 97 |     context: ctc.spec.Context,
 98 | ) -> None:
 99 |     pdp.ensure_ctc()
100 |     import ctc
101 |     import ctc.rpc
102 |     from ctc.toolbox import pl_utils
103 | 
104 |     transfers = await ctc.async_trace_native_transfers(
105 |         start_block=start_block,
106 |         end_block=end_block,
107 |         context=context,
108 |     )
109 | 
110 |     await ctc.rpc.async_close_http_session(context=context)
111 | 
112 |     # load data into output file
113 |     df = pl.DataFrame(
114 |         transfers,
115 |         orient='row',
116 |         schema=[
117 |             ('block_number', pl.datatypes.Int32),
118 |             ('transfer_index', pl.datatypes.Int32),
119 |             ('transaction_hash', pl.datatypes.Utf8),
120 |             ('from_address', pl.datatypes.Utf8),
121 |             ('to_address', pl.datatypes.Utf8),
122 |             ('value', pl.datatypes.Utf8),
123 |         ],
124 |     )
125 |     pl_utils.write_df(df=df, path=path, create_dir=True)
126 | 
127 | 


--------------------------------------------------------------------------------
/pdp/datasets/native_transfers/native_transfers_queries.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | import pdp
 6 | 
 7 | if typing.TYPE_CHECKING:
 8 |     import polars as pl
 9 | 
10 | 
11 | def query_native_transfers(
12 |     # filters
13 |     from_address: str | bytes | None = None,
14 |     to_address: str | bytes | None = None,
15 |     from_addresses: typing.Sequence[str | bytes] | None = None,
16 |     to_addresses: typing.Sequence[str | bytes] | None = None,
17 |     start_block: int | None = None,
18 |     end_block: int | None = None,
19 |     block_number: int | None = None,
20 |     # outputs
21 |     sort: bool | pdp.PolarsExpression = True,
22 |     descending: bool = False,
23 |     unique_keep: typing.Literal['last', 'first', 'any'] = 'last',
24 |     columns: pdp.PolarsExpression | None = None,
25 |     output_binary: bool = True,
26 |     # inputs
27 |     source_path: str | None = None,
28 |     network: str | int | None = None,
29 |     scan_kwargs: typing.Any = None,
30 |     collect: bool = True,
31 |     streaming: bool = True,
32 | ) -> pl.DataFrame:
33 | 
34 |     # collect filters
35 |     block_filters = {
36 |         'start_block': start_block,
37 |         'end_block': end_block,
38 |         'block_number': block_number,
39 |     }
40 |     binary_filters = {
41 |         'from_address': from_address,
42 |         'to_address': to_address,
43 |     }
44 |     binary_is_in_filters = {
45 |         'from_addresses': from_addresses,
46 |         'to_addresses': to_addresses,
47 |     }
48 |     filters = pdp.create_query_filters(
49 |         binary_filters=binary_filters,
50 |         block_filters=block_filters,
51 |         binary_is_in_filters=binary_is_in_filters,
52 |     )
53 | 
54 |     if sort and isinstance(sort, bool):
55 |         sort = ['block_number', 'transfer_index']
56 | 
57 |     return pdp.query(
58 |         datatype='native_transfers',
59 |         filters=filters,
60 |         sort=sort,
61 |         descending=descending,
62 |         columns=columns,
63 |         output_binary=output_binary,
64 |         source_path=source_path,
65 |         network=network,
66 |         unique_keep=unique_keep,
67 |         scan_kwargs=scan_kwargs,
68 |         collect=collect,
69 |         streaming=streaming,
70 |     )
71 | 
72 | 


--------------------------------------------------------------------------------
/pdp/datasets/native_transfers/native_transfers_spec.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | if typing.TYPE_CHECKING:
 6 |     import toolsql
 7 | 
 8 | 
 9 | version = '1.1.0'
10 | 
11 | example_usage = [
12 |     'look up all inbound transfers to an address',
13 |     'analyze transfer size distributions',
14 |     'analyze transfer frequency distributions',
15 | ]
16 | 
17 | schema: toolsql.DBSchemaShorthand = {
18 |     'name': 'native_transfers',
19 |     'description': 'all native transfers in similar format to ERC20 Transfers (excluding tx fees)',
20 |     'tables': {
21 |         'native_transfers': {
22 |             'description': 'each row corresponds to a trace that transfers native token',
23 |             'columns': [
24 |                 {
25 |                     'name': 'block_number',
26 |                     'type': 'INTEGER',
27 |                     'description': 'block number where native token was transfered',
28 |                 },
29 |                 {
30 |                     'name': 'transfer_index',
31 |                     'type': 'INTEGER',
32 |                     'description': 'increased by 1 for each native transfer in block',
33 |                 },
34 |                 {
35 |                     'name': 'transaction_hash',
36 |                     'type': 'BINARY',
37 |                     'description': 'hash of transaction that contains transfer',
38 |                 },
39 |                 {
40 |                     'name': 'to_address',
41 |                     'type': 'BINARY',
42 |                     'description': 'address that native token is transferred to',
43 |                 },
44 |                 {
45 |                     'name': 'from_address',
46 |                     'type': 'BINARY',
47 |                     'description': 'address that native token is transferred from',
48 |                 },
49 |                 {
50 |                     'name': 'value',
51 |                     'type': 'BINARY',
52 |                     'description': 'amount of native token transferred',
53 |                 },
54 |             ],
55 |         },
56 |     },
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/pdp/datasets/slots/__init__.py:
--------------------------------------------------------------------------------
1 | from .slots_collect import *
2 | from .slots_queries import *
3 | from .slots_spec import *
4 | 


--------------------------------------------------------------------------------
/pdp/datasets/slots/slots_collect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | import typing
  5 | 
  6 | import pdp
  7 | from . import slots_spec
  8 | 
  9 | if typing.TYPE_CHECKING:
 10 |     import ctc.spec
 11 |     import tooljob.trackers.file_tracker
 12 | 
 13 | 
 14 | def collect_slots_dataset(
 15 |     *,
 16 |     start_block: int,
 17 |     end_block: int,
 18 |     output_dir: str,
 19 |     network: ctc.spec.NetworkReference,
 20 |     chunk_size: int | None = None,
 21 |     output_filetype: str | None = None,
 22 |     executor: typing.Literal['serial', 'parallel'] = 'parallel',
 23 |     verbose: bool = False,
 24 | ) -> None:
 25 |     if chunk_size is None:
 26 |         chunk_size = 1000
 27 |     if output_filetype is None:
 28 |         output_filetype = 'parquet'
 29 | 
 30 |     dataset_name = pdp.get_versioned_dataset_name(
 31 |         datatype='slots',
 32 |         network=network,
 33 |         version=slots_spec.version,
 34 |     )
 35 | 
 36 |     extractor = _ExtractSlots(
 37 |         start_block=start_block,
 38 |         end_block=end_block,
 39 |         chunk_size=chunk_size,
 40 |         output_dir=output_dir,
 41 |         tracker='file',
 42 |         output_filetype=output_filetype,
 43 |         name=dataset_name,
 44 |         context={'network': network},
 45 |         styles=pdp.styles,
 46 |         verbose=verbose,
 47 |     )
 48 | 
 49 |     extractor.orchestrate_jobs(executor=executor)
 50 | 
 51 | 
 52 | class _ExtractSlots(pdp.BlockChunkJobs):
 53 |     tracker: tooljob.trackers.file_tracker.FileTracker
 54 | 
 55 |     def execute_job(self, i: int) -> typing.Any:
 56 |         job_data = self.get_job_data(i)
 57 |         job_name = self.get_job_name(i)
 58 |         path = self.tracker.get_job_output_path(i)
 59 |         _sync_extract_slots(
 60 |             start_block=job_data['start_block'],
 61 |             end_block=job_data['end_block'],
 62 |             job_name=job_name,
 63 |             path=path,
 64 |             context=self.context,
 65 |         )
 66 | 
 67 | 
 68 | def _sync_extract_slots(
 69 |     *,
 70 |     start_block: int,
 71 |     end_block: ctc.spec.BlockNumberReference,
 72 |     job_name: str,
 73 |     path: str,
 74 |     context: ctc.spec.Context,
 75 | ) -> None:
 76 |     try:
 77 |         asyncio.run(
 78 |             _async_extract_slots(
 79 |                 start_block=start_block,
 80 |                 end_block=end_block,
 81 |                 path=path,
 82 |                 context=context,
 83 |             )
 84 |         )
 85 |     except Exception as e:
 86 |         print('job', job_name, 'failed:' + str(e))
 87 |         raise e
 88 | 
 89 | 
 90 | async def _async_extract_slots(
 91 |     *,
 92 |     start_block: int,
 93 |     end_block: ctc.spec.BlockNumberReference,
 94 |     path: str,
 95 |     context: ctc.spec.Context,
 96 | ) -> None:
 97 |     pdp.ensure_ctc()
 98 |     import ctc
 99 |     import ctc.rpc
100 |     from ctc.toolbox import pl_utils
101 | 
102 |     df = await ctc.async_trace_slot_stats(
103 |         start_block=start_block,
104 |         end_block=end_block,
105 |         context=context,
106 |     )
107 | 
108 |     await ctc.rpc.async_close_http_session()
109 | 
110 |     pl_utils.write_df(df=df, path=path, create_dir=True)
111 | 
112 | 


--------------------------------------------------------------------------------
/pdp/datasets/slots/slots_queries.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | import pdp
 6 | from . import slots_spec
 7 | 
 8 | if typing.TYPE_CHECKING:
 9 |     import polars as pl
10 | 
11 | 
12 | def query_slots_of_contract(
13 |     contract_address: str | bytes,
14 |     network: str | int | None = None,
15 |     **query_kwargs: typing.Any
16 | ) -> pl.DataFrame:
17 |     return query_slots(
18 |         contract_address=contract_address,
19 |         network=network,
20 |         **query_kwargs,
21 |     )
22 | 
23 | 
24 | def query_contract_slot_counts(
25 |     network: str | int | None = None,
26 |     **query_kwargs: typing.Any,
27 | ) -> pl.DataFrame:
28 |     lf: pl.LazyFrame = (
29 |         query_slots(collect=False, **query_kwargs)  # type: ignore
30 |         .groupby('contract_address')
31 |         .agg(pl.count())
32 |         .sort('counts', descending=True)
33 |     )
34 | 
35 |     return lf.collect(streaming=True)
36 | 
37 | 
38 | def query_slot(
39 |     contract_address: str | bytes,
40 |     slot: str | bytes,
41 |     network: str | int | None = None,
42 | ) -> slots_spec.Slot | None:
43 |     result = query_slots(
44 |         contract_address=contract_address,
45 |         slot=slot,
46 |     )
47 |     if len(result) == 1:
48 |         return result.to_dicts()[0]  # type: ignore
49 |     else:
50 |         return None
51 | 
52 | 
53 | def query_slots(
54 |     # filters
55 |     contract_address: str | bytes | None = None,
56 |     contract_addresses: typing.Sequence[str | bytes] | None = None,
57 |     slot: str | bytes | None = None,
58 |     slots: typing.Sequence[str | bytes] | None = None,
59 |     # outputs
60 |     sort: bool | pdp.PolarsExpression = True,
61 |     unique_keep: typing.Literal['last', 'first', 'all'] = 'last',
62 |     columns: pdp.PolarsExpression | None = None,
63 |     output_binary: bool = True,
64 |     # inputs
65 |     source_path: str | None = None,
66 |     network: str | int | None = None,
67 |     scan_kwargs: typing.Any = None,
68 |     collect: bool = True,
69 |     streaming: bool = True,
70 | ) -> pl.DataFrame:
71 | 
72 |     # filters
73 |     binary_filters = {
74 |         'contract_address': contract_address,
75 |         'slot': slot,
76 |     }
77 |     binary_is_in_filters = {
78 |         'contract_addresses': contract_addresses,
79 |         'slots': slots,
80 |     }
81 |     filters = pdp.create_query_filters(
82 |         binary_filters=binary_filters,
83 |         binary_is_in_filters=binary_is_in_filters,
84 |     )
85 | 
86 |     return pdp.query(
87 |         filters=filters,
88 |         sort=sort,
89 |         columns=columns,
90 |         output_binary=output_binary,
91 |         source_path=source_path,
92 |         network=network,
93 |         datatype='slots',
94 |         scan_kwargs=scan_kwargs,
95 |         collect=collect,
96 |         streaming=streaming,
97 |     )
98 | 
99 | 


--------------------------------------------------------------------------------
/pdp/datasets/slots/slots_spec.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import typing
 4 | 
 5 | if typing.TYPE_CHECKING:
 6 |     import toolsql
 7 | 
 8 |     class Slot(typing.TypedDict):
 9 |         contract_address: str
10 |         slot: str
11 |         value: bytes
12 |         first_updated_block: int
13 |         last_updated_block: int
14 |         n_tx_updates: int
15 | 
16 | 
17 | version = '1.1.0'
18 | 
19 | example_usage = [
20 |     'look up how much storage space is used by a given contract',
21 |     'look up which slots are used by a given contract',
22 |     'look up which slots change most frequently for a given contract',
23 | ]
24 | 
25 | schema: toolsql.DBSchemaShorthand = {
26 |     'name': 'slots',
27 |     'description': 'all slots of each contract, including historical usage metadata',
28 |     'tables': {
29 |         'slots': {
30 |             'description': 'each row corresponds to a slot of a contract',
31 |             'columns': [
32 |                 {
33 |                     'name': 'contract_address',
34 |                     'type': 'BINARY',
35 |                     'description': 'contract of slot',
36 |                 },
37 |                 {
38 |                     'name': 'slot',
39 |                     'type': 'BINARY',
40 |                     'description': 'address of slot',
41 |                 },
42 |                 {
43 |                     'name': 'value',
44 |                     'type': 'BINARY',
45 |                     'description': 'last data stored in slot',
46 |                 },
47 |                 {
48 |                     'name': 'first_updated_block',
49 |                     'type': 'INTEGER',
50 |                     'description': 'first block where slot was used',
51 |                 },
52 |                 {
53 |                     'name': 'last_updated_block',
54 |                     'type': 'INTEGER',
55 |                     'description': 'last block where slot was updated',
56 |                 },
57 |                 {
58 |                     'name': 'n_tx_updates',
59 |                     'type': 'INTEGER',
60 |                     'description': 'number of transactions that updated slot',
61 |                 },
62 |             ],
63 |         },
64 |     },
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/pdp/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/335f0c8c932bdf1295c67028b11de73a8d92384e/pdp/py.typed


--------------------------------------------------------------------------------
/pdp/spec.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import typing
  4 | 
  5 | 
  6 | if typing.TYPE_CHECKING:
  7 | 
  8 |     import polars as pl
  9 |     import toolcli
 10 |     import toolsql
 11 | 
 12 |     class GlobalManifest(typing.TypedDict):
 13 |         """manifest of all datasets"""
 14 | 
 15 |         version: str
 16 |         datasets: typing.Mapping[str, DatasetManifestSlim]
 17 | 
 18 |     class DatasetManifest(typing.TypedDict):
 19 |         """manifest of a particular dataset"""
 20 | 
 21 |         name: str
 22 |         version: str
 23 |         description: str
 24 |         datatype: str
 25 |         network: str
 26 |         files: typing.Sequence[FileMetadata]
 27 |         schema: toolsql.DBSchema
 28 | 
 29 |     class DatasetManifestSlim(typing.TypedDict):
 30 |         """manifest of a particular dataset"""
 31 | 
 32 |         name: str
 33 |         version: str
 34 |         description: str
 35 |         datatype: str
 36 |         network: str
 37 |         n_files: int
 38 |         n_bytes: int
 39 |         schema: toolsql.DBSchema
 40 | 
 41 |     class FileMetadata(typing.TypedDict):
 42 |         """metadata of a dataset file"""
 43 | 
 44 |         name: str
 45 |         hash: str
 46 |         n_bytes: int
 47 | 
 48 |     PolarsExpression = typing.Union[
 49 |         pl.type_aliases.IntoExpr,
 50 |         typing.Sequence[pl.type_aliases.IntoExpr],
 51 |     ]
 52 | 
 53 | #
 54 | # # datasets
 55 | #
 56 | 
 57 | global_version = '1.0.0'
 58 | 
 59 | networks = {
 60 |     1: 'ethereum',
 61 | }
 62 | 
 63 | 
 64 | #
 65 | # # urls and paths
 66 | #
 67 | 
 68 | # default portal root
 69 | portal_root = 'https://datasets.paradigm.xyz/datasets'
 70 | bucket_root_path = 'datasets'
 71 | 
 72 | # schema for various portal urls
 73 | urls = {
 74 |     'global_manifest': '{portal_root}/global_manifest.json',
 75 |     'dataset_manifest': '{portal_root}/{network}_{datatype}/dataset_manifest.json',
 76 |     'dataset_file': '{portal_root}/{network}_{datatype}/{filename}',
 77 |     'old_global_manifests': '{portal_root}/old_global_manifests/v{version}.json',
 78 |     'old_dataset_manifest': '{portal_root}/old_dataset_manifests/{dataset}__v{version}.json',
 79 | }
 80 | 
 81 | global_manifest_filename = 'global_manifest.json'
 82 | dataset_manifest_filename = 'dataset_manifest.json'
 83 | dataset_readme_filename = 'README.md'
 84 | dataset_filename_template = '{dataset}__v{version}__{file_id}.{filetype}'
 85 | dataset_license_filenames = ['LICENSE-CC0']
 86 | 
 87 | 
 88 | #
 89 | # # cli behavior
 90 | #
 91 | 
 92 | styles: toolcli.StyleTheme = {
 93 |     'title': 'bold #00e100',
 94 |     'metavar': 'bold #e5e9f0',
 95 |     'description': '#aaaaaa',
 96 |     'content': '#00B400',
 97 |     'option': 'bold #e5e9f0',
 98 |     'comment': '#888888',
 99 | }
100 | 
101 | 
102 | #
103 | # # formats
104 | #
105 | 
106 | def to_binary(value: str | bytes) -> bytes:
107 |     if isinstance(value, bytes):
108 |         return value
109 |     elif isinstance(value, str):
110 |         if value.startswith('0x'):
111 |             return bytes.fromhex(value[2:])
112 |         else:
113 |             return bytes.fromhex(value)
114 |     else:
115 |         raise Exception('invalid format: ' + str(value))
116 | 
117 | 
118 | def to_hex(
119 |     value: str | bytes, *, prefix: bool = True, validate: bool = True
120 | ) -> str:
121 | 
122 |     if isinstance(value, str):
123 |         if value.startswith('0x'):
124 |             if validate:
125 |                 bytes.fromhex(value[2:])
126 | 
127 |             if prefix:
128 |                 return value
129 |             else:
130 |                 return value[2:]
131 |         else:
132 |             if validate:
133 |                 bytes.fromhex(value)
134 | 
135 |             if prefix:
136 |                 return '0x' + value
137 |             else:
138 |                 return value
139 | 
140 |     elif isinstance(value, bytes):
141 |         if prefix:
142 |             return '0x' + value.hex()
143 |         else:
144 |             return value.hex()
145 | 
146 |     else:
147 |         raise Exception('invalid value format')
148 | 
149 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [build-system]
 3 | requires = ["flit_core >=3.2, <4"]
 4 | build-backend = "flit_core.buildapi"
 5 | 
 6 | [project]
 7 | name = "paradigm-data-portal"
 8 | readme = "README.md"
 9 | requires-python = ">=3.7"
10 | dynamic = ["version", "description"]
11 | license = {text = "MIT OR Apache-2.0"}
12 | classifiers = [
13 |     "Development Status :: 4 - Beta",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Financial and Insurance Industry",
16 |     "Intended Audience :: Science/Research",
17 |     "License :: OSI Approved :: Apache Software License",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Natural Language :: English",
20 |     "Operating System :: MacOS",
21 |     "Operating System :: Microsoft :: Windows",
22 |     "Operating System :: POSIX :: Linux",
23 |     "Programming Language :: Python :: 3.7",
24 |     "Programming Language :: Python :: 3.8",
25 |     "Programming Language :: Python :: 3.9",
26 |     "Programming Language :: Python :: 3.10",
27 |     "Programming Language :: Python :: 3.11",
28 |     "Typing :: Typed",
29 | ]
30 | dependencies = [
31 |     'typing-extensions >=4.2.0, <5',
32 |     'requests >=2.20.0, <3',
33 |     'toolcli >=0.6.13, <0.7',
34 |     'toolstr >=0.9.3, <0.10',
35 |     'tooljob >=0.1.6, <0.2',
36 | ]
37 | 
38 | [project.optional-dependencies]
39 | test = [
40 |     'mypy ==1.2.0',
41 |     'mypy_extensions >= 1.0.0, <1.1.0',
42 |     'pytest >=6, <7',
43 | ]
44 | 
45 | [project.scripts]
46 | pdp = "pdp.cli.cli_run:run_cli"
47 | 
48 | [tool.flit.module]
49 | name = "pdp"
50 | 
51 | [tool.mypy]
52 | python_version = "3.9"
53 | strict = true
54 | implicit_reexport = true
55 | files = ["pdp"]
56 | 
57 | [tool.pytest.ini_options]
58 | testpaths = [
59 |     "tests",
60 | ]
61 | asyncio_mode = 'auto'
62 | 
63 | 


--------------------------------------------------------------------------------
/tests/remote_tests/test_manifests.py:
--------------------------------------------------------------------------------
 1 | import pdp
 2 | 
 3 | 
 4 | def test_get_global_manifest():
 5 |     global_manifest = pdp.get_global_manifest(source='remote')
 6 | 
 7 | 
 8 | def test_get_dataset_manifests():
 9 |     global_manifest = pdp.get_global_manifest(source='remote')
10 |     for dataset_name in global_manifest['datasets'].keys():
11 |         dataset_manifest = pdp.get_dataset_manifest(
12 |             dataset_name, source='remote'
13 |         )
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/test_collect.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | 
 6 | from pdp.datasets import contracts
 7 | from pdp.datasets import slots
 8 | from pdp.datasets import native_transfers
 9 | 
10 | 
11 | dataset_collectors = [
12 |     contracts.collect_contracts_dataset,
13 |     slots.collect_slots_dataset,
14 |     native_transfers.collect_native_transfers_dataset,
15 | ]
16 | 
17 | 
18 | collect_kwargs_sets = [
19 |     {
20 |         'start_block': 14_000_000,
21 |         'end_block': 14_000_100,
22 |         'chunk_size': 20,
23 |         'network': 'ethereum',
24 |         'executor': 'parallel',
25 |         'verbose': True,
26 |     },
27 | ]
28 | 
29 | 
30 | @pytest.mark.parametrize('dataset_collector', dataset_collectors)
31 | @pytest.mark.parametrize('collect_kwargs', collect_kwargs_sets)
32 | @pytest.mark.parametrize('output_filetype', ['parquet', 'csv'])
33 | def test(dataset_collector, collect_kwargs, output_filetype):
34 |     output_dir = tempfile.mkdtemp()
35 | 
36 |     dataset_collector(
37 |         output_dir=output_dir,
38 |         output_filetype=output_filetype,
39 |         **collect_kwargs,
40 |     )
41 | 
42 |     output_files = os.listdir(output_dir)
43 |     assert (
44 |         len(output_files)
45 |         == (collect_kwargs['end_block'] - collect_kwargs['start_block'])
46 |         / collect_kwargs['chunk_size']
47 |     )
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/test_validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | import pdp
 6 | 
 7 | 
 8 | data_root = pdp.get_data_root(require=False)
 9 | ethereum_contracts_present = (
10 |     data_root is not None
11 |     and os.path.isdir(data_root)
12 |     and 'ethereum_contracts' in os.listdir(data_root)
13 | )
14 | 
15 | 
16 | @pytest.mark.skipif(
17 |     not ethereum_contracts_present,
18 |     reason='ethereum_contracts dataset not present',
19 | )
20 | def test_validate_dataset():
21 |     path = pdp.get_dataset_local_path('ethereum_contracts')
22 |     pdp.validate_dataset_directory(path)
23 | 
24 | 


--------------------------------------------------------------------------------