├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── README.md ├── __init__.py ├── dataflux_core ├── __init__.py ├── benchmarking │ ├── dataflux_client_bench.py │ ├── dataflux_client_parallel_bench.py │ └── dataflux_client_threaded_bench.py ├── download.py ├── fast_list.py ├── performance_tests │ ├── list_and_download.py │ └── list_only.py ├── range_splitter.py ├── tests │ ├── __init__.py │ ├── fake_gcs.py │ ├── fake_multiprocess.py │ ├── test_download.py │ ├── test_fake_gcs.py │ ├── test_fast_list.py │ ├── test_range_splitter.py │ └── test_user_agent.py └── user_agent.py ├── docs ├── code-of-conduct.md └── contributing.md ├── kokoro ├── build.sh ├── continuous.cfg ├── hourly.cfg ├── nightly.cfg ├── performance_one_shot.sh ├── performance_seg.sh ├── presubmit.cfg └── presubmit.sh ├── requirements.txt └── setup.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Enforce top level code-owners CodeOwners (this will auto-tag reviewers) 2 | * @GoogleCloudPlatform/dataflux-python 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Expected Behavior 2 | 3 | 4 | ## Actual Behavior 5 | 6 | 7 | ## Steps to Reproduce the Problem 8 | 9 | 1. 10 | 1. 11 | 1. 12 | 13 | ## Specifications 14 | 15 | - Version: 16 | - Platform: -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | > It's a good idea to open an issue first for discussion. 4 | 5 | - [ ] Tests pass 6 | - [ ] Appropriate changes to documentation are included in the PR -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Reference: https://github.com/github/gitignore/blob/main/Python.gitignore 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "eeyore.yapf", 4 | "ms-python.flake8", 5 | "ms-python.isort", 6 | "ms-python.python", 7 | ], 8 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.formatOnSaveMode": "file", 4 | "editor.formatOnSave": true, 5 | "editor.defaultFormatter": "eeyore.yapf", 6 | "editor.formatOnType": false, 7 | "files.trimTrailingWhitespace": true, 8 | }, 9 | "python.testing.pytestArgs": [ 10 | "-s", 11 | "dataflux_core/tests", 12 | ], 13 | "python.testing.pytestEnabled": true, 14 | "python.testing.unittestEnabled": false, 15 | "editor.codeActionsOnSave": { 16 | "source.organizeImports": "always" 17 | }, 18 | "files.insertFinalNewline": true, 19 | "files.trimFinalNewlines": true, 20 | } 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataflux for Google Cloud Storage Python client library 2 | 3 | ## Overview 4 | 5 | This is the client library backing the [Dataflux Dataset for Pytorch](https://github.com/GoogleCloudPlatform/dataflux-pytorch). The purpose of this client is to quickly list and download data stored in GCS for use in Python machine learning applications. The core functionalities of this client can be broken down into two key parts. 6 | 7 | ## Fast List 8 | 9 | The fast list component of this client leverages Python multiprocessing to parallelize the listing of files within a GCS bucket. It does this by implementing a workstealing algorithm, where each worker in the list operation is able to steal work from its siblings once it has finished all currently slated listing work. This parallelization leads to a real world speed increase up to 10 times faster than sequential listing. Note that paralellization is limited by the machine on which the client runs, and optimal performance is typically found with a worker count that is 1:1 with the available cores. Benchmarking has demonstrated that the larger the object count, the better Dataflux performs when compared to a linear listing. 10 | 11 | ### Example Code 12 | ```python 13 | from dataflux_core import fast_list 14 | 15 | number_of_workers = 20 16 | project = "MyProject" 17 | bucket = "TargetBucket" 18 | target_folder_prefix = "folder1/" 19 | 20 | print("Fast list operation starting...") 21 | list_result = fast_list.ListingController( 22 | max_parallelism=number_of_workers, 23 | project=project, 24 | bucket=bucket, 25 | prefix=target_folder_prefix, 26 | ).run() 27 | ``` 28 | 29 | #### Storage Class 30 | 31 | By default, fast list will only list objects of STANDARD class in GCS buckets. This can be overridden by passing in a string list of storage classes to include while running the Listing Controller. Note that this default behavior was chosen to avoid the cost associated with downloading non-standard GCS classes. Details on GCS Storage Classes can be further explored in the [Storage Class Documentation](https://cloud.google.com/storage/docs/storage-classes). 32 | 33 | #### API Call Count 34 | 35 | Using fast list increases the total number of calls made to the GCS bucket. The increased amount can vary greatly based on size of the workload and number of threads, but our benchmarking has shown an uper bound of approximately 2x the number of API calls when compared to a sequential list. API call count tracking will be displayed in logs if log level is set to debug. To enable these logs we recommend using the `--log-cli-level=DEBUG` flag. 36 | 37 | ### Fast List Benchmark Results 38 | |File Count|VM Core Count|List Time Without Dataflux|List Time With Dataflux| 39 | |------------|-------------|--------------------------|-----------------------| 40 | |17944239 Obj|48 Core |1630.75s |79.55s | 41 | |5000000 Obj |48 Core |289.95s |23.43s | 42 | |1999002 Obj |48 Core |117.61s |12.45s | 43 | |578411 Obj |48 Core |30.70s |9.39s | 44 | |10013 Obj |48 Core |2.35s |6.06s | 45 | 46 | ## Composed Download 47 | 48 | The composed download component of the client uses the results of the fast list to efficiently download the files necessary for a machine learning workload. When downloading files from remote stores, small file size often bottlenecks the speed at which files can be downloaded. To avoid this bottleneck, composed download leverages the [GCS Compose Objects API](https://cloud.google.com/storage/docs/composing-objects) to concatinate small files into larger composed files in GCS prior to downloading. This greatly improves download performance, particularly on datasets with very large numbers of small files. 49 | 50 | ### Example Code 51 | ```python 52 | from dataflux_core import download 53 | 54 | # The maximum size in bytes of a composite download object. 55 | # If this value is set to 0, no composition will occur. 56 | max_compose_bytes = 10000000 57 | project = "MyProject" 58 | bucket = "TargetBucket" 59 | 60 | download_params = download.DataFluxDownloadOptimizationParams( 61 | max_compose_bytes 62 | ) 63 | 64 | print("Download operation starting...") 65 | download_result = download.dataflux_download( 66 | project_name=project, 67 | bucket_name=bucket, 68 | # The list_results parameter is the value returned by fast list in the previous code example. 69 | objects=list_result, 70 | dataflux_download_optimization_params=download_params, 71 | ) 72 | ``` 73 | 74 | #### Multiple Download Options 75 | 76 | Looking at the [download code](dataflux_core/download.py) you will notice three distinct download functions. The default function used in the dataflux-pytorch client is `dataflux_download`. The other functions serve to improve performance for specific use cases. 77 | 78 | ###### Parallel Download 79 | 80 | The `dataflux_download_parallel` function is the most performant stand-alone download function. When using the dataflux client library in isolation, this is the recommended download function. Parallelization must be tuned based on available CPU power and network bandwidth. 81 | 82 | ###### Threaded Download 83 | 84 | The `dataflux_download_threaded` function allows for some amount of downlod parallelization while running within daemonic processes (e.g. a distributed ML workload leveraging [ray](https://www.ray.io/)). Daemonic processes are not permitted to spin up child processes, and thus threading must be used in these instances. Threading download performance is similar to that of multiprocessing for most use-cases, but loses out on performance as the thread/process count increases. Additionally, threading does not allow for signal interuption, so SIGINT cleanup triggers are disabled when running a threaded download. 85 | 86 | ### Dataflux Download Benchmark Results 87 | 88 | These benchmarks were performed on a n2-standard-48 48 vCPU virtual machine on files of approximately 10kb each. 89 | 90 | |Number of Objects|Standard Linear Download|Dataflux Composed Download|Dataflux Threaded Composed Download (48 Threads)|Dataflux Parallel Composed Download (48 Processes)| 91 | |-----------------|------------------------|--------------------------|------------------------------------------------|--------------------------------------------------| 92 | |111 |18.27 Seconds |5.17 Seconds |3.94 Seconds |2.06 Seconds | 93 | |1111 |176.22 Seconds |61.78 Seconds |5.21 Seconds |3.14 Seconds | 94 | |11098 |1396.98 Seconds |392.23 Seconds |16.85 Seconds |14.88 Seconds | 95 | 96 | 97 | ## Getting Started 98 | 99 | To get started leveraging the dataflux client library, we encourage you to start from the [Dataflux Dataset for Pytorch](https://github.com/GoogleCloudPlatform/dataflux-pytorch). For an example of client-specific implementation, please see the [benchmark code](dataflux_core/benchmarking/dataflux_client_bench.py). 100 | 101 | ## Support 102 | 103 | * Please file a GitHub issue in this repository 104 | * If you need to get in touch with us, email dataflux-customer-support@google.com 105 | 106 | ## Contributing 107 | 108 | We welcome your feedback, issues, and bug fixes. We have a tight roadmap at this time so if you have a major feature or change in functionality you'd like to contribute, please open a GitHub Issue for discussion prior to sending a pull request. Please see [CONTRIBUTING](docs/contributing.md) for more information on how to report bugs or submit pull requests. 109 | 110 | ## Code of Conduct 111 | 112 | This project has adopted the Google Open Source Code of Conduct. Please see [code-of-conduct.md](docs/code-of-conduct.md) for more information. 113 | 114 | ## License 115 | 116 | The Dataflux Python Client has an Apache License 2.0. Please see the [LICENSE](LICENSE) file for more information. 117 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | -------------------------------------------------------------------------------- /dataflux_core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | from . import download, fast_list, user_agent 17 | -------------------------------------------------------------------------------- /dataflux_core/benchmarking/dataflux_client_bench.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Example benchmark execution: 17 | python3 dataflux_client_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --max-compose=32 18 | """ 19 | 20 | import argparse 21 | import time 22 | 23 | from dataflux_core import download, fast_list 24 | 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--project", type=str) 29 | parser.add_argument("--bucket", type=str) 30 | parser.add_argument("--bucket-file-count", type=int, default=None) 31 | parser.add_argument("--bucket-file-size", type=int, default=None) 32 | parser.add_argument("--num-workers", type=int, default=10) 33 | parser.add_argument("--max-compose-bytes", type=int, default=100000000) 34 | parser.add_argument("--prefix", type=str, default="") 35 | return parser.parse_args() 36 | 37 | 38 | def main() -> None: 39 | args = parse_args() 40 | list_start_time = time.time() 41 | print(f"Listing operation started at {list_start_time}") 42 | list_result = fast_list.ListingController(args.num_workers, 43 | args.project, 44 | args.bucket, 45 | prefix=args.prefix).run() 46 | list_end_time = time.time() 47 | if args.bucket_file_count and len(list_result) != args.bucket_file_count: 48 | raise AssertionError( 49 | f"Expected {args.bucket_file_count} files, but got {len(list_result)}" 50 | ) 51 | print( 52 | f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds" 53 | ) 54 | size = sum([x[1] for x in list_result]) 55 | print(f"Starting download of: {size} bytes of data...") 56 | download_params = download.DataFluxDownloadOptimizationParams( 57 | args.max_compose_bytes) 58 | download_start_time = time.time() 59 | print(f"Download operation started at {download_start_time}") 60 | download_result = download.dataflux_download( 61 | args.project, 62 | args.bucket, 63 | list_result, 64 | dataflux_download_optimization_params=download_params, 65 | ) 66 | download_end_time = time.time() 67 | total_size = sum([len(x) for x in download_result]) 68 | if args.bucket_file_size and total_size != args.bucket_file_size: 69 | raise AssertionError( 70 | f"Expected {args.bucket_file_size} bytes but got {total_size} bytes" 71 | ) 72 | print( 73 | f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds" 74 | ) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /dataflux_core/benchmarking/dataflux_client_parallel_bench.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Example benchmark execution: 17 | python3 dataflux_client_parallel_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --parallelization=30 --max-compose=32 18 | """ 19 | 20 | import argparse 21 | import time 22 | 23 | from dataflux_core import download, fast_list 24 | 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--project", type=str) 29 | parser.add_argument("--bucket", type=str) 30 | parser.add_argument("--bucket-file-count", type=int, default=None) 31 | parser.add_argument("--bucket-file-size", type=int, default=None) 32 | parser.add_argument("--num-workers", type=int, default=10) 33 | parser.add_argument("--max-compose-bytes", type=int, default=100000000) 34 | parser.add_argument("--parallelization", type=int, default=20) 35 | parser.add_argument("--prefix", type=str, default="") 36 | return parser.parse_args() 37 | 38 | 39 | def main() -> None: 40 | args = parse_args() 41 | list_start_time = time.time() 42 | print(f"Listing operation started at {list_start_time}") 43 | list_result = fast_list.ListingController(args.num_workers, 44 | args.project, 45 | args.bucket, 46 | prefix=args.prefix).run() 47 | list_end_time = time.time() 48 | if args.bucket_file_count and len(list_result) != args.bucket_file_count: 49 | raise AssertionError( 50 | f"Expected {args.bucket_file_count} files, but got {len(list_result)}" 51 | ) 52 | print( 53 | f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds" 54 | ) 55 | size = sum([x[1] for x in list_result]) 56 | print(f"Starting download of: {size} bytes of data...") 57 | download_params = download.DataFluxDownloadOptimizationParams( 58 | args.max_compose_bytes) 59 | download_start_time = time.time() 60 | print(f"Download operation started at {download_start_time}") 61 | download_result = download.dataflux_download_parallel( 62 | args.project, 63 | args.bucket, 64 | list_result, 65 | dataflux_download_optimization_params=download_params, 66 | parallelization=args.parallelization, 67 | ) 68 | download_end_time = time.time() 69 | total_size = sum([len(x) for x in download_result]) 70 | if args.bucket_file_size and total_size != args.bucket_file_size: 71 | raise AssertionError( 72 | f"Expected {args.bucket_file_size} bytes but got {total_size} bytes" 73 | ) 74 | print( 75 | f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds using {args.parallelization} processes" 76 | ) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /dataflux_core/benchmarking/dataflux_client_threaded_bench.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Example benchmark execution: 17 | python3 dataflux_client_threaded_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --threads=30 --max-compose=32 18 | """ 19 | 20 | import argparse 21 | import time 22 | 23 | from dataflux_core import download, fast_list 24 | 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--project", type=str) 29 | parser.add_argument("--bucket", type=str) 30 | parser.add_argument("--bucket-file-count", type=int, default=None) 31 | parser.add_argument("--bucket-file-size", type=int, default=None) 32 | parser.add_argument("--num-workers", type=int, default=10) 33 | parser.add_argument("--max-compose-bytes", type=int, default=100000000) 34 | parser.add_argument("--threads", type=int, default=20) 35 | parser.add_argument("--prefix", type=str, default="") 36 | return parser.parse_args() 37 | 38 | 39 | def main() -> None: 40 | args = parse_args() 41 | list_start_time = time.time() 42 | print(f"Listing operation started at {list_start_time}") 43 | list_result = fast_list.ListingController(args.num_workers, 44 | args.project, 45 | args.bucket, 46 | prefix=args.prefix).run() 47 | list_end_time = time.time() 48 | if args.bucket_file_count and len(list_result) != args.bucket_file_count: 49 | raise AssertionError( 50 | f"Expected {args.bucket_file_count} files, but got {len(list_result)}" 51 | ) 52 | print( 53 | f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds" 54 | ) 55 | size = sum([x[1] for x in list_result]) 56 | print(f"Starting download of: {size} bytes of data...") 57 | download_params = download.DataFluxDownloadOptimizationParams( 58 | args.max_compose_bytes) 59 | download_start_time = time.time() 60 | print(f"Download operation started at {download_start_time}") 61 | download_result = download.dataflux_download_threaded( 62 | args.project, 63 | args.bucket, 64 | list_result, 65 | dataflux_download_optimization_params=download_params, 66 | threads=args.threads, 67 | ) 68 | download_end_time = time.time() 69 | total_size = sum([len(x) for x in download_result]) 70 | if args.bucket_file_size and total_size != args.bucket_file_size: 71 | raise AssertionError( 72 | f"Expected {args.bucket_file_size} bytes but got {total_size} bytes" 73 | ) 74 | print( 75 | f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds using {args.threads} threads" 76 | ) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /dataflux_core/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import itertools 20 | import logging 21 | import math 22 | import multiprocessing 23 | import queue 24 | import signal 25 | import sys 26 | import threading 27 | import uuid 28 | from typing import Iterator 29 | 30 | from google.api_core.client_info import ClientInfo 31 | from google.cloud import storage 32 | from google.cloud.storage.retry import DEFAULT_RETRY 33 | 34 | from dataflux_core import user_agent 35 | 36 | # https://cloud.google.com/storage/docs/retry-strategy#python. 37 | MODIFIED_RETRY = DEFAULT_RETRY.with_deadline(300.0).with_delay(initial=1.0, 38 | multiplier=1.2, 39 | maximum=45.0) 40 | 41 | # https://cloud.google.com/storage/docs/composite-objects. 42 | MAX_NUM_OBJECTS_TO_COMPOSE = 32 43 | 44 | COMPOSED_PREFIX = "dataflux-composed-objects/" 45 | 46 | current_composed_object = None 47 | 48 | 49 | def compose( 50 | project_name: str, 51 | bucket_name: str, 52 | destination_blob_name: str, 53 | objects: list[tuple[str, int]], 54 | storage_client: object = None, 55 | retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 56 | ) -> object: 57 | """Compose the objects into a composite object, upload the composite object to the GCS bucket and returns it. 58 | 59 | Args: 60 | project_name: the name of the GCP project. 61 | bucket_name: the name of the GCS bucket that holds the objects to compose. 62 | The function uploads the the composed object to this bucket too. 63 | destination_blob_name: the name of the composite object to be created. 64 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 65 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 66 | storage_client: the google.cloud.storage.Client initialized with the project. 67 | If not defined, the function will initialize the client with the project_name. 68 | retry_config: The retry parameter to supply to the compose objects call. 69 | 70 | Returns: 71 | the "blob" of the composed object. 72 | """ 73 | if len(objects) > MAX_NUM_OBJECTS_TO_COMPOSE: 74 | raise ValueError( 75 | f"{MAX_NUM_OBJECTS_TO_COMPOSE} objects allowed to compose, received {len(objects)} objects." 76 | ) 77 | 78 | if storage_client is None: 79 | storage_client = storage.Client(project=project_name) 80 | user_agent.add_dataflux_user_agent(storage_client) 81 | 82 | bucket = storage_client.bucket(bucket_name) 83 | destination = bucket.blob(destination_blob_name) 84 | 85 | sources = list() 86 | for each_object in objects: 87 | blob_name = each_object[0] 88 | sources.append(bucket.blob(blob_name)) 89 | 90 | destination.compose(sources, retry=retry_config) 91 | 92 | return destination 93 | 94 | 95 | def decompose( 96 | project_name: str, 97 | bucket_name: str, 98 | composite_object_name: str, 99 | objects: list[tuple[str, int]], 100 | storage_client: object = None, 101 | retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 102 | ) -> list[bytes]: 103 | """Decompose the composite objects and return the decomposed objects contents in bytes. 104 | 105 | Args: 106 | project_name: the name of the GCP project. 107 | bucket_name: the name of the GCS bucket that holds the objects to compose. 108 | The function uploads the the composed object to this bucket too. 109 | composite_object_name: the name of the composite object to be created. 110 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 111 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 112 | storage_client: the google.cloud.storage.Client initialized with the project. 113 | If not defined, the function will initialize the client with the project_name. 114 | retry_config: The retry parameter supplied to the download_as_bytes call. 115 | 116 | Returns: 117 | the contents (in bytes) of the decomposed objects. 118 | """ 119 | if storage_client is None: 120 | storage_client = storage.Client(project=project_name) 121 | user_agent.add_dataflux_user_agent(storage_client) 122 | 123 | res = [] 124 | composed_object_content = download_single( 125 | storage_client, 126 | bucket_name, 127 | composite_object_name, 128 | retry_config=retry_config, 129 | ) 130 | 131 | start = 0 132 | for each_object in objects: 133 | blob_size = each_object[1] 134 | content = composed_object_content[start:start + blob_size] 135 | res.append(content) 136 | start += blob_size 137 | 138 | if start != len(composed_object_content): 139 | logging.error( 140 | "decomposed object length = %s bytes, wanted = %s bytes.", 141 | start, 142 | len(composed_object_content), 143 | ) 144 | return res 145 | 146 | 147 | def download_single( 148 | storage_client: object, 149 | bucket_name: str, 150 | object_name: str, 151 | retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 152 | ) -> bytes: 153 | """Download the contents of this object as a bytes object and return it. 154 | 155 | Args: 156 | storage_client: the google.cloud.storage.Client initialized with the project. 157 | bucket_name: the name of the GCS bucket that holds the object. 158 | object_name: the name of the object to download. 159 | retry_config: The retry parameter supplied to the download_as_bytes call. 160 | 161 | Returns: 162 | the contents of the object in bytes. 163 | """ 164 | bucket_handle = storage_client.bucket(bucket_name) 165 | blob = bucket_handle.blob(object_name) 166 | return blob.download_as_bytes(retry=retry_config) 167 | 168 | 169 | class DataFluxDownloadOptimizationParams: 170 | """Parameters used to optimize DataFlux download performance. 171 | 172 | Attributes: 173 | max_composite_object_size: An integer indicating a cap for the maximum size of the composite object. 174 | 175 | """ 176 | 177 | def __init__(self, max_composite_object_size): 178 | self.max_composite_object_size = max_composite_object_size 179 | 180 | 181 | def df_download_thread( 182 | results_queue: queue.Queue[list[bytes]], 183 | project_name: str, 184 | bucket_name: str, 185 | objects: list[tuple[str, int]], 186 | storage_client: object = None, 187 | dataflux_download_optimization_params: 188 | DataFluxDownloadOptimizationParams = None, 189 | retry_config=MODIFIED_RETRY, 190 | ): 191 | """Threading helper that calls dataflux_download and places results onto queue. 192 | 193 | Args: 194 | results_queue: the queue on which to put all download results. 195 | project_name: the name of the GCP project. 196 | bucket_name: the name of the GCS bucket that holds the objects to compose. 197 | The function uploads the the composed object to this bucket too. 198 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 199 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 200 | storage_client: the google.cloud.storage.Client initialized with the project. 201 | If not defined, the function will initialize the client with the project_name. 202 | dataflux_download_optimization_params: the paramemters used to optimize the download performance. 203 | retry_config: The retry configuration to pass to all retryable download operations 204 | """ 205 | result = dataflux_download( 206 | project_name, 207 | bucket_name, 208 | objects, 209 | storage_client, 210 | dataflux_download_optimization_params, 211 | # Always signify threading enabled so that signal handling is disabled. 212 | threading_enabled=True, 213 | retry_config=retry_config, 214 | ) 215 | results_queue.put(result) 216 | 217 | 218 | def dataflux_download_threaded( 219 | project_name: str, 220 | bucket_name: str, 221 | objects: list[tuple[str, int]], 222 | storage_client: object = None, 223 | dataflux_download_optimization_params: 224 | DataFluxDownloadOptimizationParams = None, 225 | threads: int = 1, 226 | retry_config=MODIFIED_RETRY, 227 | ) -> list[bytes]: 228 | """Perform the DataFlux download algorithm threaded to performantly download the object contents as bytes and return. 229 | 230 | Args: 231 | project_name: the name of the GCP project. 232 | bucket_name: the name of the GCS bucket that holds the objects to compose. 233 | The function uploads the the composed object to this bucket too. 234 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 235 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 236 | storage_client: the google.cloud.storage.Client initialized with the project. 237 | If not defined, the function will initialize the client with the project_name. 238 | dataflux_download_optimization_params: the paramemters used to optimize the download performance. 239 | threads: The number of threads on which to download at any given time. 240 | retry_config: The retry configuration to pass to all retryable download operations 241 | Returns: 242 | the contents of the object in bytes. 243 | """ 244 | chunk_size = math.ceil(len(objects) / threads) 245 | chunks = [] 246 | for i in range(threads): 247 | chunk = objects[i * chunk_size:(i + 1) * chunk_size] 248 | if chunk: 249 | chunks.append(chunk) 250 | results_queues = [queue.Queue() for _ in chunks] 251 | thread_list = [] 252 | for i, chunk in enumerate(chunks): 253 | thread = threading.Thread( 254 | target=df_download_thread, 255 | args=( 256 | results_queues[i], 257 | project_name, 258 | bucket_name, 259 | chunk, 260 | storage_client, 261 | dataflux_download_optimization_params, 262 | retry_config, 263 | ), 264 | ) 265 | thread_list.append(thread) 266 | thread.start() 267 | for thread in thread_list: 268 | thread.join() 269 | results = [] 270 | for q in results_queues: 271 | while not q.empty(): 272 | results.extend(q.get()) 273 | return results 274 | 275 | 276 | def dataflux_download_parallel( 277 | project_name: str, 278 | bucket_name: str, 279 | objects: list[tuple[str, int]], 280 | storage_client: object = None, 281 | dataflux_download_optimization_params: 282 | DataFluxDownloadOptimizationParams = None, 283 | parallelization: int = 1, 284 | retry_config=MODIFIED_RETRY, 285 | ) -> list[bytes]: 286 | """Perform the DataFlux download algorithm in parallel to download the object contents as bytes and return. 287 | 288 | Args: 289 | project_name: the name of the GCP project. 290 | bucket_name: the name of the GCS bucket that holds the objects to compose. 291 | The function uploads the the composed object to this bucket too. 292 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 293 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 294 | storage_client: the google.cloud.storage.Client initialized with the project. 295 | If not defined, the function will initialize the client with the project_name. 296 | dataflux_download_optimization_params: the paramemters used to optimize the download performance. 297 | parallelization: The number of parallel processes that will simultaneously execute the download. 298 | retry_config: The retry configuration to pass to all retryable download operations 299 | Returns: 300 | the contents of the object in bytes. 301 | """ 302 | chunk_size = math.ceil(len(objects) / parallelization) 303 | chunks = [] 304 | for i in range(parallelization): 305 | chunk = objects[i * chunk_size:(i + 1) * chunk_size] 306 | if chunk: 307 | chunks.append(chunk) 308 | with multiprocessing.Pool(processes=len(chunks)) as pool: 309 | results = pool.starmap( 310 | dataflux_download, 311 | (( 312 | project_name, 313 | bucket_name, 314 | chunk, 315 | storage_client, 316 | dataflux_download_optimization_params, 317 | False, 318 | retry_config, 319 | ) for chunk in chunks), 320 | ) 321 | return list(itertools.chain.from_iterable(results)) 322 | 323 | 324 | def dataflux_download( 325 | project_name: str, 326 | bucket_name: str, 327 | objects: list[tuple[str, int]], 328 | storage_client: object = None, 329 | dataflux_download_optimization_params: 330 | DataFluxDownloadOptimizationParams = None, 331 | threading_enabled=False, 332 | retry_config=MODIFIED_RETRY, 333 | ) -> list[bytes]: 334 | """Perform the DataFlux download algorithm to download the object contents as bytes and return. 335 | 336 | Args: 337 | project_name: the name of the GCP project. 338 | bucket_name: the name of the GCS bucket that holds the objects to compose. 339 | The function uploads the the composed object to this bucket too. 340 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 341 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 342 | storage_client: the google.cloud.storage.Client initialized with the project. 343 | If not defined, the function will initialize the client with the project_name. 344 | dataflux_download_optimization_params: the paramemters used to optimize the download performance. 345 | retry_config: The retry configuration to pass to all retryable download operations 346 | Returns: 347 | the contents of the object in bytes. 348 | """ 349 | if storage_client is None: 350 | storage_client = storage.Client(project=project_name) 351 | user_agent.add_dataflux_user_agent(storage_client) 352 | 353 | res = [] 354 | max_composite_object_size = ( 355 | dataflux_download_optimization_params.max_composite_object_size) 356 | 357 | i = 0 358 | # Register the cleanup signal handler for SIGINT. 359 | if not threading_enabled: 360 | signal.signal(signal.SIGINT, term_signal_handler) 361 | global current_composed_object 362 | while i < len(objects): 363 | curr_object_name = objects[i][0] 364 | curr_object_size = objects[i][1] 365 | 366 | if curr_object_size > max_composite_object_size: 367 | # Download the single object. 368 | curr_object_content = download_single( 369 | storage_client=storage_client, 370 | bucket_name=bucket_name, 371 | object_name=curr_object_name, 372 | retry_config=retry_config, 373 | ) 374 | res.append(curr_object_content) 375 | i += 1 376 | else: 377 | # Dynamically compose and decompose based on the object size. 378 | objects_slice = [] 379 | curr_size = 0 380 | 381 | while (i < len(objects) and curr_size <= max_composite_object_size 382 | and len(objects_slice) < MAX_NUM_OBJECTS_TO_COMPOSE): 383 | curr_size += objects[i][1] 384 | objects_slice.append(objects[i]) 385 | i += 1 386 | 387 | if len(objects_slice) == 1: 388 | object_name = objects_slice[0][0] 389 | curr_object_content = download_single( 390 | storage_client=storage_client, 391 | bucket_name=bucket_name, 392 | object_name=object_name, 393 | retry_config=retry_config, 394 | ) 395 | res.append(curr_object_content) 396 | else: 397 | # If the number of objects > 1, we want to compose, download, decompose and delete the composite object. 398 | # Need to create a unique composite name to avoid mutation on the same object among processes. 399 | composed_object_name = COMPOSED_PREFIX + str(uuid.uuid4()) 400 | composed_object = compose( 401 | project_name, 402 | bucket_name, 403 | composed_object_name, 404 | objects_slice, 405 | storage_client, 406 | retry_config=retry_config, 407 | ) 408 | current_composed_object = composed_object 409 | res.extend( 410 | decompose( 411 | project_name, 412 | bucket_name, 413 | composed_object_name, 414 | objects_slice, 415 | storage_client, 416 | retry_config=retry_config, 417 | )) 418 | 419 | try: 420 | composed_object.delete(retry=retry_config) 421 | current_composed_object = None 422 | except Exception as e: 423 | logging.exception( 424 | f"exception while deleting the composite object: {e}") 425 | return res 426 | 427 | 428 | def dataflux_download_lazy( 429 | project_name: str, 430 | bucket_name: str, 431 | objects: list[tuple[str, int]], 432 | storage_client: object = None, 433 | dataflux_download_optimization_params: 434 | DataFluxDownloadOptimizationParams = None, 435 | threading_enabled=False, 436 | retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 437 | ) -> Iterator[bytes]: 438 | """Perform the DataFlux download algorithm to download the object contents as bytes in a lazy fashion. 439 | 440 | Args: 441 | project_name: the name of the GCP project. 442 | bucket_name: the name of the GCS bucket that holds the objects to compose. 443 | The function uploads the the composed object to this bucket too. 444 | objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket. 445 | Example: [("object_name_A", 1000), ("object_name_B", 2000)] 446 | storage_client: the google.cloud.storage.Client initialized with the project. 447 | If not defined, the function will initialize the client with the project_name. 448 | dataflux_download_optimization_params: the paramemters used to optimize the download performance. 449 | retry_config: The retry parameter to supply to the compose objects call. 450 | Returns: 451 | An iterator of the contents of the object in bytes. 452 | """ 453 | if storage_client is None: 454 | storage_client = storage.Client(project=project_name) 455 | user_agent.add_dataflux_user_agent(storage_client) 456 | 457 | max_composite_object_size = ( 458 | dataflux_download_optimization_params.max_composite_object_size) 459 | 460 | i = 0 461 | # Register the cleanup signal handler for SIGINT. 462 | if not threading_enabled: 463 | signal.signal(signal.SIGINT, term_signal_handler) 464 | global current_composed_object 465 | while i < len(objects): 466 | curr_object_name = objects[i][0] 467 | curr_object_size = objects[i][1] 468 | 469 | if curr_object_size > max_composite_object_size: 470 | # Download the single object. 471 | curr_object_content = download_single( 472 | storage_client=storage_client, 473 | bucket_name=bucket_name, 474 | object_name=curr_object_name, 475 | retry_config=retry_config, 476 | ) 477 | yield from [curr_object_content] 478 | i += 1 479 | else: 480 | # Dynamically compose and decompose based on the object size. 481 | objects_slice = [] 482 | curr_size = 0 483 | 484 | while (i < len(objects) and curr_size <= max_composite_object_size 485 | and len(objects_slice) < MAX_NUM_OBJECTS_TO_COMPOSE): 486 | curr_size += objects[i][1] 487 | objects_slice.append(objects[i]) 488 | i += 1 489 | 490 | if len(objects_slice) == 1: 491 | object_name = objects_slice[0][0] 492 | curr_object_content = download_single( 493 | storage_client=storage_client, 494 | bucket_name=bucket_name, 495 | object_name=object_name, 496 | retry_config=retry_config, 497 | ) 498 | yield from [curr_object_content] 499 | else: 500 | # If the number of objects > 1, we want to compose, download, decompose and delete the composite object. 501 | # Need to create a unique composite name to avoid mutation on the same object among processes. 502 | composed_object_name = COMPOSED_PREFIX + str(uuid.uuid4()) 503 | composed_object = compose( 504 | project_name, 505 | bucket_name, 506 | composed_object_name, 507 | objects_slice, 508 | storage_client, 509 | retry_config=retry_config, 510 | ) 511 | current_composed_object = composed_object 512 | yield from (decompose( 513 | project_name, 514 | bucket_name, 515 | composed_object_name, 516 | objects_slice, 517 | storage_client, 518 | retry_config=retry_config, 519 | )) 520 | 521 | try: 522 | composed_object.delete(retry=retry_config) 523 | current_composed_object = None 524 | except Exception as e: 525 | logging.exception( 526 | f"exception while deleting the composite object: {e}") 527 | 528 | 529 | def clean_composed_object(composed_object): 530 | if composed_object: 531 | try: 532 | composed_object.delete(retry=MODIFIED_RETRY) 533 | except Exception as e: 534 | logging.exception( 535 | f"exception while deleting composite object: {e}") 536 | 537 | 538 | def term_signal_handler(signal_num, frame): 539 | print("Ctrl+C interrupt detected. Cleaning up and exiting...") 540 | clean_composed_object(current_composed_object) 541 | sys.exit(0) 542 | -------------------------------------------------------------------------------- /dataflux_core/fast_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | import logging 20 | import multiprocessing 21 | import queue 22 | import time 23 | 24 | from dataflux_core import range_splitter, user_agent 25 | from dataflux_core.download import COMPOSED_PREFIX 26 | from google.api_core.client_info import ClientInfo 27 | from google.cloud import storage 28 | from google.cloud.storage.retry import DEFAULT_RETRY 29 | 30 | DEFAULT_ALLOWED_CLASS = ["STANDARD"] 31 | MODIFIED_RETRY = DEFAULT_RETRY.with_deadline(300.0).with_delay(initial=1.0, 32 | multiplier=1.2, 33 | maximum=45.0) 34 | 35 | 36 | def remove_prefix(text: str, prefix: str): 37 | """Helper function that removes prefix from a string. 38 | 39 | Args: 40 | text: String of text to trim a prefix from. 41 | prefix: String of text that will be trimmed from text. 42 | 43 | Returns: 44 | Text value with the specified prefix removed. 45 | """ 46 | # Note that as of python 3.9 removeprefix is built into string. 47 | if text.startswith(prefix): 48 | return text[len(prefix):] 49 | return text 50 | 51 | 52 | class ListWorker(object): 53 | """Worker that lists a range of objects from a GCS bucket. 54 | 55 | Attributes: 56 | name: String name of the worker. 57 | gcs_project: The string name of the google cloud storage project to list from. 58 | bucket: The string name of the storage bucket to list from.from . import fast_list, download 59 | send_work_stealing_needed_queue: Multiprocessing queue pushed to when a worker needs more work. 60 | heartbeat_queue: Multiprocessing queue pushed to indicating worker is running nominally. 61 | direct_work_available_queue: Multiprocessing queue to push availble work stealing ranges to. 62 | idle_queue: Multiprocessing queue pushed to when worker is waiting for new work to steal. 63 | unidle_queue: Multiprocessing queue pushed to when the worker has successfully stolen work. 64 | results_queue: Multiprocessing queue on which the worker pushes its listing results onto. 65 | metadata_queue: Multiprocessing queue on which the worker pushes tracking metadata. 66 | start_range: Stirng start range worker will begin listing from. 67 | end_range: String end range worker will list until. 68 | retry_config: The retry parameter to supply to list_blob. 69 | 70 | results: Set storing aggregate results prior to pushing onto results_queue. 71 | client: The GCS client through which all GCS list operations are executed. 72 | skip_compose: When true, skip listing files with the composed object prefix. 73 | list_directory_objects: When true, include files with names ending in "/" in the listing. Default false. 74 | prefix: When provided, only list objects under this prefix. 75 | allowed_storage_classes: The set of GCS Storage Class types fast list will include. 76 | max_results: The maximum results per list call (set to max page size of 5000). 77 | splitter: The range_splitter object used by this worker to divide work. 78 | default_alph: The baseline alphabet used to initialize the range_splitter. 79 | api_call_count: Variable tracking the number of GCS list calls made by the worker. 80 | """ 81 | 82 | def __init__( 83 | self, 84 | name: str, 85 | gcs_project: str, 86 | bucket: str, 87 | send_work_stealing_needed_queue: "multiprocessing.Queue[str]", 88 | heartbeat_queue: "multiprocessing.Queue[str]", 89 | direct_work_available_queue: "multiprocessing.Queue[tuple[str, str]]", 90 | idle_queue: "multiprocessing.Queue[str]", 91 | unidle_queue: "multiprocessing.Queue[str]", 92 | results_queue: "multiprocessing.Queue[set[tuple[str, int]]]", 93 | metadata_queue: "multiprocessing.Queue[tuple[str, int]]", 94 | error_queue: "multiprocessing.Queue[Exception]", 95 | start_range: str, 96 | end_range: str, 97 | retry_config: 98 | "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 99 | client: storage.Client = None, 100 | skip_compose: bool = True, 101 | list_directory_objects: bool = False, 102 | prefix: str = "", 103 | allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS, 104 | max_retries: int = 5, 105 | ): 106 | self.name = name 107 | self.gcs_project = gcs_project 108 | self.bucket = bucket 109 | self.send_work_stealing_needed_queue = send_work_stealing_needed_queue 110 | self.heartbeat_queue = heartbeat_queue 111 | self.direct_work_available_queue = direct_work_available_queue 112 | self.idle_queue = idle_queue 113 | self.unidle_queue = unidle_queue 114 | self.results_queue = results_queue 115 | self.metadata_queue = metadata_queue 116 | self.error_queue = error_queue 117 | self.start_range = start_range 118 | self.end_range = end_range 119 | self.results: set[tuple[str, int]] = set() 120 | self.client = client 121 | self.max_results = 5000 122 | self.splitter = None 123 | self.default_alph = "ab" 124 | self.skip_compose = skip_compose 125 | self.list_directory_objects = list_directory_objects 126 | self.prefix = prefix if prefix else "" 127 | self.allowed_storage_classes = allowed_storage_classes 128 | self.api_call_count = 0 129 | self.max_retries = max_retries 130 | self.retry_config = retry_config 131 | 132 | def wait_for_work(self) -> bool: 133 | """Indefinitely waits for available work and consumes it once available. 134 | 135 | Returns: 136 | Boolean value indicating that new work has been acquired. The function 137 | will only return False in response to receiving a shutdown signal (None) 138 | from the controller. 139 | """ 140 | self.send_work_stealing_needed_queue.put(self.name) 141 | self.idle_queue.put(self.name) 142 | logging.debug(f"Process {self.name} waiting for work...") 143 | while True: 144 | try: 145 | self.heartbeat_queue.put(self.name) 146 | new_range = self.direct_work_available_queue.get_nowait() 147 | # None is pushed onto the queue as the shutdown signal once all work is finished. 148 | if new_range[0] != None: 149 | self.unidle_queue.put(self.name) 150 | except queue.Empty: 151 | time.sleep(0.1) 152 | continue 153 | break 154 | if new_range[0] is None: 155 | logging.debug(f"Process {self.name} didn't receive work") 156 | # Upon receiving shutdown signal log all relevant metadata. 157 | md = (self.name, self.api_call_count) 158 | self.metadata_queue.put(md) 159 | return False 160 | self.start_range = new_range[0] 161 | self.end_range = new_range[1] 162 | logging.debug(f"Process {self.name} got new range [{self.start_range}," 163 | f" {self.end_range}]") 164 | return True 165 | 166 | def run(self) -> None: 167 | """Runs the worker.""" 168 | logging.debug(f"Process {self.name} starting...") 169 | if not self.client: 170 | self.client = storage.Client( 171 | project=self.gcs_project, 172 | client_info=ClientInfo(user_agent="dataflux/0.0"), 173 | ) 174 | else: 175 | user_agent.add_dataflux_user_agent(self.client) 176 | self.splitter = range_splitter.new_rangesplitter(self.default_alph) 177 | # When worker has started, attempt to push to all queues. If the idle or unidle queue 178 | # push fails, the worker will not initialize and will be ignored by the controller. 179 | # This allows us to safely handle multiprocessing failures that occur on startup. 180 | self.idle_queue.put(self.name) 181 | self.unidle_queue.put(self.name) 182 | self.heartbeat_queue.put(self.name) 183 | if self.retry_config: 184 | # Post a heartbeat when retrying so the process doesn't get killed. 185 | # The retry class automatically logs the retry as a debug log. 186 | def on_error(e: Exception): 187 | self.heartbeat_queue.put(self.name) 188 | 189 | self.retry_config._on_error = on_error 190 | if self.start_range is None and self.end_range is None: 191 | if not self.wait_for_work(): 192 | return 193 | retries_remaining = self.max_retries 194 | while True: 195 | has_results = False 196 | try: 197 | list_blob_args = { 198 | "max_results": 199 | self.max_results, 200 | "start_offset": 201 | self.prefix + self.start_range, 202 | "end_offset": ("" if not self.end_range else self.prefix + 203 | self.end_range), 204 | "retry": 205 | self.retry_config, 206 | } 207 | if self.prefix: 208 | list_blob_args["prefix"] = self.prefix 209 | blobs = self.client.bucket( 210 | self.bucket).list_blobs(**list_blob_args) 211 | self.api_call_count += 1 212 | i = 0 213 | self.heartbeat_queue.put(self.name) 214 | for blob in blobs: 215 | i += 1 216 | if ((not self.skip_compose 217 | or not blob.name.startswith(COMPOSED_PREFIX)) and 218 | (self.list_directory_objects or blob.name[-1] != "/") 219 | and blob.storage_class 220 | in self.allowed_storage_classes): 221 | self.results.add((blob.name, blob.size)) 222 | # Remove the prefix from the name so that range calculations remain prefix-agnostic. 223 | # This is necessary due to the unbounded end-range when splitting string namespaces 224 | # of unknown size. 225 | self.start_range = remove_prefix(blob.name, self.prefix) 226 | if i == self.max_results: 227 | # Only allow work stealing when paging. 228 | has_results = True 229 | break 230 | retries_remaining = self.max_retries 231 | except Exception as e: 232 | retries_remaining -= 1 233 | logging.error( 234 | f"process {self.name} encountered error ({retries_remaining} retries left): {str(e)}" 235 | ) 236 | if retries_remaining == 0: 237 | logging.error("process " + self.name + 238 | " is out of retries; exiting") 239 | self.error_queue.put(e) 240 | return 241 | continue 242 | if has_results: 243 | # Check for work stealing. 244 | try: 245 | self.send_work_stealing_needed_queue.get_nowait() 246 | except queue.Empty: 247 | continue 248 | split_points = self.splitter.split_range( 249 | self.start_range, self.end_range, 1) 250 | steal_range = (split_points[0], self.end_range) 251 | self.direct_work_available_queue.put(steal_range) 252 | self.end_range = split_points[0] 253 | self.max_results = 5000 254 | else: 255 | # All done, wait for work. 256 | if len(self.results) > 0: 257 | self.results_queue.put(self.results) 258 | self.results = set() 259 | if not self.wait_for_work(): 260 | return 261 | 262 | 263 | def run_list_worker( 264 | name: str, 265 | gcs_project: str, 266 | bucket: str, 267 | send_work_stealing_needed_queue: "multiprocessing.Queue[str]", 268 | heartbeat_queue: "multiprocessing.Queue[str]", 269 | direct_work_available_queue: "multiprocessing.Queue[tuple[str, str]]", 270 | idle_queue: "multiprocessing.Queue[str]", 271 | unidle_queue: "multiprocessing.Queue[str]", 272 | results_queue: "multiprocessing.Queue[set[tuple[str, int]]]", 273 | metadata_queue: "multiprocessing.Queue[tuple[str, int]]", 274 | error_queue: "multiprocessing.Queue[Exception]", 275 | start_range: str, 276 | end_range: str, 277 | retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY, 278 | client: storage.Client = None, 279 | skip_compose: bool = True, 280 | prefix: str = "", 281 | allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS, 282 | ) -> None: 283 | """Helper function to execute a ListWorker. 284 | 285 | Args: 286 | name: String name of the list worker. 287 | gcs_project: String name of the google cloud project in use. 288 | bucket: String name of the google cloud bucket to list from. 289 | send_work_stealing_needed_queue: Multiprocessing queue pushed to when a worker needs more work. 290 | heartbeat_queue: Multiprocessing queue pushed to while a worker is running nominally. 291 | direct_work_available_queue: Multiprocessing queue to push availble work stealing ranges to. 292 | idle_queue: Multiprocessing queue pushed to when worker is waiting for new work to steal. 293 | unidle_queue: Multiprocessing queue pushed to when the worker has successfully stolen work. 294 | results_queue: Multiprocessing queue on which the worker pushes its listing results onto. 295 | metadata_queue: Multiprocessing queue on which the worker pushes tracking metadata. 296 | error_queue: Multiprocessing queue to track errors from the worker process. 297 | start_range: String start range worker will begin listing from. 298 | end_range: String end range worker will list until. 299 | retry_config: The retry parameter to supply to list_blob. 300 | client: The GCS storage client. When not provided, will be derived from background auth. 301 | skip_compose: When true, skip listing files with the composed object prefix. 302 | prefix: When provided, only list objects under this prefix. 303 | allowed_storage_classes: The set of GCS Storage Class types fast list will include. 304 | """ 305 | ListWorker( 306 | name, 307 | gcs_project, 308 | bucket, 309 | send_work_stealing_needed_queue, 310 | heartbeat_queue, 311 | direct_work_available_queue, 312 | idle_queue, 313 | unidle_queue, 314 | results_queue, 315 | metadata_queue, 316 | error_queue, 317 | start_range, 318 | end_range, 319 | retry_config, 320 | client, 321 | skip_compose=skip_compose, 322 | prefix=prefix, 323 | allowed_storage_classes=allowed_storage_classes, 324 | ).run() 325 | 326 | 327 | class ListingController(object): 328 | """This controller manages and monitors all listing workers operating on the GCS bucket. 329 | 330 | Attributes: 331 | max_parallelism: The maximum number of processes to start via the Multiprocessing library. 332 | gcs_project: The string name of the google cloud storage project to list from. 333 | bucket: The string name of the storage bucket to list from. 334 | inited: The set of ListWorker processes that have succesfully started. 335 | checkins: A dictionary tracking the last known checkin time for each inited ListWorker. 336 | waiting_for_work: The number of ListWorker processes currently waiting for new listing work. 337 | sort_results: Boolean indicating whether the final result set should be sorted or unsorted. 338 | skip_compose: When true, skip listing files with the composed object prefix. 339 | prefix: When provided, only list objects under this prefix. 340 | allowed_storage_classes: The set of GCS Storage Class types fast list will include. 341 | retry_config: The retry config passed to list_blobs. 342 | """ 343 | 344 | def __init__( 345 | self, 346 | max_parallelism: int, 347 | project: str, 348 | bucket: str, 349 | sort_results: bool = False, 350 | skip_compose: bool = True, 351 | prefix: str = "", 352 | allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS, 353 | retry_config=MODIFIED_RETRY, 354 | ): 355 | # The maximum number of threads utilized in the fast list operation. 356 | self.max_parallelism = max_parallelism 357 | self.gcs_project = project 358 | self.bucket = bucket 359 | self.inited = set() 360 | self.checkins = {} 361 | self.waiting_for_work = 0 362 | self.sort_results = sort_results 363 | self.client = None 364 | self.skip_compose = skip_compose 365 | self.prefix = prefix 366 | self.allowed_storage_classes = allowed_storage_classes 367 | self.retry_config = retry_config 368 | 369 | def manage_tracking_queues( 370 | self, 371 | idle_queue: "multiprocessing.Queue[str]", 372 | unidle_queue: "multiprocessing.Queue[str]", 373 | heartbeat_queue: "multiprocessing.Queue[str]", 374 | ) -> None: 375 | """Manages metadata queues to track execution of the listing operation. 376 | 377 | Args: 378 | idle_queue: the queue workers push to when in need of new work to steal. 379 | unidle_queue: the queue workers push to when they steal work. 380 | heartbeat_queue: the queue workers push to continuously while running nominally. 381 | """ 382 | while True: 383 | try: 384 | idle_queue.get_nowait() 385 | self.waiting_for_work += 1 386 | except queue.Empty: 387 | break 388 | while True: 389 | try: 390 | unidle_queue.get_nowait() 391 | self.waiting_for_work -= 1 392 | except queue.Empty: 393 | break 394 | while True: 395 | try: 396 | inited_worker = heartbeat_queue.get_nowait() 397 | current_time = time.time() 398 | self.inited.add(inited_worker) 399 | self.checkins[inited_worker] = current_time 400 | except queue.Empty: 401 | break 402 | 403 | def check_crashed_processes(self) -> bool: 404 | """Checks if any processes have crashed. 405 | 406 | Returns: 407 | A boolean indicating if any processes have crashed after initialization. 408 | If this function returns true, it indicates a need to restart the listing 409 | operation. 410 | """ 411 | logging.debug("checking for crashed procs...") 412 | now = time.time() 413 | crashed = [] 414 | # Wait at least 60 seconds or 2 times the API call retry delay for check-ins, 415 | # otherwise processes might appear to be crashed while retrying API calls. 416 | checkin_wait = 2 * self.retry_config._maximum if self.retry_config else 0 417 | checkin_wait = max(checkin_wait, 60) 418 | for inited_worker, last_checkin in self.checkins.items(): 419 | if now - last_checkin > checkin_wait: 420 | crashed.append(inited_worker) 421 | for proc in crashed: 422 | if proc in self.inited: 423 | logging.error( 424 | "process crash detected, ending list procedure...") 425 | return True 426 | return False 427 | 428 | def cleanup_processes( 429 | self, 430 | processes: "list[multiprocessing.Process]", 431 | results_queue: "multiprocessing.Queue[set[tuple[str, int]]]", 432 | metadata_queue: "multiprocessing.Queue[tuple[str, int]]", 433 | results: "set[tuple[str, int]]", 434 | ) -> list[tuple[str, int]]: 435 | """Allows processes to shut down, kills procs that failed to initialize. 436 | 437 | Args: 438 | processes: the list of processes. 439 | results_queue: the queue for transmitting all result tuples from listing. 440 | metadata_queue: the queue for transmitting all tracking metadata from workers. 441 | results: the set of unique results consumed from results_queue. 442 | 443 | Returns: 444 | A sorted list of (str, int) tuples indicating the name and file size of each 445 | unique file listed in the listing process. 446 | 447 | """ 448 | api_call_count = 0 449 | while True: 450 | alive = False 451 | live_procs = 0 452 | for p in processes: 453 | if p.is_alive(): 454 | alive = True 455 | live_procs += 1 456 | while True: 457 | try: 458 | result = results_queue.get_nowait() 459 | results.update(result) 460 | logging.debug(f"Result count: {len(results)}") 461 | except queue.Empty: 462 | break 463 | time.sleep(0.2) 464 | break 465 | while True: 466 | try: 467 | metadata = metadata_queue.get_nowait() 468 | api_call_count += metadata[1] 469 | except queue.Empty: 470 | break 471 | logging.debug("Live procs: %d", live_procs) 472 | logging.debug("Inited procs: %d", len(self.inited)) 473 | if live_procs <= self.max_parallelism - len(self.inited): 474 | alive = False 475 | # This prevents any memory leaks from multiple executions, but does kill 476 | # the stuck processes very aggressively. It does not cause issues in 477 | # execution, but looks very loud to the user if they are watching debug 478 | # output. 479 | for p in processes: 480 | if p.is_alive(): 481 | p.terminate() 482 | if not alive: 483 | logging.debug(f"Total GCS API call count: {api_call_count}") 484 | if self.sort_results: 485 | return sorted(results) 486 | return list(results) 487 | 488 | def terminate_now( 489 | self, processes: "list[multiprocessing.Process]") -> RuntimeError: 490 | """Terminates all processes immediately. 491 | 492 | Args: 493 | processes: The full list of multiprocessing processes. 494 | 495 | Returns: 496 | RuntimeError indicating that one or more multiprocess processes has 497 | become unresponsive 498 | """ 499 | for p in processes: 500 | p.terminate() 501 | raise RuntimeError( 502 | "multiprocessing child process became unresponsive; check logs for underlying error" 503 | ) 504 | 505 | def run(self) -> list[tuple[str, int]]: 506 | """Runs the controller that manages fast listing. 507 | 508 | Returns: 509 | A sorted list of (str, int) tuples indicating the name and file size of each 510 | unique file listed in the listing process. 511 | """ 512 | # Define the queues. 513 | send_work_stealing_needed_queue: multiprocessing.Queue[str] = ( 514 | multiprocessing.Queue()) 515 | heartbeat_queue: multiprocessing.Queue[str] = multiprocessing.Queue() 516 | direct_work_available_queue: multiprocessing.Queue[tuple[str, str]] = ( 517 | multiprocessing.Queue()) 518 | idle_queue: multiprocessing.Queue[str] = multiprocessing.Queue() 519 | unidle_queue: multiprocessing.Queue[str] = multiprocessing.Queue() 520 | results_queue: multiprocessing.Queue[set[tuple[str, int]]] = ( 521 | multiprocessing.Queue()) 522 | metadata_queue: multiprocessing.Queue[tuple[ 523 | str, int]] = multiprocessing.Queue() 524 | error_queue: multiprocessing.Queue[Exception] = multiprocessing.Queue() 525 | processes = [] 526 | results: set[tuple[str, int]] = set() 527 | for i in range(self.max_parallelism): 528 | p = multiprocessing.Process( 529 | target=run_list_worker, 530 | args=( 531 | "dataflux-listing-proc." + str(i), 532 | self.gcs_project, 533 | self.bucket, 534 | send_work_stealing_needed_queue, 535 | heartbeat_queue, 536 | direct_work_available_queue, 537 | idle_queue, 538 | unidle_queue, 539 | results_queue, 540 | metadata_queue, 541 | error_queue, 542 | "" if i == 0 else None, 543 | "" if i == 0 else None, 544 | self.retry_config, 545 | self.client, 546 | self.skip_compose, 547 | self.prefix, 548 | self.allowed_storage_classes, 549 | ), 550 | ) 551 | processes.append(p) 552 | p.start() 553 | # Wait before starting the next process to avoid deadlock when multiple processes 554 | # attempt to register with the same multiprocessing queue. 555 | time.sleep(0.1) 556 | while True: 557 | time.sleep(0.2) 558 | try: 559 | e = error_queue.get_nowait() 560 | logging.error( 561 | f"Got error from child process; exiting. Check child process logs for more details. Error: {e}" 562 | ) 563 | return self.terminate_now(processes) 564 | except queue.Empty: 565 | pass 566 | alive = False 567 | for p in processes: 568 | if p.is_alive(): 569 | alive = True 570 | break 571 | new_results = set() 572 | while True: 573 | try: 574 | result = results_queue.get_nowait() 575 | new_results.update(result) 576 | except queue.Empty: 577 | break 578 | if len(new_results) > 0: 579 | results.update(new_results) 580 | logging.debug(f"Result count: {len(results)}") 581 | if not alive: 582 | break 583 | # Update all queues related to tracking process status. 584 | self.manage_tracking_queues(idle_queue, unidle_queue, 585 | heartbeat_queue) 586 | if self.check_crashed_processes(): 587 | return self.terminate_now(processes) 588 | logging.debug("Inited procs: %d", len(self.inited)) 589 | logging.debug("Waiting for work: %d", self.waiting_for_work) 590 | if len(self.inited) == self.waiting_for_work and ( 591 | self.waiting_for_work > 0): 592 | logging.debug("Exiting, all processes are waiting for work") 593 | for _ in range(self.max_parallelism * 2): 594 | direct_work_available_queue.put((None, None)) 595 | break 596 | while True: 597 | try: 598 | result = results_queue.get_nowait() 599 | results.update(result) 600 | logging.debug(f"Result count: {len(results)}") 601 | except queue.Empty: 602 | break 603 | logging.debug("Got all results, waiting for processes to exit.") 604 | return self.cleanup_processes(processes, results_queue, metadata_queue, 605 | results) 606 | -------------------------------------------------------------------------------- /dataflux_core/performance_tests/list_and_download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import time 19 | import unittest 20 | from math import ceil 21 | 22 | from dataflux_core import download, fast_list 23 | 24 | FIFTY_GB = 50000000000 25 | 26 | 27 | class ClientPerformanceTest(unittest.TestCase): 28 | 29 | def get_config(self): 30 | config = {} 31 | # Gather env vars into dictionary. 32 | config["project"] = os.getenv("PROJECT") 33 | config["bucket"] = os.getenv("BUCKET") 34 | config["prefix"] = os.getenv("PREFIX") 35 | config["num_workers"] = os.getenv("LIST_WORKERS") 36 | config["expected_file_count"] = os.getenv("FILE_COUNT") 37 | config["expected_total_size"] = os.getenv("TOTAL_FILE_SIZE") 38 | config["max_compose_bytes"] = os.getenv("MAX_COMPOSE_BYTES") 39 | config["list_timeout"] = os.getenv("LIST_TIMEOUT") 40 | config["download_timeout"] = os.getenv("DOWNLOAD_TIMEOUT") 41 | config["parallelization"] = os.getenv("PARALLELIZATION") 42 | 43 | # Type convert env vars. 44 | if config["num_workers"]: 45 | config["num_workers"] = int(config["num_workers"]) 46 | if config["expected_file_count"]: 47 | config["expected_file_count"] = int(config["expected_file_count"]) 48 | if config["expected_total_size"]: 49 | config["expected_total_size"] = int(config["expected_total_size"]) 50 | config["max_compose_bytes"] = (int(config["max_compose_bytes"]) 51 | if config["max_compose_bytes"] else 52 | 100000000) 53 | if config["list_timeout"]: 54 | config["list_timeout"] = float(config["list_timeout"]) 55 | if config["download_timeout"]: 56 | config["download_timeout"] = float(config["download_timeout"]) 57 | config["parallelization"] = (int(config["parallelization"]) 58 | if config["parallelization"] else 1) 59 | 60 | return config 61 | 62 | def run_list(self, config): 63 | list_start_time = time.time() 64 | list_result = fast_list.ListingController( 65 | config["num_workers"], 66 | config["project"], 67 | config["bucket"], 68 | prefix=config["prefix"], 69 | ).run() 70 | list_end_time = time.time() 71 | listing_time = list_end_time - list_start_time 72 | if (config["expected_file_count"] 73 | and len(list_result) != config["expected_file_count"]): 74 | raise AssertionError( 75 | f"Expected {config['expected_file_count']} files, but got {len(list_result)}" 76 | ) 77 | if config["list_timeout"] and listing_time > config["list_timeout"]: 78 | raise AssertionError( 79 | f"Expected list operation to complete in under {config['list_timeout']} seconds, but took {listing_time} seconds." 80 | ) 81 | return list_result 82 | 83 | def run_download(self, config, list_result): 84 | segmented = False 85 | if config["expected_total_size"] > FIFTY_GB: 86 | segmented = True 87 | download_params = download.DataFluxDownloadOptimizationParams( 88 | config["max_compose_bytes"]) 89 | download_start_time = time.time() 90 | download_result = None 91 | if config["parallelization"] and config["parallelization"] > 1: 92 | download_result = download.dataflux_download_parallel( 93 | config["project"], 94 | config["bucket"], 95 | list_result, 96 | dataflux_download_optimization_params=download_params, 97 | parallelization=config["parallelization"], 98 | ) 99 | else: 100 | download_result = download.dataflux_download( 101 | config["project"], 102 | config["bucket"], 103 | list_result, 104 | dataflux_download_optimization_params=download_params, 105 | ) 106 | download_end_time = time.time() 107 | downloading_time = download_end_time - download_start_time 108 | total_size = sum([len(x) for x in download_result]) 109 | if (not segmented and config["expected_total_size"] 110 | and total_size != config["expected_total_size"]): 111 | raise AssertionError( 112 | f"Expected {config['expected_total_size']} bytes but got {total_size} bytes" 113 | ) 114 | if config["download_timeout"] and downloading_time > config[ 115 | "download_timeout"]: 116 | raise AssertionError( 117 | f"Expected download operation to complete in under {config['download_timeout']} seconds, but took {downloading_time} seconds." 118 | ) 119 | return total_size 120 | 121 | def test_list_and_download_one_shot(self): 122 | config = self.get_config() 123 | list_result = self.run_list(config) 124 | self.run_download(config, list_result) 125 | 126 | def test_list_and_download_segmented(self): 127 | # This function is needed to avoid OOM errors when the dataset size 128 | # exceeds the memory of the VM. 129 | config = self.get_config() 130 | list_result = self.run_list(config) 131 | num_segments = config["expected_total_size"] / FIFTY_GB 132 | segment_size = ceil(config["expected_file_count"] / num_segments) 133 | segments = [ 134 | list_result[i:i + segment_size] 135 | for i in range(0, len(list_result), segment_size) 136 | ] 137 | total_size = 0 138 | for seg in segments: 139 | total_size += self.run_download(config, seg) 140 | if (config["expected_total_size"] 141 | and total_size != config["expected_total_size"]): 142 | raise AssertionError( 143 | f"Expected {config['expected_total_size']} bytes but got {total_size} bytes" 144 | ) 145 | -------------------------------------------------------------------------------- /dataflux_core/performance_tests/list_only.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import argparse 18 | import time 19 | 20 | from dataflux_core import download, fast_list 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--project", type=str) 26 | parser.add_argument("--bucket", type=str) 27 | parser.add_argument("--bucket-file-count", type=int, default=None) 28 | parser.add_argument("--bucket-file-size", type=int, default=None) 29 | parser.add_argument("--num-workers", type=int, default=10) 30 | parser.add_argument("--max-compose-bytes", type=int, default=100000000) 31 | parser.add_argument("--prefix", type=str, default="") 32 | return parser.parse_args() 33 | 34 | 35 | def main() -> None: 36 | args = parse_args() 37 | list_start_time = time.time() 38 | print(f"Listing operation started at {list_start_time}") 39 | list_result = fast_list.ListingController(args.num_workers, 40 | args.project, 41 | args.bucket, 42 | prefix=args.prefix).run() 43 | list_end_time = time.time() 44 | if args.bucket_file_count and len(list_result) != args.bucket_file_count: 45 | raise AssertionError( 46 | f"Expected {args.bucket_file_count} files, but got {len(list_result)}" 47 | ) 48 | print( 49 | f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds" 50 | ) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /dataflux_core/range_splitter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | from collections.abc import Sequence 20 | from dataclasses import dataclass 21 | from fractions import Fraction 22 | from itertools import count 23 | 24 | 25 | @dataclass 26 | class MinimalIntRange: 27 | start_int: int 28 | end_int: int 29 | min_len: int 30 | 31 | 32 | @dataclass 33 | class GenerateSplitsOpts: 34 | min_int_range: MinimalIntRange 35 | num_splits: int 36 | start_range: str 37 | end_range: str 38 | 39 | 40 | class RangeSplitter(object): 41 | """Manages splits performed to facilitate the work-stealing algorithm. 42 | 43 | Attr: 44 | alphabet_map: An int mapping for an alphabet of arbitrary character size. 45 | sorted_alphabet: The sorted alphabet that initializes the RangeSplitter. 46 | """ 47 | 48 | min_splits = 2 49 | 50 | def __init__(self, alphabet_map: dict[int, str], 51 | sorted_alphabet: Sequence[str]): 52 | self.alphabet_map = alphabet_map 53 | self.sorted_alphabet = sorted_alphabet 54 | self.alphabet_set = set(sorted_alphabet) 55 | 56 | def split_range( 57 | self, 58 | start_range: str, 59 | end_range: str, 60 | num_splits: int, 61 | ) -> Sequence[str]: 62 | """Creates a given number of splits based on a provided start and end range. 63 | 64 | Args: 65 | start_range (str): The string marking the start of the split range. 66 | end_range (str): The string marking the end of the split range. 67 | num_splits (int): The number of splitpoints to return. 68 | 69 | Returns: 70 | A sequence of split points dividing up the provided range. 71 | """ 72 | if num_splits < 1: 73 | raise ValueError("Got num_splits of %s but need minimum of %s." % 74 | (num_splits, self.min_splits)) 75 | if len(end_range) != 0 and start_range >= end_range: 76 | return [] 77 | 78 | if self.is_range_equal_with_padding(start_range, end_range): 79 | return [] 80 | 81 | self.add_characters_to_alphabet(start_range + end_range) 82 | 83 | min_int_range = self.string_to_minimal_int_range( 84 | start_range, end_range, num_splits) 85 | 86 | split_points = self.generate_splits( 87 | GenerateSplitsOpts(min_int_range, num_splits, start_range, 88 | end_range)) 89 | return split_points 90 | 91 | def generate_splits(self, opts: GenerateSplitsOpts) -> Sequence[str]: 92 | """Generates a list of split points. 93 | 94 | Args: 95 | opts (GenerateSplitOpts): Set of options for generating splitpoints 96 | 97 | Returns: 98 | A list of split points. 99 | """ 100 | start_int = opts.min_int_range.start_int 101 | end_int = opts.min_int_range.end_int 102 | min_len = opts.min_int_range.min_len 103 | 104 | range_diff = end_int - start_int 105 | split_points = [] 106 | range_interval = opts.num_splits + 1 107 | adjustment = Fraction(range_diff / range_interval) 108 | 109 | for i in range(1, opts.num_splits + 1): 110 | split_point = start_int + adjustment * i 111 | split_string = self.int_to_string(int(split_point), min_len) 112 | 113 | is_greater_than_start = (len(split_string) > 0 114 | and split_string > opts.start_range) 115 | is_less_than_end = len( 116 | opts.end_range) == 0 or (len(split_string) > 0 117 | and split_string < opts.end_range) 118 | 119 | if is_greater_than_start and is_less_than_end: 120 | split_points.append(split_string) 121 | 122 | return split_points 123 | 124 | def int_to_string(self, split_point: int, string_len: int) -> str: 125 | """Converts the base len(alphabet) int back into a string. 126 | 127 | Args: 128 | split_point (int): A valid split point int to be converted to string. 129 | string_len (int): The required length of the resulting string. 130 | 131 | Returns: 132 | A string derived from a base len(alphabet) int. 133 | """ 134 | alphabet_len = len(self.sorted_alphabet) 135 | split_string = "" 136 | 137 | for _ in range(string_len): 138 | remainder = split_point % alphabet_len 139 | split_point //= alphabet_len 140 | split_string += self.sorted_alphabet[remainder] 141 | 142 | # This is assembeled backwards via division, so we reverse the final string. 143 | return split_string[::-1] 144 | 145 | def string_to_minimal_int_range(self, start_range: str, end_range: str, 146 | num_splits: int) -> MinimalIntRange: 147 | """Converts a string range to a minimal integer range. 148 | 149 | Args: 150 | start_range (str): The string marking the start of the split range. 151 | end_range (str): The string marking the end of the split range. 152 | num_splits (int): The number of splitpoints to return. 153 | 154 | Returns: 155 | A minimal integer range. 156 | """ 157 | 158 | start_int = 0 159 | end_int = 0 160 | 161 | alphabet_len = len(self.sorted_alphabet) 162 | start_char = self.sorted_alphabet[0] 163 | end_char = self.sorted_alphabet[-1] 164 | 165 | end_default_char = start_char 166 | if len(end_range) == 0: 167 | end_default_char = end_char 168 | 169 | for i in count(0): 170 | start_pos = self.alphabet_map[get_char_or_default( 171 | start_range, i, start_char)] 172 | start_int *= alphabet_len 173 | start_int += start_pos 174 | 175 | end_pos = self.alphabet_map[get_char_or_default( 176 | end_range, i, end_default_char)] 177 | end_int *= alphabet_len 178 | end_int += end_pos 179 | 180 | difference = end_int - start_int 181 | if difference > num_splits: 182 | # Due to zero indexing, min length must have 1 added to it. 183 | return MinimalIntRange(start_int, end_int, i + 1) 184 | 185 | def is_range_equal_with_padding(self, start_range: str, end_range: str): 186 | """Checks for equality between two string ranges. 187 | 188 | Args: 189 | start_range (str): The start range for the split. 190 | end_range (str): The end range for the split. 191 | 192 | Returns: 193 | Boolean indicating equality of the two provided ranges. 194 | """ 195 | 196 | if len(end_range) == 0: 197 | return False 198 | 199 | longest = max(len(start_range), len(end_range)) 200 | 201 | smallest_char = self.sorted_alphabet[0] 202 | 203 | for i in range(longest): 204 | char_start = get_char_or_default(start_range, i, smallest_char) 205 | char_end = get_char_or_default(end_range, i, smallest_char) 206 | 207 | if char_start != char_end: 208 | return False 209 | 210 | return True 211 | 212 | def add_characters_to_alphabet(self, characters: str): 213 | """Adds a character to the known alphabet. 214 | 215 | Args: 216 | characters: The string of characters to add to the library. 217 | """ 218 | unique_characters = set(characters) 219 | new_alphabet = self.alphabet_set.union(unique_characters) 220 | if len(new_alphabet) != len(self.alphabet_set): 221 | self.sorted_alphabet = sorted(new_alphabet) 222 | self.alphabet_map = { 223 | val: index 224 | for index, val in enumerate(self.sorted_alphabet) 225 | } 226 | 227 | 228 | def get_char_or_default(characters: str, index: int, default_char: str) -> str: 229 | """Returns the character at the given index or the default character if the index is out of bounds. 230 | 231 | Args: 232 | characters (str): The range string to check. 233 | index (int): The current iteration index across characters. 234 | default_char (str): The smallest character in the implemented char set. 235 | 236 | Returns: 237 | The resulting character for the given index. 238 | """ 239 | if index < 0 or index >= len(characters): 240 | return default_char 241 | 242 | return characters[index] 243 | 244 | 245 | def new_rangesplitter(alphabet: str) -> RangeSplitter: 246 | """Creates a new RangeSplitter with the given alphabets. 247 | 248 | Note that the alphabets are a predetermined set of characters 249 | by the work-stealing algorithm, and the characters are guaranteed to be unique. 250 | 251 | Args: 252 | alphabet (str): The full set of characters used for this range splitter. 253 | 254 | Returns: 255 | An instance of the RangeSplitter class that is used to manage splits 256 | performed to facilitate the work-stealing algorithm. 257 | """ 258 | if len(alphabet) == 0: 259 | raise ValueError("Cannot split with an empty alphabet.") 260 | sorted_alphabet = sorted(alphabet) 261 | alphabet_map = {val: index for index, val in enumerate(sorted_alphabet)} 262 | return RangeSplitter(alphabet_map, sorted_alphabet) 263 | -------------------------------------------------------------------------------- /dataflux_core/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | from . import fake_gcs 17 | -------------------------------------------------------------------------------- /dataflux_core/tests/fake_gcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Fake GCS package supporting the GCS API methods used for Dataflux. 17 | 18 | The fake_gcs package provides Client, Bucket, and GCSObject classes matching the 19 | interfaces used in Dataflux code. The fake is implemented using these classes, 20 | rather than by using an HTTP server and connecting the actual GCS client to the 21 | server, which could be a future improvement. 22 | """ 23 | 24 | from __future__ import annotations 25 | 26 | import io 27 | 28 | from google.cloud.storage import _http 29 | 30 | 31 | class Bucket(object): 32 | """Bucket represents a bucket in GCS, containing objects.""" 33 | 34 | list_error = None 35 | """If set, an error which is returned when calling list_blobs""" 36 | 37 | def __init__(self, name: str): 38 | if not name: 39 | raise Exception("bucket name must not be empty") 40 | self.name = name 41 | self.blobs: dict[str, Blob] = dict() 42 | self.permissions: any = [] 43 | 44 | def list_blobs( 45 | self, 46 | max_results: int = 0, 47 | start_offset: str = "", 48 | end_offset: str = "", 49 | prefix: str = "", 50 | retry: "google.api_core.retry.retry_unary.Retry" = None, 51 | ) -> list[Blob]: 52 | results = [] 53 | for name in sorted(self.blobs): 54 | if max_results and len(results) == max_results: 55 | break 56 | if (not start_offset or name 57 | >= start_offset) and (not end_offset or name < end_offset): 58 | if name.startswith(prefix): 59 | results.append(self.blobs[name]) 60 | return results 61 | 62 | def blob(self, name: str, missing_path: bool = False): 63 | if name == "missing-path": 64 | missing_path = True 65 | if name not in self.blobs: 66 | self.blobs[name] = Blob( 67 | name, bucket=self, missing_bucket=missing_path) 68 | return self.blobs[name] 69 | 70 | def _add_file(self, 71 | filename: str, 72 | content: bytes, 73 | storage_class="STANDARD"): 74 | self.blobs[filename] = Blob(filename, 75 | content, 76 | self, 77 | storage_class=storage_class) 78 | 79 | def test_iam_permissions(self, permissions: any): 80 | return [p for p in permissions if p in self.permissions] 81 | 82 | 83 | class FakeBlobWriter(object): 84 | """Represents fake BlobWriter.""" 85 | 86 | def __init__(self, blob): 87 | self.blob = blob 88 | 89 | def write(self, data: bytes): 90 | self.blob.content += data 91 | 92 | def flush(self): 93 | pass 94 | 95 | def __enter__(self): 96 | return self 97 | 98 | def __exit__(self, exc_type, exc_val, exc_tb): 99 | pass 100 | 101 | 102 | class Blob(object): 103 | """Blob represents a GCS blob object. 104 | 105 | Attributes: 106 | name: The name of the blob. 107 | retry: A variable tracking the retry policy input. 108 | content: The byte content of the Blob. 109 | bucket: The bucket object in which this Blob resides. 110 | size: The size in bytes of the Blob. 111 | """ 112 | 113 | def __init__( 114 | self, 115 | name: str, 116 | content: bytes = b"", 117 | bucket: Bucket = None, 118 | storage_class="STANDARD", 119 | missing_bucket: bool = False 120 | ): 121 | self.name = name 122 | self.retry = None 123 | self.content = content 124 | self.bucket = bucket 125 | self.size = len(self.content) 126 | self.storage_class = storage_class 127 | self.missing_bucket = missing_bucket 128 | 129 | def compose(self, sources: list[str], retry=None): 130 | b = b"" 131 | for item in sources: 132 | b += self.bucket.blobs[item.name].content 133 | self.content = b 134 | self.retry = retry 135 | 136 | def delete(self, retry=None): 137 | del self.bucket.blobs[self.name] 138 | 139 | def exists(self, retry=None): 140 | return not self.missing_bucket 141 | 142 | def download_as_bytes(self, retry=None): 143 | return self.content 144 | 145 | def download_to_file(self, file_obj: io.IOBase) -> None: 146 | file_obj.write(self.content) 147 | 148 | def open(self, mode: str, ignore_flush: bool = False): 149 | if mode == "rb": 150 | return io.BytesIO(self.content) 151 | elif mode == "wb": 152 | self.content = b"" 153 | return FakeBlobWriter(self) 154 | raise NotImplementedError( 155 | "Supported modes strings are 'rb' and 'wb' only.") 156 | 157 | 158 | class Client(object): 159 | """Client represents a GCS client which can provide bucket handles.""" 160 | 161 | def __init__(self): 162 | self.buckets: dict[str, Bucket] = dict() 163 | self.content: dict[str, tuple[str, str]] = dict() 164 | self._connection = _http.Connection(self) 165 | 166 | def bucket(self, name: str) -> Bucket: 167 | if name not in self.buckets: 168 | self.buckets[name] = Bucket(name) 169 | if name in self.content: 170 | self.buckets[name].content = self.content[name] 171 | return self.buckets[name] 172 | 173 | def _set_perm(self, permissions: any, name: str): 174 | self.buckets[name].permissions = permissions 175 | -------------------------------------------------------------------------------- /dataflux_core/tests/fake_multiprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Fake GCS package supporting the GCS API methods used for Dataflux. 17 | 18 | The fake_gcs package provides Client, Bucket, and GCSObject classes matching the 19 | interfaces used in Dataflux code. The fake is implemented using these classes, 20 | rather than by using an HTTP server and connecting the actual GCS client to the 21 | server, which could be a future improvement. 22 | """ 23 | 24 | 25 | class FakeProcess(object): 26 | """A fake multiprocessing process for testing purposes.""" 27 | 28 | def __init__(self, name: str, alive: bool = False, term_tracker=[]): 29 | self.name = name 30 | self.alive = alive 31 | self.term_tracker = term_tracker 32 | 33 | def is_alive(self): 34 | if self.alive: 35 | self.alive = False 36 | return True 37 | return self.alive 38 | 39 | def terminate(self): 40 | self.term_tracker.append("") 41 | -------------------------------------------------------------------------------- /dataflux_core/tests/test_download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import unittest 18 | from unittest import mock 19 | 20 | from dataflux_core import download 21 | from dataflux_core.tests import fake_gcs 22 | 23 | 24 | class DownloadTestCase(unittest.TestCase): 25 | 26 | def test_compose(self): 27 | bucket_name = "test_bucket" 28 | destination_blob_name = "dest_name" 29 | objects = [("one", 3), ("two", 3), ("three", 5)] 30 | client = fake_gcs.Client() 31 | bucket = client.bucket(bucket_name) 32 | bucket._add_file("one", bytes("one", "utf-8")) 33 | bucket._add_file("two", bytes("two", "utf-8")) 34 | bucket._add_file("three", bytes("three", "utf-8")) 35 | expected_result = b"onetwothree" 36 | blob = download.compose("", bucket_name, destination_blob_name, 37 | objects, client) 38 | self.assertEqual(blob.name, destination_blob_name) 39 | self.assertEqual(blob.content, expected_result) 40 | self.assertIn("dataflux", client._connection.user_agent) 41 | 42 | def test_decompose(self): 43 | bucket_name = "test_bucket" 44 | object_name = "test_obj" 45 | objects = [("one", 3), ("two", 3), ("three", 5)] 46 | client = fake_gcs.Client() 47 | bucket = client.bucket(bucket_name) 48 | bucket._add_file(object_name, bytes("onetwothree", "utf-8")) 49 | result = download.decompose("", bucket_name, object_name, objects, 50 | client) 51 | self.assertEqual(result, [b"one", b"two", b"three"]) 52 | self.assertIn("dataflux", client._connection.user_agent) 53 | 54 | def test_download_single(self): 55 | client = fake_gcs.Client() 56 | bucket_name = "test_bucket" 57 | object_name = "test_obj" 58 | content = bytes("onetwothree", "utf-8") 59 | bucket = client.bucket(bucket_name) 60 | bucket._add_file(object_name, content) 61 | result = download.download_single(client, bucket_name, object_name) 62 | self.assertEqual(result, content) 63 | 64 | def test_dataflux_download(self): 65 | bucket_name = "test_bucket" 66 | objects = [("one", 3), ("two", 3), ("three", 5)] 67 | client = fake_gcs.Client() 68 | bucket = client.bucket(bucket_name) 69 | bucket._add_file("one", bytes("one", "utf-8")) 70 | bucket._add_file("two", bytes("two", "utf-8")) 71 | bucket._add_file("three", bytes("three", "utf-8")) 72 | params = download.DataFluxDownloadOptimizationParams(32) 73 | expected_result = [b"one", b"two", b"three"] 74 | result = download.dataflux_download("", bucket_name, objects, client, 75 | params) 76 | self.assertEqual(result, expected_result) 77 | # This checks for succesful deletion of the composed object. 78 | if len(bucket.blobs) != 3: 79 | self.fail( 80 | f"expected only 3 objects in bucket, but found {len(bucket.blobs)}" 81 | ) 82 | self.assertIn("dataflux", client._connection.user_agent) 83 | 84 | def test_dataflux_download_parallel(self): 85 | test_cases = [ 86 | { 87 | "name": "exceed number of items", 88 | "procs": 4 89 | }, 90 | { 91 | "name": "single proc", 92 | "procs": 1 93 | }, 94 | { 95 | "name": "standard", 96 | "procs": 2 97 | }, 98 | ] 99 | bucket_name = "test_bucket" 100 | objects = [("one", 3), ("two", 3), ("three", 5)] 101 | client = fake_gcs.Client() 102 | bucket = client.bucket(bucket_name) 103 | bucket._add_file("one", bytes("one", "utf-8")) 104 | bucket._add_file("two", bytes("two", "utf-8")) 105 | bucket._add_file("three", bytes("three", "utf-8")) 106 | params = download.DataFluxDownloadOptimizationParams(32) 107 | expected_result = [b"one", b"two", b"three"] 108 | for tc in test_cases: 109 | result = download.dataflux_download_parallel( 110 | "", 111 | bucket_name, 112 | objects, 113 | client, 114 | params, 115 | tc["procs"], 116 | ) 117 | self.assertEqual(result, expected_result) 118 | # This checks for succesful deletion of the composed object. 119 | if len(bucket.blobs) != 3: 120 | self.fail( 121 | f"{tc['name']} expected only 3 objects in bucket, but found {len(bucket.blobs)}" 122 | ) 123 | 124 | def test_dataflux_download_threaded(self): 125 | test_cases = [ 126 | { 127 | "name": "exceed number of items", 128 | "threads": 4 129 | }, 130 | { 131 | "name": "single thread", 132 | "threads": 1 133 | }, 134 | { 135 | "name": "standard", 136 | "threads": 2 137 | }, 138 | ] 139 | bucket_name = "test_bucket" 140 | objects = [("one", 3), ("two", 3), ("three", 5)] 141 | client = fake_gcs.Client() 142 | bucket = client.bucket(bucket_name) 143 | bucket._add_file("one", bytes("one", "utf-8")) 144 | bucket._add_file("two", bytes("two", "utf-8")) 145 | bucket._add_file("three", bytes("three", "utf-8")) 146 | params = download.DataFluxDownloadOptimizationParams(32) 147 | expected_result = [b"one", b"two", b"three"] 148 | for tc in test_cases: 149 | result = download.dataflux_download_threaded( 150 | "", 151 | bucket_name, 152 | objects, 153 | client, 154 | params, 155 | tc["threads"], 156 | ) 157 | self.assertEqual(result, expected_result) 158 | # This checks for succesful deletion of the composed object. 159 | if len(bucket.blobs) != 3: 160 | self.fail( 161 | f"{tc['name']} expected only 3 objects in bucket, but found {len(bucket.blobs)}" 162 | ) 163 | self.assertIn("dataflux", client._connection.user_agent) 164 | 165 | def test_dataflux_download_lazy(self): 166 | test_cases = [ 167 | { 168 | "desc": "Need to compose objects before downloading", 169 | "max_composite_object_size": 100, 170 | }, 171 | { 172 | "desc": "Do not need to compose objects before downloading", 173 | "max_composite_object_size": 0, 174 | }, 175 | ] 176 | 177 | for tc in test_cases: 178 | bucket_name = "test_bucket" 179 | objects = [("one", 3), ("two", 3), ("three", 5)] 180 | client = fake_gcs.Client() 181 | bucket = client.bucket(bucket_name) 182 | bucket._add_file("one", bytes("one", "utf-8")) 183 | bucket._add_file("two", bytes("two", "utf-8")) 184 | bucket._add_file("three", bytes("three", "utf-8")) 185 | params = download.DataFluxDownloadOptimizationParams( 186 | tc["max_composite_object_size"]) 187 | expected_result = [b"one", b"two", b"three"] 188 | result = download.dataflux_download_lazy("", bucket_name, objects, 189 | client, params) 190 | self.assertEqual( 191 | list(result), 192 | expected_result, 193 | f"test {tc['desc']} got {list(result)} objects, wanted {expected_result}", 194 | ) 195 | # This checks for succesful deletion of the composed object. 196 | if len(bucket.blobs) != 3: 197 | self.fail( 198 | f"test {tc['desc']} expected only 3 objects in bucket, but found {len(bucket.blobs)}" 199 | ) 200 | self.assertIn("dataflux", client._connection.user_agent) 201 | 202 | def test_clean_composed_object(self): 203 | 204 | class ComposedObj: 205 | 206 | def __init__(self): 207 | self.deleted = False 208 | 209 | def delete(self, retry=None): 210 | self.deleted = True 211 | 212 | current_composed_object = ComposedObj() 213 | download.clean_composed_object(current_composed_object) 214 | if not current_composed_object.deleted: 215 | self.fail("expected composed object cleanup: True, got False") 216 | 217 | 218 | if __name__ == "__main__": 219 | unittest.main() 220 | -------------------------------------------------------------------------------- /dataflux_core/tests/test_fake_gcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import io 18 | import unittest 19 | 20 | from dataflux_core.tests import fake_gcs 21 | 22 | 23 | class FakeGCSTest(unittest.TestCase): 24 | 25 | def test_list_blobs_empty(self): 26 | bucket = fake_gcs.Client().bucket("test-bucket") 27 | self.assertFalse(bucket.list_blobs()) 28 | 29 | def test_list_blobs_all(self): 30 | bucket = fake_gcs.Client().bucket("test-bucket") 31 | bucket._add_file("obj1", "a") 32 | bucket._add_file("obj2", "aa") 33 | want_objects = [ 34 | bucket.blobs["obj1"], 35 | bucket.blobs["obj2"], 36 | ] 37 | self.assertEqual(bucket.list_blobs(), want_objects) 38 | 39 | def test_list_blobs_with_start_range_equal(self): 40 | bucket = fake_gcs.Client().bucket("test-bucket") 41 | bucket._add_file("obj1", "a") 42 | bucket._add_file("obj2", "aa") 43 | want_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]] 44 | self.assertEqual(bucket.list_blobs(start_offset=want_objects[0].name), 45 | want_objects) 46 | 47 | def test_list_blobs_with_end_range_equal(self): 48 | bucket = fake_gcs.Client().bucket("test-bucket") 49 | bucket._add_file("obj1", "a") 50 | bucket._add_file("obj2", "aa") 51 | all_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]] 52 | want_objects = [all_objects[0]] 53 | self.assertEqual(bucket.list_blobs(end_offset=all_objects[1].name), 54 | want_objects) 55 | 56 | def test_list_blobs_with_start_range_greater(self): 57 | bucket = fake_gcs.Client().bucket("test-bucket") 58 | bucket._add_file("obj1", "a") 59 | bucket._add_file("obj2", "aa") 60 | all_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]] 61 | want_objects = [all_objects[1]] 62 | self.assertEqual(bucket.list_blobs(start_offset=all_objects[1].name), 63 | want_objects) 64 | 65 | def test_list_blobs_with_range(self): 66 | bucket = fake_gcs.Client().bucket("test-bucket") 67 | bucket._add_file("obj1", "a") 68 | bucket._add_file("obj2", "aa") 69 | bucket._add_file("obj3", "aaa") 70 | all_objects = [ 71 | bucket.blobs["obj1"], bucket.blobs["obj2"], bucket.blobs["obj3"] 72 | ] 73 | want_objects = [all_objects[1]] 74 | self.assertEqual( 75 | bucket.list_blobs(start_offset=all_objects[1].name, 76 | end_offset=all_objects[2].name), 77 | want_objects, 78 | ) 79 | 80 | def test_list_blobs_with_max_results(self): 81 | bucket = fake_gcs.Client().bucket("test-bucket") 82 | bucket._add_file("obj1", "a") 83 | bucket._add_file("obj2", "aa") 84 | bucket._add_file("obj3", "aaa") 85 | all_objects = [ 86 | bucket.blobs["obj1"], bucket.blobs["obj2"], bucket.blobs["obj3"] 87 | ] 88 | want_objects = [all_objects[0]] 89 | self.assertEqual(bucket.list_blobs(max_results=1), want_objects) 90 | 91 | def test_list_blobs_with_max_results_and_range(self): 92 | bucket = fake_gcs.Client().bucket("test-bucket") 93 | bucket._add_file("obj1", "a") 94 | bucket._add_file("obj2", "aa") 95 | bucket._add_file("obj3", "aaa") 96 | bucket._add_file("obj4", "aaaa") 97 | all_objects = [ 98 | bucket.blobs["obj1"], 99 | bucket.blobs["obj2"], 100 | bucket.blobs["obj3"], 101 | bucket.blobs["obj4"], 102 | ] 103 | want_objects = [all_objects[1], all_objects[2]] 104 | self.assertEqual( 105 | bucket.list_blobs( 106 | max_results=2, 107 | start_offset=all_objects[1].name, 108 | end_offset=all_objects[3].name, 109 | ), 110 | want_objects, 111 | ) 112 | 113 | def test_bucket_name_none_raises_error(self): 114 | try: 115 | fake_gcs.Client().bucket(None) 116 | except: 117 | return 118 | self.fail("Creating bucket with None name did not raise error") 119 | 120 | def test_blob_write(self): 121 | want_obj = "test" 122 | obj_bytes = str.encode(want_obj) 123 | bucket = fake_gcs.Bucket("test-bucket") 124 | blob = bucket.blob(want_obj) 125 | writer = fake_gcs.FakeBlobWriter(blob) 126 | writer.write(obj_bytes) 127 | self.assertEqual(blob.content, b'' + obj_bytes) 128 | 129 | def test_blob_read(self): 130 | bucket = fake_gcs.Bucket("test-bucket") 131 | blob = bucket.blob("test") 132 | self.assertIsInstance(blob.open("rb"), io.BytesIO) 133 | 134 | def test_blob_writer(self): 135 | bucket = fake_gcs.Bucket("test-bucket") 136 | blob = bucket.blob("test") 137 | self.assertIsInstance(blob.open("wb"), fake_gcs.FakeBlobWriter) 138 | 139 | def test_permissions(self): 140 | test_bucket = "test-bucket" 141 | test_perm = ["test-perm-1", "test-perm-3"] 142 | client = fake_gcs.Client() 143 | bucket = client.bucket(test_bucket) 144 | client._set_perm(["test-perm-1", "test-perm-2", "test-perm-3"], 145 | test_bucket) 146 | got_perm = bucket.test_iam_permissions(test_perm) 147 | self.assertEqual(got_perm, test_perm) 148 | 149 | def test_no_permissions(self): 150 | test_bucket = "test-bucket" 151 | test_perm = ["test-perm-1", "test-perm-3"] 152 | client = fake_gcs.Client() 153 | bucket = client.bucket(test_bucket) 154 | got_perm = bucket.test_iam_permissions(test_perm) 155 | self.assertEqual(got_perm, []) 156 | 157 | def test_download_to_file(self): 158 | bucket = fake_gcs.Client().bucket("test-bucket") 159 | name = "obj1" 160 | contents = b"aaaa" 161 | bucket._add_file(name, contents) 162 | 163 | stream = io.BytesIO() 164 | bucket.blob(name).download_to_file(stream) 165 | stream.seek(0) 166 | self.assertEqual(stream.read(), contents) 167 | 168 | 169 | if __name__ == "__main__": 170 | unittest.main() 171 | -------------------------------------------------------------------------------- /dataflux_core/tests/test_fast_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import queue 18 | import time 19 | import unittest 20 | 21 | from dataflux_core import fast_list 22 | from dataflux_core.tests import fake_gcs, fake_multiprocess 23 | 24 | 25 | class FastListTest(unittest.TestCase): 26 | 27 | def test_single_worker(self): 28 | """End to end test of a single ListWorker.""" 29 | test_cases = [ 30 | { 31 | "desc": "List 10k objects default", 32 | "object_count": 10000, 33 | "compose_obj_count": 1, 34 | "prefix_obj_count": 0, 35 | "archive_obj_count": 0, 36 | "prefix": "", 37 | "object_size": 10, 38 | "directory_obj_count": 10, 39 | "skip_compose": True, 40 | "list_directory_objects": False, 41 | "expected_objects": 10000, 42 | "expected_api_calls": 3, 43 | }, 44 | { 45 | "desc": "List 10k objects including compose", 46 | "object_count": 10000, 47 | "compose_obj_count": 1, 48 | "prefix_obj_count": 0, 49 | "archive_obj_count": 0, 50 | "prefix": "", 51 | "object_size": 10, 52 | "directory_obj_count": 0, 53 | "skip_compose": False, 54 | "list_directory_objects": False, 55 | "expected_objects": 10001, 56 | "expected_api_calls": 3, 57 | }, 58 | { 59 | "desc": "List 5k objects excluding compose", 60 | "object_count": 5000, 61 | "compose_obj_count": 5000, 62 | "prefix_obj_count": 0, 63 | "archive_obj_count": 0, 64 | "prefix": "", 65 | "object_size": 10, 66 | "directory_obj_count": 0, 67 | "skip_compose": True, 68 | "list_directory_objects": False, 69 | "expected_objects": 5000, 70 | "expected_api_calls": 3, 71 | }, 72 | { 73 | "desc": "List 2k objects, prefix only", 74 | "object_count": 5000, 75 | "compose_obj_count": 5000, 76 | "prefix_obj_count": 2000, 77 | "archive_obj_count": 0, 78 | "prefix": "test-prefix/", 79 | "object_size": 10, 80 | "directory_obj_count": 0, 81 | "skip_compose": True, 82 | "list_directory_objects": False, 83 | "expected_objects": 2000, 84 | "expected_api_calls": 1, 85 | }, 86 | { 87 | "desc": "List directory objects", 88 | "object_count": 10000, 89 | "compose_obj_count": 0, 90 | "prefix_obj_count": 0, 91 | "archive_obj_count": 0, 92 | "prefix": "", 93 | "object_size": 10, 94 | "directory_obj_count": 10, 95 | "skip_compose": True, 96 | "list_directory_objects": True, 97 | "expected_objects": 10010, 98 | "expected_api_calls": 3, 99 | }, 100 | { 101 | "desc": "Skip non-standard class", 102 | "object_count": 10000, 103 | "compose_obj_count": 0, 104 | "prefix_obj_count": 0, 105 | "archive_obj_count": 1000, 106 | "prefix": "", 107 | "object_size": 10, 108 | "directory_obj_count": 0, 109 | "skip_compose": True, 110 | "list_directory_objects": True, 111 | "expected_objects": 10000, 112 | "expected_api_calls": 3, 113 | }, 114 | ] 115 | for tc in test_cases: 116 | client = fake_gcs.Client() 117 | bucket_name = "test_bucket" 118 | bucket = client.bucket(bucket_name) 119 | object_count = tc["object_count"] 120 | object_size = tc["object_size"] 121 | results_queue = queue.Queue() 122 | metadata_queue = queue.Queue() 123 | work_queue = queue.Queue() 124 | work_queue.put((None, "")) 125 | 126 | for i in range(object_count): 127 | bucket._add_file(str(i), b"a" * object_size) 128 | # Add one composed object to make sure it is skipped. 129 | for i in range(tc["compose_obj_count"]): 130 | bucket._add_file(f"dataflux-composed-objects/composed{i}.tar", 131 | b"a" * object_size) 132 | for i in range(tc["prefix_obj_count"]): 133 | bucket._add_file(f"{tc['prefix']}file{i}.txt", 134 | b"a" * object_size) 135 | for i in range(tc["directory_obj_count"]): 136 | bucket._add_file(f"{tc['prefix']}/dir{i}/", b"") 137 | for i in range(tc["archive_obj_count"]): 138 | bucket._add_file(f"archive_{i}", 139 | b"a" * object_size, 140 | storage_class="ARCHIVE") 141 | list_worker = fast_list.ListWorker( 142 | "test_worker", 143 | "", 144 | bucket_name, 145 | queue.Queue(), 146 | queue.Queue(), 147 | work_queue, 148 | queue.Queue(), 149 | queue.Queue(), 150 | results_queue, 151 | metadata_queue, 152 | queue.Queue(), 153 | "", 154 | "", 155 | skip_compose=tc["skip_compose"], 156 | list_directory_objects=tc["list_directory_objects"], 157 | prefix=tc["prefix"], 158 | ) 159 | list_worker.client = client 160 | list_worker.run() 161 | got_results = set() 162 | while True: 163 | try: 164 | new_results = results_queue.get_nowait() 165 | got_results.update(new_results) 166 | except queue.Empty: 167 | break 168 | expected_objects = tc["expected_objects"] 169 | if len(got_results) != expected_objects: 170 | self.fail( 171 | f"got {len(got_results)} results, want {expected_objects}") 172 | got_total_size = 0 173 | for result in got_results: 174 | got_total_size += result[1] 175 | want_total_size = ( 176 | expected_objects - 177 | (tc["directory_obj_count"] 178 | if tc["list_directory_objects"] else 0)) * object_size 179 | if got_total_size != want_total_size: 180 | self.fail( 181 | f"got {got_total_size} total size, want {want_total_size}") 182 | if list_worker.api_call_count != tc["expected_api_calls"]: 183 | self.fail(f"{list_worker.api_call_count} on test {tc['desc']}") 184 | self.assertIn("dataflux", client._connection.user_agent) 185 | 186 | def test_manage_tracking_queues(self): 187 | """Tests that all tracking queues are pushed to properly.""" 188 | controller = fast_list.ListingController(10, "", "") 189 | idle_queue = queue.Queue() 190 | idle_queue.put("one") 191 | idle_queue.put("two") 192 | idle_queue.put("three") 193 | unidle_queue = queue.Queue() 194 | unidle_queue.put("one") 195 | hb_queue = queue.Queue() 196 | hb_queue.put("four") 197 | controller.manage_tracking_queues(idle_queue, unidle_queue, hb_queue) 198 | if controller.waiting_for_work != 2: 199 | self.fail( 200 | f"got {controller.waiting_for_work} works waiting, want 2") 201 | if "four" not in controller.inited: 202 | self.fail( 203 | "expected inited worker to be tracked, but was not added to inited" 204 | ) 205 | if "four" not in controller.checkins: 206 | self.fail( 207 | "expected hb_queue entry to be tracked in checkins, but was not found" 208 | ) 209 | 210 | def test_check_crashed_processes(self): 211 | """Tests that crashed processes are correctly discovered and mitigated.""" 212 | controller = fast_list.ListingController(10, "", "") 213 | controller.inited.add("one") 214 | controller.checkins["one"] = time.time() 215 | if controller.check_crashed_processes(): 216 | self.fail( 217 | f"expected no crahsed processes, but found crashed process") 218 | controller.checkins["one"] = time.time() - 100 219 | if not controller.check_crashed_processes(): 220 | self.fail( 221 | f"expected crashed process to be detected, but found no crashed processes" 222 | ) 223 | 224 | def test_check_crashed_processes_follow_retry_timeout(self): 225 | """Tests that processes aren't considered to be crashed while waiting to retry API calls""" 226 | controller = fast_list.ListingController( 227 | 10, 228 | "", 229 | "", 230 | retry_config=fast_list.MODIFIED_RETRY.with_delay(maximum=90)) 231 | controller.inited.add("one") 232 | controller.checkins["one"] = time.time() - 170 233 | if controller.check_crashed_processes(): 234 | self.fail( 235 | "expected no crahsed processes, but found crashed process") 236 | controller.checkins["one"] = time.time() - 190 237 | if not controller.check_crashed_processes(): 238 | self.fail( 239 | "expected crashed process to be detected, but found no crashed processes" 240 | ) 241 | 242 | def test_cleanup_processes(self): 243 | """Tests that all processes are cleaned up at the end of execution.""" 244 | controller = fast_list.ListingController(10, "", "", True) 245 | procs = [] 246 | results_queue = queue.Queue() 247 | metadata_queue = queue.Queue() 248 | set1 = set() 249 | set2 = set() 250 | set1.add(("item", 1)) 251 | set2.add(("item2", 2)) 252 | results_queue.put(set1) 253 | results_queue.put(set2) 254 | results_set = set() 255 | for i in range(5): 256 | procs.append(fake_multiprocess.FakeProcess(f"proc{i}", False)) 257 | results = controller.cleanup_processes(procs, results_queue, 258 | metadata_queue, results_set) 259 | if results: 260 | self.fail("received results when no processes were alive") 261 | procs = [] 262 | expected = [("item", 1), ("item2", 2)] 263 | for i in range(5): 264 | procs.append(fake_multiprocess.FakeProcess(f"proc{i}", True)) 265 | results = controller.cleanup_processes(procs, results_queue, 266 | metadata_queue, results_set) 267 | self.assertEqual(results, expected) 268 | 269 | def test_terminate_now(self): 270 | controller = fast_list.ListingController(10, "", "", True) 271 | procs = [] 272 | term_tracker = [] 273 | proc_count = 5 274 | for i in range(proc_count): 275 | procs.append( 276 | fake_multiprocess.FakeProcess(f"proc{i}", False, term_tracker)) 277 | 278 | with self.assertRaises(RuntimeError): 279 | controller.terminate_now(procs) 280 | 281 | self.assertEqual(proc_count, len(term_tracker)) 282 | 283 | def test_list_controller_e2e(self): 284 | """Full end to end test of the fast list operation with one worker.""" 285 | client = fake_gcs.Client() 286 | bucket_name = "test_bucket" 287 | bucket = client.bucket(bucket_name) 288 | object_count = 1000 289 | object_size = 10 290 | for i in range(object_count): 291 | bucket._add_file(str(i), "aaaaaaaaaa") 292 | controller = fast_list.ListingController(1, "", bucket_name, True) 293 | controller.client = client 294 | results = controller.run() 295 | if len(results) != object_count: 296 | self.fail(f"got {len(results)} results, want {object_count}") 297 | got_total_size = 0 298 | for result in results: 299 | got_total_size += result[1] 300 | if got_total_size != object_count * object_size: 301 | self.fail( 302 | f"got {got_total_size} results, want {object_count * object_size}" 303 | ) 304 | 305 | def test_list_controller_e2e_error(self): 306 | """Full end to end test of the fast list operation with one worker which exits with an error.""" 307 | client = fake_gcs.Client() 308 | controller = fast_list.ListingController(1, "", "", True) 309 | controller.client = client 310 | try: 311 | results = controller.run() 312 | except: 313 | return 314 | self.fail( 315 | "Expected controller to raise an error when child process raises an error but it did not" 316 | ) 317 | 318 | def test_wait_for_work_success(self): 319 | """Tests waiting for work when there is still work remaining.""" 320 | client = fake_gcs.Client() 321 | worker_name = "test_worker" 322 | bucket_name = "test_bucket" 323 | send_work_needed_queue = queue.Queue() 324 | hb_queue = queue.Queue() 325 | direct_work_queue = queue.Queue() 326 | idle_queue = queue.Queue() 327 | unidle_queue = queue.Queue() 328 | results_queue = queue.Queue() 329 | metadata_queue = queue.Queue() 330 | direct_work_queue.put(("y", "z")) 331 | 332 | list_worker = fast_list.ListWorker( 333 | worker_name, 334 | "", 335 | bucket_name, 336 | send_work_needed_queue, 337 | hb_queue, 338 | direct_work_queue, 339 | idle_queue, 340 | unidle_queue, 341 | results_queue, 342 | metadata_queue, 343 | queue.Queue(), 344 | "", 345 | "", 346 | ) 347 | list_worker.client = client 348 | result = list_worker.wait_for_work() 349 | if not result: 350 | self.fail(f"got {result}, but expected True") 351 | self.assertEqual(send_work_needed_queue.get_nowait(), worker_name) 352 | self.assertEqual(idle_queue.get_nowait(), worker_name) 353 | self.assertEqual(hb_queue.get_nowait(), worker_name) 354 | self.assertEqual(unidle_queue.get_nowait(), worker_name) 355 | self.assertEqual(list_worker.start_range, "y") 356 | self.assertEqual(list_worker.end_range, "z") 357 | 358 | def test_wait_for_work_shutdown(self): 359 | """Tests that waiting for work correctly detects shutdown signal.""" 360 | client = fake_gcs.Client() 361 | worker_name = "test_worker" 362 | bucket_name = "test_bucket" 363 | send_work_needed_queue = queue.Queue() 364 | hb_queue = queue.Queue() 365 | direct_work_queue = queue.Queue() 366 | idle_queue = queue.Queue() 367 | unidle_queue = queue.Queue() 368 | results_queue = queue.Queue() 369 | metadata_queue = queue.Queue() 370 | direct_work_queue.put((None, None)) 371 | 372 | list_worker = fast_list.ListWorker( 373 | worker_name, 374 | "", 375 | bucket_name, 376 | send_work_needed_queue, 377 | hb_queue, 378 | direct_work_queue, 379 | idle_queue, 380 | unidle_queue, 381 | results_queue, 382 | metadata_queue, 383 | queue.Queue(), 384 | "", 385 | "", 386 | ) 387 | list_worker.client = client 388 | result = list_worker.wait_for_work() 389 | if result: 390 | self.fail(f"got {result}, but expected False") 391 | self.assertEqual(send_work_needed_queue.get_nowait(), worker_name) 392 | self.assertEqual(idle_queue.get_nowait(), worker_name) 393 | self.assertEqual(hb_queue.get_nowait(), worker_name) 394 | self.assertRaises(queue.Empty, unidle_queue.get_nowait) 395 | 396 | def test_fast_list_exits_on_error(self): 397 | """Test of a single ListWorker with an error.""" 398 | client = fake_gcs.Client() 399 | bucket_name = None 400 | results_queue = queue.Queue() 401 | metadata_queue = queue.Queue() 402 | work_queue = queue.Queue() 403 | work_queue.put((None, "")) 404 | 405 | list_worker = fast_list.ListWorker( 406 | "test_worker", 407 | "", 408 | bucket_name, 409 | queue.Queue(), 410 | queue.Queue(), 411 | work_queue, 412 | queue.Queue(), 413 | queue.Queue(), 414 | results_queue, 415 | metadata_queue, 416 | queue.Queue(), 417 | "", 418 | "", 419 | ) 420 | list_worker.client = client 421 | list_worker.run() 422 | got_results = set() 423 | while True: 424 | try: 425 | new_results = results_queue.get_nowait() 426 | got_results.update(new_results) 427 | except queue.Empty: 428 | break 429 | if len(got_results) != 0: 430 | self.fail(f"got {len(got_results)} results, want 0") 431 | 432 | 433 | if __name__ == "__main__": 434 | unittest.main() 435 | -------------------------------------------------------------------------------- /dataflux_core/tests/test_range_splitter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import unittest 18 | 19 | from dataflux_core import range_splitter 20 | 21 | 22 | class RangeSplitterTest(unittest.TestCase): 23 | 24 | def test_range_splits(self): 25 | test_cases = [ 26 | { 27 | "desc": "less than one split", 28 | "start": "1", 29 | "end": "2", 30 | "splits": 0, 31 | "expected_error": ValueError, 32 | }, 33 | { 34 | "desc": "end smaller than start range", 35 | "start": "456", 36 | "end": "123", 37 | "splits": 1, 38 | "expected_error": None, 39 | "result": [], 40 | }, 41 | { 42 | "desc": "start and end equal after padding", 43 | "start": "9", 44 | "end": "90", 45 | "splits": 100, 46 | "expected_error": None, 47 | "result": [], 48 | }, 49 | { 50 | "desc": "tight range split", 51 | "start": "199999", 52 | "end": "2", 53 | "splits": 1, 54 | "expected_error": None, 55 | "result": ["1999995"], 56 | }, 57 | { 58 | "desc": 59 | "split full namespace", 60 | "start": 61 | "", 62 | "end": 63 | "", 64 | "splits": 65 | 24, 66 | "expected_error": 67 | None, 68 | "result": [ 69 | "03", 70 | "07", 71 | "11", 72 | "15", 73 | "19", 74 | "23", 75 | "27", 76 | "31", 77 | "35", 78 | "39", 79 | "43", 80 | "47", 81 | "51", 82 | "55", 83 | "59", 84 | "63", 85 | "67", 86 | "71", 87 | "75", 88 | "79", 89 | "83", 90 | "87", 91 | "91", 92 | "95", 93 | ], 94 | }, 95 | { 96 | "desc": "split with only start range", 97 | "start": "5555", 98 | "end": "", 99 | "splits": 4, 100 | "expected_error": None, 101 | "result": ["63", "72", "81", "90"], 102 | }, 103 | { 104 | "desc": "large gap small number of splits", 105 | "start": "0", 106 | "end": "9", 107 | "splits": 3, 108 | "expected_error": None, 109 | "result": ["2", "4", "6"], 110 | }, 111 | { 112 | "desc": "split with longer prefix", 113 | "start": "0123455111", 114 | "end": "012347", 115 | "splits": 1, 116 | "expected_error": None, 117 | "result": ["012346"], 118 | }, 119 | { 120 | "desc": "split with only end range", 121 | "start": "", 122 | "end": "9", 123 | "splits": 1, 124 | "expected_error": None, 125 | "result": ["4"], 126 | }, 127 | ] 128 | rs = range_splitter.new_rangesplitter("0123456789") 129 | for tc in test_cases: 130 | try: 131 | result = rs.split_range(tc["start"], tc["end"], tc["splits"]) 132 | self.assertEqual(result, tc["result"]) 133 | except tc["expected_error"]: 134 | pass 135 | 136 | def test_add_characters_to_alphabet(self): 137 | test_cases = [ 138 | { 139 | "desc": "empty strings", 140 | "chars": "", 141 | "expected_alphabet_map": { 142 | "7": 0, 143 | "8": 1, 144 | "9": 2 145 | }, 146 | }, 147 | { 148 | "desc": "no new characters", 149 | "chars": "998", 150 | "expected_alphabet_map": { 151 | "7": 0, 152 | "8": 1, 153 | "9": 2 154 | }, 155 | }, 156 | { 157 | "desc": "new characters", 158 | "chars": "102", 159 | "expected_alphabet_map": { 160 | "0": 0, 161 | "1": 1, 162 | "2": 2, 163 | "7": 3, 164 | "8": 4, 165 | "9": 5, 166 | }, 167 | }, 168 | ] 169 | rs = range_splitter.new_rangesplitter("789") 170 | 171 | for tc in test_cases: 172 | rs.add_characters_to_alphabet(tc["chars"]) 173 | self.assertEqual(rs.alphabet_map, tc["expected_alphabet_map"], 174 | tc["desc"]) 175 | 176 | def test_int_to_string(self): 177 | test_cases = [ 178 | { 179 | "desc": "get a string", 180 | "split_point": 15, 181 | "string_len": 3, 182 | "result": "023", 183 | }, 184 | { 185 | "desc": "max number", 186 | "split_point": 215, 187 | "string_len": 3, 188 | "result": "BBB", 189 | }, 190 | { 191 | "desc": "large than max number", 192 | "split_point": 220, 193 | "string_len": 3, 194 | "result": "00A", 195 | }, 196 | ] 197 | rs = range_splitter.new_rangesplitter("0123AB") 198 | for tc in test_cases: 199 | result = rs.int_to_string(tc["split_point"], tc["string_len"]) 200 | self.assertEqual(result, tc["result"], tc["desc"]) 201 | 202 | def test_int_to_string_empty_range(self): 203 | test_cases = [ 204 | { 205 | "desc": "get a string", 206 | "split_point": 9, 207 | "string_len": 3, 208 | "result": "", 209 | "expected_error": ValueError, 210 | }, 211 | ] 212 | for tc in test_cases: 213 | try: 214 | rs = range_splitter.new_rangesplitter("") 215 | result = rs.int_to_string(tc["split_point"], tc["string_len"]) 216 | self.assertEqual(result, tc["result"], tc["desc"]) 217 | except tc["expected_error"]: 218 | pass 219 | 220 | def test_get_char_or_default(self): 221 | test_cases = [ 222 | { 223 | "desc": "index larger than character string", 224 | "characters": "15", 225 | "index": 3, 226 | "default_char": "0", 227 | "result": "0", 228 | }, 229 | { 230 | "desc": "index in string length", 231 | "characters": "15ABC", 232 | "index": 2, 233 | "default_char": "0", 234 | "result": "A", 235 | }, 236 | { 237 | "desc": "index less than 0", 238 | "characters": "15ABC", 239 | "index": -3, 240 | "default_char": "0", 241 | "result": "0", 242 | }, 243 | { 244 | "desc": "empty character", 245 | "characters": "", 246 | "index": 1, 247 | "default_char": "0", 248 | "result": "0", 249 | }, 250 | ] 251 | for tc in test_cases: 252 | result = range_splitter.get_char_or_default( 253 | tc["characters"], tc["index"], tc["default_char"]) 254 | self.assertEqual(result, tc["result"], tc["desc"]) 255 | 256 | def test_is_range_equal_with_padding(self): 257 | test_cases = [ 258 | { 259 | "desc": "start and end range with padding are equal", 260 | "start": "15", 261 | "end": "1500", 262 | "result": True, 263 | }, 264 | { 265 | "desc": "start and end range with padding are not equal", 266 | "start": "15", 267 | "end": "150A", 268 | "result": False, 269 | }, 270 | { 271 | "desc": "end range is empty", 272 | "start": "15", 273 | "end": "", 274 | "result": False, 275 | }, 276 | { 277 | "desc": "start range is empty", 278 | "start": "", 279 | "end": "09", 280 | "result": False, 281 | }, 282 | { 283 | "desc": "start range is empty", 284 | "start": "", 285 | "end": "0", 286 | "result": True, 287 | }, 288 | { 289 | "desc": "start and end range are empty", 290 | "start": "", 291 | "end": "", 292 | "result": False, 293 | }, 294 | { 295 | "desc": "start and end range are not equal", 296 | "start": "21", 297 | "end": "12", 298 | "result": False, 299 | }, 300 | { 301 | "desc": "start and end range are equal", 302 | "start": "21", 303 | "end": "21", 304 | "result": True, 305 | }, 306 | ] 307 | rs = range_splitter.new_rangesplitter("01A") 308 | for tc in test_cases: 309 | result = rs.is_range_equal_with_padding(tc["start"], tc["end"]) 310 | self.assertEqual(result, tc["result"], tc["desc"]) 311 | 312 | def test_string_to_minimal_int_range(self): 313 | test_cases = [ 314 | { 315 | "desc": 316 | "split numbers", 317 | "start": 318 | "00", 319 | "end": 320 | "20", 321 | "splits": 322 | 3, 323 | "result": 324 | range_splitter.MinimalIntRange(start_int=0, 325 | end_int=20, 326 | min_len=2), 327 | }, 328 | { 329 | "desc": 330 | "start is non-zero", 331 | "start": 332 | "06", 333 | "end": 334 | "201", 335 | "splits": 336 | 4, 337 | "result": 338 | range_splitter.MinimalIntRange(start_int=6, 339 | end_int=20, 340 | min_len=2), 341 | }, 342 | { 343 | "desc": 344 | "start with smaller suffix", 345 | "start": 346 | "091", 347 | "end": 348 | "10", 349 | "splits": 350 | 2, 351 | "result": 352 | range_splitter.MinimalIntRange(start_int=91, 353 | end_int=100, 354 | min_len=3), 355 | }, 356 | { 357 | "desc": 358 | "start is empty", 359 | "start": 360 | "", 361 | "end": 362 | "10", 363 | "splits": 364 | 2, 365 | "result": 366 | range_splitter.MinimalIntRange(start_int=0, 367 | end_int=10, 368 | min_len=2), 369 | }, 370 | { 371 | "desc": 372 | "start and end are empty", 373 | "start": 374 | "", 375 | "end": 376 | "", 377 | "splits": 378 | 24, 379 | "result": 380 | range_splitter.MinimalIntRange(start_int=0, 381 | end_int=99, 382 | min_len=2), 383 | }, 384 | { 385 | "desc": 386 | "end is empty", 387 | "start": 388 | "5555", 389 | "end": 390 | "", 391 | "splits": 392 | 4, 393 | "result": 394 | range_splitter.MinimalIntRange(start_int=55, 395 | end_int=99, 396 | min_len=2), 397 | }, 398 | { 399 | "desc": 400 | "tight range split", 401 | "start": 402 | "199999", 403 | "end": 404 | "2", 405 | "splits": 406 | 1, 407 | "result": 408 | range_splitter.MinimalIntRange(start_int=1999990, 409 | end_int=2000000, 410 | min_len=7), 411 | }, 412 | { 413 | "desc": 414 | "tight range split", 415 | "start": 416 | "8100", 417 | "end": 418 | "9100", 419 | "splits": 420 | 3, 421 | "result": 422 | range_splitter.MinimalIntRange(start_int=81, 423 | end_int=91, 424 | min_len=2), 425 | }, 426 | ] 427 | rs = range_splitter.new_rangesplitter("0123456789") 428 | for tc in test_cases: 429 | result = rs.string_to_minimal_int_range(tc["start"], tc["end"], 430 | tc["splits"]) 431 | self.assertEqual(result, tc["result"], tc["desc"]) 432 | 433 | def test_generate_splits(self): 434 | test_cases = [ 435 | { 436 | "desc": "less than one split", 437 | "start": "1", 438 | "end": "2", 439 | "splits": 0, 440 | "result": [], 441 | }, 442 | { 443 | "desc": "tight range split", 444 | "start": "199999", 445 | "end": "2", 446 | "splits": 1, 447 | "result": ["1999995"], 448 | }, 449 | { 450 | "desc": 451 | "split full namespace", 452 | "start": 453 | "", 454 | "end": 455 | "", 456 | "splits": 457 | 24, 458 | "result": [ 459 | "03", 460 | "07", 461 | "11", 462 | "15", 463 | "19", 464 | "23", 465 | "27", 466 | "31", 467 | "35", 468 | "39", 469 | "43", 470 | "47", 471 | "51", 472 | "55", 473 | "59", 474 | "63", 475 | "67", 476 | "71", 477 | "75", 478 | "79", 479 | "83", 480 | "87", 481 | "91", 482 | "95", 483 | ], 484 | }, 485 | { 486 | "desc": "split with only start range", 487 | "start": "5555", 488 | "end": "", 489 | "splits": 4, 490 | "result": ["63", "72", "81", "90"], 491 | }, 492 | { 493 | "desc": "large gap small number of splits", 494 | "start": "0", 495 | "end": "9", 496 | "splits": 3, 497 | "result": ["2", "4", "6"], 498 | }, 499 | { 500 | "desc": "split with longer prefix", 501 | "start": "0123455111", 502 | "end": "012347", 503 | "splits": 1, 504 | "result": ["012346"], 505 | }, 506 | { 507 | "desc": "split with only end range", 508 | "start": "", 509 | "end": "9", 510 | "splits": 1, 511 | "result": ["4"], 512 | }, 513 | { 514 | "desc": "tight range split", 515 | "start": "8100", 516 | "end": "9100", 517 | "splits": 3, 518 | "result": ["83", "86", "88"], 519 | }, 520 | ] 521 | rs = range_splitter.new_rangesplitter("0123456789") 522 | for tc in test_cases: 523 | min_int_range = rs.string_to_minimal_int_range( 524 | tc["start"], tc["end"], tc["splits"]) 525 | opts = range_splitter.GenerateSplitsOpts(min_int_range, 526 | tc["splits"], tc["start"], 527 | tc["end"]) 528 | result = rs.generate_splits(opts) 529 | self.assertEqual(result, tc["result"], tc["desc"]) 530 | 531 | 532 | if __name__ == "__main__": 533 | unittest.main() 534 | -------------------------------------------------------------------------------- /dataflux_core/tests/test_user_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import unittest 18 | 19 | from google.api_core.client_info import ClientInfo 20 | from google.cloud import storage 21 | 22 | from dataflux_core import user_agent 23 | 24 | 25 | class UserAgentTest(unittest.TestCase): 26 | 27 | def test_no_existing_info(self): 28 | client = storage.Client() 29 | user_agent.add_dataflux_user_agent(client) 30 | self.assertTrue(client._connection.user_agent.startswith("dataflux")) 31 | 32 | def test_no_existing_string(self): 33 | client = storage.Client(client_info=ClientInfo()) 34 | user_agent.add_dataflux_user_agent(client) 35 | self.assertTrue(client._connection.user_agent.startswith("dataflux")) 36 | 37 | def test_with_existing_string(self): 38 | existing_user_agent = "existing user agent" 39 | client = storage.Client(client_info=ClientInfo( 40 | user_agent=existing_user_agent)) 41 | user_agent.add_dataflux_user_agent(client) 42 | self.assertTrue(client._connection.user_agent.startswith("dataflux")) 43 | self.assertIn(existing_user_agent, 44 | client._connection._client_info.user_agent) 45 | -------------------------------------------------------------------------------- /dataflux_core/user_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from google.api_core.client_info import ClientInfo 18 | from google.cloud import storage 19 | 20 | user_agent_string = "dataflux/1.0" 21 | 22 | 23 | def add_dataflux_user_agent(storage_client: storage.Client): 24 | if not storage_client._connection: 25 | return 26 | if not storage_client._connection._client_info: 27 | storage_client._connection._client_info = ClientInfo( 28 | user_agent=user_agent_string) 29 | elif not storage_client._connection._client_info.user_agent: 30 | storage_client._connection._client_info.user_agent = user_agent_string 31 | elif user_agent_string not in storage_client._connection._client_info.user_agent: 32 | storage_client._connection._client_info.user_agent = user_agent_string + \ 33 | " " + storage_client._connection._client_info.user_agent 34 | -------------------------------------------------------------------------------- /docs/code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of 9 | experience, education, socio-economic status, nationality, personal appearance, 10 | race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or reject 41 | comments, commits, code, wiki edits, issues, and other contributions that are 42 | not aligned to this Code of Conduct, or to ban temporarily or permanently any 43 | contributor for other behaviors that they deem inappropriate, threatening, 44 | offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when the Project 56 | Steward has a reasonable belief that an individual's behavior may have a 57 | negative impact on the project or its community. 58 | 59 | ## Conflict Resolution 60 | 61 | We do not believe that all conflict is bad; healthy debate and disagreement 62 | often yield positive results. However, it is never okay to be disrespectful or 63 | to engage in behavior that violates the project’s code of conduct. 64 | 65 | If you see someone violating the code of conduct, you are encouraged to address 66 | the behavior directly with those involved. Many issues can be resolved quickly 67 | and easily, and this gives people more control over the outcome of their 68 | dispute. If you are unable to resolve the matter for any reason, or if the 69 | behavior is threatening or harassing, report it. We are dedicated to providing 70 | an environment where participants feel welcome and safe. 71 | 72 | Reports should be directed to dataflux-customer-support@google.com, the 73 | Project Steward(s) for Dataflux. It is the Project Steward’s duty to 74 | receive and address reported violations of the code of conduct. They will then 75 | work with a committee consisting of representatives from the Open Source 76 | Programs Office and the Google Open Source Strategy team. If for any reason you 77 | are uncomfortable reaching out to the Project Steward, please email 78 | opensource@google.com. 79 | 80 | We will investigate every complaint, but you may not receive a direct response. 81 | We will use our discretion in determining when and how to follow up on reported 82 | incidents, which may range from not taking action to permanent expulsion from 83 | the project and project-sponsored spaces. We will notify the accused of the 84 | report and provide them an opportunity to discuss it before any action is taken. 85 | The identity of the reporter will be omitted from the details of the report 86 | supplied to the accused. In potentially harmful situations, such as ongoing 87 | harassment or threats to anyone's safety, we may take action without notice. 88 | 89 | ## Attribution 90 | 91 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, 92 | available at 93 | https://www.contributor-covenant.org/version/1/4/code-of-conduct/ -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We would love to accept your patches and contributions to this project. 4 | 5 | ## Before you begin 6 | 7 | ### Sign our Contributor License Agreement 8 | 9 | Contributions to this project must be accompanied by a 10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA). 11 | You (or your employer) retain the copyright to your contribution; this simply 12 | gives us permission to use and redistribute your contributions as part of the 13 | project. 14 | 15 | If you or your current employer have already signed the Google CLA (even if it 16 | was for a different project), you probably don't need to do it again. 17 | 18 | Visit to see your current agreements or to 19 | sign a new one. 20 | 21 | ### Review our Community Guidelines 22 | 23 | This project follows [Google's Open Source Community 24 | Guidelines](https://opensource.google/conduct/). 25 | 26 | ## Contribution process 27 | 28 | ### Code Reviews 29 | 30 | All submissions, including submissions by project members, require review. We 31 | use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests) 32 | for this purpose. -------------------------------------------------------------------------------- /kokoro/build.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | # Fail on any error. 18 | set -e 19 | 20 | # Code under repo is checked out to this directory. 21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python" 22 | 23 | function install_requirements() { 24 | echo Installing requirements. 25 | 26 | echo Installing python3-pip. 27 | sudo apt-get -y install python3-pip 28 | 29 | echo Installing required dependencies. 30 | pip install -r requirements.txt 31 | } 32 | 33 | function run_unit_tests() { 34 | echo Running unit tests. 35 | python -m pytest dataflux_core/tests -vvv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG 36 | } 37 | 38 | install_requirements 39 | run_unit_tests 40 | -------------------------------------------------------------------------------- /kokoro/continuous.cfg: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http:#www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | build_file: "dataflux-client-python/kokoro/build.sh" 16 | 17 | action { 18 | define_artifacts { 19 | regex: "**/unit_tests/sponge_log.xml" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /kokoro/hourly.cfg: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http:#www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | build_file: "dataflux-client-python/kokoro/performance_one_shot.sh" 16 | 17 | env_vars { 18 | key: "PROJECT" 19 | value: "dataflux-project" 20 | } 21 | 22 | env_vars { 23 | key: "BUCKET" 24 | value: "official-dataflux-tess" 25 | } 26 | 27 | env_vars { 28 | key: "PREFIX" 29 | value: "UNet3D/micro/100KB-500MB/train" 30 | } 31 | 32 | env_vars { 33 | key: "LIST_WORKERS" 34 | value: "32" 35 | } 36 | 37 | env_vars { 38 | key: "FILE_COUNT" 39 | value: "5000" 40 | } 41 | 42 | env_vars { 43 | key: "TOTAL_FILE_SIZE" 44 | value: "501770000" 45 | } 46 | 47 | env_vars { 48 | key: "MAX_COMPOSE_BYTES" 49 | value: "100000000" 50 | } 51 | 52 | env_vars { 53 | key: "LIST_TIMEOUT" 54 | value: "30" 55 | } 56 | 57 | env_vars { 58 | key: "DOWNLOAD_TIMEOUT" 59 | value: "400" 60 | } 61 | 62 | action { 63 | define_artifacts { 64 | regex: "**/unit_tests/sponge_log.xml" 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /kokoro/nightly.cfg: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http:#www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | build_file: "dataflux-client-python/kokoro/performance_seg.sh" 16 | 17 | env_vars { 18 | key: "PROJECT" 19 | value: "dataflux-project" 20 | } 21 | 22 | env_vars { 23 | key: "BUCKET" 24 | value: "official-dataflux-tess" 25 | } 26 | 27 | env_vars { 28 | key: "PREFIX" 29 | value: "UNet3D/large/150MB-750GB/train" 30 | } 31 | 32 | env_vars { 33 | key: "LIST_WORKERS" 34 | value: "32" 35 | } 36 | 37 | env_vars { 38 | key: "FILE_COUNT" 39 | value: "5000" 40 | } 41 | 42 | env_vars { 43 | key: "TOTAL_FILE_SIZE" 44 | value: "749947535000" 45 | } 46 | 47 | env_vars { 48 | key: "MAX_COMPOSE_BYTES" 49 | value: "10" 50 | } 51 | 52 | env_vars { 53 | key: "LIST_TIMEOUT" 54 | value: "10" 55 | } 56 | 57 | env_vars { 58 | key: "DOWNLOAD_TIMEOUT" 59 | value: "1400" 60 | } 61 | 62 | env_vars { 63 | key: "PARALLELIZATION" 64 | value: "32" 65 | } 66 | 67 | action { 68 | define_artifacts { 69 | regex: "**/unit_tests/sponge_log.xml" 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /kokoro/performance_one_shot.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | # Fail on any error. 18 | set -e 19 | 20 | # Code under repo is checked out to this directory. 21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python" 22 | 23 | function install_requirements() { 24 | echo Installing requirements. 25 | 26 | echo Installing python3-pip. 27 | sudo apt-get -y install python3-pip 28 | 29 | echo Installing required dependencies. 30 | pip install -r requirements.txt 31 | 32 | echo Installing dataflux core. 33 | pip install . 34 | } 35 | 36 | function run_one_shot_tests() { 37 | echo Running performance tests. 38 | # -k one_shot triggers a full list and download, loading all files into memory in one shot. 39 | # Alternatively, the segmented test allows us to divide the download into multiple passes 40 | # to avoid OOM errors. 41 | python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k one_shot -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG 42 | } 43 | 44 | install_requirements 45 | run_one_shot_tests 46 | -------------------------------------------------------------------------------- /kokoro/performance_seg.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | # Fail on any error. 18 | set -e 19 | 20 | # Code under repo is checked out to this directory. 21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python" 22 | 23 | function install_requirements() { 24 | echo Installing requirements. 25 | 26 | echo Installing python3-pip. 27 | sudo apt-get -y install python3-pip 28 | 29 | echo Installing required dependencies. 30 | pip install -r requirements.txt 31 | 32 | echo Installing dataflux core. 33 | pip install . 34 | } 35 | 36 | function run_segmented_tests() { 37 | echo Running performance tests. 38 | # -k segmented triggers a full list and download, batching the download into 50GB chunks. 39 | # This test sequence is designed to handle volumes of data that exceed memory of the machine. 40 | python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k segmented -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG 41 | } 42 | 43 | install_requirements 44 | run_segmented_tests 45 | 46 | -------------------------------------------------------------------------------- /kokoro/presubmit.cfg: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http:#www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | build_file: "dataflux-client-python/kokoro/presubmit.sh" 16 | 17 | env_vars { 18 | key: "PROJECT" 19 | value: "dataflux-project" 20 | } 21 | 22 | env_vars { 23 | key: "BUCKET" 24 | value: "official-dataflux-tess" 25 | } 26 | 27 | env_vars { 28 | key: "PREFIX" 29 | value: "UNet3D/micro/100KB-500MB/train" 30 | } 31 | 32 | env_vars { 33 | key: "LIST_WORKERS" 34 | value: "32" 35 | } 36 | 37 | env_vars { 38 | key: "FILE_COUNT" 39 | value: "5000" 40 | } 41 | 42 | env_vars { 43 | key: "TOTAL_FILE_SIZE" 44 | value: "501770000" 45 | } 46 | 47 | env_vars { 48 | key: "MAX_COMPOSE_BYTES" 49 | value: "100000000" 50 | } 51 | 52 | env_vars { 53 | key: "LIST_TIMEOUT" 54 | value: "10" 55 | } 56 | 57 | env_vars { 58 | key: "DOWNLOAD_TIMEOUT" 59 | value: "400" 60 | } 61 | 62 | env_vars { 63 | key: "PARALLELIZATION" 64 | value: "32" 65 | } 66 | 67 | action { 68 | define_artifacts { 69 | regex: "**/unit_tests/sponge_log.xml" 70 | } 71 | } 72 | 73 | action { 74 | define_artifacts { 75 | regex: "**/integration_tests/sponge_log.xml" 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /kokoro/presubmit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | # Fail on any error. 18 | set -e 19 | 20 | # Code under repo is checked out to this directory. 21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python" 22 | 23 | function install_requirements() { 24 | echo Installing requirements. 25 | 26 | echo Installing python3-pip. 27 | sudo apt-get -y install python3-pip 28 | 29 | echo Installing required dependencies. 30 | pip install -r requirements.txt 31 | 32 | echo Installing dataflux core. 33 | pip install . 34 | } 35 | 36 | function run_presubmit_tests() { 37 | echo Running unit tests. 38 | python3 -m pytest dataflux_core/tests -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG 39 | echo Running performance tests. 40 | # -k one_shot triggers a full list and download, loading all files into memory in one shot. 41 | # Alternatively, the segmented test allows us to divide the download into multiple passes 42 | # to avoid OOM errors. 43 | python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k one_shot -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/integration_tests/sponge_log.xml" --log-cli-level=DEBUG 44 | } 45 | 46 | install_requirements 47 | run_presubmit_tests 48 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | google-auth==2.39.0 2 | google-cloud-storage 3 | absl-py 4 | pytest 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="dataflux_client_python", 5 | packages=find_packages(), 6 | ) 7 | --------------------------------------------------------------------------------