├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .vscode
    ├── extensions.json
    └── settings.json
├── LICENSE
├── README.md
├── __init__.py
├── dataflux_core
    ├── __init__.py
    ├── benchmarking
    │   ├── dataflux_client_bench.py
    │   ├── dataflux_client_parallel_bench.py
    │   └── dataflux_client_threaded_bench.py
    ├── download.py
    ├── fast_list.py
    ├── performance_tests
    │   ├── list_and_download.py
    │   └── list_only.py
    ├── range_splitter.py
    ├── tests
    │   ├── __init__.py
    │   ├── fake_gcs.py
    │   ├── fake_multiprocess.py
    │   ├── test_download.py
    │   ├── test_fake_gcs.py
    │   ├── test_fast_list.py
    │   ├── test_range_splitter.py
    │   └── test_user_agent.py
    └── user_agent.py
├── docs
    ├── code-of-conduct.md
    └── contributing.md
├── kokoro
    ├── build.sh
    ├── continuous.cfg
    ├── hourly.cfg
    ├── nightly.cfg
    ├── performance_one_shot.sh
    ├── performance_seg.sh
    ├── presubmit.cfg
    └── presubmit.sh
├── requirements.txt
└── setup.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Enforce top level code-owners CodeOwners (this will auto-tag reviewers)
2 | *  @GoogleCloudPlatform/dataflux-python
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Expected Behavior
 2 | 
 3 | 
 4 | ## Actual Behavior
 5 | 
 6 | 
 7 | ## Steps to Reproduce the Problem
 8 | 
 9 | 1.
10 | 1.
11 | 1.
12 | 
13 | ## Specifications
14 | 
15 | - Version:
16 | - Platform:


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #<issue_number_goes_here>
2 | 
3 | > It's a good idea to open an issue first for discussion.
4 | 
5 | - [ ] Tests pass
6 | - [ ] Appropriate changes to documentation are included in the PR


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Reference: https://github.com/github/gitignore/blob/main/Python.gitignore
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "eeyore.yapf",
4 |         "ms-python.flake8",
5 |         "ms-python.isort",
6 |         "ms-python.python",
7 |     ],
8 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "[python]": {
 3 |         "editor.formatOnSaveMode": "file",
 4 |         "editor.formatOnSave": true,
 5 |         "editor.defaultFormatter": "eeyore.yapf",
 6 |         "editor.formatOnType": false,
 7 |         "files.trimTrailingWhitespace": true,
 8 |     },
 9 |     "python.testing.pytestArgs": [
10 |         "-s",
11 |         "dataflux_core/tests",
12 |     ],
13 |     "python.testing.pytestEnabled": true,
14 |     "python.testing.unittestEnabled": false,
15 |     "editor.codeActionsOnSave": {
16 |         "source.organizeImports": "always"
17 |     },
18 |     "files.insertFinalNewline": true,
19 |     "files.trimFinalNewlines": true,
20 | }
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dataflux for Google Cloud Storage Python client library
  2 | 
  3 | ## Overview
  4 | 
  5 | This is the client library backing the [Dataflux Dataset for Pytorch](https://github.com/GoogleCloudPlatform/dataflux-pytorch). The purpose of this client is to quickly list and download data stored in GCS for use in Python machine learning applications. The core functionalities of this client can be broken down into two key parts.
  6 | 
  7 | ## Fast List
  8 | 
  9 | The fast list component of this client leverages Python multiprocessing to parallelize the listing of files within a GCS bucket. It does this by implementing a workstealing algorithm, where each worker in the list operation is able to steal work from its siblings once it has finished all currently slated listing work. This parallelization leads to a real world speed increase up to 10 times faster than sequential listing. Note that paralellization is limited by the machine on which the client runs, and optimal performance is typically found with a worker count that is 1:1 with the available cores. Benchmarking has demonstrated that the larger the object count, the better Dataflux performs when compared to a linear listing.
 10 | 
 11 | ### Example Code
 12 | ```python
 13 | from dataflux_core import fast_list
 14 | 
 15 | number_of_workers = 20
 16 | project = "MyProject"
 17 | bucket = "TargetBucket"
 18 | target_folder_prefix = "folder1/"
 19 | 
 20 | print("Fast list operation starting...")
 21 | list_result = fast_list.ListingController(
 22 |     max_parallelism=number_of_workers,
 23 |     project=project,
 24 |     bucket=bucket,
 25 |     prefix=target_folder_prefix,
 26 | ).run()
 27 | ```
 28 | 
 29 | #### Storage Class
 30 | 
 31 | By default, fast list will only list objects of STANDARD class in GCS buckets. This can be overridden by passing in a string list of storage classes to include while running the Listing Controller. Note that this default behavior was chosen to avoid the cost associated with downloading non-standard GCS classes. Details on GCS Storage Classes can be further explored in the [Storage Class Documentation](https://cloud.google.com/storage/docs/storage-classes).
 32 | 
 33 | #### API Call Count
 34 | 
 35 | Using fast list increases the total number of calls made to the GCS bucket. The increased amount can vary greatly based on size of the workload and number of threads, but our benchmarking has shown an uper bound of approximately 2x the number of API calls when compared to a sequential list. API call count tracking will be displayed in logs if log level is set to debug. To enable these logs we recommend using the `--log-cli-level=DEBUG` flag.
 36 | 
 37 | ### Fast List Benchmark Results
 38 | |File Count|VM Core Count|List Time Without Dataflux|List Time With Dataflux|
 39 | |------------|-------------|--------------------------|-----------------------|
 40 | |17944239 Obj|48 Core      |1630.75s                  |79.55s                 |
 41 | |5000000 Obj |48 Core      |289.95s                   |23.43s                 |
 42 | |1999002 Obj |48 Core      |117.61s                   |12.45s                 |
 43 | |578411 Obj  |48 Core      |30.70s                    |9.39s                  |
 44 | |10013 Obj   |48 Core      |2.35s                     |6.06s                  |
 45 | 
 46 | ## Composed Download
 47 | 
 48 | The composed download component of the client uses the results of the fast list to efficiently download the files necessary for a machine learning workload. When downloading files from remote stores, small file size often bottlenecks the speed at which files can be downloaded. To avoid this bottleneck, composed download leverages the [GCS Compose Objects API](https://cloud.google.com/storage/docs/composing-objects) to concatinate small files into larger composed files in GCS prior to downloading. This greatly improves download performance, particularly on datasets with very large numbers of small files.
 49 | 
 50 | ### Example Code
 51 | ```python
 52 | from dataflux_core import download
 53 | 
 54 | # The maximum size in bytes of a composite download object.
 55 | # If this value is set to 0, no composition will occur.
 56 | max_compose_bytes = 10000000
 57 | project = "MyProject"
 58 | bucket = "TargetBucket"
 59 | 
 60 | download_params = download.DataFluxDownloadOptimizationParams(
 61 |     max_compose_bytes
 62 | )
 63 | 
 64 | print("Download operation starting...")
 65 | download_result = download.dataflux_download(
 66 |     project_name=project,
 67 |     bucket_name=bucket,
 68 |     # The list_results parameter is the value returned by fast list in the previous code example.
 69 |     objects=list_result,
 70 |     dataflux_download_optimization_params=download_params,
 71 | )
 72 | ```
 73 | 
 74 | #### Multiple Download Options
 75 | 
 76 | Looking at the [download code](dataflux_core/download.py) you will notice three distinct download functions. The default function used in the dataflux-pytorch client is `dataflux_download`. The other functions serve to improve performance for specific use cases.
 77 | 
 78 | ###### Parallel Download
 79 | 
 80 | The `dataflux_download_parallel` function is the most performant stand-alone download function. When using the dataflux client library in isolation, this is the recommended download function. Parallelization must be tuned based on available CPU power and network bandwidth.
 81 | 
 82 | ###### Threaded Download
 83 | 
 84 | The `dataflux_download_threaded` function allows for some amount of downlod parallelization while running within daemonic processes (e.g. a distributed ML workload leveraging [ray](https://www.ray.io/)). Daemonic processes are not permitted to spin up child processes, and thus threading must be used in these instances. Threading download performance is similar to that of multiprocessing for most use-cases, but loses out on performance as the thread/process count increases. Additionally, threading does not allow for signal interuption, so SIGINT cleanup triggers are disabled when running a threaded download.
 85 | 
 86 | ### Dataflux Download Benchmark Results
 87 | 
 88 | These benchmarks were performed on a n2-standard-48 48 vCPU virtual machine on files of approximately 10kb each.
 89 | 
 90 | |Number of Objects|Standard Linear Download|Dataflux Composed Download|Dataflux Threaded Composed Download (48 Threads)|Dataflux Parallel Composed Download (48 Processes)|
 91 | |-----------------|------------------------|--------------------------|------------------------------------------------|--------------------------------------------------|
 92 | |111              |18.27 Seconds           |5.17 Seconds              |3.94 Seconds                                    |2.06 Seconds                                      |
 93 | |1111             |176.22 Seconds          |61.78 Seconds             |5.21 Seconds                                    |3.14 Seconds                                      |
 94 | |11098            |1396.98 Seconds         |392.23 Seconds            |16.85 Seconds                                   |14.88 Seconds                                     |
 95 | 
 96 | 
 97 | ## Getting Started
 98 | 
 99 | To get started leveraging the dataflux client library, we encourage you to start from the [Dataflux Dataset for Pytorch](https://github.com/GoogleCloudPlatform/dataflux-pytorch). For an example of client-specific implementation, please see the [benchmark code](dataflux_core/benchmarking/dataflux_client_bench.py).
100 | 
101 | ## Support
102 | 
103 | * Please file a GitHub issue in this repository
104 | * If you need to get in touch with us, email dataflux-customer-support@google.com
105 | 
106 | ## Contributing
107 | 
108 | We welcome your feedback, issues, and bug fixes. We have a tight roadmap at this time so if you have a major feature or change in functionality you'd like to contribute, please open a GitHub Issue for discussion prior to sending a pull request. Please see [CONTRIBUTING](docs/contributing.md) for more information on how to report bugs or submit pull requests.
109 | 
110 | ## Code of Conduct
111 | 
112 | This project has adopted the Google Open Source Code of Conduct. Please see [code-of-conduct.md](docs/code-of-conduct.md) for more information.
113 | 
114 | ## License
115 | 
116 | The Dataflux Python Client has an Apache License 2.0. Please see the [LICENSE](LICENSE) file for more information.
117 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2023 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | 


--------------------------------------------------------------------------------
/dataflux_core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | from . import download, fast_list, user_agent
17 | 


--------------------------------------------------------------------------------
/dataflux_core/benchmarking/dataflux_client_bench.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 | 
16 |  Example benchmark execution:
17 |  python3 dataflux_client_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --max-compose=32
18 |  """
19 | 
20 | import argparse
21 | import time
22 | 
23 | from dataflux_core import download, fast_list
24 | 
25 | 
26 | def parse_args():
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--project", type=str)
29 |     parser.add_argument("--bucket", type=str)
30 |     parser.add_argument("--bucket-file-count", type=int, default=None)
31 |     parser.add_argument("--bucket-file-size", type=int, default=None)
32 |     parser.add_argument("--num-workers", type=int, default=10)
33 |     parser.add_argument("--max-compose-bytes", type=int, default=100000000)
34 |     parser.add_argument("--prefix", type=str, default="")
35 |     return parser.parse_args()
36 | 
37 | 
38 | def main() -> None:
39 |     args = parse_args()
40 |     list_start_time = time.time()
41 |     print(f"Listing operation started at {list_start_time}")
42 |     list_result = fast_list.ListingController(args.num_workers,
43 |                                               args.project,
44 |                                               args.bucket,
45 |                                               prefix=args.prefix).run()
46 |     list_end_time = time.time()
47 |     if args.bucket_file_count and len(list_result) != args.bucket_file_count:
48 |         raise AssertionError(
49 |             f"Expected {args.bucket_file_count} files, but got {len(list_result)}"
50 |         )
51 |     print(
52 |         f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds"
53 |     )
54 |     size = sum([x[1] for x in list_result])
55 |     print(f"Starting download of: {size} bytes of data...")
56 |     download_params = download.DataFluxDownloadOptimizationParams(
57 |         args.max_compose_bytes)
58 |     download_start_time = time.time()
59 |     print(f"Download operation started at {download_start_time}")
60 |     download_result = download.dataflux_download(
61 |         args.project,
62 |         args.bucket,
63 |         list_result,
64 |         dataflux_download_optimization_params=download_params,
65 |     )
66 |     download_end_time = time.time()
67 |     total_size = sum([len(x) for x in download_result])
68 |     if args.bucket_file_size and total_size != args.bucket_file_size:
69 |         raise AssertionError(
70 |             f"Expected {args.bucket_file_size} bytes but got {total_size} bytes"
71 |         )
72 |     print(
73 |         f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds"
74 |     )
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/dataflux_core/benchmarking/dataflux_client_parallel_bench.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 | 
16 |  Example benchmark execution:
17 |  python3 dataflux_client_parallel_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --parallelization=30 --max-compose=32
18 |  """
19 | 
20 | import argparse
21 | import time
22 | 
23 | from dataflux_core import download, fast_list
24 | 
25 | 
26 | def parse_args():
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--project", type=str)
29 |     parser.add_argument("--bucket", type=str)
30 |     parser.add_argument("--bucket-file-count", type=int, default=None)
31 |     parser.add_argument("--bucket-file-size", type=int, default=None)
32 |     parser.add_argument("--num-workers", type=int, default=10)
33 |     parser.add_argument("--max-compose-bytes", type=int, default=100000000)
34 |     parser.add_argument("--parallelization", type=int, default=20)
35 |     parser.add_argument("--prefix", type=str, default="")
36 |     return parser.parse_args()
37 | 
38 | 
39 | def main() -> None:
40 |     args = parse_args()
41 |     list_start_time = time.time()
42 |     print(f"Listing operation started at {list_start_time}")
43 |     list_result = fast_list.ListingController(args.num_workers,
44 |                                               args.project,
45 |                                               args.bucket,
46 |                                               prefix=args.prefix).run()
47 |     list_end_time = time.time()
48 |     if args.bucket_file_count and len(list_result) != args.bucket_file_count:
49 |         raise AssertionError(
50 |             f"Expected {args.bucket_file_count} files, but got {len(list_result)}"
51 |         )
52 |     print(
53 |         f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds"
54 |     )
55 |     size = sum([x[1] for x in list_result])
56 |     print(f"Starting download of: {size} bytes of data...")
57 |     download_params = download.DataFluxDownloadOptimizationParams(
58 |         args.max_compose_bytes)
59 |     download_start_time = time.time()
60 |     print(f"Download operation started at {download_start_time}")
61 |     download_result = download.dataflux_download_parallel(
62 |         args.project,
63 |         args.bucket,
64 |         list_result,
65 |         dataflux_download_optimization_params=download_params,
66 |         parallelization=args.parallelization,
67 |     )
68 |     download_end_time = time.time()
69 |     total_size = sum([len(x) for x in download_result])
70 |     if args.bucket_file_size and total_size != args.bucket_file_size:
71 |         raise AssertionError(
72 |             f"Expected {args.bucket_file_size} bytes but got {total_size} bytes"
73 |         )
74 |     print(
75 |         f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds using {args.parallelization} processes"
76 |     )
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/dataflux_core/benchmarking/dataflux_client_threaded_bench.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 | 
16 |  Example benchmark execution:
17 |  python3 dataflux_client_threaded_bench.py --project=test-project --bucket=test-bucket --bucket-file-count=5 --bucket-file-size=172635220 --num-workers=5 --threads=30 --max-compose=32
18 |  """
19 | 
20 | import argparse
21 | import time
22 | 
23 | from dataflux_core import download, fast_list
24 | 
25 | 
26 | def parse_args():
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--project", type=str)
29 |     parser.add_argument("--bucket", type=str)
30 |     parser.add_argument("--bucket-file-count", type=int, default=None)
31 |     parser.add_argument("--bucket-file-size", type=int, default=None)
32 |     parser.add_argument("--num-workers", type=int, default=10)
33 |     parser.add_argument("--max-compose-bytes", type=int, default=100000000)
34 |     parser.add_argument("--threads", type=int, default=20)
35 |     parser.add_argument("--prefix", type=str, default="")
36 |     return parser.parse_args()
37 | 
38 | 
39 | def main() -> None:
40 |     args = parse_args()
41 |     list_start_time = time.time()
42 |     print(f"Listing operation started at {list_start_time}")
43 |     list_result = fast_list.ListingController(args.num_workers,
44 |                                               args.project,
45 |                                               args.bucket,
46 |                                               prefix=args.prefix).run()
47 |     list_end_time = time.time()
48 |     if args.bucket_file_count and len(list_result) != args.bucket_file_count:
49 |         raise AssertionError(
50 |             f"Expected {args.bucket_file_count} files, but got {len(list_result)}"
51 |         )
52 |     print(
53 |         f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds"
54 |     )
55 |     size = sum([x[1] for x in list_result])
56 |     print(f"Starting download of: {size} bytes of data...")
57 |     download_params = download.DataFluxDownloadOptimizationParams(
58 |         args.max_compose_bytes)
59 |     download_start_time = time.time()
60 |     print(f"Download operation started at {download_start_time}")
61 |     download_result = download.dataflux_download_threaded(
62 |         args.project,
63 |         args.bucket,
64 |         list_result,
65 |         dataflux_download_optimization_params=download_params,
66 |         threads=args.threads,
67 |     )
68 |     download_end_time = time.time()
69 |     total_size = sum([len(x) for x in download_result])
70 |     if args.bucket_file_size and total_size != args.bucket_file_size:
71 |         raise AssertionError(
72 |             f"Expected {args.bucket_file_size} bytes but got {total_size} bytes"
73 |         )
74 |     print(
75 |         f"{total_size} bytes across {len(list_result)} objects downloaded in {download_end_time - download_start_time} seconds using {args.threads} threads"
76 |     )
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/dataflux_core/download.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2024 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | import itertools
 20 | import logging
 21 | import math
 22 | import multiprocessing
 23 | import queue
 24 | import signal
 25 | import sys
 26 | import threading
 27 | import uuid
 28 | from typing import Iterator
 29 | 
 30 | from google.api_core.client_info import ClientInfo
 31 | from google.cloud import storage
 32 | from google.cloud.storage.retry import DEFAULT_RETRY
 33 | 
 34 | from dataflux_core import user_agent
 35 | 
 36 | # https://cloud.google.com/storage/docs/retry-strategy#python.
 37 | MODIFIED_RETRY = DEFAULT_RETRY.with_deadline(300.0).with_delay(initial=1.0,
 38 |                                                                multiplier=1.2,
 39 |                                                                maximum=45.0)
 40 | 
 41 | # https://cloud.google.com/storage/docs/composite-objects.
 42 | MAX_NUM_OBJECTS_TO_COMPOSE = 32
 43 | 
 44 | COMPOSED_PREFIX = "dataflux-composed-objects/"
 45 | 
 46 | current_composed_object = None
 47 | 
 48 | 
 49 | def compose(
 50 |     project_name: str,
 51 |     bucket_name: str,
 52 |     destination_blob_name: str,
 53 |     objects: list[tuple[str, int]],
 54 |     storage_client: object = None,
 55 |     retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
 56 | ) -> object:
 57 |     """Compose the objects into a composite object, upload the composite object to the GCS bucket and returns it.
 58 | 
 59 |     Args:
 60 |         project_name: the name of the GCP project.
 61 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
 62 |             The function uploads the the composed object to this bucket too.
 63 |         destination_blob_name: the name of the composite object to be created.
 64 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
 65 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
 66 |         storage_client: the google.cloud.storage.Client initialized with the project.
 67 |             If not defined, the function will initialize the client with the project_name.
 68 |         retry_config: The retry parameter to supply to the compose objects call.
 69 | 
 70 |     Returns:
 71 |         the "blob" of the composed object.
 72 |     """
 73 |     if len(objects) > MAX_NUM_OBJECTS_TO_COMPOSE:
 74 |         raise ValueError(
 75 |             f"{MAX_NUM_OBJECTS_TO_COMPOSE} objects allowed to compose, received {len(objects)} objects."
 76 |         )
 77 | 
 78 |     if storage_client is None:
 79 |         storage_client = storage.Client(project=project_name)
 80 |     user_agent.add_dataflux_user_agent(storage_client)
 81 | 
 82 |     bucket = storage_client.bucket(bucket_name)
 83 |     destination = bucket.blob(destination_blob_name)
 84 | 
 85 |     sources = list()
 86 |     for each_object in objects:
 87 |         blob_name = each_object[0]
 88 |         sources.append(bucket.blob(blob_name))
 89 | 
 90 |     destination.compose(sources, retry=retry_config)
 91 | 
 92 |     return destination
 93 | 
 94 | 
 95 | def decompose(
 96 |     project_name: str,
 97 |     bucket_name: str,
 98 |     composite_object_name: str,
 99 |     objects: list[tuple[str, int]],
100 |     storage_client: object = None,
101 |     retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
102 | ) -> list[bytes]:
103 |     """Decompose the composite objects and return the decomposed objects contents in bytes.
104 | 
105 |     Args:
106 |         project_name: the name of the GCP project.
107 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
108 |             The function uploads the the composed object to this bucket too.
109 |         composite_object_name: the name of the composite object to be created.
110 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
111 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
112 |         storage_client: the google.cloud.storage.Client initialized with the project.
113 |             If not defined, the function will initialize the client with the project_name.
114 |         retry_config: The retry parameter supplied to the download_as_bytes call.
115 | 
116 |     Returns:
117 |         the contents (in bytes) of the decomposed objects.
118 |     """
119 |     if storage_client is None:
120 |         storage_client = storage.Client(project=project_name)
121 |     user_agent.add_dataflux_user_agent(storage_client)
122 | 
123 |     res = []
124 |     composed_object_content = download_single(
125 |         storage_client,
126 |         bucket_name,
127 |         composite_object_name,
128 |         retry_config=retry_config,
129 |     )
130 | 
131 |     start = 0
132 |     for each_object in objects:
133 |         blob_size = each_object[1]
134 |         content = composed_object_content[start:start + blob_size]
135 |         res.append(content)
136 |         start += blob_size
137 | 
138 |     if start != len(composed_object_content):
139 |         logging.error(
140 |             "decomposed object length = %s bytes, wanted = %s bytes.",
141 |             start,
142 |             len(composed_object_content),
143 |         )
144 |     return res
145 | 
146 | 
147 | def download_single(
148 |     storage_client: object,
149 |     bucket_name: str,
150 |     object_name: str,
151 |     retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
152 | ) -> bytes:
153 |     """Download the contents of this object as a bytes object and return it.
154 | 
155 |     Args:
156 |         storage_client: the google.cloud.storage.Client initialized with the project.
157 |         bucket_name: the name of the GCS bucket that holds the object.
158 |         object_name: the name of the object to download.
159 |         retry_config: The retry parameter supplied to the download_as_bytes call.
160 | 
161 |     Returns:
162 |         the contents of the object in bytes.
163 |     """
164 |     bucket_handle = storage_client.bucket(bucket_name)
165 |     blob = bucket_handle.blob(object_name)
166 |     return blob.download_as_bytes(retry=retry_config)
167 | 
168 | 
169 | class DataFluxDownloadOptimizationParams:
170 |     """Parameters used to optimize DataFlux download performance.
171 | 
172 |     Attributes:
173 |         max_composite_object_size: An integer indicating a cap for the maximum size of the composite object.
174 | 
175 |     """
176 | 
177 |     def __init__(self, max_composite_object_size):
178 |         self.max_composite_object_size = max_composite_object_size
179 | 
180 | 
181 | def df_download_thread(
182 |     results_queue: queue.Queue[list[bytes]],
183 |     project_name: str,
184 |     bucket_name: str,
185 |     objects: list[tuple[str, int]],
186 |     storage_client: object = None,
187 |     dataflux_download_optimization_params:
188 |     DataFluxDownloadOptimizationParams = None,
189 |     retry_config=MODIFIED_RETRY,
190 | ):
191 |     """Threading helper that calls dataflux_download and places results onto queue.
192 | 
193 |     Args:
194 |         results_queue: the queue on which to put all download results.
195 |         project_name: the name of the GCP project.
196 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
197 |             The function uploads the the composed object to this bucket too.
198 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
199 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
200 |         storage_client: the google.cloud.storage.Client initialized with the project.
201 |             If not defined, the function will initialize the client with the project_name.
202 |         dataflux_download_optimization_params: the paramemters used to optimize the download performance.
203 |         retry_config: The retry configuration to pass to all retryable download operations
204 |     """
205 |     result = dataflux_download(
206 |         project_name,
207 |         bucket_name,
208 |         objects,
209 |         storage_client,
210 |         dataflux_download_optimization_params,
211 |         # Always signify threading enabled so that signal handling is disabled.
212 |         threading_enabled=True,
213 |         retry_config=retry_config,
214 |     )
215 |     results_queue.put(result)
216 | 
217 | 
218 | def dataflux_download_threaded(
219 |     project_name: str,
220 |     bucket_name: str,
221 |     objects: list[tuple[str, int]],
222 |     storage_client: object = None,
223 |     dataflux_download_optimization_params:
224 |     DataFluxDownloadOptimizationParams = None,
225 |     threads: int = 1,
226 |     retry_config=MODIFIED_RETRY,
227 | ) -> list[bytes]:
228 |     """Perform the DataFlux download algorithm threaded to performantly download the object contents as bytes and return.
229 | 
230 |     Args:
231 |         project_name: the name of the GCP project.
232 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
233 |             The function uploads the the composed object to this bucket too.
234 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
235 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
236 |         storage_client: the google.cloud.storage.Client initialized with the project.
237 |             If not defined, the function will initialize the client with the project_name.
238 |         dataflux_download_optimization_params: the paramemters used to optimize the download performance.
239 |         threads: The number of threads on which to download at any given time.
240 |         retry_config: The retry configuration to pass to all retryable download operations
241 |     Returns:
242 |         the contents of the object in bytes.
243 |     """
244 |     chunk_size = math.ceil(len(objects) / threads)
245 |     chunks = []
246 |     for i in range(threads):
247 |         chunk = objects[i * chunk_size:(i + 1) * chunk_size]
248 |         if chunk:
249 |             chunks.append(chunk)
250 |     results_queues = [queue.Queue() for _ in chunks]
251 |     thread_list = []
252 |     for i, chunk in enumerate(chunks):
253 |         thread = threading.Thread(
254 |             target=df_download_thread,
255 |             args=(
256 |                 results_queues[i],
257 |                 project_name,
258 |                 bucket_name,
259 |                 chunk,
260 |                 storage_client,
261 |                 dataflux_download_optimization_params,
262 |                 retry_config,
263 |             ),
264 |         )
265 |         thread_list.append(thread)
266 |         thread.start()
267 |     for thread in thread_list:
268 |         thread.join()
269 |     results = []
270 |     for q in results_queues:
271 |         while not q.empty():
272 |             results.extend(q.get())
273 |     return results
274 | 
275 | 
276 | def dataflux_download_parallel(
277 |     project_name: str,
278 |     bucket_name: str,
279 |     objects: list[tuple[str, int]],
280 |     storage_client: object = None,
281 |     dataflux_download_optimization_params:
282 |     DataFluxDownloadOptimizationParams = None,
283 |     parallelization: int = 1,
284 |     retry_config=MODIFIED_RETRY,
285 | ) -> list[bytes]:
286 |     """Perform the DataFlux download algorithm in parallel to download the object contents as bytes and return.
287 | 
288 |     Args:
289 |         project_name: the name of the GCP project.
290 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
291 |             The function uploads the the composed object to this bucket too.
292 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
293 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
294 |         storage_client: the google.cloud.storage.Client initialized with the project.
295 |             If not defined, the function will initialize the client with the project_name.
296 |         dataflux_download_optimization_params: the paramemters used to optimize the download performance.
297 |         parallelization: The number of parallel processes that will simultaneously execute the download.
298 |         retry_config: The retry configuration to pass to all retryable download operations
299 |     Returns:
300 |         the contents of the object in bytes.
301 |     """
302 |     chunk_size = math.ceil(len(objects) / parallelization)
303 |     chunks = []
304 |     for i in range(parallelization):
305 |         chunk = objects[i * chunk_size:(i + 1) * chunk_size]
306 |         if chunk:
307 |             chunks.append(chunk)
308 |     with multiprocessing.Pool(processes=len(chunks)) as pool:
309 |         results = pool.starmap(
310 |             dataflux_download,
311 |             ((
312 |                 project_name,
313 |                 bucket_name,
314 |                 chunk,
315 |                 storage_client,
316 |                 dataflux_download_optimization_params,
317 |                 False,
318 |                 retry_config,
319 |             ) for chunk in chunks),
320 |         )
321 |         return list(itertools.chain.from_iterable(results))
322 | 
323 | 
324 | def dataflux_download(
325 |     project_name: str,
326 |     bucket_name: str,
327 |     objects: list[tuple[str, int]],
328 |     storage_client: object = None,
329 |     dataflux_download_optimization_params:
330 |     DataFluxDownloadOptimizationParams = None,
331 |     threading_enabled=False,
332 |     retry_config=MODIFIED_RETRY,
333 | ) -> list[bytes]:
334 |     """Perform the DataFlux download algorithm to download the object contents as bytes and return.
335 | 
336 |     Args:
337 |         project_name: the name of the GCP project.
338 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
339 |             The function uploads the the composed object to this bucket too.
340 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
341 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
342 |         storage_client: the google.cloud.storage.Client initialized with the project.
343 |             If not defined, the function will initialize the client with the project_name.
344 |         dataflux_download_optimization_params: the paramemters used to optimize the download performance.
345 |         retry_config: The retry configuration to pass to all retryable download operations
346 |     Returns:
347 |         the contents of the object in bytes.
348 |     """
349 |     if storage_client is None:
350 |         storage_client = storage.Client(project=project_name)
351 |     user_agent.add_dataflux_user_agent(storage_client)
352 | 
353 |     res = []
354 |     max_composite_object_size = (
355 |         dataflux_download_optimization_params.max_composite_object_size)
356 | 
357 |     i = 0
358 |     # Register the cleanup signal handler for SIGINT.
359 |     if not threading_enabled:
360 |         signal.signal(signal.SIGINT, term_signal_handler)
361 |     global current_composed_object
362 |     while i < len(objects):
363 |         curr_object_name = objects[i][0]
364 |         curr_object_size = objects[i][1]
365 | 
366 |         if curr_object_size > max_composite_object_size:
367 |             # Download the single object.
368 |             curr_object_content = download_single(
369 |                 storage_client=storage_client,
370 |                 bucket_name=bucket_name,
371 |                 object_name=curr_object_name,
372 |                 retry_config=retry_config,
373 |             )
374 |             res.append(curr_object_content)
375 |             i += 1
376 |         else:
377 |             # Dynamically compose and decompose based on the object size.
378 |             objects_slice = []
379 |             curr_size = 0
380 | 
381 |             while (i < len(objects) and curr_size <= max_composite_object_size
382 |                    and len(objects_slice) < MAX_NUM_OBJECTS_TO_COMPOSE):
383 |                 curr_size += objects[i][1]
384 |                 objects_slice.append(objects[i])
385 |                 i += 1
386 | 
387 |             if len(objects_slice) == 1:
388 |                 object_name = objects_slice[0][0]
389 |                 curr_object_content = download_single(
390 |                     storage_client=storage_client,
391 |                     bucket_name=bucket_name,
392 |                     object_name=object_name,
393 |                     retry_config=retry_config,
394 |                 )
395 |                 res.append(curr_object_content)
396 |             else:
397 |                 # If the number of objects > 1, we want to compose, download, decompose and delete the composite object.
398 |                 # Need to create a unique composite name to avoid mutation on the same object among processes.
399 |                 composed_object_name = COMPOSED_PREFIX + str(uuid.uuid4())
400 |                 composed_object = compose(
401 |                     project_name,
402 |                     bucket_name,
403 |                     composed_object_name,
404 |                     objects_slice,
405 |                     storage_client,
406 |                     retry_config=retry_config,
407 |                 )
408 |                 current_composed_object = composed_object
409 |                 res.extend(
410 |                     decompose(
411 |                         project_name,
412 |                         bucket_name,
413 |                         composed_object_name,
414 |                         objects_slice,
415 |                         storage_client,
416 |                         retry_config=retry_config,
417 |                     ))
418 | 
419 |                 try:
420 |                     composed_object.delete(retry=retry_config)
421 |                     current_composed_object = None
422 |                 except Exception as e:
423 |                     logging.exception(
424 |                         f"exception while deleting the composite object: {e}")
425 |     return res
426 | 
427 | 
428 | def dataflux_download_lazy(
429 |     project_name: str,
430 |     bucket_name: str,
431 |     objects: list[tuple[str, int]],
432 |     storage_client: object = None,
433 |     dataflux_download_optimization_params:
434 |     DataFluxDownloadOptimizationParams = None,
435 |     threading_enabled=False,
436 |     retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
437 | ) -> Iterator[bytes]:
438 |     """Perform the DataFlux download algorithm to download the object contents as bytes in a lazy fashion.
439 | 
440 |     Args:
441 |         project_name: the name of the GCP project.
442 |         bucket_name: the name of the GCS bucket that holds the objects to compose.
443 |             The function uploads the the composed object to this bucket too.
444 |         objects: A list of tuples which indicate the object names and sizes (in bytes) in the bucket.
445 |             Example: [("object_name_A", 1000), ("object_name_B", 2000)]
446 |         storage_client: the google.cloud.storage.Client initialized with the project.
447 |             If not defined, the function will initialize the client with the project_name.
448 |         dataflux_download_optimization_params: the paramemters used to optimize the download performance.
449 |         retry_config: The retry parameter to supply to the compose objects call.
450 |     Returns:
451 |         An iterator of the contents of the object in bytes.
452 |     """
453 |     if storage_client is None:
454 |         storage_client = storage.Client(project=project_name)
455 |     user_agent.add_dataflux_user_agent(storage_client)
456 | 
457 |     max_composite_object_size = (
458 |         dataflux_download_optimization_params.max_composite_object_size)
459 | 
460 |     i = 0
461 |     # Register the cleanup signal handler for SIGINT.
462 |     if not threading_enabled:
463 |         signal.signal(signal.SIGINT, term_signal_handler)
464 |     global current_composed_object
465 |     while i < len(objects):
466 |         curr_object_name = objects[i][0]
467 |         curr_object_size = objects[i][1]
468 | 
469 |         if curr_object_size > max_composite_object_size:
470 |             # Download the single object.
471 |             curr_object_content = download_single(
472 |                 storage_client=storage_client,
473 |                 bucket_name=bucket_name,
474 |                 object_name=curr_object_name,
475 |                 retry_config=retry_config,
476 |             )
477 |             yield from [curr_object_content]
478 |             i += 1
479 |         else:
480 |             # Dynamically compose and decompose based on the object size.
481 |             objects_slice = []
482 |             curr_size = 0
483 | 
484 |             while (i < len(objects) and curr_size <= max_composite_object_size
485 |                    and len(objects_slice) < MAX_NUM_OBJECTS_TO_COMPOSE):
486 |                 curr_size += objects[i][1]
487 |                 objects_slice.append(objects[i])
488 |                 i += 1
489 | 
490 |             if len(objects_slice) == 1:
491 |                 object_name = objects_slice[0][0]
492 |                 curr_object_content = download_single(
493 |                     storage_client=storage_client,
494 |                     bucket_name=bucket_name,
495 |                     object_name=object_name,
496 |                     retry_config=retry_config,
497 |                 )
498 |                 yield from [curr_object_content]
499 |             else:
500 |                 # If the number of objects > 1, we want to compose, download, decompose and delete the composite object.
501 |                 # Need to create a unique composite name to avoid mutation on the same object among processes.
502 |                 composed_object_name = COMPOSED_PREFIX + str(uuid.uuid4())
503 |                 composed_object = compose(
504 |                     project_name,
505 |                     bucket_name,
506 |                     composed_object_name,
507 |                     objects_slice,
508 |                     storage_client,
509 |                     retry_config=retry_config,
510 |                 )
511 |                 current_composed_object = composed_object
512 |                 yield from (decompose(
513 |                     project_name,
514 |                     bucket_name,
515 |                     composed_object_name,
516 |                     objects_slice,
517 |                     storage_client,
518 |                     retry_config=retry_config,
519 |                 ))
520 | 
521 |                 try:
522 |                     composed_object.delete(retry=retry_config)
523 |                     current_composed_object = None
524 |                 except Exception as e:
525 |                     logging.exception(
526 |                         f"exception while deleting the composite object: {e}")
527 | 
528 | 
529 | def clean_composed_object(composed_object):
530 |     if composed_object:
531 |         try:
532 |             composed_object.delete(retry=MODIFIED_RETRY)
533 |         except Exception as e:
534 |             logging.exception(
535 |                 f"exception while deleting composite object: {e}")
536 | 
537 | 
538 | def term_signal_handler(signal_num, frame):
539 |     print("Ctrl+C interrupt detected. Cleaning up and exiting...")
540 |     clean_composed_object(current_composed_object)
541 |     sys.exit(0)
542 | 


--------------------------------------------------------------------------------
/dataflux_core/fast_list.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2023 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | import logging
 20 | import multiprocessing
 21 | import queue
 22 | import time
 23 | 
 24 | from dataflux_core import range_splitter, user_agent
 25 | from dataflux_core.download import COMPOSED_PREFIX
 26 | from google.api_core.client_info import ClientInfo
 27 | from google.cloud import storage
 28 | from google.cloud.storage.retry import DEFAULT_RETRY
 29 | 
 30 | DEFAULT_ALLOWED_CLASS = ["STANDARD"]
 31 | MODIFIED_RETRY = DEFAULT_RETRY.with_deadline(300.0).with_delay(initial=1.0,
 32 |                                                                multiplier=1.2,
 33 |                                                                maximum=45.0)
 34 | 
 35 | 
 36 | def remove_prefix(text: str, prefix: str):
 37 |     """Helper function that removes prefix from a string.
 38 | 
 39 |     Args:
 40 |         text: String of text to trim a prefix from.
 41 |         prefix: String of text that will be trimmed from text.
 42 | 
 43 |     Returns:
 44 |         Text value with the specified prefix removed.
 45 |     """
 46 |     # Note that as of python 3.9 removeprefix is built into string.
 47 |     if text.startswith(prefix):
 48 |         return text[len(prefix):]
 49 |     return text
 50 | 
 51 | 
 52 | class ListWorker(object):
 53 |     """Worker that lists a range of objects from a GCS bucket.
 54 | 
 55 |     Attributes:
 56 |         name: String name of the worker.
 57 |         gcs_project: The string name of the google cloud storage project to list from.
 58 |         bucket: The string name of the storage bucket to list from.from . import fast_list, download
 59 |         send_work_stealing_needed_queue: Multiprocessing queue pushed to when a worker needs more work.
 60 |         heartbeat_queue: Multiprocessing queue pushed to indicating worker is running nominally.
 61 |         direct_work_available_queue: Multiprocessing queue to push availble work stealing ranges to.
 62 |         idle_queue: Multiprocessing queue pushed to when worker is waiting for new work to steal.
 63 |         unidle_queue: Multiprocessing queue pushed to when the worker has successfully stolen work.
 64 |         results_queue: Multiprocessing queue on which the worker pushes its listing results onto.
 65 |         metadata_queue: Multiprocessing queue on which the worker pushes tracking metadata.
 66 |         start_range: Stirng start range worker will begin listing from.
 67 |         end_range: String end range worker will list until.
 68 |         retry_config: The retry parameter to supply to list_blob.
 69 | 
 70 |         results: Set storing aggregate results prior to pushing onto results_queue.
 71 |         client: The GCS client through which all GCS list operations are executed.
 72 |         skip_compose: When true, skip listing files with the composed object prefix.
 73 |         list_directory_objects: When true, include files with names ending in "/" in the listing. Default false.
 74 |         prefix: When provided, only list objects under this prefix.
 75 |         allowed_storage_classes: The set of GCS Storage Class types fast list will include.
 76 |         max_results: The maximum results per list call (set to max page size of 5000).
 77 |         splitter: The range_splitter object used by this worker to divide work.
 78 |         default_alph: The baseline alphabet used to initialize the range_splitter.
 79 |         api_call_count: Variable tracking the number of GCS list calls made by the worker.
 80 |     """
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         name: str,
 85 |         gcs_project: str,
 86 |         bucket: str,
 87 |         send_work_stealing_needed_queue: "multiprocessing.Queue[str]",
 88 |         heartbeat_queue: "multiprocessing.Queue[str]",
 89 |         direct_work_available_queue: "multiprocessing.Queue[tuple[str, str]]",
 90 |         idle_queue: "multiprocessing.Queue[str]",
 91 |         unidle_queue: "multiprocessing.Queue[str]",
 92 |         results_queue: "multiprocessing.Queue[set[tuple[str, int]]]",
 93 |         metadata_queue: "multiprocessing.Queue[tuple[str, int]]",
 94 |         error_queue: "multiprocessing.Queue[Exception]",
 95 |         start_range: str,
 96 |         end_range: str,
 97 |         retry_config:
 98 |         "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
 99 |         client: storage.Client = None,
100 |         skip_compose: bool = True,
101 |         list_directory_objects: bool = False,
102 |         prefix: str = "",
103 |         allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS,
104 |         max_retries: int = 5,
105 |     ):
106 |         self.name = name
107 |         self.gcs_project = gcs_project
108 |         self.bucket = bucket
109 |         self.send_work_stealing_needed_queue = send_work_stealing_needed_queue
110 |         self.heartbeat_queue = heartbeat_queue
111 |         self.direct_work_available_queue = direct_work_available_queue
112 |         self.idle_queue = idle_queue
113 |         self.unidle_queue = unidle_queue
114 |         self.results_queue = results_queue
115 |         self.metadata_queue = metadata_queue
116 |         self.error_queue = error_queue
117 |         self.start_range = start_range
118 |         self.end_range = end_range
119 |         self.results: set[tuple[str, int]] = set()
120 |         self.client = client
121 |         self.max_results = 5000
122 |         self.splitter = None
123 |         self.default_alph = "ab"
124 |         self.skip_compose = skip_compose
125 |         self.list_directory_objects = list_directory_objects
126 |         self.prefix = prefix if prefix else ""
127 |         self.allowed_storage_classes = allowed_storage_classes
128 |         self.api_call_count = 0
129 |         self.max_retries = max_retries
130 |         self.retry_config = retry_config
131 | 
132 |     def wait_for_work(self) -> bool:
133 |         """Indefinitely waits for available work and consumes it once available.
134 | 
135 |         Returns:
136 |           Boolean value indicating that new work has been acquired. The function
137 |           will only return False in response to receiving a shutdown signal (None)
138 |           from the controller.
139 |         """
140 |         self.send_work_stealing_needed_queue.put(self.name)
141 |         self.idle_queue.put(self.name)
142 |         logging.debug(f"Process {self.name} waiting for work...")
143 |         while True:
144 |             try:
145 |                 self.heartbeat_queue.put(self.name)
146 |                 new_range = self.direct_work_available_queue.get_nowait()
147 |                 # None is pushed onto the queue as the shutdown signal once all work is finished.
148 |                 if new_range[0] != None:
149 |                     self.unidle_queue.put(self.name)
150 |             except queue.Empty:
151 |                 time.sleep(0.1)
152 |                 continue
153 |             break
154 |         if new_range[0] is None:
155 |             logging.debug(f"Process {self.name} didn't receive work")
156 |             # Upon receiving shutdown signal log all relevant metadata.
157 |             md = (self.name, self.api_call_count)
158 |             self.metadata_queue.put(md)
159 |             return False
160 |         self.start_range = new_range[0]
161 |         self.end_range = new_range[1]
162 |         logging.debug(f"Process {self.name} got new range [{self.start_range},"
163 |                       f" {self.end_range}]")
164 |         return True
165 | 
166 |     def run(self) -> None:
167 |         """Runs the worker."""
168 |         logging.debug(f"Process {self.name} starting...")
169 |         if not self.client:
170 |             self.client = storage.Client(
171 |                 project=self.gcs_project,
172 |                 client_info=ClientInfo(user_agent="dataflux/0.0"),
173 |             )
174 |         else:
175 |             user_agent.add_dataflux_user_agent(self.client)
176 |         self.splitter = range_splitter.new_rangesplitter(self.default_alph)
177 |         # When worker has started, attempt to push to all queues. If the idle or unidle queue
178 |         # push fails, the worker will not initialize and will be ignored by the controller.
179 |         # This allows us to safely handle multiprocessing failures that occur on startup.
180 |         self.idle_queue.put(self.name)
181 |         self.unidle_queue.put(self.name)
182 |         self.heartbeat_queue.put(self.name)
183 |         if self.retry_config:
184 |             # Post a heartbeat when retrying so the process doesn't get killed.
185 |             # The retry class automatically logs the retry as a debug log.
186 |             def on_error(e: Exception):
187 |                 self.heartbeat_queue.put(self.name)
188 | 
189 |             self.retry_config._on_error = on_error
190 |         if self.start_range is None and self.end_range is None:
191 |             if not self.wait_for_work():
192 |                 return
193 |         retries_remaining = self.max_retries
194 |         while True:
195 |             has_results = False
196 |             try:
197 |                 list_blob_args = {
198 |                     "max_results":
199 |                     self.max_results,
200 |                     "start_offset":
201 |                     self.prefix + self.start_range,
202 |                     "end_offset": ("" if not self.end_range else self.prefix +
203 |                                    self.end_range),
204 |                     "retry":
205 |                     self.retry_config,
206 |                 }
207 |                 if self.prefix:
208 |                     list_blob_args["prefix"] = self.prefix
209 |                 blobs = self.client.bucket(
210 |                     self.bucket).list_blobs(**list_blob_args)
211 |                 self.api_call_count += 1
212 |                 i = 0
213 |                 self.heartbeat_queue.put(self.name)
214 |                 for blob in blobs:
215 |                     i += 1
216 |                     if ((not self.skip_compose
217 |                          or not blob.name.startswith(COMPOSED_PREFIX)) and
218 |                         (self.list_directory_objects or blob.name[-1] != "/")
219 |                             and blob.storage_class
220 |                             in self.allowed_storage_classes):
221 |                         self.results.add((blob.name, blob.size))
222 |                     # Remove the prefix from the name so that range calculations remain prefix-agnostic.
223 |                     # This is necessary due to the unbounded end-range when splitting string namespaces
224 |                     # of unknown size.
225 |                     self.start_range = remove_prefix(blob.name, self.prefix)
226 |                     if i == self.max_results:
227 |                         # Only allow work stealing when paging.
228 |                         has_results = True
229 |                         break
230 |                 retries_remaining = self.max_retries
231 |             except Exception as e:
232 |                 retries_remaining -= 1
233 |                 logging.error(
234 |                     f"process {self.name} encountered error ({retries_remaining} retries left): {str(e)}"
235 |                 )
236 |                 if retries_remaining == 0:
237 |                     logging.error("process " + self.name +
238 |                                   " is out of retries; exiting")
239 |                     self.error_queue.put(e)
240 |                     return
241 |                 continue
242 |             if has_results:
243 |                 # Check for work stealing.
244 |                 try:
245 |                     self.send_work_stealing_needed_queue.get_nowait()
246 |                 except queue.Empty:
247 |                     continue
248 |                 split_points = self.splitter.split_range(
249 |                     self.start_range, self.end_range, 1)
250 |                 steal_range = (split_points[0], self.end_range)
251 |                 self.direct_work_available_queue.put(steal_range)
252 |                 self.end_range = split_points[0]
253 |                 self.max_results = 5000
254 |             else:
255 |                 # All done, wait for work.
256 |                 if len(self.results) > 0:
257 |                     self.results_queue.put(self.results)
258 |                     self.results = set()
259 |                 if not self.wait_for_work():
260 |                     return
261 | 
262 | 
263 | def run_list_worker(
264 |     name: str,
265 |     gcs_project: str,
266 |     bucket: str,
267 |     send_work_stealing_needed_queue: "multiprocessing.Queue[str]",
268 |     heartbeat_queue: "multiprocessing.Queue[str]",
269 |     direct_work_available_queue: "multiprocessing.Queue[tuple[str, str]]",
270 |     idle_queue: "multiprocessing.Queue[str]",
271 |     unidle_queue: "multiprocessing.Queue[str]",
272 |     results_queue: "multiprocessing.Queue[set[tuple[str, int]]]",
273 |     metadata_queue: "multiprocessing.Queue[tuple[str, int]]",
274 |     error_queue: "multiprocessing.Queue[Exception]",
275 |     start_range: str,
276 |     end_range: str,
277 |     retry_config: "google.api_core.retry.retry_unary.Retry" = MODIFIED_RETRY,
278 |     client: storage.Client = None,
279 |     skip_compose: bool = True,
280 |     prefix: str = "",
281 |     allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS,
282 | ) -> None:
283 |     """Helper function to execute a ListWorker.
284 | 
285 |     Args:
286 |       name: String name of the list worker.
287 |       gcs_project: String name of the google cloud project in use.
288 |       bucket: String name of the google cloud bucket to list from.
289 |       send_work_stealing_needed_queue: Multiprocessing queue pushed to when a worker needs more work.
290 |       heartbeat_queue: Multiprocessing queue pushed to while a worker is running nominally.
291 |       direct_work_available_queue: Multiprocessing queue to push availble work stealing ranges to.
292 |       idle_queue: Multiprocessing queue pushed to when worker is waiting for new work to steal.
293 |       unidle_queue: Multiprocessing queue pushed to when the worker has successfully stolen work.
294 |       results_queue: Multiprocessing queue on which the worker pushes its listing results onto.
295 |       metadata_queue: Multiprocessing queue on which the worker pushes tracking metadata.
296 |       error_queue: Multiprocessing queue to track errors from the worker process.
297 |       start_range: String start range worker will begin listing from.
298 |       end_range: String end range worker will list until.
299 |       retry_config: The retry parameter to supply to list_blob.
300 |       client: The GCS storage client. When not provided, will be derived from background auth.
301 |       skip_compose: When true, skip listing files with the composed object prefix.
302 |       prefix: When provided, only list objects under this prefix.
303 |       allowed_storage_classes: The set of GCS Storage Class types fast list will include.
304 |     """
305 |     ListWorker(
306 |         name,
307 |         gcs_project,
308 |         bucket,
309 |         send_work_stealing_needed_queue,
310 |         heartbeat_queue,
311 |         direct_work_available_queue,
312 |         idle_queue,
313 |         unidle_queue,
314 |         results_queue,
315 |         metadata_queue,
316 |         error_queue,
317 |         start_range,
318 |         end_range,
319 |         retry_config,
320 |         client,
321 |         skip_compose=skip_compose,
322 |         prefix=prefix,
323 |         allowed_storage_classes=allowed_storage_classes,
324 |     ).run()
325 | 
326 | 
327 | class ListingController(object):
328 |     """This controller manages and monitors all listing workers operating on the GCS bucket.
329 | 
330 |     Attributes:
331 |         max_parallelism: The maximum number of processes to start via the Multiprocessing library.
332 |         gcs_project: The string name of the google cloud storage project to list from.
333 |         bucket: The string name of the storage bucket to list from.
334 |         inited: The set of ListWorker processes that have succesfully started.
335 |         checkins: A dictionary tracking the last known checkin time for each inited ListWorker.
336 |         waiting_for_work: The number of ListWorker processes currently waiting for new listing work.
337 |         sort_results: Boolean indicating whether the final result set should be sorted or unsorted.
338 |         skip_compose: When true, skip listing files with the composed object prefix.
339 |         prefix: When provided, only list objects under this prefix.
340 |         allowed_storage_classes: The set of GCS Storage Class types fast list will include.
341 |         retry_config: The retry config passed to list_blobs.
342 |     """
343 | 
344 |     def __init__(
345 |         self,
346 |         max_parallelism: int,
347 |         project: str,
348 |         bucket: str,
349 |         sort_results: bool = False,
350 |         skip_compose: bool = True,
351 |         prefix: str = "",
352 |         allowed_storage_classes: list[str] = DEFAULT_ALLOWED_CLASS,
353 |         retry_config=MODIFIED_RETRY,
354 |     ):
355 |         # The maximum number of threads utilized in the fast list operation.
356 |         self.max_parallelism = max_parallelism
357 |         self.gcs_project = project
358 |         self.bucket = bucket
359 |         self.inited = set()
360 |         self.checkins = {}
361 |         self.waiting_for_work = 0
362 |         self.sort_results = sort_results
363 |         self.client = None
364 |         self.skip_compose = skip_compose
365 |         self.prefix = prefix
366 |         self.allowed_storage_classes = allowed_storage_classes
367 |         self.retry_config = retry_config
368 | 
369 |     def manage_tracking_queues(
370 |         self,
371 |         idle_queue: "multiprocessing.Queue[str]",
372 |         unidle_queue: "multiprocessing.Queue[str]",
373 |         heartbeat_queue: "multiprocessing.Queue[str]",
374 |     ) -> None:
375 |         """Manages metadata queues to track execution of the listing operation.
376 | 
377 |         Args:
378 |           idle_queue: the queue workers push to when in need of new work to steal.
379 |           unidle_queue: the queue workers push to when they steal work.
380 |           heartbeat_queue: the queue workers push to continuously while running nominally.
381 |         """
382 |         while True:
383 |             try:
384 |                 idle_queue.get_nowait()
385 |                 self.waiting_for_work += 1
386 |             except queue.Empty:
387 |                 break
388 |         while True:
389 |             try:
390 |                 unidle_queue.get_nowait()
391 |                 self.waiting_for_work -= 1
392 |             except queue.Empty:
393 |                 break
394 |         while True:
395 |             try:
396 |                 inited_worker = heartbeat_queue.get_nowait()
397 |                 current_time = time.time()
398 |                 self.inited.add(inited_worker)
399 |                 self.checkins[inited_worker] = current_time
400 |             except queue.Empty:
401 |                 break
402 | 
403 |     def check_crashed_processes(self) -> bool:
404 |         """Checks if any processes have crashed.
405 | 
406 |         Returns:
407 |           A boolean indicating if any processes have crashed after initialization.
408 |           If this function returns true, it indicates a need to restart the listing
409 |           operation.
410 |         """
411 |         logging.debug("checking for crashed procs...")
412 |         now = time.time()
413 |         crashed = []
414 |         # Wait at least 60 seconds or 2 times the API call retry delay for check-ins,
415 |         # otherwise processes might appear to be crashed while retrying API calls.
416 |         checkin_wait = 2 * self.retry_config._maximum if self.retry_config else 0
417 |         checkin_wait = max(checkin_wait, 60)
418 |         for inited_worker, last_checkin in self.checkins.items():
419 |             if now - last_checkin > checkin_wait:
420 |                 crashed.append(inited_worker)
421 |             for proc in crashed:
422 |                 if proc in self.inited:
423 |                     logging.error(
424 |                         "process crash detected, ending list procedure...")
425 |                     return True
426 |         return False
427 | 
428 |     def cleanup_processes(
429 |         self,
430 |         processes: "list[multiprocessing.Process]",
431 |         results_queue: "multiprocessing.Queue[set[tuple[str, int]]]",
432 |         metadata_queue: "multiprocessing.Queue[tuple[str, int]]",
433 |         results: "set[tuple[str, int]]",
434 |     ) -> list[tuple[str, int]]:
435 |         """Allows processes to shut down, kills procs that failed to initialize.
436 | 
437 |         Args:
438 |           processes: the list of processes.
439 |           results_queue: the queue for transmitting all result tuples from listing.
440 |           metadata_queue: the queue for transmitting all tracking metadata from workers.
441 |           results: the set of unique results consumed from results_queue.
442 | 
443 |         Returns:
444 |           A sorted list of (str, int) tuples indicating the name and file size of each
445 |           unique file listed in the listing process.
446 | 
447 |         """
448 |         api_call_count = 0
449 |         while True:
450 |             alive = False
451 |             live_procs = 0
452 |             for p in processes:
453 |                 if p.is_alive():
454 |                     alive = True
455 |                     live_procs += 1
456 |                     while True:
457 |                         try:
458 |                             result = results_queue.get_nowait()
459 |                             results.update(result)
460 |                             logging.debug(f"Result count: {len(results)}")
461 |                         except queue.Empty:
462 |                             break
463 |                     time.sleep(0.2)
464 |                     break
465 |             while True:
466 |                 try:
467 |                     metadata = metadata_queue.get_nowait()
468 |                     api_call_count += metadata[1]
469 |                 except queue.Empty:
470 |                     break
471 |             logging.debug("Live procs: %d", live_procs)
472 |             logging.debug("Inited procs: %d", len(self.inited))
473 |             if live_procs <= self.max_parallelism - len(self.inited):
474 |                 alive = False
475 |                 # This prevents any memory leaks from multiple executions, but does kill
476 |                 # the stuck processes very aggressively. It does not cause issues in
477 |                 # execution, but looks very loud to the user if they are watching debug
478 |                 # output.
479 |                 for p in processes:
480 |                     if p.is_alive():
481 |                         p.terminate()
482 |             if not alive:
483 |                 logging.debug(f"Total GCS API call count: {api_call_count}")
484 |                 if self.sort_results:
485 |                     return sorted(results)
486 |                 return list(results)
487 | 
488 |     def terminate_now(
489 |             self, processes: "list[multiprocessing.Process]") -> RuntimeError:
490 |         """Terminates all processes immediately.
491 | 
492 |         Args:
493 |           processes: The full list of multiprocessing processes.
494 | 
495 |         Returns:
496 |             RuntimeError indicating that one or more multiprocess processes has
497 |             become unresponsive
498 |         """
499 |         for p in processes:
500 |             p.terminate()
501 |         raise RuntimeError(
502 |             "multiprocessing child process became unresponsive; check logs for underlying error"
503 |         )
504 | 
505 |     def run(self) -> list[tuple[str, int]]:
506 |         """Runs the controller that manages fast listing.
507 | 
508 |         Returns:
509 |           A sorted list of (str, int) tuples indicating the name and file size of each
510 |           unique file listed in the listing process.
511 |         """
512 |         # Define the queues.
513 |         send_work_stealing_needed_queue: multiprocessing.Queue[str] = (
514 |             multiprocessing.Queue())
515 |         heartbeat_queue: multiprocessing.Queue[str] = multiprocessing.Queue()
516 |         direct_work_available_queue: multiprocessing.Queue[tuple[str, str]] = (
517 |             multiprocessing.Queue())
518 |         idle_queue: multiprocessing.Queue[str] = multiprocessing.Queue()
519 |         unidle_queue: multiprocessing.Queue[str] = multiprocessing.Queue()
520 |         results_queue: multiprocessing.Queue[set[tuple[str, int]]] = (
521 |             multiprocessing.Queue())
522 |         metadata_queue: multiprocessing.Queue[tuple[
523 |             str, int]] = multiprocessing.Queue()
524 |         error_queue: multiprocessing.Queue[Exception] = multiprocessing.Queue()
525 |         processes = []
526 |         results: set[tuple[str, int]] = set()
527 |         for i in range(self.max_parallelism):
528 |             p = multiprocessing.Process(
529 |                 target=run_list_worker,
530 |                 args=(
531 |                     "dataflux-listing-proc." + str(i),
532 |                     self.gcs_project,
533 |                     self.bucket,
534 |                     send_work_stealing_needed_queue,
535 |                     heartbeat_queue,
536 |                     direct_work_available_queue,
537 |                     idle_queue,
538 |                     unidle_queue,
539 |                     results_queue,
540 |                     metadata_queue,
541 |                     error_queue,
542 |                     "" if i == 0 else None,
543 |                     "" if i == 0 else None,
544 |                     self.retry_config,
545 |                     self.client,
546 |                     self.skip_compose,
547 |                     self.prefix,
548 |                     self.allowed_storage_classes,
549 |                 ),
550 |             )
551 |             processes.append(p)
552 |             p.start()
553 |             # Wait before starting the next process to avoid deadlock when multiple processes
554 |             # attempt to register with the same multiprocessing queue.
555 |             time.sleep(0.1)
556 |         while True:
557 |             time.sleep(0.2)
558 |             try:
559 |                 e = error_queue.get_nowait()
560 |                 logging.error(
561 |                     f"Got error from child process; exiting. Check child process logs for more details. Error: {e}"
562 |                 )
563 |                 return self.terminate_now(processes)
564 |             except queue.Empty:
565 |                 pass
566 |             alive = False
567 |             for p in processes:
568 |                 if p.is_alive():
569 |                     alive = True
570 |                     break
571 |             new_results = set()
572 |             while True:
573 |                 try:
574 |                     result = results_queue.get_nowait()
575 |                     new_results.update(result)
576 |                 except queue.Empty:
577 |                     break
578 |             if len(new_results) > 0:
579 |                 results.update(new_results)
580 |                 logging.debug(f"Result count: {len(results)}")
581 |             if not alive:
582 |                 break
583 |             # Update all queues related to tracking process status.
584 |             self.manage_tracking_queues(idle_queue, unidle_queue,
585 |                                         heartbeat_queue)
586 |             if self.check_crashed_processes():
587 |                 return self.terminate_now(processes)
588 |             logging.debug("Inited procs: %d", len(self.inited))
589 |             logging.debug("Waiting for work: %d", self.waiting_for_work)
590 |             if len(self.inited) == self.waiting_for_work and (
591 |                     self.waiting_for_work > 0):
592 |                 logging.debug("Exiting, all processes are waiting for work")
593 |                 for _ in range(self.max_parallelism * 2):
594 |                     direct_work_available_queue.put((None, None))
595 |                 break
596 |         while True:
597 |             try:
598 |                 result = results_queue.get_nowait()
599 |                 results.update(result)
600 |                 logging.debug(f"Result count: {len(results)}")
601 |             except queue.Empty:
602 |                 break
603 |         logging.debug("Got all results, waiting for processes to exit.")
604 |         return self.cleanup_processes(processes, results_queue, metadata_queue,
605 |                                       results)
606 | 


--------------------------------------------------------------------------------
/dataflux_core/performance_tests/list_and_download.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2024 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | import os
 18 | import time
 19 | import unittest
 20 | from math import ceil
 21 | 
 22 | from dataflux_core import download, fast_list
 23 | 
 24 | FIFTY_GB = 50000000000
 25 | 
 26 | 
 27 | class ClientPerformanceTest(unittest.TestCase):
 28 | 
 29 |     def get_config(self):
 30 |         config = {}
 31 |         # Gather env vars into dictionary.
 32 |         config["project"] = os.getenv("PROJECT")
 33 |         config["bucket"] = os.getenv("BUCKET")
 34 |         config["prefix"] = os.getenv("PREFIX")
 35 |         config["num_workers"] = os.getenv("LIST_WORKERS")
 36 |         config["expected_file_count"] = os.getenv("FILE_COUNT")
 37 |         config["expected_total_size"] = os.getenv("TOTAL_FILE_SIZE")
 38 |         config["max_compose_bytes"] = os.getenv("MAX_COMPOSE_BYTES")
 39 |         config["list_timeout"] = os.getenv("LIST_TIMEOUT")
 40 |         config["download_timeout"] = os.getenv("DOWNLOAD_TIMEOUT")
 41 |         config["parallelization"] = os.getenv("PARALLELIZATION")
 42 | 
 43 |         # Type convert env vars.
 44 |         if config["num_workers"]:
 45 |             config["num_workers"] = int(config["num_workers"])
 46 |         if config["expected_file_count"]:
 47 |             config["expected_file_count"] = int(config["expected_file_count"])
 48 |         if config["expected_total_size"]:
 49 |             config["expected_total_size"] = int(config["expected_total_size"])
 50 |         config["max_compose_bytes"] = (int(config["max_compose_bytes"])
 51 |                                        if config["max_compose_bytes"] else
 52 |                                        100000000)
 53 |         if config["list_timeout"]:
 54 |             config["list_timeout"] = float(config["list_timeout"])
 55 |         if config["download_timeout"]:
 56 |             config["download_timeout"] = float(config["download_timeout"])
 57 |         config["parallelization"] = (int(config["parallelization"])
 58 |                                      if config["parallelization"] else 1)
 59 | 
 60 |         return config
 61 | 
 62 |     def run_list(self, config):
 63 |         list_start_time = time.time()
 64 |         list_result = fast_list.ListingController(
 65 |             config["num_workers"],
 66 |             config["project"],
 67 |             config["bucket"],
 68 |             prefix=config["prefix"],
 69 |         ).run()
 70 |         list_end_time = time.time()
 71 |         listing_time = list_end_time - list_start_time
 72 |         if (config["expected_file_count"]
 73 |                 and len(list_result) != config["expected_file_count"]):
 74 |             raise AssertionError(
 75 |                 f"Expected {config['expected_file_count']} files, but got {len(list_result)}"
 76 |             )
 77 |         if config["list_timeout"] and listing_time > config["list_timeout"]:
 78 |             raise AssertionError(
 79 |                 f"Expected list operation to complete in under {config['list_timeout']} seconds, but took {listing_time} seconds."
 80 |             )
 81 |         return list_result
 82 | 
 83 |     def run_download(self, config, list_result):
 84 |         segmented = False
 85 |         if config["expected_total_size"] > FIFTY_GB:
 86 |             segmented = True
 87 |         download_params = download.DataFluxDownloadOptimizationParams(
 88 |             config["max_compose_bytes"])
 89 |         download_start_time = time.time()
 90 |         download_result = None
 91 |         if config["parallelization"] and config["parallelization"] > 1:
 92 |             download_result = download.dataflux_download_parallel(
 93 |                 config["project"],
 94 |                 config["bucket"],
 95 |                 list_result,
 96 |                 dataflux_download_optimization_params=download_params,
 97 |                 parallelization=config["parallelization"],
 98 |             )
 99 |         else:
100 |             download_result = download.dataflux_download(
101 |                 config["project"],
102 |                 config["bucket"],
103 |                 list_result,
104 |                 dataflux_download_optimization_params=download_params,
105 |             )
106 |         download_end_time = time.time()
107 |         downloading_time = download_end_time - download_start_time
108 |         total_size = sum([len(x) for x in download_result])
109 |         if (not segmented and config["expected_total_size"]
110 |                 and total_size != config["expected_total_size"]):
111 |             raise AssertionError(
112 |                 f"Expected {config['expected_total_size']} bytes but got {total_size} bytes"
113 |             )
114 |         if config["download_timeout"] and downloading_time > config[
115 |                 "download_timeout"]:
116 |             raise AssertionError(
117 |                 f"Expected download operation to complete in under {config['download_timeout']} seconds, but took {downloading_time} seconds."
118 |             )
119 |         return total_size
120 | 
121 |     def test_list_and_download_one_shot(self):
122 |         config = self.get_config()
123 |         list_result = self.run_list(config)
124 |         self.run_download(config, list_result)
125 | 
126 |     def test_list_and_download_segmented(self):
127 |         # This function is needed to avoid OOM errors when the dataset size
128 |         # exceeds the memory of the VM.
129 |         config = self.get_config()
130 |         list_result = self.run_list(config)
131 |         num_segments = config["expected_total_size"] / FIFTY_GB
132 |         segment_size = ceil(config["expected_file_count"] / num_segments)
133 |         segments = [
134 |             list_result[i:i + segment_size]
135 |             for i in range(0, len(list_result), segment_size)
136 |         ]
137 |         total_size = 0
138 |         for seg in segments:
139 |             total_size += self.run_download(config, seg)
140 |         if (config["expected_total_size"]
141 |                 and total_size != config["expected_total_size"]):
142 |             raise AssertionError(
143 |                 f"Expected {config['expected_total_size']} bytes but got {total_size} bytes"
144 |             )
145 | 


--------------------------------------------------------------------------------
/dataflux_core/performance_tests/list_only.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | 
17 | import argparse
18 | import time
19 | 
20 | from dataflux_core import download, fast_list
21 | 
22 | 
23 | def parse_args():
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--project", type=str)
26 |     parser.add_argument("--bucket", type=str)
27 |     parser.add_argument("--bucket-file-count", type=int, default=None)
28 |     parser.add_argument("--bucket-file-size", type=int, default=None)
29 |     parser.add_argument("--num-workers", type=int, default=10)
30 |     parser.add_argument("--max-compose-bytes", type=int, default=100000000)
31 |     parser.add_argument("--prefix", type=str, default="")
32 |     return parser.parse_args()
33 | 
34 | 
35 | def main() -> None:
36 |     args = parse_args()
37 |     list_start_time = time.time()
38 |     print(f"Listing operation started at {list_start_time}")
39 |     list_result = fast_list.ListingController(args.num_workers,
40 |                                               args.project,
41 |                                               args.bucket,
42 |                                               prefix=args.prefix).run()
43 |     list_end_time = time.time()
44 |     if args.bucket_file_count and len(list_result) != args.bucket_file_count:
45 |         raise AssertionError(
46 |             f"Expected {args.bucket_file_count} files, but got {len(list_result)}"
47 |         )
48 |     print(
49 |         f"{len(list_result)} objects listed in {list_end_time - list_start_time} seconds"
50 |     )
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/dataflux_core/range_splitter.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2023 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | from collections.abc import Sequence
 20 | from dataclasses import dataclass
 21 | from fractions import Fraction
 22 | from itertools import count
 23 | 
 24 | 
 25 | @dataclass
 26 | class MinimalIntRange:
 27 |     start_int: int
 28 |     end_int: int
 29 |     min_len: int
 30 | 
 31 | 
 32 | @dataclass
 33 | class GenerateSplitsOpts:
 34 |     min_int_range: MinimalIntRange
 35 |     num_splits: int
 36 |     start_range: str
 37 |     end_range: str
 38 | 
 39 | 
 40 | class RangeSplitter(object):
 41 |     """Manages splits performed to facilitate the work-stealing algorithm.
 42 | 
 43 |     Attr:
 44 |       alphabet_map: An int mapping for an alphabet of arbitrary character size.
 45 |       sorted_alphabet: The sorted alphabet that initializes the RangeSplitter.
 46 |     """
 47 | 
 48 |     min_splits = 2
 49 | 
 50 |     def __init__(self, alphabet_map: dict[int, str],
 51 |                  sorted_alphabet: Sequence[str]):
 52 |         self.alphabet_map = alphabet_map
 53 |         self.sorted_alphabet = sorted_alphabet
 54 |         self.alphabet_set = set(sorted_alphabet)
 55 | 
 56 |     def split_range(
 57 |         self,
 58 |         start_range: str,
 59 |         end_range: str,
 60 |         num_splits: int,
 61 |     ) -> Sequence[str]:
 62 |         """Creates a given number of splits based on a provided start and end range.
 63 | 
 64 |         Args:
 65 |           start_range (str): The string marking the start of the split range.
 66 |           end_range (str): The string marking the end of the split range.
 67 |           num_splits (int): The number of splitpoints to return.
 68 | 
 69 |         Returns:
 70 |           A sequence of split points dividing up the provided range.
 71 |         """
 72 |         if num_splits < 1:
 73 |             raise ValueError("Got num_splits of %s but need minimum of %s." %
 74 |                              (num_splits, self.min_splits))
 75 |         if len(end_range) != 0 and start_range >= end_range:
 76 |             return []
 77 | 
 78 |         if self.is_range_equal_with_padding(start_range, end_range):
 79 |             return []
 80 | 
 81 |         self.add_characters_to_alphabet(start_range + end_range)
 82 | 
 83 |         min_int_range = self.string_to_minimal_int_range(
 84 |             start_range, end_range, num_splits)
 85 | 
 86 |         split_points = self.generate_splits(
 87 |             GenerateSplitsOpts(min_int_range, num_splits, start_range,
 88 |                                end_range))
 89 |         return split_points
 90 | 
 91 |     def generate_splits(self, opts: GenerateSplitsOpts) -> Sequence[str]:
 92 |         """Generates a list of split points.
 93 | 
 94 |         Args:
 95 |           opts (GenerateSplitOpts): Set of options for generating splitpoints
 96 | 
 97 |         Returns:
 98 |           A list of split points.
 99 |         """
100 |         start_int = opts.min_int_range.start_int
101 |         end_int = opts.min_int_range.end_int
102 |         min_len = opts.min_int_range.min_len
103 | 
104 |         range_diff = end_int - start_int
105 |         split_points = []
106 |         range_interval = opts.num_splits + 1
107 |         adjustment = Fraction(range_diff / range_interval)
108 | 
109 |         for i in range(1, opts.num_splits + 1):
110 |             split_point = start_int + adjustment * i
111 |             split_string = self.int_to_string(int(split_point), min_len)
112 | 
113 |             is_greater_than_start = (len(split_string) > 0
114 |                                      and split_string > opts.start_range)
115 |             is_less_than_end = len(
116 |                 opts.end_range) == 0 or (len(split_string) > 0
117 |                                          and split_string < opts.end_range)
118 | 
119 |             if is_greater_than_start and is_less_than_end:
120 |                 split_points.append(split_string)
121 | 
122 |         return split_points
123 | 
124 |     def int_to_string(self, split_point: int, string_len: int) -> str:
125 |         """Converts the base len(alphabet) int back into a string.
126 | 
127 |         Args:
128 |           split_point (int): A valid split point int to be converted to string.
129 |           string_len (int): The required length of the resulting string.
130 | 
131 |         Returns:
132 |           A string derived from a base len(alphabet) int.
133 |         """
134 |         alphabet_len = len(self.sorted_alphabet)
135 |         split_string = ""
136 | 
137 |         for _ in range(string_len):
138 |             remainder = split_point % alphabet_len
139 |             split_point //= alphabet_len
140 |             split_string += self.sorted_alphabet[remainder]
141 | 
142 |         # This is assembeled backwards via division, so we reverse the final string.
143 |         return split_string[::-1]
144 | 
145 |     def string_to_minimal_int_range(self, start_range: str, end_range: str,
146 |                                     num_splits: int) -> MinimalIntRange:
147 |         """Converts a string range to a minimal integer range.
148 | 
149 |         Args:
150 |           start_range (str): The string marking the start of the split range.
151 |           end_range (str): The string marking the end of the split range.
152 |           num_splits (int): The number of splitpoints to return.
153 | 
154 |         Returns:
155 |           A minimal integer range.
156 |         """
157 | 
158 |         start_int = 0
159 |         end_int = 0
160 | 
161 |         alphabet_len = len(self.sorted_alphabet)
162 |         start_char = self.sorted_alphabet[0]
163 |         end_char = self.sorted_alphabet[-1]
164 | 
165 |         end_default_char = start_char
166 |         if len(end_range) == 0:
167 |             end_default_char = end_char
168 | 
169 |         for i in count(0):
170 |             start_pos = self.alphabet_map[get_char_or_default(
171 |                 start_range, i, start_char)]
172 |             start_int *= alphabet_len
173 |             start_int += start_pos
174 | 
175 |             end_pos = self.alphabet_map[get_char_or_default(
176 |                 end_range, i, end_default_char)]
177 |             end_int *= alphabet_len
178 |             end_int += end_pos
179 | 
180 |             difference = end_int - start_int
181 |             if difference > num_splits:
182 |                 # Due to zero indexing, min length must have 1 added to it.
183 |                 return MinimalIntRange(start_int, end_int, i + 1)
184 | 
185 |     def is_range_equal_with_padding(self, start_range: str, end_range: str):
186 |         """Checks for equality between two string ranges.
187 | 
188 |         Args:
189 |           start_range (str): The start range for the split.
190 |           end_range (str): The end range for the split.
191 | 
192 |         Returns:
193 |           Boolean indicating equality of the two provided ranges.
194 |         """
195 | 
196 |         if len(end_range) == 0:
197 |             return False
198 | 
199 |         longest = max(len(start_range), len(end_range))
200 | 
201 |         smallest_char = self.sorted_alphabet[0]
202 | 
203 |         for i in range(longest):
204 |             char_start = get_char_or_default(start_range, i, smallest_char)
205 |             char_end = get_char_or_default(end_range, i, smallest_char)
206 | 
207 |             if char_start != char_end:
208 |                 return False
209 | 
210 |         return True
211 | 
212 |     def add_characters_to_alphabet(self, characters: str):
213 |         """Adds a character to the known alphabet.
214 | 
215 |         Args:
216 |           characters: The string of characters to add to the library.
217 |         """
218 |         unique_characters = set(characters)
219 |         new_alphabet = self.alphabet_set.union(unique_characters)
220 |         if len(new_alphabet) != len(self.alphabet_set):
221 |             self.sorted_alphabet = sorted(new_alphabet)
222 |             self.alphabet_map = {
223 |                 val: index
224 |                 for index, val in enumerate(self.sorted_alphabet)
225 |             }
226 | 
227 | 
228 | def get_char_or_default(characters: str, index: int, default_char: str) -> str:
229 |     """Returns the character at the given index or the default character if the index is out of bounds.
230 | 
231 |     Args:
232 |       characters (str): The range string to check.
233 |       index (int): The current iteration index across characters.
234 |       default_char (str): The smallest character in the implemented char set.
235 | 
236 |     Returns:
237 |       The resulting character for the given index.
238 |     """
239 |     if index < 0 or index >= len(characters):
240 |         return default_char
241 | 
242 |     return characters[index]
243 | 
244 | 
245 | def new_rangesplitter(alphabet: str) -> RangeSplitter:
246 |     """Creates a new RangeSplitter with the given alphabets.
247 | 
248 |     Note that the alphabets are a predetermined set of characters
249 |     by the work-stealing algorithm, and the characters are guaranteed to be unique.
250 | 
251 |     Args:
252 |       alphabet (str): The full set of characters used for this range splitter.
253 | 
254 |     Returns:
255 |       An instance of the RangeSplitter class that is used to manage splits
256 |       performed to facilitate the work-stealing algorithm.
257 |     """
258 |     if len(alphabet) == 0:
259 |         raise ValueError("Cannot split with an empty alphabet.")
260 |     sorted_alphabet = sorted(alphabet)
261 |     alphabet_map = {val: index for index, val in enumerate(sorted_alphabet)}
262 |     return RangeSplitter(alphabet_map, sorted_alphabet)
263 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | from . import fake_gcs
17 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/fake_gcs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2024 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 | 
 16 | Fake GCS package supporting the GCS API methods used for Dataflux.
 17 | 
 18 | The fake_gcs package provides Client, Bucket, and GCSObject classes matching the
 19 | interfaces used in Dataflux code. The fake is implemented using these classes,
 20 | rather than by using an HTTP server and connecting the actual GCS client to the
 21 | server, which could be a future improvement.
 22 | """
 23 | 
 24 | from __future__ import annotations
 25 | 
 26 | import io
 27 | 
 28 | from google.cloud.storage import _http
 29 | 
 30 | 
 31 | class Bucket(object):
 32 |     """Bucket represents a bucket in GCS, containing objects."""
 33 | 
 34 |     list_error = None
 35 |     """If set, an error which is returned when calling list_blobs"""
 36 | 
 37 |     def __init__(self, name: str):
 38 |         if not name:
 39 |             raise Exception("bucket name must not be empty")
 40 |         self.name = name
 41 |         self.blobs: dict[str, Blob] = dict()
 42 |         self.permissions: any = []
 43 | 
 44 |     def list_blobs(
 45 |         self,
 46 |         max_results: int = 0,
 47 |         start_offset: str = "",
 48 |         end_offset: str = "",
 49 |         prefix: str = "",
 50 |         retry: "google.api_core.retry.retry_unary.Retry" = None,
 51 |     ) -> list[Blob]:
 52 |         results = []
 53 |         for name in sorted(self.blobs):
 54 |             if max_results and len(results) == max_results:
 55 |                 break
 56 |             if (not start_offset or name
 57 |                     >= start_offset) and (not end_offset or name < end_offset):
 58 |                 if name.startswith(prefix):
 59 |                     results.append(self.blobs[name])
 60 |         return results
 61 | 
 62 |     def blob(self, name: str, missing_path: bool = False):
 63 |         if name == "missing-path":
 64 |             missing_path = True
 65 |         if name not in self.blobs:
 66 |             self.blobs[name] = Blob(
 67 |                 name, bucket=self, missing_bucket=missing_path)
 68 |         return self.blobs[name]
 69 | 
 70 |     def _add_file(self,
 71 |                   filename: str,
 72 |                   content: bytes,
 73 |                   storage_class="STANDARD"):
 74 |         self.blobs[filename] = Blob(filename,
 75 |                                     content,
 76 |                                     self,
 77 |                                     storage_class=storage_class)
 78 | 
 79 |     def test_iam_permissions(self, permissions: any):
 80 |         return [p for p in permissions if p in self.permissions]
 81 | 
 82 | 
 83 | class FakeBlobWriter(object):
 84 |     """Represents fake BlobWriter."""
 85 | 
 86 |     def __init__(self, blob):
 87 |         self.blob = blob
 88 | 
 89 |     def write(self, data: bytes):
 90 |         self.blob.content += data
 91 | 
 92 |     def flush(self):
 93 |         pass
 94 | 
 95 |     def __enter__(self):
 96 |         return self
 97 | 
 98 |     def __exit__(self, exc_type, exc_val, exc_tb):
 99 |         pass
100 | 
101 | 
102 | class Blob(object):
103 |     """Blob represents a GCS blob object.
104 | 
105 |     Attributes:
106 |         name: The name of the blob.
107 |         retry: A variable tracking the retry policy input.
108 |         content: The byte content of the Blob.
109 |         bucket: The bucket object in which this Blob resides.
110 |         size: The size in bytes of the Blob.
111 |     """
112 | 
113 |     def __init__(
114 |         self,
115 |         name: str,
116 |         content: bytes = b"",
117 |         bucket: Bucket = None,
118 |         storage_class="STANDARD",
119 |         missing_bucket: bool = False
120 |     ):
121 |         self.name = name
122 |         self.retry = None
123 |         self.content = content
124 |         self.bucket = bucket
125 |         self.size = len(self.content)
126 |         self.storage_class = storage_class
127 |         self.missing_bucket = missing_bucket
128 | 
129 |     def compose(self, sources: list[str], retry=None):
130 |         b = b""
131 |         for item in sources:
132 |             b += self.bucket.blobs[item.name].content
133 |         self.content = b
134 |         self.retry = retry
135 | 
136 |     def delete(self, retry=None):
137 |         del self.bucket.blobs[self.name]
138 | 
139 |     def exists(self, retry=None):
140 |         return not self.missing_bucket
141 | 
142 |     def download_as_bytes(self, retry=None):
143 |         return self.content
144 | 
145 |     def download_to_file(self, file_obj: io.IOBase) -> None:
146 |         file_obj.write(self.content)
147 | 
148 |     def open(self, mode: str, ignore_flush: bool = False):
149 |         if mode == "rb":
150 |             return io.BytesIO(self.content)
151 |         elif mode == "wb":
152 |             self.content = b""
153 |             return FakeBlobWriter(self)
154 |         raise NotImplementedError(
155 |             "Supported modes strings are 'rb' and 'wb' only.")
156 | 
157 | 
158 | class Client(object):
159 |     """Client represents a GCS client which can provide bucket handles."""
160 | 
161 |     def __init__(self):
162 |         self.buckets: dict[str, Bucket] = dict()
163 |         self.content: dict[str, tuple[str, str]] = dict()
164 |         self._connection = _http.Connection(self)
165 | 
166 |     def bucket(self, name: str) -> Bucket:
167 |         if name not in self.buckets:
168 |             self.buckets[name] = Bucket(name)
169 |             if name in self.content:
170 |                 self.buckets[name].content = self.content[name]
171 |         return self.buckets[name]
172 | 
173 |     def _set_perm(self, permissions: any, name: str):
174 |         self.buckets[name].permissions = permissions
175 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/fake_multiprocess.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2023 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 | 
16 | Fake GCS package supporting the GCS API methods used for Dataflux.
17 | 
18 | The fake_gcs package provides Client, Bucket, and GCSObject classes matching the
19 | interfaces used in Dataflux code. The fake is implemented using these classes,
20 | rather than by using an HTTP server and connecting the actual GCS client to the
21 | server, which could be a future improvement.
22 | """
23 | 
24 | 
25 | class FakeProcess(object):
26 |     """A fake multiprocessing process for testing purposes."""
27 | 
28 |     def __init__(self, name: str, alive: bool = False, term_tracker=[]):
29 |         self.name = name
30 |         self.alive = alive
31 |         self.term_tracker = term_tracker
32 | 
33 |     def is_alive(self):
34 |         if self.alive:
35 |             self.alive = False
36 |             return True
37 |         return self.alive
38 | 
39 |     def terminate(self):
40 |         self.term_tracker.append("")
41 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/test_download.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2024 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | import unittest
 18 | from unittest import mock
 19 | 
 20 | from dataflux_core import download
 21 | from dataflux_core.tests import fake_gcs
 22 | 
 23 | 
 24 | class DownloadTestCase(unittest.TestCase):
 25 | 
 26 |     def test_compose(self):
 27 |         bucket_name = "test_bucket"
 28 |         destination_blob_name = "dest_name"
 29 |         objects = [("one", 3), ("two", 3), ("three", 5)]
 30 |         client = fake_gcs.Client()
 31 |         bucket = client.bucket(bucket_name)
 32 |         bucket._add_file("one", bytes("one", "utf-8"))
 33 |         bucket._add_file("two", bytes("two", "utf-8"))
 34 |         bucket._add_file("three", bytes("three", "utf-8"))
 35 |         expected_result = b"onetwothree"
 36 |         blob = download.compose("", bucket_name, destination_blob_name,
 37 |                                 objects, client)
 38 |         self.assertEqual(blob.name, destination_blob_name)
 39 |         self.assertEqual(blob.content, expected_result)
 40 |         self.assertIn("dataflux", client._connection.user_agent)
 41 | 
 42 |     def test_decompose(self):
 43 |         bucket_name = "test_bucket"
 44 |         object_name = "test_obj"
 45 |         objects = [("one", 3), ("two", 3), ("three", 5)]
 46 |         client = fake_gcs.Client()
 47 |         bucket = client.bucket(bucket_name)
 48 |         bucket._add_file(object_name, bytes("onetwothree", "utf-8"))
 49 |         result = download.decompose("", bucket_name, object_name, objects,
 50 |                                     client)
 51 |         self.assertEqual(result, [b"one", b"two", b"three"])
 52 |         self.assertIn("dataflux", client._connection.user_agent)
 53 | 
 54 |     def test_download_single(self):
 55 |         client = fake_gcs.Client()
 56 |         bucket_name = "test_bucket"
 57 |         object_name = "test_obj"
 58 |         content = bytes("onetwothree", "utf-8")
 59 |         bucket = client.bucket(bucket_name)
 60 |         bucket._add_file(object_name, content)
 61 |         result = download.download_single(client, bucket_name, object_name)
 62 |         self.assertEqual(result, content)
 63 | 
 64 |     def test_dataflux_download(self):
 65 |         bucket_name = "test_bucket"
 66 |         objects = [("one", 3), ("two", 3), ("three", 5)]
 67 |         client = fake_gcs.Client()
 68 |         bucket = client.bucket(bucket_name)
 69 |         bucket._add_file("one", bytes("one", "utf-8"))
 70 |         bucket._add_file("two", bytes("two", "utf-8"))
 71 |         bucket._add_file("three", bytes("three", "utf-8"))
 72 |         params = download.DataFluxDownloadOptimizationParams(32)
 73 |         expected_result = [b"one", b"two", b"three"]
 74 |         result = download.dataflux_download("", bucket_name, objects, client,
 75 |                                             params)
 76 |         self.assertEqual(result, expected_result)
 77 |         # This checks for succesful deletion of the composed object.
 78 |         if len(bucket.blobs) != 3:
 79 |             self.fail(
 80 |                 f"expected only 3 objects in bucket, but found {len(bucket.blobs)}"
 81 |             )
 82 |         self.assertIn("dataflux", client._connection.user_agent)
 83 | 
 84 |     def test_dataflux_download_parallel(self):
 85 |         test_cases = [
 86 |             {
 87 |                 "name": "exceed number of items",
 88 |                 "procs": 4
 89 |             },
 90 |             {
 91 |                 "name": "single proc",
 92 |                 "procs": 1
 93 |             },
 94 |             {
 95 |                 "name": "standard",
 96 |                 "procs": 2
 97 |             },
 98 |         ]
 99 |         bucket_name = "test_bucket"
100 |         objects = [("one", 3), ("two", 3), ("three", 5)]
101 |         client = fake_gcs.Client()
102 |         bucket = client.bucket(bucket_name)
103 |         bucket._add_file("one", bytes("one", "utf-8"))
104 |         bucket._add_file("two", bytes("two", "utf-8"))
105 |         bucket._add_file("three", bytes("three", "utf-8"))
106 |         params = download.DataFluxDownloadOptimizationParams(32)
107 |         expected_result = [b"one", b"two", b"three"]
108 |         for tc in test_cases:
109 |             result = download.dataflux_download_parallel(
110 |                 "",
111 |                 bucket_name,
112 |                 objects,
113 |                 client,
114 |                 params,
115 |                 tc["procs"],
116 |             )
117 |             self.assertEqual(result, expected_result)
118 |             # This checks for succesful deletion of the composed object.
119 |             if len(bucket.blobs) != 3:
120 |                 self.fail(
121 |                     f"{tc['name']} expected only 3 objects in bucket, but found {len(bucket.blobs)}"
122 |                 )
123 | 
124 |     def test_dataflux_download_threaded(self):
125 |         test_cases = [
126 |             {
127 |                 "name": "exceed number of items",
128 |                 "threads": 4
129 |             },
130 |             {
131 |                 "name": "single thread",
132 |                 "threads": 1
133 |             },
134 |             {
135 |                 "name": "standard",
136 |                 "threads": 2
137 |             },
138 |         ]
139 |         bucket_name = "test_bucket"
140 |         objects = [("one", 3), ("two", 3), ("three", 5)]
141 |         client = fake_gcs.Client()
142 |         bucket = client.bucket(bucket_name)
143 |         bucket._add_file("one", bytes("one", "utf-8"))
144 |         bucket._add_file("two", bytes("two", "utf-8"))
145 |         bucket._add_file("three", bytes("three", "utf-8"))
146 |         params = download.DataFluxDownloadOptimizationParams(32)
147 |         expected_result = [b"one", b"two", b"three"]
148 |         for tc in test_cases:
149 |             result = download.dataflux_download_threaded(
150 |                 "",
151 |                 bucket_name,
152 |                 objects,
153 |                 client,
154 |                 params,
155 |                 tc["threads"],
156 |             )
157 |             self.assertEqual(result, expected_result)
158 |             # This checks for succesful deletion of the composed object.
159 |             if len(bucket.blobs) != 3:
160 |                 self.fail(
161 |                     f"{tc['name']} expected only 3 objects in bucket, but found {len(bucket.blobs)}"
162 |                 )
163 |         self.assertIn("dataflux", client._connection.user_agent)
164 | 
165 |     def test_dataflux_download_lazy(self):
166 |         test_cases = [
167 |             {
168 |                 "desc": "Need to compose objects before downloading",
169 |                 "max_composite_object_size": 100,
170 |             },
171 |             {
172 |                 "desc": "Do not need to compose objects before downloading",
173 |                 "max_composite_object_size": 0,
174 |             },
175 |         ]
176 | 
177 |         for tc in test_cases:
178 |             bucket_name = "test_bucket"
179 |             objects = [("one", 3), ("two", 3), ("three", 5)]
180 |             client = fake_gcs.Client()
181 |             bucket = client.bucket(bucket_name)
182 |             bucket._add_file("one", bytes("one", "utf-8"))
183 |             bucket._add_file("two", bytes("two", "utf-8"))
184 |             bucket._add_file("three", bytes("three", "utf-8"))
185 |             params = download.DataFluxDownloadOptimizationParams(
186 |                 tc["max_composite_object_size"])
187 |             expected_result = [b"one", b"two", b"three"]
188 |             result = download.dataflux_download_lazy("", bucket_name, objects,
189 |                                                      client, params)
190 |             self.assertEqual(
191 |                 list(result),
192 |                 expected_result,
193 |                 f"test {tc['desc']} got {list(result)} objects, wanted {expected_result}",
194 |             )
195 |             # This checks for succesful deletion of the composed object.
196 |             if len(bucket.blobs) != 3:
197 |                 self.fail(
198 |                     f"test {tc['desc']} expected only 3 objects in bucket, but found {len(bucket.blobs)}"
199 |                 )
200 |             self.assertIn("dataflux", client._connection.user_agent)
201 | 
202 |     def test_clean_composed_object(self):
203 | 
204 |         class ComposedObj:
205 | 
206 |             def __init__(self):
207 |                 self.deleted = False
208 | 
209 |             def delete(self, retry=None):
210 |                 self.deleted = True
211 | 
212 |         current_composed_object = ComposedObj()
213 |         download.clean_composed_object(current_composed_object)
214 |         if not current_composed_object.deleted:
215 |             self.fail("expected composed object cleanup: True, got False")
216 | 
217 | 
218 | if __name__ == "__main__":
219 |     unittest.main()
220 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/test_fake_gcs.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2023 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | import io
 18 | import unittest
 19 | 
 20 | from dataflux_core.tests import fake_gcs
 21 | 
 22 | 
 23 | class FakeGCSTest(unittest.TestCase):
 24 | 
 25 |     def test_list_blobs_empty(self):
 26 |         bucket = fake_gcs.Client().bucket("test-bucket")
 27 |         self.assertFalse(bucket.list_blobs())
 28 | 
 29 |     def test_list_blobs_all(self):
 30 |         bucket = fake_gcs.Client().bucket("test-bucket")
 31 |         bucket._add_file("obj1", "a")
 32 |         bucket._add_file("obj2", "aa")
 33 |         want_objects = [
 34 |             bucket.blobs["obj1"],
 35 |             bucket.blobs["obj2"],
 36 |         ]
 37 |         self.assertEqual(bucket.list_blobs(), want_objects)
 38 | 
 39 |     def test_list_blobs_with_start_range_equal(self):
 40 |         bucket = fake_gcs.Client().bucket("test-bucket")
 41 |         bucket._add_file("obj1", "a")
 42 |         bucket._add_file("obj2", "aa")
 43 |         want_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]]
 44 |         self.assertEqual(bucket.list_blobs(start_offset=want_objects[0].name),
 45 |                          want_objects)
 46 | 
 47 |     def test_list_blobs_with_end_range_equal(self):
 48 |         bucket = fake_gcs.Client().bucket("test-bucket")
 49 |         bucket._add_file("obj1", "a")
 50 |         bucket._add_file("obj2", "aa")
 51 |         all_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]]
 52 |         want_objects = [all_objects[0]]
 53 |         self.assertEqual(bucket.list_blobs(end_offset=all_objects[1].name),
 54 |                          want_objects)
 55 | 
 56 |     def test_list_blobs_with_start_range_greater(self):
 57 |         bucket = fake_gcs.Client().bucket("test-bucket")
 58 |         bucket._add_file("obj1", "a")
 59 |         bucket._add_file("obj2", "aa")
 60 |         all_objects = [bucket.blobs["obj1"], bucket.blobs["obj2"]]
 61 |         want_objects = [all_objects[1]]
 62 |         self.assertEqual(bucket.list_blobs(start_offset=all_objects[1].name),
 63 |                          want_objects)
 64 | 
 65 |     def test_list_blobs_with_range(self):
 66 |         bucket = fake_gcs.Client().bucket("test-bucket")
 67 |         bucket._add_file("obj1", "a")
 68 |         bucket._add_file("obj2", "aa")
 69 |         bucket._add_file("obj3", "aaa")
 70 |         all_objects = [
 71 |             bucket.blobs["obj1"], bucket.blobs["obj2"], bucket.blobs["obj3"]
 72 |         ]
 73 |         want_objects = [all_objects[1]]
 74 |         self.assertEqual(
 75 |             bucket.list_blobs(start_offset=all_objects[1].name,
 76 |                               end_offset=all_objects[2].name),
 77 |             want_objects,
 78 |         )
 79 | 
 80 |     def test_list_blobs_with_max_results(self):
 81 |         bucket = fake_gcs.Client().bucket("test-bucket")
 82 |         bucket._add_file("obj1", "a")
 83 |         bucket._add_file("obj2", "aa")
 84 |         bucket._add_file("obj3", "aaa")
 85 |         all_objects = [
 86 |             bucket.blobs["obj1"], bucket.blobs["obj2"], bucket.blobs["obj3"]
 87 |         ]
 88 |         want_objects = [all_objects[0]]
 89 |         self.assertEqual(bucket.list_blobs(max_results=1), want_objects)
 90 | 
 91 |     def test_list_blobs_with_max_results_and_range(self):
 92 |         bucket = fake_gcs.Client().bucket("test-bucket")
 93 |         bucket._add_file("obj1", "a")
 94 |         bucket._add_file("obj2", "aa")
 95 |         bucket._add_file("obj3", "aaa")
 96 |         bucket._add_file("obj4", "aaaa")
 97 |         all_objects = [
 98 |             bucket.blobs["obj1"],
 99 |             bucket.blobs["obj2"],
100 |             bucket.blobs["obj3"],
101 |             bucket.blobs["obj4"],
102 |         ]
103 |         want_objects = [all_objects[1], all_objects[2]]
104 |         self.assertEqual(
105 |             bucket.list_blobs(
106 |                 max_results=2,
107 |                 start_offset=all_objects[1].name,
108 |                 end_offset=all_objects[3].name,
109 |             ),
110 |             want_objects,
111 |         )
112 | 
113 |     def test_bucket_name_none_raises_error(self):
114 |         try:
115 |             fake_gcs.Client().bucket(None)
116 |         except:
117 |             return
118 |         self.fail("Creating bucket with None name did not raise error")
119 | 
120 |     def test_blob_write(self):
121 |         want_obj = "test"
122 |         obj_bytes = str.encode(want_obj)
123 |         bucket = fake_gcs.Bucket("test-bucket")
124 |         blob = bucket.blob(want_obj)
125 |         writer = fake_gcs.FakeBlobWriter(blob)
126 |         writer.write(obj_bytes)
127 |         self.assertEqual(blob.content, b'' + obj_bytes)
128 | 
129 |     def test_blob_read(self):
130 |         bucket = fake_gcs.Bucket("test-bucket")
131 |         blob = bucket.blob("test")
132 |         self.assertIsInstance(blob.open("rb"), io.BytesIO)
133 | 
134 |     def test_blob_writer(self):
135 |         bucket = fake_gcs.Bucket("test-bucket")
136 |         blob = bucket.blob("test")
137 |         self.assertIsInstance(blob.open("wb"), fake_gcs.FakeBlobWriter)
138 | 
139 |     def test_permissions(self):
140 |         test_bucket = "test-bucket"
141 |         test_perm = ["test-perm-1", "test-perm-3"]
142 |         client = fake_gcs.Client()
143 |         bucket = client.bucket(test_bucket)
144 |         client._set_perm(["test-perm-1", "test-perm-2", "test-perm-3"],
145 |                          test_bucket)
146 |         got_perm = bucket.test_iam_permissions(test_perm)
147 |         self.assertEqual(got_perm, test_perm)
148 | 
149 |     def test_no_permissions(self):
150 |         test_bucket = "test-bucket"
151 |         test_perm = ["test-perm-1", "test-perm-3"]
152 |         client = fake_gcs.Client()
153 |         bucket = client.bucket(test_bucket)
154 |         got_perm = bucket.test_iam_permissions(test_perm)
155 |         self.assertEqual(got_perm, [])
156 | 
157 |     def test_download_to_file(self):
158 |         bucket = fake_gcs.Client().bucket("test-bucket")
159 |         name = "obj1"
160 |         contents = b"aaaa"
161 |         bucket._add_file(name, contents)
162 | 
163 |         stream = io.BytesIO()
164 |         bucket.blob(name).download_to_file(stream)
165 |         stream.seek(0)
166 |         self.assertEqual(stream.read(), contents)
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     unittest.main()
171 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/test_fast_list.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2023 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | import queue
 18 | import time
 19 | import unittest
 20 | 
 21 | from dataflux_core import fast_list
 22 | from dataflux_core.tests import fake_gcs, fake_multiprocess
 23 | 
 24 | 
 25 | class FastListTest(unittest.TestCase):
 26 | 
 27 |     def test_single_worker(self):
 28 |         """End to end test of a single ListWorker."""
 29 |         test_cases = [
 30 |             {
 31 |                 "desc": "List 10k objects default",
 32 |                 "object_count": 10000,
 33 |                 "compose_obj_count": 1,
 34 |                 "prefix_obj_count": 0,
 35 |                 "archive_obj_count": 0,
 36 |                 "prefix": "",
 37 |                 "object_size": 10,
 38 |                 "directory_obj_count": 10,
 39 |                 "skip_compose": True,
 40 |                 "list_directory_objects": False,
 41 |                 "expected_objects": 10000,
 42 |                 "expected_api_calls": 3,
 43 |             },
 44 |             {
 45 |                 "desc": "List 10k objects including compose",
 46 |                 "object_count": 10000,
 47 |                 "compose_obj_count": 1,
 48 |                 "prefix_obj_count": 0,
 49 |                 "archive_obj_count": 0,
 50 |                 "prefix": "",
 51 |                 "object_size": 10,
 52 |                 "directory_obj_count": 0,
 53 |                 "skip_compose": False,
 54 |                 "list_directory_objects": False,
 55 |                 "expected_objects": 10001,
 56 |                 "expected_api_calls": 3,
 57 |             },
 58 |             {
 59 |                 "desc": "List 5k objects excluding compose",
 60 |                 "object_count": 5000,
 61 |                 "compose_obj_count": 5000,
 62 |                 "prefix_obj_count": 0,
 63 |                 "archive_obj_count": 0,
 64 |                 "prefix": "",
 65 |                 "object_size": 10,
 66 |                 "directory_obj_count": 0,
 67 |                 "skip_compose": True,
 68 |                 "list_directory_objects": False,
 69 |                 "expected_objects": 5000,
 70 |                 "expected_api_calls": 3,
 71 |             },
 72 |             {
 73 |                 "desc": "List 2k objects, prefix only",
 74 |                 "object_count": 5000,
 75 |                 "compose_obj_count": 5000,
 76 |                 "prefix_obj_count": 2000,
 77 |                 "archive_obj_count": 0,
 78 |                 "prefix": "test-prefix/",
 79 |                 "object_size": 10,
 80 |                 "directory_obj_count": 0,
 81 |                 "skip_compose": True,
 82 |                 "list_directory_objects": False,
 83 |                 "expected_objects": 2000,
 84 |                 "expected_api_calls": 1,
 85 |             },
 86 |             {
 87 |                 "desc": "List directory objects",
 88 |                 "object_count": 10000,
 89 |                 "compose_obj_count": 0,
 90 |                 "prefix_obj_count": 0,
 91 |                 "archive_obj_count": 0,
 92 |                 "prefix": "",
 93 |                 "object_size": 10,
 94 |                 "directory_obj_count": 10,
 95 |                 "skip_compose": True,
 96 |                 "list_directory_objects": True,
 97 |                 "expected_objects": 10010,
 98 |                 "expected_api_calls": 3,
 99 |             },
100 |             {
101 |                 "desc": "Skip non-standard class",
102 |                 "object_count": 10000,
103 |                 "compose_obj_count": 0,
104 |                 "prefix_obj_count": 0,
105 |                 "archive_obj_count": 1000,
106 |                 "prefix": "",
107 |                 "object_size": 10,
108 |                 "directory_obj_count": 0,
109 |                 "skip_compose": True,
110 |                 "list_directory_objects": True,
111 |                 "expected_objects": 10000,
112 |                 "expected_api_calls": 3,
113 |             },
114 |         ]
115 |         for tc in test_cases:
116 |             client = fake_gcs.Client()
117 |             bucket_name = "test_bucket"
118 |             bucket = client.bucket(bucket_name)
119 |             object_count = tc["object_count"]
120 |             object_size = tc["object_size"]
121 |             results_queue = queue.Queue()
122 |             metadata_queue = queue.Queue()
123 |             work_queue = queue.Queue()
124 |             work_queue.put((None, ""))
125 | 
126 |             for i in range(object_count):
127 |                 bucket._add_file(str(i), b"a" * object_size)
128 |             # Add one composed object to make sure it is skipped.
129 |             for i in range(tc["compose_obj_count"]):
130 |                 bucket._add_file(f"dataflux-composed-objects/composed{i}.tar",
131 |                                  b"a" * object_size)
132 |             for i in range(tc["prefix_obj_count"]):
133 |                 bucket._add_file(f"{tc['prefix']}file{i}.txt",
134 |                                  b"a" * object_size)
135 |             for i in range(tc["directory_obj_count"]):
136 |                 bucket._add_file(f"{tc['prefix']}/dir{i}/", b"")
137 |             for i in range(tc["archive_obj_count"]):
138 |                 bucket._add_file(f"archive_{i}",
139 |                                  b"a" * object_size,
140 |                                  storage_class="ARCHIVE")
141 |             list_worker = fast_list.ListWorker(
142 |                 "test_worker",
143 |                 "",
144 |                 bucket_name,
145 |                 queue.Queue(),
146 |                 queue.Queue(),
147 |                 work_queue,
148 |                 queue.Queue(),
149 |                 queue.Queue(),
150 |                 results_queue,
151 |                 metadata_queue,
152 |                 queue.Queue(),
153 |                 "",
154 |                 "",
155 |                 skip_compose=tc["skip_compose"],
156 |                 list_directory_objects=tc["list_directory_objects"],
157 |                 prefix=tc["prefix"],
158 |             )
159 |             list_worker.client = client
160 |             list_worker.run()
161 |             got_results = set()
162 |             while True:
163 |                 try:
164 |                     new_results = results_queue.get_nowait()
165 |                     got_results.update(new_results)
166 |                 except queue.Empty:
167 |                     break
168 |             expected_objects = tc["expected_objects"]
169 |             if len(got_results) != expected_objects:
170 |                 self.fail(
171 |                     f"got {len(got_results)} results, want {expected_objects}")
172 |             got_total_size = 0
173 |             for result in got_results:
174 |                 got_total_size += result[1]
175 |             want_total_size = (
176 |                 expected_objects -
177 |                 (tc["directory_obj_count"]
178 |                  if tc["list_directory_objects"] else 0)) * object_size
179 |             if got_total_size != want_total_size:
180 |                 self.fail(
181 |                     f"got {got_total_size} total size, want {want_total_size}")
182 |             if list_worker.api_call_count != tc["expected_api_calls"]:
183 |                 self.fail(f"{list_worker.api_call_count} on test {tc['desc']}")
184 |             self.assertIn("dataflux", client._connection.user_agent)
185 | 
186 |     def test_manage_tracking_queues(self):
187 |         """Tests that all tracking queues are pushed to properly."""
188 |         controller = fast_list.ListingController(10, "", "")
189 |         idle_queue = queue.Queue()
190 |         idle_queue.put("one")
191 |         idle_queue.put("two")
192 |         idle_queue.put("three")
193 |         unidle_queue = queue.Queue()
194 |         unidle_queue.put("one")
195 |         hb_queue = queue.Queue()
196 |         hb_queue.put("four")
197 |         controller.manage_tracking_queues(idle_queue, unidle_queue, hb_queue)
198 |         if controller.waiting_for_work != 2:
199 |             self.fail(
200 |                 f"got {controller.waiting_for_work} works waiting, want 2")
201 |         if "four" not in controller.inited:
202 |             self.fail(
203 |                 "expected inited worker to be tracked, but was not added to inited"
204 |             )
205 |         if "four" not in controller.checkins:
206 |             self.fail(
207 |                 "expected hb_queue entry to be tracked in checkins, but was not found"
208 |             )
209 | 
210 |     def test_check_crashed_processes(self):
211 |         """Tests that crashed processes are correctly discovered and mitigated."""
212 |         controller = fast_list.ListingController(10, "", "")
213 |         controller.inited.add("one")
214 |         controller.checkins["one"] = time.time()
215 |         if controller.check_crashed_processes():
216 |             self.fail(
217 |                 f"expected no crahsed processes, but found crashed process")
218 |         controller.checkins["one"] = time.time() - 100
219 |         if not controller.check_crashed_processes():
220 |             self.fail(
221 |                 f"expected crashed process to be detected, but found no crashed processes"
222 |             )
223 | 
224 |     def test_check_crashed_processes_follow_retry_timeout(self):
225 |         """Tests that processes aren't considered to be crashed while waiting to retry API calls"""
226 |         controller = fast_list.ListingController(
227 |             10,
228 |             "",
229 |             "",
230 |             retry_config=fast_list.MODIFIED_RETRY.with_delay(maximum=90))
231 |         controller.inited.add("one")
232 |         controller.checkins["one"] = time.time() - 170
233 |         if controller.check_crashed_processes():
234 |             self.fail(
235 |                 "expected no crahsed processes, but found crashed process")
236 |         controller.checkins["one"] = time.time() - 190
237 |         if not controller.check_crashed_processes():
238 |             self.fail(
239 |                 "expected crashed process to be detected, but found no crashed processes"
240 |             )
241 | 
242 |     def test_cleanup_processes(self):
243 |         """Tests that all processes are cleaned up at the end of execution."""
244 |         controller = fast_list.ListingController(10, "", "", True)
245 |         procs = []
246 |         results_queue = queue.Queue()
247 |         metadata_queue = queue.Queue()
248 |         set1 = set()
249 |         set2 = set()
250 |         set1.add(("item", 1))
251 |         set2.add(("item2", 2))
252 |         results_queue.put(set1)
253 |         results_queue.put(set2)
254 |         results_set = set()
255 |         for i in range(5):
256 |             procs.append(fake_multiprocess.FakeProcess(f"proc{i}", False))
257 |         results = controller.cleanup_processes(procs, results_queue,
258 |                                                metadata_queue, results_set)
259 |         if results:
260 |             self.fail("received results when no processes were alive")
261 |         procs = []
262 |         expected = [("item", 1), ("item2", 2)]
263 |         for i in range(5):
264 |             procs.append(fake_multiprocess.FakeProcess(f"proc{i}", True))
265 |         results = controller.cleanup_processes(procs, results_queue,
266 |                                                metadata_queue, results_set)
267 |         self.assertEqual(results, expected)
268 | 
269 |     def test_terminate_now(self):
270 |         controller = fast_list.ListingController(10, "", "", True)
271 |         procs = []
272 |         term_tracker = []
273 |         proc_count = 5
274 |         for i in range(proc_count):
275 |             procs.append(
276 |                 fake_multiprocess.FakeProcess(f"proc{i}", False, term_tracker))
277 | 
278 |         with self.assertRaises(RuntimeError):
279 |             controller.terminate_now(procs)
280 | 
281 |         self.assertEqual(proc_count, len(term_tracker))
282 | 
283 |     def test_list_controller_e2e(self):
284 |         """Full end to end test of the fast list operation with one worker."""
285 |         client = fake_gcs.Client()
286 |         bucket_name = "test_bucket"
287 |         bucket = client.bucket(bucket_name)
288 |         object_count = 1000
289 |         object_size = 10
290 |         for i in range(object_count):
291 |             bucket._add_file(str(i), "aaaaaaaaaa")
292 |         controller = fast_list.ListingController(1, "", bucket_name, True)
293 |         controller.client = client
294 |         results = controller.run()
295 |         if len(results) != object_count:
296 |             self.fail(f"got {len(results)} results, want {object_count}")
297 |         got_total_size = 0
298 |         for result in results:
299 |             got_total_size += result[1]
300 |         if got_total_size != object_count * object_size:
301 |             self.fail(
302 |                 f"got {got_total_size} results, want {object_count * object_size}"
303 |             )
304 | 
305 |     def test_list_controller_e2e_error(self):
306 |         """Full end to end test of the fast list operation with one worker which exits with an error."""
307 |         client = fake_gcs.Client()
308 |         controller = fast_list.ListingController(1, "", "", True)
309 |         controller.client = client
310 |         try:
311 |             results = controller.run()
312 |         except:
313 |             return
314 |         self.fail(
315 |             "Expected controller to raise an error when child process raises an error but it did not"
316 |         )
317 | 
318 |     def test_wait_for_work_success(self):
319 |         """Tests waiting for work when there is still work remaining."""
320 |         client = fake_gcs.Client()
321 |         worker_name = "test_worker"
322 |         bucket_name = "test_bucket"
323 |         send_work_needed_queue = queue.Queue()
324 |         hb_queue = queue.Queue()
325 |         direct_work_queue = queue.Queue()
326 |         idle_queue = queue.Queue()
327 |         unidle_queue = queue.Queue()
328 |         results_queue = queue.Queue()
329 |         metadata_queue = queue.Queue()
330 |         direct_work_queue.put(("y", "z"))
331 | 
332 |         list_worker = fast_list.ListWorker(
333 |             worker_name,
334 |             "",
335 |             bucket_name,
336 |             send_work_needed_queue,
337 |             hb_queue,
338 |             direct_work_queue,
339 |             idle_queue,
340 |             unidle_queue,
341 |             results_queue,
342 |             metadata_queue,
343 |             queue.Queue(),
344 |             "",
345 |             "",
346 |         )
347 |         list_worker.client = client
348 |         result = list_worker.wait_for_work()
349 |         if not result:
350 |             self.fail(f"got {result}, but expected True")
351 |         self.assertEqual(send_work_needed_queue.get_nowait(), worker_name)
352 |         self.assertEqual(idle_queue.get_nowait(), worker_name)
353 |         self.assertEqual(hb_queue.get_nowait(), worker_name)
354 |         self.assertEqual(unidle_queue.get_nowait(), worker_name)
355 |         self.assertEqual(list_worker.start_range, "y")
356 |         self.assertEqual(list_worker.end_range, "z")
357 | 
358 |     def test_wait_for_work_shutdown(self):
359 |         """Tests that waiting for work correctly detects shutdown signal."""
360 |         client = fake_gcs.Client()
361 |         worker_name = "test_worker"
362 |         bucket_name = "test_bucket"
363 |         send_work_needed_queue = queue.Queue()
364 |         hb_queue = queue.Queue()
365 |         direct_work_queue = queue.Queue()
366 |         idle_queue = queue.Queue()
367 |         unidle_queue = queue.Queue()
368 |         results_queue = queue.Queue()
369 |         metadata_queue = queue.Queue()
370 |         direct_work_queue.put((None, None))
371 | 
372 |         list_worker = fast_list.ListWorker(
373 |             worker_name,
374 |             "",
375 |             bucket_name,
376 |             send_work_needed_queue,
377 |             hb_queue,
378 |             direct_work_queue,
379 |             idle_queue,
380 |             unidle_queue,
381 |             results_queue,
382 |             metadata_queue,
383 |             queue.Queue(),
384 |             "",
385 |             "",
386 |         )
387 |         list_worker.client = client
388 |         result = list_worker.wait_for_work()
389 |         if result:
390 |             self.fail(f"got {result}, but expected False")
391 |         self.assertEqual(send_work_needed_queue.get_nowait(), worker_name)
392 |         self.assertEqual(idle_queue.get_nowait(), worker_name)
393 |         self.assertEqual(hb_queue.get_nowait(), worker_name)
394 |         self.assertRaises(queue.Empty, unidle_queue.get_nowait)
395 | 
396 |     def test_fast_list_exits_on_error(self):
397 |         """Test of a single ListWorker with an error."""
398 |         client = fake_gcs.Client()
399 |         bucket_name = None
400 |         results_queue = queue.Queue()
401 |         metadata_queue = queue.Queue()
402 |         work_queue = queue.Queue()
403 |         work_queue.put((None, ""))
404 | 
405 |         list_worker = fast_list.ListWorker(
406 |             "test_worker",
407 |             "",
408 |             bucket_name,
409 |             queue.Queue(),
410 |             queue.Queue(),
411 |             work_queue,
412 |             queue.Queue(),
413 |             queue.Queue(),
414 |             results_queue,
415 |             metadata_queue,
416 |             queue.Queue(),
417 |             "",
418 |             "",
419 |         )
420 |         list_worker.client = client
421 |         list_worker.run()
422 |         got_results = set()
423 |         while True:
424 |             try:
425 |                 new_results = results_queue.get_nowait()
426 |                 got_results.update(new_results)
427 |             except queue.Empty:
428 |                 break
429 |         if len(got_results) != 0:
430 |             self.fail(f"got {len(got_results)} results, want 0")
431 | 
432 | 
433 | if __name__ == "__main__":
434 |     unittest.main()
435 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/test_range_splitter.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Copyright 2023 Google LLC
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |       https://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  """
 16 | 
 17 | import unittest
 18 | 
 19 | from dataflux_core import range_splitter
 20 | 
 21 | 
 22 | class RangeSplitterTest(unittest.TestCase):
 23 | 
 24 |     def test_range_splits(self):
 25 |         test_cases = [
 26 |             {
 27 |                 "desc": "less than one split",
 28 |                 "start": "1",
 29 |                 "end": "2",
 30 |                 "splits": 0,
 31 |                 "expected_error": ValueError,
 32 |             },
 33 |             {
 34 |                 "desc": "end smaller than start range",
 35 |                 "start": "456",
 36 |                 "end": "123",
 37 |                 "splits": 1,
 38 |                 "expected_error": None,
 39 |                 "result": [],
 40 |             },
 41 |             {
 42 |                 "desc": "start and end equal after padding",
 43 |                 "start": "9",
 44 |                 "end": "90",
 45 |                 "splits": 100,
 46 |                 "expected_error": None,
 47 |                 "result": [],
 48 |             },
 49 |             {
 50 |                 "desc": "tight range split",
 51 |                 "start": "199999",
 52 |                 "end": "2",
 53 |                 "splits": 1,
 54 |                 "expected_error": None,
 55 |                 "result": ["1999995"],
 56 |             },
 57 |             {
 58 |                 "desc":
 59 |                 "split full namespace",
 60 |                 "start":
 61 |                 "",
 62 |                 "end":
 63 |                 "",
 64 |                 "splits":
 65 |                 24,
 66 |                 "expected_error":
 67 |                 None,
 68 |                 "result": [
 69 |                     "03",
 70 |                     "07",
 71 |                     "11",
 72 |                     "15",
 73 |                     "19",
 74 |                     "23",
 75 |                     "27",
 76 |                     "31",
 77 |                     "35",
 78 |                     "39",
 79 |                     "43",
 80 |                     "47",
 81 |                     "51",
 82 |                     "55",
 83 |                     "59",
 84 |                     "63",
 85 |                     "67",
 86 |                     "71",
 87 |                     "75",
 88 |                     "79",
 89 |                     "83",
 90 |                     "87",
 91 |                     "91",
 92 |                     "95",
 93 |                 ],
 94 |             },
 95 |             {
 96 |                 "desc": "split with only start range",
 97 |                 "start": "5555",
 98 |                 "end": "",
 99 |                 "splits": 4,
100 |                 "expected_error": None,
101 |                 "result": ["63", "72", "81", "90"],
102 |             },
103 |             {
104 |                 "desc": "large gap small number of splits",
105 |                 "start": "0",
106 |                 "end": "9",
107 |                 "splits": 3,
108 |                 "expected_error": None,
109 |                 "result": ["2", "4", "6"],
110 |             },
111 |             {
112 |                 "desc": "split with longer prefix",
113 |                 "start": "0123455111",
114 |                 "end": "012347",
115 |                 "splits": 1,
116 |                 "expected_error": None,
117 |                 "result": ["012346"],
118 |             },
119 |             {
120 |                 "desc": "split with only end range",
121 |                 "start": "",
122 |                 "end": "9",
123 |                 "splits": 1,
124 |                 "expected_error": None,
125 |                 "result": ["4"],
126 |             },
127 |         ]
128 |         rs = range_splitter.new_rangesplitter("0123456789")
129 |         for tc in test_cases:
130 |             try:
131 |                 result = rs.split_range(tc["start"], tc["end"], tc["splits"])
132 |                 self.assertEqual(result, tc["result"])
133 |             except tc["expected_error"]:
134 |                 pass
135 | 
136 |     def test_add_characters_to_alphabet(self):
137 |         test_cases = [
138 |             {
139 |                 "desc": "empty strings",
140 |                 "chars": "",
141 |                 "expected_alphabet_map": {
142 |                     "7": 0,
143 |                     "8": 1,
144 |                     "9": 2
145 |                 },
146 |             },
147 |             {
148 |                 "desc": "no new characters",
149 |                 "chars": "998",
150 |                 "expected_alphabet_map": {
151 |                     "7": 0,
152 |                     "8": 1,
153 |                     "9": 2
154 |                 },
155 |             },
156 |             {
157 |                 "desc": "new characters",
158 |                 "chars": "102",
159 |                 "expected_alphabet_map": {
160 |                     "0": 0,
161 |                     "1": 1,
162 |                     "2": 2,
163 |                     "7": 3,
164 |                     "8": 4,
165 |                     "9": 5,
166 |                 },
167 |             },
168 |         ]
169 |         rs = range_splitter.new_rangesplitter("789")
170 | 
171 |         for tc in test_cases:
172 |             rs.add_characters_to_alphabet(tc["chars"])
173 |             self.assertEqual(rs.alphabet_map, tc["expected_alphabet_map"],
174 |                              tc["desc"])
175 | 
176 |     def test_int_to_string(self):
177 |         test_cases = [
178 |             {
179 |                 "desc": "get a string",
180 |                 "split_point": 15,
181 |                 "string_len": 3,
182 |                 "result": "023",
183 |             },
184 |             {
185 |                 "desc": "max number",
186 |                 "split_point": 215,
187 |                 "string_len": 3,
188 |                 "result": "BBB",
189 |             },
190 |             {
191 |                 "desc": "large than max number",
192 |                 "split_point": 220,
193 |                 "string_len": 3,
194 |                 "result": "00A",
195 |             },
196 |         ]
197 |         rs = range_splitter.new_rangesplitter("0123AB")
198 |         for tc in test_cases:
199 |             result = rs.int_to_string(tc["split_point"], tc["string_len"])
200 |             self.assertEqual(result, tc["result"], tc["desc"])
201 | 
202 |     def test_int_to_string_empty_range(self):
203 |         test_cases = [
204 |             {
205 |                 "desc": "get a string",
206 |                 "split_point": 9,
207 |                 "string_len": 3,
208 |                 "result": "",
209 |                 "expected_error": ValueError,
210 |             },
211 |         ]
212 |         for tc in test_cases:
213 |             try:
214 |                 rs = range_splitter.new_rangesplitter("")
215 |                 result = rs.int_to_string(tc["split_point"], tc["string_len"])
216 |                 self.assertEqual(result, tc["result"], tc["desc"])
217 |             except tc["expected_error"]:
218 |                 pass
219 | 
220 |     def test_get_char_or_default(self):
221 |         test_cases = [
222 |             {
223 |                 "desc": "index larger than character string",
224 |                 "characters": "15",
225 |                 "index": 3,
226 |                 "default_char": "0",
227 |                 "result": "0",
228 |             },
229 |             {
230 |                 "desc": "index in string length",
231 |                 "characters": "15ABC",
232 |                 "index": 2,
233 |                 "default_char": "0",
234 |                 "result": "A",
235 |             },
236 |             {
237 |                 "desc": "index less than 0",
238 |                 "characters": "15ABC",
239 |                 "index": -3,
240 |                 "default_char": "0",
241 |                 "result": "0",
242 |             },
243 |             {
244 |                 "desc": "empty character",
245 |                 "characters": "",
246 |                 "index": 1,
247 |                 "default_char": "0",
248 |                 "result": "0",
249 |             },
250 |         ]
251 |         for tc in test_cases:
252 |             result = range_splitter.get_char_or_default(
253 |                 tc["characters"], tc["index"], tc["default_char"])
254 |             self.assertEqual(result, tc["result"], tc["desc"])
255 | 
256 |     def test_is_range_equal_with_padding(self):
257 |         test_cases = [
258 |             {
259 |                 "desc": "start and end range with padding are equal",
260 |                 "start": "15",
261 |                 "end": "1500",
262 |                 "result": True,
263 |             },
264 |             {
265 |                 "desc": "start and end range with padding are not equal",
266 |                 "start": "15",
267 |                 "end": "150A",
268 |                 "result": False,
269 |             },
270 |             {
271 |                 "desc": "end range is empty",
272 |                 "start": "15",
273 |                 "end": "",
274 |                 "result": False,
275 |             },
276 |             {
277 |                 "desc": "start range is empty",
278 |                 "start": "",
279 |                 "end": "09",
280 |                 "result": False,
281 |             },
282 |             {
283 |                 "desc": "start range is empty",
284 |                 "start": "",
285 |                 "end": "0",
286 |                 "result": True,
287 |             },
288 |             {
289 |                 "desc": "start and end range are empty",
290 |                 "start": "",
291 |                 "end": "",
292 |                 "result": False,
293 |             },
294 |             {
295 |                 "desc": "start and end range are not equal",
296 |                 "start": "21",
297 |                 "end": "12",
298 |                 "result": False,
299 |             },
300 |             {
301 |                 "desc": "start and end range are equal",
302 |                 "start": "21",
303 |                 "end": "21",
304 |                 "result": True,
305 |             },
306 |         ]
307 |         rs = range_splitter.new_rangesplitter("01A")
308 |         for tc in test_cases:
309 |             result = rs.is_range_equal_with_padding(tc["start"], tc["end"])
310 |             self.assertEqual(result, tc["result"], tc["desc"])
311 | 
312 |     def test_string_to_minimal_int_range(self):
313 |         test_cases = [
314 |             {
315 |                 "desc":
316 |                 "split numbers",
317 |                 "start":
318 |                 "00",
319 |                 "end":
320 |                 "20",
321 |                 "splits":
322 |                 3,
323 |                 "result":
324 |                 range_splitter.MinimalIntRange(start_int=0,
325 |                                                end_int=20,
326 |                                                min_len=2),
327 |             },
328 |             {
329 |                 "desc":
330 |                 "start is non-zero",
331 |                 "start":
332 |                 "06",
333 |                 "end":
334 |                 "201",
335 |                 "splits":
336 |                 4,
337 |                 "result":
338 |                 range_splitter.MinimalIntRange(start_int=6,
339 |                                                end_int=20,
340 |                                                min_len=2),
341 |             },
342 |             {
343 |                 "desc":
344 |                 "start with smaller suffix",
345 |                 "start":
346 |                 "091",
347 |                 "end":
348 |                 "10",
349 |                 "splits":
350 |                 2,
351 |                 "result":
352 |                 range_splitter.MinimalIntRange(start_int=91,
353 |                                                end_int=100,
354 |                                                min_len=3),
355 |             },
356 |             {
357 |                 "desc":
358 |                 "start is empty",
359 |                 "start":
360 |                 "",
361 |                 "end":
362 |                 "10",
363 |                 "splits":
364 |                 2,
365 |                 "result":
366 |                 range_splitter.MinimalIntRange(start_int=0,
367 |                                                end_int=10,
368 |                                                min_len=2),
369 |             },
370 |             {
371 |                 "desc":
372 |                 "start and end are empty",
373 |                 "start":
374 |                 "",
375 |                 "end":
376 |                 "",
377 |                 "splits":
378 |                 24,
379 |                 "result":
380 |                 range_splitter.MinimalIntRange(start_int=0,
381 |                                                end_int=99,
382 |                                                min_len=2),
383 |             },
384 |             {
385 |                 "desc":
386 |                 "end is empty",
387 |                 "start":
388 |                 "5555",
389 |                 "end":
390 |                 "",
391 |                 "splits":
392 |                 4,
393 |                 "result":
394 |                 range_splitter.MinimalIntRange(start_int=55,
395 |                                                end_int=99,
396 |                                                min_len=2),
397 |             },
398 |             {
399 |                 "desc":
400 |                 "tight range split",
401 |                 "start":
402 |                 "199999",
403 |                 "end":
404 |                 "2",
405 |                 "splits":
406 |                 1,
407 |                 "result":
408 |                 range_splitter.MinimalIntRange(start_int=1999990,
409 |                                                end_int=2000000,
410 |                                                min_len=7),
411 |             },
412 |             {
413 |                 "desc":
414 |                 "tight range split",
415 |                 "start":
416 |                 "8100",
417 |                 "end":
418 |                 "9100",
419 |                 "splits":
420 |                 3,
421 |                 "result":
422 |                 range_splitter.MinimalIntRange(start_int=81,
423 |                                                end_int=91,
424 |                                                min_len=2),
425 |             },
426 |         ]
427 |         rs = range_splitter.new_rangesplitter("0123456789")
428 |         for tc in test_cases:
429 |             result = rs.string_to_minimal_int_range(tc["start"], tc["end"],
430 |                                                     tc["splits"])
431 |             self.assertEqual(result, tc["result"], tc["desc"])
432 | 
433 |     def test_generate_splits(self):
434 |         test_cases = [
435 |             {
436 |                 "desc": "less than one split",
437 |                 "start": "1",
438 |                 "end": "2",
439 |                 "splits": 0,
440 |                 "result": [],
441 |             },
442 |             {
443 |                 "desc": "tight range split",
444 |                 "start": "199999",
445 |                 "end": "2",
446 |                 "splits": 1,
447 |                 "result": ["1999995"],
448 |             },
449 |             {
450 |                 "desc":
451 |                 "split full namespace",
452 |                 "start":
453 |                 "",
454 |                 "end":
455 |                 "",
456 |                 "splits":
457 |                 24,
458 |                 "result": [
459 |                     "03",
460 |                     "07",
461 |                     "11",
462 |                     "15",
463 |                     "19",
464 |                     "23",
465 |                     "27",
466 |                     "31",
467 |                     "35",
468 |                     "39",
469 |                     "43",
470 |                     "47",
471 |                     "51",
472 |                     "55",
473 |                     "59",
474 |                     "63",
475 |                     "67",
476 |                     "71",
477 |                     "75",
478 |                     "79",
479 |                     "83",
480 |                     "87",
481 |                     "91",
482 |                     "95",
483 |                 ],
484 |             },
485 |             {
486 |                 "desc": "split with only start range",
487 |                 "start": "5555",
488 |                 "end": "",
489 |                 "splits": 4,
490 |                 "result": ["63", "72", "81", "90"],
491 |             },
492 |             {
493 |                 "desc": "large gap small number of splits",
494 |                 "start": "0",
495 |                 "end": "9",
496 |                 "splits": 3,
497 |                 "result": ["2", "4", "6"],
498 |             },
499 |             {
500 |                 "desc": "split with longer prefix",
501 |                 "start": "0123455111",
502 |                 "end": "012347",
503 |                 "splits": 1,
504 |                 "result": ["012346"],
505 |             },
506 |             {
507 |                 "desc": "split with only end range",
508 |                 "start": "",
509 |                 "end": "9",
510 |                 "splits": 1,
511 |                 "result": ["4"],
512 |             },
513 |             {
514 |                 "desc": "tight range split",
515 |                 "start": "8100",
516 |                 "end": "9100",
517 |                 "splits": 3,
518 |                 "result": ["83", "86", "88"],
519 |             },
520 |         ]
521 |         rs = range_splitter.new_rangesplitter("0123456789")
522 |         for tc in test_cases:
523 |             min_int_range = rs.string_to_minimal_int_range(
524 |                 tc["start"], tc["end"], tc["splits"])
525 |             opts = range_splitter.GenerateSplitsOpts(min_int_range,
526 |                                                      tc["splits"], tc["start"],
527 |                                                      tc["end"])
528 |             result = rs.generate_splits(opts)
529 |             self.assertEqual(result, tc["result"], tc["desc"])
530 | 
531 | 
532 | if __name__ == "__main__":
533 |     unittest.main()
534 | 


--------------------------------------------------------------------------------
/dataflux_core/tests/test_user_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2024 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | 
17 | import unittest
18 | 
19 | from google.api_core.client_info import ClientInfo
20 | from google.cloud import storage
21 | 
22 | from dataflux_core import user_agent
23 | 
24 | 
25 | class UserAgentTest(unittest.TestCase):
26 | 
27 |     def test_no_existing_info(self):
28 |         client = storage.Client()
29 |         user_agent.add_dataflux_user_agent(client)
30 |         self.assertTrue(client._connection.user_agent.startswith("dataflux"))
31 | 
32 |     def test_no_existing_string(self):
33 |         client = storage.Client(client_info=ClientInfo())
34 |         user_agent.add_dataflux_user_agent(client)
35 |         self.assertTrue(client._connection.user_agent.startswith("dataflux"))
36 | 
37 |     def test_with_existing_string(self):
38 |         existing_user_agent = "existing user agent"
39 |         client = storage.Client(client_info=ClientInfo(
40 |             user_agent=existing_user_agent))
41 |         user_agent.add_dataflux_user_agent(client)
42 |         self.assertTrue(client._connection.user_agent.startswith("dataflux"))
43 |         self.assertIn(existing_user_agent,
44 |                       client._connection._client_info.user_agent)
45 | 


--------------------------------------------------------------------------------
/dataflux_core/user_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright 2023 Google LLC
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |       https://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  """
16 | 
17 | from google.api_core.client_info import ClientInfo
18 | from google.cloud import storage
19 | 
20 | user_agent_string = "dataflux/1.0"
21 | 
22 | 
23 | def add_dataflux_user_agent(storage_client: storage.Client):
24 |     if not storage_client._connection:
25 |         return
26 |     if not storage_client._connection._client_info:
27 |         storage_client._connection._client_info = ClientInfo(
28 |             user_agent=user_agent_string)
29 |     elif not storage_client._connection._client_info.user_agent:
30 |         storage_client._connection._client_info.user_agent = user_agent_string
31 |     elif user_agent_string not in storage_client._connection._client_info.user_agent:
32 |         storage_client._connection._client_info.user_agent = user_agent_string + \
33 |             " " + storage_client._connection._client_info.user_agent
34 | 


--------------------------------------------------------------------------------
/docs/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of
 9 | experience, education, socio-economic status, nationality, personal appearance,
10 | race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | *   Using welcoming and inclusive language
18 | *   Being respectful of differing viewpoints and experiences
19 | *   Gracefully accepting constructive criticism
20 | *   Focusing on what is best for the community
21 | *   Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | *   The use of sexualized language or imagery and unwelcome sexual attention or
26 |     advances
27 | *   Trolling, insulting/derogatory comments, and personal or political attacks
28 | *   Public or private harassment
29 | *   Publishing others' private information, such as a physical or electronic
30 |     address, without explicit permission
31 | *   Other conduct which could reasonably be considered inappropriate in a
32 |     professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or reject
41 | comments, commits, code, wiki edits, issues, and other contributions that are
42 | not aligned to this Code of Conduct, or to ban temporarily or permanently any
43 | contributor for other behaviors that they deem inappropriate, threatening,
44 | offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when the Project
56 | Steward has a reasonable belief that an individual's behavior may have a
57 | negative impact on the project or its community.
58 | 
59 | ## Conflict Resolution
60 | 
61 | We do not believe that all conflict is bad; healthy debate and disagreement
62 | often yield positive results. However, it is never okay to be disrespectful or
63 | to engage in behavior that violates the project’s code of conduct.
64 | 
65 | If you see someone violating the code of conduct, you are encouraged to address
66 | the behavior directly with those involved. Many issues can be resolved quickly
67 | and easily, and this gives people more control over the outcome of their
68 | dispute. If you are unable to resolve the matter for any reason, or if the
69 | behavior is threatening or harassing, report it. We are dedicated to providing
70 | an environment where participants feel welcome and safe.
71 | 
72 | Reports should be directed to dataflux-customer-support@google.com, the
73 | Project Steward(s) for Dataflux. It is the Project Steward’s duty to
74 | receive and address reported violations of the code of conduct. They will then
75 | work with a committee consisting of representatives from the Open Source
76 | Programs Office and the Google Open Source Strategy team. If for any reason you
77 | are uncomfortable reaching out to the Project Steward, please email
78 | opensource@google.com.
79 | 
80 | We will investigate every complaint, but you may not receive a direct response.
81 | We will use our discretion in determining when and how to follow up on reported
82 | incidents, which may range from not taking action to permanent expulsion from
83 | the project and project-sponsored spaces. We will notify the accused of the
84 | report and provide them an opportunity to discuss it before any action is taken.
85 | The identity of the reporter will be omitted from the details of the report
86 | supplied to the accused. In potentially harmful situations, such as ongoing
87 | harassment or threats to anyone's safety, we may take action without notice.
88 | 
89 | ## Attribution
90 | 
91 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4,
92 | available at
93 | https://www.contributor-covenant.org/version/1/4/code-of-conduct/


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We would love to accept your patches and contributions to this project.
 4 | 
 5 | ## Before you begin
 6 | 
 7 | ### Sign our Contributor License Agreement
 8 | 
 9 | Contributions to this project must be accompanied by a
10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11 | You (or your employer) retain the copyright to your contribution; this simply
12 | gives us permission to use and redistribute your contributions as part of the
13 | project.
14 | 
15 | If you or your current employer have already signed the Google CLA (even if it
16 | was for a different project), you probably don't need to do it again.
17 | 
18 | Visit <https://cla.developers.google.com/> to see your current agreements or to
19 | sign a new one.
20 | 
21 | ### Review our Community Guidelines
22 | 
23 | This project follows [Google's Open Source Community
24 | Guidelines](https://opensource.google/conduct/).
25 | 
26 | ## Contribution process
27 | 
28 | ### Code Reviews
29 | 
30 | All submissions, including submissions by project members, require review. We
31 | use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests)
32 | for this purpose.


--------------------------------------------------------------------------------
/kokoro/build.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | # Fail on any error.
18 | set -e
19 | 
20 | # Code under repo is checked out to this directory.
21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python"
22 | 
23 | function install_requirements() {
24 |     echo Installing requirements.
25 | 
26 |     echo Installing python3-pip.
27 |     sudo apt-get -y install python3-pip
28 | 
29 |     echo Installing required dependencies.
30 |     pip install -r requirements.txt
31 | }
32 | 
33 | function run_unit_tests() {
34 |     echo Running unit tests.
35 |     python -m pytest dataflux_core/tests -vvv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG
36 | }
37 | 
38 | install_requirements
39 | run_unit_tests
40 | 


--------------------------------------------------------------------------------
/kokoro/continuous.cfg:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http:#www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | build_file: "dataflux-client-python/kokoro/build.sh"
16 | 
17 | action {
18 |   define_artifacts {
19 |     regex: "**/unit_tests/sponge_log.xml"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/kokoro/hourly.cfg:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http:#www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | build_file: "dataflux-client-python/kokoro/performance_one_shot.sh"
16 | 
17 | env_vars {
18 |   key: "PROJECT"
19 |   value: "dataflux-project"
20 | }
21 | 
22 | env_vars {
23 |   key: "BUCKET"
24 |   value: "official-dataflux-tess"
25 | }
26 | 
27 | env_vars {
28 |   key: "PREFIX"
29 |   value: "UNet3D/micro/100KB-500MB/train"
30 | }
31 | 
32 | env_vars {
33 |   key: "LIST_WORKERS"
34 |   value: "32"
35 | }
36 | 
37 | env_vars {
38 |   key: "FILE_COUNT"
39 |   value: "5000"
40 | }
41 | 
42 | env_vars {
43 |   key: "TOTAL_FILE_SIZE"
44 |   value: "501770000"
45 | }
46 | 
47 | env_vars {
48 |   key: "MAX_COMPOSE_BYTES"
49 |   value: "100000000"
50 | }
51 | 
52 | env_vars {
53 |   key: "LIST_TIMEOUT"
54 |   value: "30"
55 | }
56 | 
57 | env_vars {
58 |   key: "DOWNLOAD_TIMEOUT"
59 |   value: "400"
60 | }
61 | 
62 | action {
63 |   define_artifacts {
64 |     regex: "**/unit_tests/sponge_log.xml"
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/kokoro/nightly.cfg:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http:#www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | build_file: "dataflux-client-python/kokoro/performance_seg.sh"
16 | 
17 | env_vars {
18 |   key: "PROJECT"
19 |   value: "dataflux-project"
20 | }
21 | 
22 | env_vars {
23 |   key: "BUCKET"
24 |   value: "official-dataflux-tess"
25 | }
26 | 
27 | env_vars {
28 |   key: "PREFIX"
29 |   value: "UNet3D/large/150MB-750GB/train"
30 | }
31 | 
32 | env_vars {
33 |   key: "LIST_WORKERS"
34 |   value: "32"
35 | }
36 | 
37 | env_vars {
38 |   key: "FILE_COUNT"
39 |   value: "5000"
40 | }
41 | 
42 | env_vars {
43 |   key: "TOTAL_FILE_SIZE"
44 |   value: "749947535000"
45 | }
46 | 
47 | env_vars {
48 |   key: "MAX_COMPOSE_BYTES"
49 |   value: "10"
50 | }
51 | 
52 | env_vars {
53 |   key: "LIST_TIMEOUT"
54 |   value: "10"
55 | }
56 | 
57 | env_vars {
58 |   key: "DOWNLOAD_TIMEOUT"
59 |   value: "1400"
60 | }
61 | 
62 | env_vars {
63 |   key: "PARALLELIZATION"
64 |   value: "32"
65 | }
66 | 
67 | action {
68 |   define_artifacts {
69 |     regex: "**/unit_tests/sponge_log.xml"
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/kokoro/performance_one_shot.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | # Fail on any error.
18 | set -e
19 | 
20 | # Code under repo is checked out to this directory.
21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python"
22 | 
23 | function install_requirements() {
24 |     echo Installing requirements.
25 | 
26 |     echo Installing python3-pip.
27 |     sudo apt-get -y install python3-pip
28 | 
29 |     echo Installing required dependencies.
30 |     pip install -r requirements.txt
31 |     
32 |     echo Installing dataflux core.
33 |     pip install .
34 | }
35 | 
36 | function run_one_shot_tests() {
37 |     echo Running performance tests.
38 |     # -k one_shot triggers a full list and download, loading all files into memory in one shot.
39 |     # Alternatively, the segmented test allows us to divide the download into multiple passes
40 |     # to avoid OOM errors.
41 |     python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k one_shot -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG
42 | }
43 | 
44 | install_requirements
45 | run_one_shot_tests
46 | 


--------------------------------------------------------------------------------
/kokoro/performance_seg.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | # Fail on any error.
18 | set -e
19 | 
20 | # Code under repo is checked out to this directory.
21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python"
22 | 
23 | function install_requirements() {
24 |     echo Installing requirements.
25 | 
26 |     echo Installing python3-pip.
27 |     sudo apt-get -y install python3-pip
28 | 
29 |     echo Installing required dependencies.
30 |     pip install -r requirements.txt
31 |     
32 |     echo Installing dataflux core.
33 |     pip install .
34 | }
35 | 
36 | function run_segmented_tests() {
37 |     echo Running performance tests.
38 |     # -k segmented triggers a full list and download, batching the download into 50GB chunks.
39 |     # This test sequence is designed to handle volumes of data that exceed memory of the machine.
40 |     python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k segmented -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG
41 | }
42 | 
43 | install_requirements
44 | run_segmented_tests
45 | 
46 | 


--------------------------------------------------------------------------------
/kokoro/presubmit.cfg:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http:#www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | build_file: "dataflux-client-python/kokoro/presubmit.sh"
16 | 
17 | env_vars {
18 |   key: "PROJECT"
19 |   value: "dataflux-project"
20 | }
21 | 
22 | env_vars {
23 |   key: "BUCKET"
24 |   value: "official-dataflux-tess"
25 | }
26 | 
27 | env_vars {
28 |   key: "PREFIX"
29 |   value: "UNet3D/micro/100KB-500MB/train"
30 | }
31 | 
32 | env_vars {
33 |   key: "LIST_WORKERS"
34 |   value: "32"
35 | }
36 | 
37 | env_vars {
38 |   key: "FILE_COUNT"
39 |   value: "5000"
40 | }
41 | 
42 | env_vars {
43 |   key: "TOTAL_FILE_SIZE"
44 |   value: "501770000"
45 | }
46 | 
47 | env_vars {
48 |   key: "MAX_COMPOSE_BYTES"
49 |   value: "100000000"
50 | }
51 | 
52 | env_vars {
53 |   key: "LIST_TIMEOUT"
54 |   value: "10"
55 | }
56 | 
57 | env_vars {
58 |   key: "DOWNLOAD_TIMEOUT"
59 |   value: "400"
60 | }
61 | 
62 | env_vars {
63 |   key: "PARALLELIZATION"
64 |   value: "32"
65 | }
66 | 
67 | action {
68 |   define_artifacts {
69 |     regex: "**/unit_tests/sponge_log.xml"
70 |   }
71 | }
72 | 
73 | action {
74 |   define_artifacts {
75 |     regex: "**/integration_tests/sponge_log.xml"
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/kokoro/presubmit.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | # Fail on any error.
18 | set -e
19 | 
20 | # Code under repo is checked out to this directory.
21 | cd "${KOKORO_ARTIFACTS_DIR}/github/dataflux-client-python"
22 | 
23 | function install_requirements() {
24 |     echo Installing requirements.
25 | 
26 |     echo Installing python3-pip.
27 |     sudo apt-get -y install python3-pip
28 | 
29 |     echo Installing required dependencies.
30 |     pip install -r requirements.txt
31 |     
32 |     echo Installing dataflux core.
33 |     pip install .
34 | }
35 | 
36 | function run_presubmit_tests() {
37 |     echo Running unit tests.
38 |     python3 -m pytest dataflux_core/tests -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/unit_tests/sponge_log.xml" --log-cli-level=DEBUG
39 |     echo Running performance tests.
40 |     # -k one_shot triggers a full list and download, loading all files into memory in one shot.
41 |     # Alternatively, the segmented test allows us to divide the download into multiple passes
42 |     # to avoid OOM errors.
43 |     python3 -m pytest dataflux_core/performance_tests/list_and_download.py -k one_shot -vv --junit-xml="${KOKORO_ARTIFACTS_DIR}/integration_tests/sponge_log.xml" --log-cli-level=DEBUG
44 | }
45 | 
46 | install_requirements
47 | run_presubmit_tests
48 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google-auth==2.39.0
2 | google-cloud-storage
3 | absl-py
4 | pytest
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | 
3 | setup(
4 |     name="dataflux_client_python",
5 |     packages=find_packages(),
6 | )
7 | 


--------------------------------------------------------------------------------