├── .github └── workflows │ └── python-package.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── anypathlib ├── __init__.py ├── anypath.py ├── cli.py └── path_handlers │ ├── __init__.py │ ├── azure_handler.py │ ├── base_path_handler.py │ ├── local_handler.py │ ├── path_types.py │ └── s3_handler.py ├── docs ├── anypathlib_logo.png └── wsc_logo.png ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── fixtures_anypath.py ├── test_anypath_flows.py ├── test_cli.py ├── test_copy_file_to_dir.py ├── test_download_from_cloud.py ├── test_iterdir_glob_rglob.py ├── test_pathlib_properties.py ├── test_str_path_interoperability.py ├── test_upload_to_cloud.py └── tests_urls.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Version 0.2.0 2 | - Type `AnyPathLikeType` was added, which can be used to init an `AnyPath` instance or in `copy` target 3 | - `listdir` is now deprecated, replaced by `iterdir`, `rglob`, and `glob` 4 | - `copy` not supports the case where the source is a file and target is a directory -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2024] [WSC Sports Technologies Ltd.] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

logo

2 | 3 | 4 |
5 | 6 | [![wsc_logo](https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/master/docs/wsc_logo.png)](https://wsc-sports.com/) 7 | 8 |
9 | 10 | # AnyPathLib - Crossing Cloud Borders With a Simple API 11 | 12 |

13 | PyPI version 14 | Downloads 15 | All Contributors 16 | License: Apache 2.0 17 |

18 | 19 | 20 | Welcome to AnyPathLib, a Python library designed to allow hassle-free file operations across different cloud and local 21 | storage 22 | 23 | ## Why `AnyPathLib`? 24 | 25 | With `AnyPathLib` you can write the same code to handle files across different storage systems, without worrying about 26 | the 27 | underlying details. 28 | Operations can be optimized per-backend and the library is easily extendable to support additional cloud storage 29 | providers. 30 | 31 | ## Getting Started With `AnyPathLib` with 3 easy examples ️🛣️ 32 | 33 | ### ️🛣️ 1/3 Copying a file or directory from anywhere to anywhere ️🛣️ 34 | 35 | ```python 36 | from anypathlib import AnyPath 37 | 38 | # Create an AnyPath instance for a local file 39 | local_file = AnyPath("/path/to/local/file.txt") 40 | 41 | # Create an AnyPath instance for an S3 object 42 | s3_file = AnyPath("s3://bucket/path/to/object.txt") 43 | 44 | # Copy a file from local to S3 45 | local_file.copy(s3_file) 46 | 47 | # Copy a directory from S3 to Azure 48 | s3_dir = AnyPath("s3://bucket/path/to/dir") 49 | azure_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path") 50 | s3_dir.copy(azure_dir) 51 | ``` 52 | 53 | ### ️🛣️ 2/3 Local caching for quicker access ️🛣️ 54 | 55 | Use "copy" without a target to get a local copy of the file which is stored in a local cache. 56 | Use `force_overwrite=False` to prevent repeated downloads of the same file 57 | 58 | ```python 59 | my_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path/to/dir") 60 | local_dir_path = my_dir.copy() 61 | 62 | my_file = AnyPath("s3://bucket/path/to/file.txt") 63 | local_file_path = my_file.copy() 64 | local_file_path = my_file.copy(force_overwrite=False) # Returns the path of the previously downloaded file 65 | ``` 66 | 67 | ### 🛣️ 3/3 A simplified pathlib-like Interface 🛣️ 68 | 69 | ```python 70 | my_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path/to/dir") 71 | my_dir.exists() # True if my_path exists, otherwise False 72 | parent, name, stem = my_dir.parent, my_dir.name, my_dir.stem 73 | files_in_dir: List[AnyPath] = my_dir.rglob('*') # List of AnyPath instances for files in the directory 74 | 75 | my_file = AnyPath("s3://bucket/path/to/file.txt") 76 | my_file.is_file() # True if my_path exists, otherwise False 77 | my_file.is_dir() # False 78 | my_file.remove() 79 | ``` 80 | 81 | ### CLI Usage 82 | 83 | `AnyPathLib` also comes with a CLI tool that allows you to perform file operations from the command line. 84 | You can run `anypathlib --help` to get a list of available commands and options. 85 | 86 | Here are some examples: 87 | 88 | Copy: 89 | ```bash 90 | anypathlib copy -i /path/to/source -o /path/to/destination 91 | ``` 92 | 93 | Remove a file or directory: 94 | ```bash 95 | anypathlib remove -p /path/to/file_or_directory 96 | ``` 97 | 98 | ### Key Features 99 | 100 | * **Unified, Cloud Agnostic, API**: Perform file operations across different storage backends using the same set of 101 | methods. 102 | * **Path-like Operations**: Supports common path operations like joining paths, listing directories, checking file 103 | existence, etc. 104 | * **Performance**: Local caching for repeated downloads across different sessions, multithreading, and more. 105 | * **Extensibility**: Easily extendable to support additional cloud storage providers. 106 | 107 | ### Security and Credentials 108 | 109 | `AnyPath` does not store any credentials in it. In order to access cloud storage, you need to have the necessary 110 | environment variables defined. 111 | 112 | #### Azure 113 | 114 | ```bash 115 | export AZURE_SUBSCRIPTION_ID="your-subscription-id" 116 | export AZURE_RESOURCE_GROUP_NAME="your-resource-group-name" 117 | ``` 118 | 119 | #### AWS S3 120 | 121 | Same as Boto3: 122 | 123 | ```bash 124 | export AWS_DEFAULT_REGION="your-region" 125 | export AWS_SECRET_ACCESS_KEY="your-secret" 126 | export AWS_ACCESS_KEY_ID="your-key" 127 | ``` 128 | 129 | # TODOs: 130 | 131 | - [ ] Add support for additional cloud storage providers. 132 | 133 | > GCP 134 | 135 | - [ ] Improve API 136 | 137 | > Add __open__ method for reading files, etc. 138 | 139 | - [ ] Implement cloud-to-cloud ops more efficiently. 140 | 141 | > cache azure credentials to avoid repeated logins 142 | 143 | - [ ] Improve logging and add verbose mode. 144 | 145 | > progress bar, etc. 146 | 147 | ## Contributors ✨ 148 | 149 | Thanks goes to these wonderful people: 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 |

Yuval Shomer

🎨 🤔

Jeremy Levy

🎨 🤔
162 | -------------------------------------------------------------------------------- /anypathlib/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.0" 2 | 3 | from anypathlib.anypath import AnyPath 4 | from anypathlib.path_handlers.path_types import PathType 5 | -------------------------------------------------------------------------------- /anypathlib/anypath.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | from pathlib import Path, PurePath 4 | from typing import Union, Optional, List, Dict, NewType 5 | from urllib.parse import urlparse 6 | 7 | from anypathlib.path_handlers.azure_handler import AzureHandler 8 | from anypathlib.path_handlers.base_path_handler import BasePathHandler 9 | from anypathlib.path_handlers.local_handler import LocalPathHandler 10 | from anypathlib.path_handlers.path_types import PathType 11 | from anypathlib.path_handlers.s3_handler import S3Handler 12 | 13 | AnyPathLikeType = NewType('AnyPathLikeType', Union[str, Path, 'AnyPath']) 14 | 15 | 16 | class AnyPath: 17 | PATH_HANDLERS: Dict[PathType, BasePathHandler] = {PathType.local: LocalPathHandler, 18 | PathType.s3: S3Handler, 19 | PathType.azure: AzureHandler} 20 | LOCAL_CACHE_PATH = Path(tempfile.gettempdir()) / 'AnyPath' 21 | 22 | def __init__(self, base_path: AnyPathLikeType): 23 | if type(base_path) is str: 24 | self._base_path = base_path 25 | elif issubclass(type(base_path), PurePath): 26 | self._base_path = base_path.absolute().as_posix() 27 | elif type(base_path) is AnyPath: 28 | self._base_path = base_path.base_path 29 | else: 30 | raise ValueError(f'base_path must be of type str, Path or AnyPath, got {type(base_path)}') 31 | self.path_type = self.get_path_type(self._base_path) 32 | self.path_handler = self.PATH_HANDLERS[self.path_type] 33 | 34 | @staticmethod 35 | def get_path_type(url: str) -> PathType: 36 | parsed_url = urlparse(url) 37 | if parsed_url.scheme in ['http', 'https']: 38 | if 'blob.core.windows.net' in parsed_url.netloc: 39 | return PathType.azure 40 | elif 'amazonaws.com' in parsed_url.netloc or 's3' in parsed_url.netloc: 41 | return PathType.s3 42 | elif parsed_url.scheme in ['s3']: 43 | return PathType.s3 44 | elif parsed_url.scheme in ['file', '']: 45 | return PathType.local 46 | else: 47 | # Assume local 48 | return PathType.local 49 | 50 | def __repr__(self): 51 | return self.base_path 52 | 53 | @property 54 | def is_s3(self) -> bool: 55 | return self.path_type == PathType.s3 56 | 57 | @property 58 | def is_local(self) -> bool: 59 | return self.path_type == PathType.local 60 | 61 | @property 62 | def is_azure(self) -> bool: 63 | return self.path_type == PathType.azure 64 | 65 | # define truediv to allow for concatenation 66 | def __truediv__(self, other: str) -> 'AnyPath': 67 | if self.is_local: 68 | return AnyPath(Path(self.base_path) / other) 69 | else: 70 | valid_other = other[1:] if other.startswith('/') else other 71 | valid_base = self.base_path if self.base_path.endswith('/') else self.base_path + '/' 72 | 73 | return AnyPath(f'{valid_base}{valid_other}') 74 | 75 | @property 76 | def base_path(self) -> str: 77 | if self.path_type == PathType.s3: 78 | base_path = self._base_path 79 | base_path = base_path.replace('//', '/') 80 | if base_path.startswith('s3:/') and not base_path.startswith('s3://'): 81 | base_path = base_path.replace('s3:/', 's3://') 82 | if base_path[-1] == '/': 83 | base_path = base_path[:-1] 84 | elif self.path_type == PathType.local: 85 | base_path = Path(self._base_path).as_posix() 86 | else: 87 | base_path = self._base_path 88 | return base_path 89 | 90 | def is_dir(self) -> bool: 91 | return self.path_handler.is_dir(self.base_path) 92 | 93 | def is_file(self) -> bool: 94 | return self.path_handler.is_file(self.base_path) 95 | 96 | def exists(self) -> bool: 97 | return self.path_handler.exists(self.base_path) 98 | 99 | def remove(self): 100 | self.path_handler.remove(self.base_path) 101 | 102 | @property 103 | def parent(self) -> 'AnyPath': 104 | return AnyPath(self.path_handler.parent(self.base_path)) 105 | 106 | @property 107 | def stem(self) -> str: 108 | return self.path_handler.stem(self.base_path) 109 | 110 | @property 111 | def name(self) -> str: 112 | return self.path_handler.name(self.base_path) 113 | 114 | def iterdir(self) -> List['AnyPath']: 115 | return [AnyPath(p) for p in self.path_handler.iterdir(self.base_path)] 116 | 117 | def glob(self, pattern: str) -> List['AnyPath']: 118 | return [AnyPath(p) for p in self.path_handler.glob(self.base_path, pattern)] 119 | 120 | def rglob(self, pattern: str) -> List['AnyPath']: 121 | return [AnyPath(p) for p in self.path_handler.rglob(self.base_path, pattern)] 122 | 123 | def __get_local_path(self, target_path: Optional[Path] = None, force_overwrite: bool = False, 124 | verbose: bool = False) -> Optional[Path]: 125 | if target_path is None: 126 | if self.is_dir(): 127 | valid_target_path = Path(tempfile.mkdtemp()) 128 | else: 129 | valid_target_path = Path(tempfile.mktemp()) 130 | else: 131 | if target_path.exists(): 132 | assert target_path.is_dir() == self.is_dir() 133 | assert target_path.is_file() == self.is_file() 134 | valid_target_path = target_path 135 | if self.path_type == PathType.local: 136 | if not target_path.exists() or force_overwrite: 137 | if self.is_dir(): 138 | shutil.copytree(self.base_path, valid_target_path, dirs_exist_ok=True) 139 | else: 140 | Path(valid_target_path).parent.mkdir(exist_ok=True, parents=True) 141 | shutil.copy(self.base_path, valid_target_path) 142 | return valid_target_path 143 | else: 144 | if self.is_dir(): 145 | result = self.path_handler.download_directory(url=self.base_path, 146 | force_overwrite=force_overwrite, 147 | target_dir=valid_target_path, 148 | verbose=verbose) 149 | if result is not None: 150 | local_path, _ = result 151 | else: 152 | return None 153 | 154 | else: 155 | local_path = self.path_handler.download_file(url=self.base_path, force_overwrite=force_overwrite, 156 | target_path=valid_target_path) 157 | 158 | assert local_path == valid_target_path, \ 159 | f'local_path {local_path} is not equal to valid_target_path {valid_target_path}' 160 | return Path(local_path) 161 | 162 | def __get_local_cache_path(self) -> 'AnyPath': 163 | handler_prefix = 's3' if self.is_s3 else 'azure' if self.is_azure else 'local' 164 | local_cache_path = self.LOCAL_CACHE_PATH / handler_prefix / self.path_handler.relative_path(self.base_path) 165 | if self.is_dir(): 166 | local_cache_path.mkdir(exist_ok=True, parents=True) 167 | elif self.is_file(): 168 | local_cache_path.parent.mkdir(exist_ok=True, parents=True) 169 | return AnyPath(local_cache_path) 170 | 171 | def copy(self, target: Optional[AnyPathLikeType] = None, force_overwrite: bool = True, 172 | verbose: bool = False) -> 'AnyPath': 173 | assert self.exists(), f'source path: {self.base_path} does not exist' 174 | if target is None: 175 | valid_target = self.__get_local_cache_path() 176 | else: 177 | input_target = AnyPath(target) 178 | # if source is a file and target is either an existing dir copy the file to the target dir 179 | if self.is_file() and input_target.is_dir(): 180 | valid_target = input_target / self.name 181 | else: 182 | valid_target = input_target 183 | if valid_target.is_local: 184 | self.__get_local_path(target_path=Path(valid_target.base_path), force_overwrite=force_overwrite, 185 | verbose=verbose) 186 | else: 187 | if valid_target.is_s3 and self.is_s3: 188 | S3Handler.copy(source_url=self.base_path, target_url=valid_target.base_path) 189 | elif valid_target.is_azure and self.is_azure: 190 | AzureHandler.copy(source_url=self.base_path, target_url=valid_target.base_path) 191 | else: 192 | # valid_target and source are different, 193 | # so we need to download the source and upload it to the valid_target 194 | 195 | local_path = Path(self.base_path) if self.is_local else self.__get_local_path( 196 | force_overwrite=force_overwrite, verbose=verbose) 197 | target_path_handler = valid_target.path_handler 198 | if self.is_dir(): 199 | target_path_handler.upload_directory(local_dir=local_path, target_url=valid_target.base_path, 200 | verbose=verbose) 201 | else: 202 | target_path_handler.upload_file(local_path=str(local_path), target_url=valid_target.base_path) 203 | return valid_target 204 | -------------------------------------------------------------------------------- /anypathlib/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from anypathlib import AnyPath 3 | 4 | 5 | @click.group() 6 | def cli(): 7 | pass 8 | 9 | 10 | @click.command() 11 | @click.option('-i', '--input', 'input_path', required=True, type=click.STRING, help='Input path to copy from') 12 | @click.option('-o', '--output', 'output_path', type=click.STRING, help='Output path to copy to') 13 | @click.option('-v', '--verbose', is_flag=True, default=False, help='Verbose flag') 14 | @click.option('-f', '--force/--no-force', is_flag=True, default=True, help='Force overwrite flag') 15 | def copy(input_path, output_path, verbose, force): 16 | """Copy files from input to output path. """ 17 | target_path = AnyPath(input_path).copy(target=AnyPath(output_path) if output_path else None, 18 | verbose=verbose, force_overwrite=force) 19 | click.echo(f'Copied Successfully to {target_path}') 20 | 21 | 22 | @click.command() 23 | @click.option('-p', '--path', required=True, type=click.STRING, help='Path to check') 24 | def exists(path): 25 | """Check if the path exists. """ 26 | click.echo(AnyPath(path).exists()) 27 | 28 | 29 | @click.command() 30 | @click.option('-p', 'path', required=True, type=click.STRING, help='Path to list') 31 | def iterdir(path): 32 | """List the directory. """ 33 | click.echo(AnyPath(path).iterdir()) 34 | 35 | 36 | @click.command() 37 | @click.option('-p', 'path', required=True, type=click.STRING, help='Path to remove') 38 | def remove(path): 39 | """Remove the path. """ 40 | AnyPath(path).remove() 41 | 42 | 43 | cli.add_command(copy) 44 | cli.add_command(exists) 45 | cli.add_command(iterdir) 46 | cli.add_command(remove) 47 | 48 | if __name__ == '__main__': 49 | cli() 50 | -------------------------------------------------------------------------------- /anypathlib/path_handlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/anypathlib/path_handlers/__init__.py -------------------------------------------------------------------------------- /anypathlib/path_handlers/azure_handler.py: -------------------------------------------------------------------------------- 1 | import fnmatch 2 | import os 3 | from concurrent.futures import ThreadPoolExecutor 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | from typing import Optional, List, Tuple 7 | from urllib.parse import urlparse 8 | 9 | from tqdm import tqdm 10 | from azure.core.exceptions import ResourceNotFoundError 11 | 12 | from azure.identity import DefaultAzureCredential 13 | from azure.mgmt.storage import StorageManagementClient 14 | from azure.storage.blob import BlobServiceClient, ContainerClient 15 | 16 | from loguru import logger 17 | 18 | from anypathlib.path_handlers.base_path_handler import BasePathHandler 19 | 20 | 21 | @dataclass 22 | class AzureStoragePath: 23 | storage_account: str 24 | container_name: str 25 | blob_name: str 26 | connection_string: Optional[str] = None 27 | _blob_service_client: Optional[BlobServiceClient] = field(init=False, default=None) 28 | _container_client: Optional[ContainerClient] = field(init=False, default=None) 29 | 30 | def __post_init__(self): 31 | if self.connection_string is None: 32 | self.connection_string = AzureHandler.get_connection_string(self.storage_account) 33 | self._container_client = None 34 | self._blob_service_client = None 35 | 36 | @property 37 | def http_url(self) -> str: 38 | return f'https://{self.storage_account}.{AzureHandler.AZURE_URL_SUFFIX}/{self.container_name}/{self.blob_name}' 39 | 40 | @property 41 | def blob_service_client(self) -> BlobServiceClient: 42 | if self._blob_service_client is None: 43 | self._blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) 44 | return self._blob_service_client 45 | 46 | @property 47 | def container_client(cls) -> ContainerClient: 48 | if cls._container_client is None: 49 | cls._container_client = cls.blob_service_client.get_container_client(cls.container_name) 50 | 51 | return cls._container_client 52 | 53 | 54 | class AzureHandler(BasePathHandler): 55 | DEFAULT_SUBSCRIPTION_ID = os.environ.get('AZURE_SUBSCRIPTION_ID', None) 56 | 57 | DEFAULT_GROUP_NAME = os.environ.get('AZURE_RESOURCE_GROUP_NAME', None) 58 | AZURE_URL_SUFFIX = r'blob.core.windows.net' 59 | 60 | @classmethod 61 | def refresh_credentials(cls): 62 | if cls.DEFAULT_SUBSCRIPTION_ID is None: 63 | cls.DEFAULT_SUBSCRIPTION_ID = os.environ.get('AZURE_SUBSCRIPTION_ID', None) 64 | if cls.DEFAULT_GROUP_NAME is None: 65 | cls.DEFAULT_GROUP_NAME = os.environ.get('AZURE_RESOURCE_GROUP_NAME', None) 66 | 67 | @classmethod 68 | def relative_path(cls, url: str) -> str: 69 | storage_path = cls.http_to_storage_params(url) 70 | return f'{storage_path.container_name}/{storage_path.blob_name}' 71 | 72 | @classmethod 73 | def is_dir(cls, url: str) -> bool: 74 | return cls.exists(url) and not cls.is_file(url) 75 | 76 | @classmethod 77 | def is_file(cls, url: str) -> bool: 78 | storage_path = cls.http_to_storage_params(url) 79 | container_client = storage_path.container_client 80 | blob_client = container_client.get_blob_client(storage_path.blob_name) 81 | 82 | try: 83 | blob_properties = blob_client.get_blob_properties() 84 | # If the blob exists and is not a directory placeholder, it's a file 85 | return not blob_properties.metadata.get('hdi_isfolder', False) 86 | except Exception: 87 | return False # If exception is raised, the blob does not exist or is not a file 88 | 89 | @classmethod 90 | def exists(cls, url: str) -> bool: 91 | storage_path = cls.http_to_storage_params(url) 92 | container_client = storage_path.container_client 93 | return len([p for p in container_client.list_blobs(name_starts_with=storage_path.blob_name)]) > 0 94 | 95 | @classmethod 96 | def get_connection_string(cls, storage_account: str, subscription_id: Optional[str] = None, 97 | resource_group_name: Optional[str] = None) -> str: 98 | cls.refresh_credentials() 99 | account_key = cls.get_storage_account_key(storage_account_name=storage_account, subscription_id=subscription_id, 100 | resource_group_name=resource_group_name) 101 | connection_string = (f"DefaultEndpointsProtocol=https;AccountName={storage_account};" 102 | f"AccountKey={account_key};EndpointSuffix=core.windows.net") 103 | return connection_string 104 | 105 | @classmethod 106 | def http_to_storage_params(cls, url: str) -> AzureStoragePath: 107 | parsed_url = urlparse(url) 108 | account_name = parsed_url.netloc.split('.')[0] 109 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/') 110 | blob_path = '/'.join(blob_path_parts) 111 | 112 | azure_storage_path = AzureStoragePath(storage_account=account_name, container_name=container_name, 113 | blob_name=blob_path) 114 | return azure_storage_path 115 | 116 | @classmethod 117 | def get_storage_account_key(cls, 118 | storage_account_name: str, 119 | subscription_id: Optional[str] = None, 120 | resource_group_name: Optional[str] = None, 121 | ) -> str: 122 | """ 123 | Retrieves the access key for a storage account in Azure. 124 | 125 | Args: 126 | subscription_id (str): The subscription ID of the Azure account. 127 | resource_group_name (str): The name of the resource group containing the storage account. 128 | storage_account_name (str): The name of the storage account. 129 | 130 | Returns: 131 | str: The access key for the storage account. 132 | """ 133 | try: 134 | if subscription_id is None: 135 | subscription_id = cls.DEFAULT_SUBSCRIPTION_ID 136 | if subscription_id is None: 137 | raise ValueError( 138 | """ 139 | No subscription ID was provided. 140 | Set the AZURE_SUBSCRIPTION_ID environment variable, or pass it as an argument. 141 | """ 142 | ) 143 | if resource_group_name is None: 144 | resource_group_name = cls.DEFAULT_GROUP_NAME 145 | if resource_group_name is None: 146 | raise ValueError( 147 | """ 148 | No resource group name was provided. 149 | Set the AZURE_RESOURCE_GROUP_NAME environment variable, or pass it as an argument. 150 | """ 151 | ) 152 | client = StorageManagementClient(credential=DefaultAzureCredential(), subscription_id=subscription_id) 153 | response = client.storage_accounts.list_keys(resource_group_name=resource_group_name, 154 | account_name=storage_account_name, ) 155 | if not response.keys: 156 | raise ValueError( 157 | """ 158 | No keys were found for the storage account. 159 | Ask the MLOps guys for the access key, or try and get it from the Azure portal 160 | """ 161 | ) 162 | return response.keys[0].value # Returns the first key of the storage account 163 | except Exception as e: 164 | logger.exception(e) 165 | logger.exception( 166 | """ 167 | There was an error fetching the storage account key. 168 | Make sure you are connected to VPN, and config is correct. 169 | If it still fails, get it from the Azure portal, 170 | or ask the MLOps guys for the access key. 171 | """ 172 | ) 173 | raise e 174 | 175 | @classmethod 176 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path: 177 | if target_path.exists() and not force_overwrite: 178 | return target_path 179 | azure_storage_path = cls.http_to_storage_params(url) 180 | # Construct the Blob Service Client 181 | blob_service_client = BlobServiceClient( 182 | account_url=f"https://{azure_storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}") 183 | 184 | # Get a client to interact with the specified container and blob 185 | blob_client = blob_service_client.get_blob_client(container=azure_storage_path.container_name, 186 | blob=azure_storage_path.blob_name) 187 | 188 | # Ensure the directory exists 189 | target_path.parent.mkdir(parents=True, exist_ok=True) 190 | 191 | # Download the blob to a local file 192 | with open(target_path, "wb") as download_file: 193 | download_file.write(blob_client.download_blob().readall()) 194 | 195 | return target_path 196 | 197 | @classmethod 198 | def upload_file(cls, local_path: str, target_url: str): 199 | """Upload a single file to Azure Blob Storage.""" 200 | azure_storage_path = cls.http_to_storage_params(target_url) 201 | blob_service_client = azure_storage_path.blob_service_client 202 | container_client = azure_storage_path.container_client 203 | # Check if the container exists and create if it does not 204 | try: 205 | container_client.get_container_properties() 206 | except Exception as e: 207 | # Assuming exception means container does not exist. Create new container 208 | container_client.create_container() 209 | 210 | # Now, upload the file 211 | blob_client = blob_service_client.get_blob_client(container=azure_storage_path.container_name, 212 | blob=azure_storage_path.blob_name) 213 | with open(local_path, "rb") as data: 214 | blob_client.upload_blob(data, overwrite=True) 215 | 216 | @classmethod 217 | def remove_directory(cls, url: str): 218 | """Remove a directory (all blobs with the same prefix) from Azure Blob Storage.""" 219 | azure_storage_path = cls.http_to_storage_params(url) 220 | container_client = azure_storage_path.container_client 221 | for blob in container_client.list_blobs(name_starts_with=azure_storage_path.blob_name): 222 | container_client.delete_blob(blob.name) 223 | 224 | @classmethod 225 | def remove(cls, url: str, allow_missing: bool = False): 226 | """Remove a single file/directory from Azure Blob Storage.""" 227 | if url.endswith('/'): 228 | cls.remove_directory(url) 229 | else: 230 | azure_storage_path = cls.http_to_storage_params(url) 231 | container_client = azure_storage_path.container_client 232 | try: 233 | container_client.delete_blob(azure_storage_path.blob_name) 234 | except ResourceNotFoundError as e: 235 | if not allow_missing: 236 | raise e 237 | 238 | @classmethod 239 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \ 240 | Optional[Tuple[Path, List[Path]]]: 241 | """Download a directory (all blobs with the same prefix) from Azure Blob Storage.""" 242 | assert target_dir.is_dir() 243 | azure_storage_path = cls.http_to_storage_params(url) 244 | 245 | container_client = azure_storage_path.container_client 246 | local_paths = [] 247 | 248 | if verbose: 249 | container_iterator = container_client.list_blobs(name_starts_with=azure_storage_path.blob_name) 250 | progress_bar = tqdm(container_iterator, desc='Downloading directory', 251 | total=len(list(container_iterator))) 252 | else: 253 | progress_bar = container_client.list_blobs(name_starts_with=azure_storage_path.blob_name) 254 | 255 | for blob in progress_bar: 256 | blob_url = AzureStoragePath(storage_account=azure_storage_path.storage_account, 257 | container_name=azure_storage_path.container_name, blob_name=blob.name, 258 | connection_string=azure_storage_path.connection_string).http_url 259 | local_target = target_dir / Path(blob_url).relative_to(Path(url)) 260 | local_path = cls.download_file(url=blob_url, force_overwrite=force_overwrite, target_path=local_target) 261 | assert local_path is not None, f'could not download from {url}' 262 | local_paths.append(Path(local_path)) 263 | if len(local_paths) == 0: 264 | return None 265 | return local_paths[0].parent, local_paths 266 | 267 | @classmethod 268 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool): 269 | """Upload a directory to Azure Blob Storage.""" 270 | azure_storage_path = cls.http_to_storage_params(target_url) 271 | # Check if the container exists and create if it does not 272 | container_client = azure_storage_path.container_client 273 | try: 274 | container_client.get_container_properties() 275 | except Exception as e: 276 | # Assuming exception means container does not exist. Create new container 277 | container_client.create_container() 278 | 279 | def upload_file_wrapper(local_path: str, blob_name: str): 280 | azure_url = rf'https://{azure_storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{azure_storage_path.container_name}/{blob_name}' 281 | cls.upload_file(local_path=local_path, target_url=azure_url) 282 | 283 | # Collect all files to upload 284 | files_to_upload = [] 285 | for file_path in local_dir.rglob('*'): 286 | if not file_path.is_file(): 287 | continue 288 | blob_name = os.path.join(azure_storage_path.blob_name, file_path.relative_to(local_dir)) 289 | files_to_upload.append((file_path, blob_name)) 290 | 291 | # Upload files in parallel 292 | with ThreadPoolExecutor() as executor: 293 | futures = [executor.submit(upload_file_wrapper, str(local_path), blob_name) for local_path, blob_name in 294 | files_to_upload] 295 | if verbose: 296 | with tqdm(total=len(files_to_upload), desc='Uploading directory') as pbar: 297 | for future in futures: 298 | future.result() # Wait for each upload to complete 299 | pbar.update(1) 300 | else: 301 | for future in futures: 302 | future.result() # Wait for each upload to complete 303 | 304 | @classmethod 305 | def copy(cls, source_url: str, target_url: str): 306 | source_storage_path = cls.http_to_storage_params(source_url) 307 | target_storage_path = cls.http_to_storage_params(target_url) 308 | 309 | target_blob_service_client = target_storage_path.blob_service_client 310 | source_container_client = source_storage_path.container_client 311 | 312 | blobs_to_rename = source_container_client.list_blobs(name_starts_with=source_storage_path.blob_name) 313 | 314 | def copy_blob(blob): 315 | source_blob_url = AzureStoragePath(storage_account=source_storage_path.storage_account, 316 | container_name=source_storage_path.container_name, blob_name=blob.name, 317 | connection_string=source_storage_path.connection_string).http_url 318 | target_blob_name = blob.name.replace(source_storage_path.blob_name, target_storage_path.blob_name, 1) 319 | 320 | # Copy to new location 321 | target_blob = target_blob_service_client.get_blob_client(container=target_storage_path.container_name, 322 | blob=target_blob_name) 323 | target_blob.start_copy_from_url(source_blob_url) 324 | 325 | # Execute copy and delete operations in parallel 326 | with ThreadPoolExecutor() as executor: 327 | futures = [executor.submit(copy_blob, blob) for blob in blobs_to_rename] 328 | for future in futures: 329 | future.result() # Wait for each operation to complete 330 | 331 | @classmethod 332 | def parent(cls, url: str) -> str: 333 | parsed_url = urlparse(url) 334 | account_name = parsed_url.netloc.split('.')[0] 335 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/') 336 | if blob_path_parts[-1] == "": 337 | blob_path_parts = blob_path_parts[:-1] 338 | blob_path = '/'.join(blob_path_parts[:-1]) 339 | parent_url = f'https://{account_name}.{cls.AZURE_URL_SUFFIX}/{container_name}/{blob_path}/' 340 | return parent_url 341 | 342 | @classmethod 343 | def name(cls, url: str) -> str: 344 | parsed_url = urlparse(url) 345 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/') 346 | if blob_path_parts[-1] == "": 347 | blob_path_parts = blob_path_parts[:-1] 348 | blob_name = blob_path_parts[-1] 349 | return blob_name 350 | 351 | @classmethod 352 | def stem(cls, url: str) -> str: 353 | parsed_url = urlparse(url) 354 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/') 355 | if blob_path_parts[-1] == "": 356 | blob_path_parts = blob_path_parts[:-1] 357 | blob_name = blob_path_parts[-1] 358 | return Path(blob_name).stem 359 | 360 | @classmethod 361 | def iterdir(cls, url: str) -> List[str]: 362 | return cls.glob(url, pattern='*') 363 | 364 | @classmethod 365 | def glob(cls, url: str, pattern: str) -> List[str]: 366 | storage_path = cls.http_to_storage_params(url) 367 | container_client = storage_path.container_client 368 | blob_names = [blob.name for blob in 369 | container_client.walk_blobs(name_starts_with=storage_path.blob_name, delimiter='/')] 370 | all_blobs = [ 371 | f"https://{storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{storage_path.container_name}/{blob}" for 372 | blob in blob_names] 373 | matched_blobs = [blob for blob in all_blobs if fnmatch.fnmatch(blob, pattern)] 374 | return matched_blobs 375 | 376 | @classmethod 377 | def rglob(cls, url: str, pattern: str) -> List[str]: 378 | storage_path = cls.http_to_storage_params(url) 379 | container_client = storage_path.container_client 380 | blobs = [blob for blob in container_client.list_blob_names(name_starts_with=storage_path.blob_name)] 381 | all_blobs = [ 382 | f"https://{storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{storage_path.container_name}/{blob}" for 383 | blob in blobs] 384 | matched_blobs = [blob for blob in all_blobs if fnmatch.fnmatch(blob, pattern)] 385 | all_dirs = list(set([cls.parent(url) for url in matched_blobs])) 386 | dirs_under_url = [dir.rstrip('/') for dir in all_dirs if dir.startswith(url) and dir != url] 387 | return matched_blobs + dirs_under_url 388 | -------------------------------------------------------------------------------- /anypathlib/path_handlers/base_path_handler.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABC 2 | from pathlib import Path 3 | from typing import List, Optional, Tuple 4 | 5 | 6 | class BasePathHandler(ABC): 7 | @classmethod 8 | @abstractmethod 9 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path: 10 | pass 11 | 12 | 13 | @classmethod 14 | @abstractmethod 15 | def remove(cls, url: str): 16 | pass 17 | 18 | @classmethod 19 | @abstractmethod 20 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, 21 | verbose: bool) -> Optional[Tuple[Path, List[Path]]]: 22 | pass 23 | 24 | @classmethod 25 | @abstractmethod 26 | def upload_file(cls, local_path: str, target_url: str): 27 | pass 28 | 29 | @classmethod 30 | @abstractmethod 31 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool): 32 | pass 33 | 34 | @classmethod 35 | @abstractmethod 36 | def copy(cls, source_url: str, target_url: str): 37 | pass 38 | 39 | @classmethod 40 | @abstractmethod 41 | def is_dir(cls, url: str) -> bool: 42 | pass 43 | 44 | @classmethod 45 | @abstractmethod 46 | def is_file(cls, url: str) -> bool: 47 | pass 48 | 49 | @classmethod 50 | @abstractmethod 51 | def exists(cls, url: str) -> bool: 52 | pass 53 | 54 | @classmethod 55 | @abstractmethod 56 | def relative_path(cls, url: str) -> str: 57 | pass 58 | 59 | @classmethod 60 | @abstractmethod 61 | def parent(cls, url: str) -> str: 62 | pass 63 | 64 | @classmethod 65 | @abstractmethod 66 | def name(cls, url: str) -> str: 67 | pass 68 | 69 | @classmethod 70 | @abstractmethod 71 | def stem(cls, url: str) -> str: 72 | pass 73 | 74 | @classmethod 75 | @abstractmethod 76 | def iterdir(cls, url: str) -> List[str]: 77 | """ 78 | Lists all files and directories directly under the given directory 79 | """ 80 | pass 81 | 82 | @classmethod 83 | @abstractmethod 84 | def glob(cls, url: str, pattern: str) -> List[str]: 85 | """ 86 | Finds all the paths matching a specific pattern, which can include wildcards, but does not search recursively 87 | """ 88 | pass 89 | 90 | @classmethod 91 | @abstractmethod 92 | def rglob(cls, url: str, pattern: str) -> List[str]: 93 | """ 94 | Finds all the paths matching a specific pattern, including wildcards, and searches recursively in all subdirectories 95 | """ 96 | pass 97 | -------------------------------------------------------------------------------- /anypathlib/path_handlers/local_handler.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | from typing import List, Optional, Tuple 4 | 5 | from anypathlib.path_handlers.base_path_handler import BasePathHandler 6 | 7 | 8 | class LocalPathHandler(BasePathHandler): 9 | 10 | @classmethod 11 | def is_dir(cls, url: str) -> bool: 12 | return Path(url).is_dir() 13 | 14 | @classmethod 15 | def is_file(cls, url: str) -> bool: 16 | return Path(url).is_file() 17 | 18 | @classmethod 19 | def exists(cls, url: str) -> bool: 20 | return Path(url).exists() 21 | 22 | @classmethod 23 | def remove(cls, url: str): 24 | local_path = Path(url) 25 | if local_path.is_file(): 26 | local_path.unlink() 27 | elif local_path.is_dir(): 28 | shutil.rmtree(local_path) 29 | 30 | @classmethod 31 | def upload_file(cls, local_path: str, target_url: str): 32 | cls.copy_path(url=Path(local_path).absolute().as_posix(), target_path=Path(target_url), force_overwrite=True) 33 | 34 | @classmethod 35 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool): 36 | cls.copy_path(url=local_dir.absolute().as_posix(), target_path=Path(target_url), force_overwrite=True) 37 | 38 | @classmethod 39 | def copy(cls, source_url: str, target_url: str): 40 | cls.copy_path(url=source_url, target_path=Path(target_url), force_overwrite=True) 41 | 42 | @classmethod 43 | def copy_path(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path: 44 | if target_path.exists() and not force_overwrite: 45 | return target_path 46 | if target_path.exists() and force_overwrite: 47 | cls.remove(url=target_path.as_posix()) 48 | local_path = Path(url) 49 | if local_path.is_dir(): 50 | shutil.copytree(local_path, target_path) 51 | else: 52 | shutil.copy(local_path, target_path) 53 | 54 | @classmethod 55 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \ 56 | Optional[Tuple[Path, List[Path]]]: 57 | cls.copy_path(url=url, target_path=target_dir, force_overwrite=force_overwrite) 58 | return target_dir, [p for p in target_dir.rglob('*')] 59 | 60 | @classmethod 61 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path: 62 | return cls.copy_path(url=url, target_path=target_path, force_overwrite=force_overwrite) 63 | 64 | @classmethod 65 | def relative_path(cls, url: str) -> str: 66 | return Path(url).relative_to(Path(url).anchor).as_posix() 67 | 68 | @classmethod 69 | def parent(cls, url: str) -> str: 70 | return Path(url).parent.as_posix() 71 | 72 | @classmethod 73 | def stem(cls, url: str) -> str: 74 | return Path(url).stem 75 | 76 | @classmethod 77 | def name(cls, url: str) -> str: 78 | return Path(url).name 79 | 80 | @classmethod 81 | def iterdir(cls, url: str) -> List[str]: 82 | return [str(p) for p in Path(url).iterdir()] 83 | 84 | @classmethod 85 | def glob(cls, url: str, pattern: str) -> List[str]: 86 | return [str(p) for p in Path(url).glob(pattern)] 87 | 88 | @classmethod 89 | def rglob(cls, url: str, pattern: str) -> List[str]: 90 | return [str(p) for p in Path(url).rglob(pattern)] 91 | -------------------------------------------------------------------------------- /anypathlib/path_handlers/path_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class PathType(Enum): 5 | local = 'local' 6 | s3 = 's3' 7 | azure = 'azure' 8 | -------------------------------------------------------------------------------- /anypathlib/path_handlers/s3_handler.py: -------------------------------------------------------------------------------- 1 | import fnmatch 2 | import os 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from pathlib import Path 5 | from typing import List, Tuple, Optional, ClassVar 6 | from urllib.parse import urlparse 7 | 8 | import boto3 as boto3 9 | import botocore 10 | from tqdm import tqdm 11 | 12 | from anypathlib.path_handlers.base_path_handler import BasePathHandler 13 | 14 | 15 | class S3Handler(BasePathHandler): 16 | AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None) 17 | MAX_POOL_CONNECTIONS = 50 18 | # Create a boto3 S3 client 19 | s3_client: ClassVar[boto3.client] = boto3.client('s3', config=botocore.config.Config( 20 | max_pool_connections=MAX_POOL_CONNECTIONS)) 21 | 22 | @classmethod 23 | def refresh_credentials(cls): 24 | if cls.AWS_ACCESS_KEY_ID is None: 25 | cls.AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None) 26 | cls.s3_client = boto3.client('s3', 27 | config=botocore.config.Config(max_pool_connections=cls.MAX_POOL_CONNECTIONS)) 28 | 29 | @classmethod 30 | def relative_path(cls, url: str) -> str: 31 | bucket, key = cls.get_bucket_and_key_from_uri(url) 32 | return f'{bucket}/{key}' 33 | 34 | @classmethod 35 | def is_dir(cls, url: str) -> bool: 36 | return cls.exists(url) and not cls.is_file(url) 37 | 38 | @classmethod 39 | def is_file(cls, url: str) -> bool: 40 | bucket_name, object_key = cls.get_bucket_and_key_from_uri(url) 41 | try: 42 | cls.s3_client.head_object(Bucket=bucket_name, Key=object_key) 43 | return True # If the head object doesn't raise an error, it's a file 44 | except (cls.s3_client.exceptions.NoSuchKey, cls.s3_client.exceptions.ClientError): 45 | return False # If a NoSuchKey error is raised, it's not a file 46 | 47 | @classmethod 48 | def parent(cls, url: str) -> str: 49 | bucket, key = cls.get_bucket_and_key_from_uri(url) 50 | return cls.get_full_path(bucket=bucket, key=Path(key).parent.as_posix()) 51 | 52 | @classmethod 53 | def stem(cls, url: str) -> str: 54 | bucket, key = cls.get_bucket_and_key_from_uri(url) 55 | return Path(key).stem 56 | 57 | @classmethod 58 | def name(cls, url: str) -> str: 59 | bucket, key = cls.get_bucket_and_key_from_uri(url) 60 | return Path(key).name 61 | 62 | @classmethod 63 | def exists(cls, url: str) -> bool: 64 | bucket, key = cls.get_bucket_and_key_from_uri(url) 65 | try: 66 | resp = cls.s3_client.list_objects(Bucket=bucket, Prefix=key, Delimiter='/', MaxKeys=1) 67 | return 'Contents' in resp or 'CommonPrefixes' in resp 68 | except cls.s3_client.exceptions.NoSuchKey: 69 | return False 70 | 71 | @classmethod 72 | def get_bucket_and_key_from_uri(cls, s3_uri: str) -> Tuple[str, str]: 73 | parsed_uri = urlparse(s3_uri) 74 | bucket = parsed_uri.netloc 75 | key = parsed_uri.path.lstrip('/') 76 | cls.refresh_credentials() 77 | return bucket, key 78 | 79 | @classmethod 80 | def get_full_path(cls, bucket: str, key: str) -> str: 81 | 82 | return f's3://{bucket}/{key}' 83 | 84 | @classmethod 85 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path: 86 | # Convert the local path to a Path object 87 | local_file_path = Path(target_path) 88 | if not force_overwrite and local_file_path.exists(): 89 | return local_file_path 90 | # Parse the S3 URL 91 | bucket, key = cls.get_bucket_and_key_from_uri(url) 92 | 93 | # Ensure the local directory exists 94 | local_file_path.parent.mkdir(parents=True, exist_ok=True) 95 | # Download the file 96 | cls.s3_client.download_file(Bucket=bucket, Key=key, Filename=local_file_path.absolute().as_posix()) 97 | return local_file_path 98 | 99 | @classmethod 100 | def remove(cls, url: str): 101 | bucket, key = cls.get_bucket_and_key_from_uri(url) 102 | s3_resource = boto3.resource('s3') 103 | bucket = s3_resource.Bucket(bucket) 104 | bucket.objects.filter(Prefix=key).delete() 105 | 106 | @classmethod 107 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \ 108 | Optional[Tuple[Path, List[Path]]]: 109 | 110 | s3_resource = boto3.resource('s3') 111 | 112 | bucket, source_key = cls.get_bucket_and_key_from_uri(url) 113 | bucket = s3_resource.Bucket(bucket) 114 | all_files = [] 115 | 116 | # Prepare the list of s3_paths to download 117 | s3_paths: List[str] = [cls.get_full_path(bucket=bucket.name, key=obj.key) for obj in 118 | bucket.objects.filter(Prefix=source_key)] 119 | s3_paths = [s3_path for s3_path in s3_paths if s3_path.rstrip('/') != url] 120 | 121 | def s3_path_to_local_file_path(s3_path: str, local_base_path: Path) -> Path: 122 | _, key = cls.get_bucket_and_key_from_uri(s3_path) 123 | local_file_relative_path = Path(key).relative_to(source_key) 124 | return local_base_path / local_file_relative_path 125 | 126 | # Download in parallel 127 | with ThreadPoolExecutor() as executor: 128 | future_to_s3_path = {executor.submit(cls.download_file, 129 | url=s3_path, 130 | target_path=s3_path_to_local_file_path(s3_path=s3_path, 131 | local_base_path=target_dir), 132 | force_overwrite=force_overwrite): s3_path for s3_path in s3_paths} 133 | 134 | def process_futures(): 135 | for future in as_completed(future_to_s3_path): 136 | s3_path = future_to_s3_path[future] 137 | try: 138 | local_path = future.result() 139 | if local_path: 140 | all_files.append(local_path) 141 | except Exception as exc: 142 | print(f'{s3_path} generated an exception: {exc}') 143 | 144 | yield None 145 | 146 | if verbose: 147 | with tqdm(total=len(s3_paths), desc='Downloading directory') as pbar: 148 | for _ in process_futures(): 149 | pbar.update(1) 150 | else: 151 | for _ in process_futures(): 152 | pass 153 | 154 | return target_dir, all_files 155 | 156 | @classmethod 157 | def upload_file(cls, local_path: str, target_url: str): 158 | bucket, key = cls.get_bucket_and_key_from_uri(target_url) 159 | cls.s3_client.upload_file(local_path, bucket, key) 160 | 161 | @classmethod 162 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool = False): 163 | bucket, key = cls.get_bucket_and_key_from_uri(target_url) 164 | 165 | total_files = 0 166 | if verbose: 167 | for root, dirs, files in os.walk(local_dir): 168 | total_files += len(files) 169 | 170 | if verbose: 171 | progress_bar = tqdm(os.walk(local_dir), desc='Uploading directory', total=total_files) 172 | else: 173 | progress_bar = os.walk(local_dir) 174 | 175 | for root, dirs, files in progress_bar: 176 | for file in files: 177 | local_path = Path(root) / file 178 | s3_key = f'{key.rstrip("/")}/{local_path.relative_to(local_dir).as_posix()}' 179 | cls.s3_client.upload_file(local_path, bucket, s3_key) 180 | 181 | if verbose: 182 | progress_bar.update(len(files)) 183 | 184 | @classmethod 185 | def copy(cls, source_url: str, target_url: str): 186 | s3_resource = boto3.resource('s3') 187 | source_bucket_name, source_key = cls.get_bucket_and_key_from_uri(source_url) 188 | target_bucket_name, target_key = cls.get_bucket_and_key_from_uri(target_url) 189 | 190 | source_bucket = s3_resource.Bucket(source_bucket_name) 191 | objects = list(source_bucket.objects.filter(Prefix=source_key)) 192 | 193 | def copy_and_delete(obj): 194 | new_key = obj.key.replace(source_key, target_key, 1) 195 | copy_source = { 196 | 'Bucket': source_bucket_name, 197 | 'Key': obj.key 198 | } 199 | # Copy object to the new location 200 | s3_resource.meta.client.copy(copy_source, target_bucket_name, new_key) 201 | 202 | # Use ThreadPoolExecutor to parallelize the operation 203 | with ThreadPoolExecutor() as executor: 204 | futures = [executor.submit(copy_and_delete, obj) for obj in objects] 205 | 206 | for future in as_completed(futures): 207 | try: 208 | future.result() # If needed, handle result or exceptions here 209 | except Exception as exc: 210 | print(f'Operation generated an exception: {exc}') 211 | 212 | @classmethod 213 | def _get_bucket_objects(cls, url: str) -> List[str]: 214 | bucket, key = cls.get_bucket_and_key_from_uri(url) 215 | s3_resource = boto3.resource('s3') 216 | bucket_obj = s3_resource.Bucket(bucket) 217 | return [cls.get_full_path(bucket=bucket, key=obj.key) for obj in bucket_obj.objects.filter(Prefix=key)] 218 | 219 | @classmethod 220 | def iterdir(cls, url: str) -> List[str]: 221 | return cls.glob(url, pattern='*') 222 | 223 | @classmethod 224 | def _get_dirs_under_url(cls, base_url: str, url_list: List[str]) -> List[str]: 225 | all_dirs = list(set([cls.parent(url) for url in url_list])) 226 | dirs_under_url = [dir.rstrip('/') for dir in all_dirs if dir.startswith(base_url) and dir != base_url] 227 | return dirs_under_url 228 | 229 | @classmethod 230 | def glob(cls, url: str, pattern: str) -> List[str]: 231 | objects = cls._get_bucket_objects(url) 232 | matched_objects = [obj for obj in objects if fnmatch.fnmatch(obj, pattern)] 233 | # return only top level matched objects 234 | top_level_objects = [obj for obj in matched_objects if obj.count('/') == url.rstrip('/').count('/') + 1] 235 | all_subdirs = cls._get_dirs_under_url(base_url=url, url_list=matched_objects) 236 | subdirs_in_top_level = [dir for dir in all_subdirs if dir.count('/') == url.rstrip('/').count('/') + 1] 237 | return top_level_objects + subdirs_in_top_level 238 | 239 | @classmethod 240 | def rglob(cls, url: str, pattern: str) -> List[str]: 241 | """ 242 | Finds all the paths matching a specific pattern, including wildcards, and searches recursively in all subdirectories 243 | """ 244 | objects = cls._get_bucket_objects(url) 245 | matched_objects = [obj for obj in objects if fnmatch.fnmatch(obj, pattern)] 246 | dirs = cls._get_dirs_under_url(base_url=url, url_list=matched_objects) 247 | return matched_objects + dirs 248 | -------------------------------------------------------------------------------- /docs/anypathlib_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/docs/anypathlib_logo.png -------------------------------------------------------------------------------- /docs/wsc_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/docs/wsc_logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | azure-storage-blob>=12.14.0 2 | azure-identity>=1.10.0 3 | azure-mgmt-storage>=21.1.0 4 | boto3>=1.34.23 5 | loguru 6 | tqdm 7 | click==8.1.7 8 | pytest==8.2.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import setuptools 3 | import codecs 4 | import os 5 | import re 6 | 7 | with open("README.md", "r", encoding='utf-8') as fh: 8 | long_description = fh.read() 9 | packages = setuptools.find_namespace_packages(include=["anypathlib*"]) 10 | print("PACKAGES FOUND:", packages) 11 | print(sys.version_info) 12 | 13 | 14 | def find_version(*file_paths: str) -> str: 15 | with codecs.open(os.path.join(*file_paths), "r") as fp: 16 | version_file = fp.read() 17 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) 18 | if version_match: 19 | return version_match.group(1) 20 | raise RuntimeError("Unable to find version string.") 21 | 22 | 23 | setuptools.setup( 24 | name="AnyPathLib", 25 | version=find_version("anypathlib", "__init__.py"), 26 | author="Kfir Goldberg @ WSC-Sports", 27 | description="A unified API for every storage resource", 28 | long_description=long_description, 29 | long_description_content_type="text/markdown", 30 | url="", 31 | packages=packages, 32 | package_data={"AnyPathLib": ["py.typed"]}, 33 | license='Apache License 2.0', 34 | classifiers=[ 35 | "Programming Language :: Python :: 3", 36 | 'License :: OSI Approved :: Apache Software License', 37 | "Operating System :: OS Independent", 38 | ], 39 | python_requires=">=3.7", 40 | install_requires=[ 41 | "azure-storage-blob>=12.14.0", 42 | "azure-identity>=1.15.0", 43 | "azure-mgmt-storage>=21.1.0", 44 | "boto3>=1.34.23", 45 | "loguru", 46 | "tqdm", 47 | 'Click' 48 | ], 49 | setup_requires=["pre-commit"], 50 | py_modules=["anypathlib"], 51 | entry_points={"console_scripts": ["anypathlib = anypathlib.cli:cli"]} 52 | ) 53 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures_anypath.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import pytest 5 | from pathlib import Path 6 | import tempfile 7 | 8 | from click.testing import CliRunner 9 | 10 | from anypathlib import PathType 11 | 12 | from tests.tests_urls import PATH_TYPE_TO_BASE_TEST_PATH, PATH_TYPE_TO_HANDLER 13 | 14 | 15 | def create_files_in_directory(directory: Path, n_files: int = 5): 16 | for _ in range(n_files): 17 | # Generate a random file name 18 | file_name = ''.join(random.choices(string.ascii_lowercase, k=10)) + '.txt' 19 | file_path = directory / file_name 20 | 21 | # Write some random short content to each file 22 | content = ''.join(random.choices(string.ascii_letters + string.digits, k=20)) 23 | file_path.write_text(content) 24 | 25 | 26 | @pytest.fixture 27 | def temp_dir_with_files(): 28 | with tempfile.TemporaryDirectory() as tmpdirname: 29 | tmpdir = Path(tmpdirname) 30 | create_files_in_directory(tmpdir) 31 | yield tmpdir, list(tmpdir.iterdir()) 32 | 33 | 34 | @pytest.fixture 35 | def temp_nested_dir(): 36 | with tempfile.TemporaryDirectory() as tmpdirname: 37 | tmpdir = Path(tmpdirname) 38 | nested = tempfile.TemporaryDirectory(dir=tmpdirname) 39 | create_files_in_directory(tmpdir) 40 | create_files_in_directory(Path(nested.name)) 41 | yield tmpdir, list(tmpdir.iterdir()), list(Path(nested.name).iterdir()) 42 | 43 | 44 | @pytest.fixture 45 | def temp_local_dir(): 46 | with tempfile.TemporaryDirectory() as tmpdirname: 47 | yield Path(tmpdirname) 48 | 49 | 50 | @pytest.fixture 51 | def clean_remote_dir(request, path_type: PathType): 52 | test_name = request.node.name 53 | remote_base_dir = PATH_TYPE_TO_BASE_TEST_PATH[path_type] 54 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 55 | remote_dir = f"{remote_base_dir}{test_name}/" 56 | cloud_handler.remove(remote_dir) 57 | yield remote_dir 58 | cloud_handler.remove(remote_dir) 59 | 60 | 61 | @pytest.fixture 62 | def cli_runner(): 63 | return CliRunner() 64 | -------------------------------------------------------------------------------- /tests/test_anypath_flows.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from anypathlib import PathType, AnyPath 4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 5 | from fixtures_anypath import temp_dir_with_files, clean_remote_dir 6 | 7 | 8 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir") 9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local]) 10 | def test_exists_copy_exists_rglob_remove_exists(path_type: PathType, temp_dir_with_files, clean_remote_dir): 11 | remote_base_dir = clean_remote_dir 12 | local_dir_path, local_dir_files = temp_dir_with_files 13 | remote_dir = remote_base_dir + 'test_exists_copy_exists_rglob_remove_exists/' 14 | local_any_path = AnyPath(local_dir_path) 15 | target_any_path = AnyPath(remote_dir) 16 | assert not target_any_path.exists() 17 | local_any_path.copy(target=target_any_path, force_overwrite=True) 18 | assert target_any_path.exists() 19 | target_dir_files = target_any_path.rglob('*') 20 | assert sorted([remote_file.name for remote_file in target_dir_files]) == sorted( 21 | [local_dir_file.name for local_dir_file in local_dir_files]) 22 | target_any_path.remove() 23 | assert not target_any_path.exists() 24 | 25 | 26 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir") 27 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local]) 28 | def test_is_dir(path_type: PathType, temp_dir_with_files, clean_remote_dir): 29 | local_dir_path, local_dir_files = temp_dir_with_files 30 | remote_dir = clean_remote_dir 31 | remote_dir_any_path = AnyPath(remote_dir) 32 | assert not remote_dir_any_path.is_dir() 33 | assert not remote_dir_any_path.is_file() 34 | AnyPath(local_dir_path).copy(target=remote_dir_any_path, force_overwrite=True) 35 | assert remote_dir_any_path.is_dir() 36 | assert not remote_dir_any_path.is_file() 37 | remote_dir_any_path.remove() 38 | assert not remote_dir_any_path.exists() 39 | assert not remote_dir_any_path.is_dir() 40 | 41 | 42 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir") 43 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local]) 44 | def test_is_file(path_type: PathType, temp_dir_with_files, clean_remote_dir): 45 | local_dir_path, local_dir_files = temp_dir_with_files 46 | local_file = local_dir_files[0] 47 | remote_dir = clean_remote_dir 48 | remote_file = f'{remote_dir}/{local_file.name}' 49 | remote_file_any_path = AnyPath(remote_file) 50 | assert not remote_file_any_path.is_dir() 51 | assert not remote_file_any_path.is_file() 52 | AnyPath(local_file).copy(target=remote_file_any_path) 53 | assert not remote_file_any_path.is_dir() 54 | assert remote_file_any_path.is_file() 55 | remote_file_any_path.remove() 56 | assert not remote_file_any_path.exists() 57 | assert not remote_file_any_path.is_file() 58 | 59 | 60 | @pytest.mark.usefixtures("clean_remote_dir") 61 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local]) 62 | @pytest.mark.parametrize("verbose", [True, False]) 63 | def test_caching(path_type: PathType, temp_dir_with_files, clean_remote_dir, verbose: bool): 64 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 65 | local_dir_path, local_dir_files = temp_dir_with_files 66 | remote_dir = clean_remote_dir 67 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=verbose) 68 | target1 = AnyPath(remote_dir).copy(target=None, force_overwrite=False, verbose=verbose) 69 | target2 = AnyPath(remote_dir).copy(target=None, force_overwrite=False, verbose=verbose) 70 | target1.remove() 71 | if target2.exists(): 72 | target2.remove() 73 | assert target1.base_path == target2.base_path 74 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from anypathlib.cli import cli 6 | from tests.fixtures_anypath import temp_dir_with_files, cli_runner, temp_local_dir 7 | 8 | FOLDER_NAME = 'folder' 9 | 10 | 11 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 12 | def test_copy_command_success(temp_dir_with_files, cli_runner): 13 | local_dir_path, local_dir_files = temp_dir_with_files 14 | input_file = local_dir_files[0] 15 | 16 | output_path = local_dir_path / FOLDER_NAME / input_file.name 17 | 18 | result = cli_runner.invoke(cli, ['copy', '-i', local_dir_files[0], '-o', output_path]) 19 | assert result.exit_code == 0 20 | assert output_path.exists() 21 | 22 | 23 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 24 | def test_copy_command_without_output(temp_dir_with_files, cli_runner): 25 | local_dir_path, local_dir_files = temp_dir_with_files 26 | input_file = local_dir_files[0] 27 | 28 | result = cli_runner.invoke(cli, ['copy', '-i', input_file]) 29 | assert result.exit_code == 0 30 | output_path = result.output.split(" ")[-1] 31 | assert Path(output_path.strip()).exists() 32 | 33 | 34 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 35 | def test_exists_command_true(temp_dir_with_files, cli_runner): 36 | local_dir_path, local_dir_files = temp_dir_with_files 37 | input_file = local_dir_files[0] 38 | result = cli_runner.invoke(cli, ['exists', '-p', input_file]) 39 | assert result.exit_code == 0 40 | assert 'True' in result.output 41 | 42 | 43 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 44 | def test_exists_command_false(temp_dir_with_files, cli_runner): 45 | local_dir_path, local_dir_files = temp_dir_with_files 46 | input_file = local_dir_files[0] 47 | input_file.unlink() 48 | result = cli_runner.invoke(cli, ['exists', '-p', input_file]) 49 | assert result.exit_code == 0 50 | assert 'False' in result.output 51 | 52 | 53 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 54 | def test_iterdir_command_with_files(temp_dir_with_files, cli_runner): 55 | local_dir_path, local_dir_files = temp_dir_with_files 56 | result = cli_runner.invoke(cli, ['iterdir', '-p', local_dir_path]) 57 | assert result.exit_code == 0 58 | 59 | for file in local_dir_files: 60 | assert file.name in result.output 61 | 62 | 63 | @pytest.mark.usefixtures("temp_local_dir", 'cli_runner') 64 | def test_iterdir_command_empty(temp_local_dir, cli_runner): 65 | result = cli_runner.invoke(cli, ['iterdir', '-p', temp_local_dir]) 66 | assert result.exit_code == 0 67 | assert result.output.strip() == '[]' 68 | 69 | 70 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 71 | def test_remove_command_success(temp_dir_with_files, cli_runner): 72 | local_dir_path, local_dir_files = temp_dir_with_files 73 | input_file = local_dir_files[0] 74 | 75 | result = cli_runner.invoke(cli, ['remove', '-p', input_file]) 76 | assert result.exit_code == 0 77 | assert not input_file.exists() 78 | 79 | 80 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner') 81 | def test_remove_command_non_existing_path(temp_dir_with_files, cli_runner): 82 | local_dir_path, local_dir_files = temp_dir_with_files 83 | input_file = local_dir_files[0] 84 | 85 | result = cli_runner.invoke(cli, ['remove', '-p', input_file]) 86 | assert result.exit_code == 0 87 | -------------------------------------------------------------------------------- /tests/test_copy_file_to_dir.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from anypathlib import PathType, AnyPath 4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir 6 | 7 | 8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir") 9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local]) 10 | def test_copy_file_to_dir(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir): 11 | path_handler = PATH_TYPE_TO_HANDLER[path_type] 12 | local_dir_path, local_dir_files = temp_dir_with_files 13 | 14 | remote_dir = clean_remote_dir 15 | path_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False) 16 | source_files = AnyPath(remote_dir).iterdir() 17 | remote_file = source_files[0] 18 | local_downloaded_file_path = AnyPath(remote_file).copy(target=temp_local_dir, force_overwrite=True) 19 | assert local_downloaded_file_path.exists() 20 | assert local_downloaded_file_path.name == remote_file.name 21 | assert local_downloaded_file_path.is_file() 22 | assert local_downloaded_file_path.parent.base_path == AnyPath(temp_local_dir).base_path 23 | -------------------------------------------------------------------------------- /tests/test_download_from_cloud.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from anypathlib import PathType, AnyPath 4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir 6 | 7 | 8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir") 9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3]) 10 | def test_copy_to_local_from_cloud(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir): 11 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 12 | local_dir_path, local_dir_files = temp_dir_with_files 13 | 14 | remote_dir = clean_remote_dir 15 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False) 16 | local_download_dir = AnyPath(remote_dir).copy(target=AnyPath(temp_local_dir), force_overwrite=True) 17 | remote_files = AnyPath(remote_dir).rglob('*') 18 | assert sorted([fn.name for fn in remote_files]) == sorted( 19 | [fn.name for fn in local_download_dir.rglob('*')]) 20 | -------------------------------------------------------------------------------- /tests/test_iterdir_glob_rglob.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from anypathlib import PathType, AnyPath 3 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 4 | from fixtures_anypath import temp_dir_with_files, clean_remote_dir, temp_nested_dir 5 | 6 | 7 | @pytest.mark.usefixtures("temp_nested_dir", "clean_remote_dir") 8 | @pytest.mark.parametrize("path_type", [PathType.local, PathType.azure, PathType.s3]) 9 | def test_rglob_glob_iterdir(path_type: PathType, temp_nested_dir, clean_remote_dir): 10 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 11 | local_dir_path, local_files_top_level, local_nested_files = temp_nested_dir 12 | all_local_files = local_files_top_level + local_nested_files 13 | remote_dir = clean_remote_dir 14 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False) 15 | remote_all_files = AnyPath(remote_dir).rglob(pattern='*') 16 | assert sorted([fn.name for fn in remote_all_files]) == sorted([fn.name for fn in all_local_files]) 17 | remote_files_top_level_glob = AnyPath(remote_dir).glob(pattern='*') 18 | assert sorted([fn.name for fn in remote_files_top_level_glob]) == sorted([fn.name for fn in local_files_top_level]) 19 | remote_files_top_level_iterdir = AnyPath(remote_dir).iterdir() 20 | assert sorted([fn.name for fn in remote_files_top_level_iterdir]) == sorted( 21 | [fn.name for fn in local_files_top_level]) 22 | -------------------------------------------------------------------------------- /tests/test_pathlib_properties.py: -------------------------------------------------------------------------------- 1 | from anypathlib import AnyPath 2 | 3 | 4 | def test_pathlib_properties(): 5 | s3_test_dir = AnyPath(r's3://bucket/AnyPath/tests/') 6 | assert s3_test_dir.stem == 'tests' 7 | assert s3_test_dir.name == 'tests' 8 | assert s3_test_dir.parent.base_path == AnyPath(r's3://bucket/AnyPath/').base_path 9 | 10 | s3_test_file = AnyPath(r's3://bucket/AnyPath/tests/a.txt') 11 | assert s3_test_file.stem == 'a' 12 | assert s3_test_file.name == 'a.txt' 13 | assert s3_test_file.parent.base_path == AnyPath(r's3://bucket/AnyPath/tests/').base_path 14 | 15 | azure_test_dir = AnyPath(r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/') 16 | assert azure_test_dir.stem == 'tests' 17 | assert azure_test_dir.name == 'tests' 18 | assert azure_test_dir.parent.base_path == AnyPath( 19 | r'https://storage_account.blob.core.windows.net/container/AnyPath/').base_path 20 | 21 | azure_test_file = AnyPath(r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/a.txt') 22 | assert azure_test_file.stem == 'a' 23 | assert azure_test_file.name == 'a.txt' 24 | assert azure_test_file.parent.base_path == AnyPath( 25 | r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/').base_path 26 | 27 | local_test_dir = AnyPath(r'/tmp/AnyPath/tests/') 28 | assert local_test_dir.stem == 'tests' 29 | assert local_test_dir.name == 'tests' 30 | assert local_test_dir.parent.base_path == AnyPath(r'/tmp/AnyPath').base_path 31 | 32 | local_test_file = AnyPath(r'/tmp/AnyPath/tests/a.txt') 33 | assert local_test_file.stem == 'a' 34 | assert local_test_file.name == 'a.txt' 35 | assert local_test_file.parent.base_path == AnyPath(r'/tmp/AnyPath/tests').base_path 36 | -------------------------------------------------------------------------------- /tests/test_str_path_interoperability.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from anypathlib import AnyPath, PathType 6 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir 7 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 8 | 9 | 10 | def test_init_anypath_from_anypath(tmpdir): 11 | tmpdir_str = str(Path(tmpdir.dirname)) 12 | local_path = AnyPath(tmpdir_str) 13 | local_path_init_from_anypath = AnyPath(local_path) 14 | assert local_path_init_from_anypath.base_path == local_path.base_path 15 | local_path_init_from_path = AnyPath(Path(tmpdir_str)) 16 | assert local_path_init_from_path.base_path == local_path.base_path 17 | 18 | 19 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir") 20 | @pytest.mark.parametrize("target_type", [str, Path, AnyPath]) 21 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3]) 22 | def test_copy_targets(path_type: PathType, target_type, temp_dir_with_files, temp_local_dir, clean_remote_dir): 23 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 24 | local_dir_path, local_dir_files = temp_dir_with_files 25 | temp_local_dir = target_type(temp_local_dir) 26 | remote_dir = clean_remote_dir 27 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False) 28 | local_download_dir = AnyPath(remote_dir).copy(target=temp_local_dir, force_overwrite=True) 29 | remote_files = AnyPath(remote_dir).rglob('*') 30 | assert sorted([fn.name for fn in remote_files]) == sorted([fn.name for fn in local_download_dir.rglob('*')]) 31 | -------------------------------------------------------------------------------- /tests/test_upload_to_cloud.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from anypathlib import PathType, AnyPath 4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER 5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir 6 | 7 | 8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir") 9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3]) 10 | def test_copy_from_local_to_cloud(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir): 11 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type] 12 | local_dir_path, local_dir_files = temp_dir_with_files 13 | remote_dir = clean_remote_dir 14 | local_anypath = AnyPath(local_dir_path) 15 | local_anypath.copy(target=AnyPath(remote_dir)) 16 | remote_dir_files = cloud_handler.rglob(remote_dir, pattern='*') 17 | cloud_handler.remove(remote_dir) 18 | assert sorted([remote_file.split('/')[-1] for remote_file in remote_dir_files]) == sorted( 19 | [local_dir_file.name for local_dir_file in local_dir_files]) 20 | -------------------------------------------------------------------------------- /tests/tests_urls.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from anypathlib import PathType 4 | from anypathlib.path_handlers.azure_handler import AzureHandler 5 | from anypathlib.path_handlers.local_handler import LocalPathHandler 6 | from anypathlib.path_handlers.s3_handler import S3Handler 7 | 8 | PATH_TYPE_TO_BASE_TEST_PATH = {PathType.s3: os.environ['ANYPATH_S3_TEST_URL'], 9 | PathType.azure: os.environ['ANYPATH_AZURE_TEST_URL'], 10 | PathType.local: os.environ['ANYPATH_LOCAL_TEST_URL']} 11 | 12 | PATH_TYPE_TO_HANDLER = {PathType.s3: S3Handler, PathType.azure: AzureHandler, PathType.local: LocalPathHandler} 13 | --------------------------------------------------------------------------------