├── .github
└── workflows
│ └── python-package.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── anypathlib
├── __init__.py
├── anypath.py
├── cli.py
└── path_handlers
│ ├── __init__.py
│ ├── azure_handler.py
│ ├── base_path_handler.py
│ ├── local_handler.py
│ ├── path_types.py
│ └── s3_handler.py
├── docs
├── anypathlib_logo.png
└── wsc_logo.png
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
├── fixtures_anypath.py
├── test_anypath_flows.py
├── test_cli.py
├── test_copy_file_to_dir.py
├── test_download_from_cloud.py
├── test_iterdir_glob_rglob.py
├── test_pathlib_properties.py
├── test_str_path_interoperability.py
├── test_upload_to_cloud.py
└── tests_urls.py
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "master" ]
9 | pull_request:
10 | branches: [ "master" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.8", "3.9", "3.10", "3.11"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Version 0.2.0
2 | - Type `AnyPathLikeType` was added, which can be used to init an `AnyPath` instance or in `copy` target
3 | - `listdir` is now deprecated, replaced by `iterdir`, `rglob`, and `glob`
4 | - `copy` not supports the case where the source is a file and target is a directory
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [2024] [WSC Sports Technologies Ltd.]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |

2 |
3 |
4 |
5 |
6 | [](https://wsc-sports.com/)
7 |
8 |
9 |
10 | # AnyPathLib - Crossing Cloud Borders With a Simple API
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | Welcome to AnyPathLib, a Python library designed to allow hassle-free file operations across different cloud and local
21 | storage
22 |
23 | ## Why `AnyPathLib`?
24 |
25 | With `AnyPathLib` you can write the same code to handle files across different storage systems, without worrying about
26 | the
27 | underlying details.
28 | Operations can be optimized per-backend and the library is easily extendable to support additional cloud storage
29 | providers.
30 |
31 | ## Getting Started With `AnyPathLib` with 3 easy examples ️🛣️
32 |
33 | ### ️🛣️ 1/3 Copying a file or directory from anywhere to anywhere ️🛣️
34 |
35 | ```python
36 | from anypathlib import AnyPath
37 |
38 | # Create an AnyPath instance for a local file
39 | local_file = AnyPath("/path/to/local/file.txt")
40 |
41 | # Create an AnyPath instance for an S3 object
42 | s3_file = AnyPath("s3://bucket/path/to/object.txt")
43 |
44 | # Copy a file from local to S3
45 | local_file.copy(s3_file)
46 |
47 | # Copy a directory from S3 to Azure
48 | s3_dir = AnyPath("s3://bucket/path/to/dir")
49 | azure_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path")
50 | s3_dir.copy(azure_dir)
51 | ```
52 |
53 | ### ️🛣️ 2/3 Local caching for quicker access ️🛣️
54 |
55 | Use "copy" without a target to get a local copy of the file which is stored in a local cache.
56 | Use `force_overwrite=False` to prevent repeated downloads of the same file
57 |
58 | ```python
59 | my_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path/to/dir")
60 | local_dir_path = my_dir.copy()
61 |
62 | my_file = AnyPath("s3://bucket/path/to/file.txt")
63 | local_file_path = my_file.copy()
64 | local_file_path = my_file.copy(force_overwrite=False) # Returns the path of the previously downloaded file
65 | ```
66 |
67 | ### 🛣️ 3/3 A simplified pathlib-like Interface 🛣️
68 |
69 | ```python
70 | my_dir = AnyPath("https://account_name.blob.core.windows.net/container_name/path/to/dir")
71 | my_dir.exists() # True if my_path exists, otherwise False
72 | parent, name, stem = my_dir.parent, my_dir.name, my_dir.stem
73 | files_in_dir: List[AnyPath] = my_dir.rglob('*') # List of AnyPath instances for files in the directory
74 |
75 | my_file = AnyPath("s3://bucket/path/to/file.txt")
76 | my_file.is_file() # True if my_path exists, otherwise False
77 | my_file.is_dir() # False
78 | my_file.remove()
79 | ```
80 |
81 | ### CLI Usage
82 |
83 | `AnyPathLib` also comes with a CLI tool that allows you to perform file operations from the command line.
84 | You can run `anypathlib --help` to get a list of available commands and options.
85 |
86 | Here are some examples:
87 |
88 | Copy:
89 | ```bash
90 | anypathlib copy -i /path/to/source -o /path/to/destination
91 | ```
92 |
93 | Remove a file or directory:
94 | ```bash
95 | anypathlib remove -p /path/to/file_or_directory
96 | ```
97 |
98 | ### Key Features
99 |
100 | * **Unified, Cloud Agnostic, API**: Perform file operations across different storage backends using the same set of
101 | methods.
102 | * **Path-like Operations**: Supports common path operations like joining paths, listing directories, checking file
103 | existence, etc.
104 | * **Performance**: Local caching for repeated downloads across different sessions, multithreading, and more.
105 | * **Extensibility**: Easily extendable to support additional cloud storage providers.
106 |
107 | ### Security and Credentials
108 |
109 | `AnyPath` does not store any credentials in it. In order to access cloud storage, you need to have the necessary
110 | environment variables defined.
111 |
112 | #### Azure
113 |
114 | ```bash
115 | export AZURE_SUBSCRIPTION_ID="your-subscription-id"
116 | export AZURE_RESOURCE_GROUP_NAME="your-resource-group-name"
117 | ```
118 |
119 | #### AWS S3
120 |
121 | Same as Boto3:
122 |
123 | ```bash
124 | export AWS_DEFAULT_REGION="your-region"
125 | export AWS_SECRET_ACCESS_KEY="your-secret"
126 | export AWS_ACCESS_KEY_ID="your-key"
127 | ```
128 |
129 | # TODOs:
130 |
131 | - [ ] Add support for additional cloud storage providers.
132 |
133 | > GCP
134 |
135 | - [ ] Improve API
136 |
137 | > Add __open__ method for reading files, etc.
138 |
139 | - [ ] Implement cloud-to-cloud ops more efficiently.
140 |
141 | > cache azure credentials to avoid repeated logins
142 |
143 | - [ ] Improve logging and add verbose mode.
144 |
145 | > progress bar, etc.
146 |
147 | ## Contributors ✨
148 |
149 | Thanks goes to these wonderful people:
150 |
151 |
152 |
153 |
154 |
162 |
--------------------------------------------------------------------------------
/anypathlib/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.0"
2 |
3 | from anypathlib.anypath import AnyPath
4 | from anypathlib.path_handlers.path_types import PathType
5 |
--------------------------------------------------------------------------------
/anypathlib/anypath.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import tempfile
3 | from pathlib import Path, PurePath
4 | from typing import Union, Optional, List, Dict, NewType
5 | from urllib.parse import urlparse
6 |
7 | from anypathlib.path_handlers.azure_handler import AzureHandler
8 | from anypathlib.path_handlers.base_path_handler import BasePathHandler
9 | from anypathlib.path_handlers.local_handler import LocalPathHandler
10 | from anypathlib.path_handlers.path_types import PathType
11 | from anypathlib.path_handlers.s3_handler import S3Handler
12 |
13 | AnyPathLikeType = NewType('AnyPathLikeType', Union[str, Path, 'AnyPath'])
14 |
15 |
16 | class AnyPath:
17 | PATH_HANDLERS: Dict[PathType, BasePathHandler] = {PathType.local: LocalPathHandler,
18 | PathType.s3: S3Handler,
19 | PathType.azure: AzureHandler}
20 | LOCAL_CACHE_PATH = Path(tempfile.gettempdir()) / 'AnyPath'
21 |
22 | def __init__(self, base_path: AnyPathLikeType):
23 | if type(base_path) is str:
24 | self._base_path = base_path
25 | elif issubclass(type(base_path), PurePath):
26 | self._base_path = base_path.absolute().as_posix()
27 | elif type(base_path) is AnyPath:
28 | self._base_path = base_path.base_path
29 | else:
30 | raise ValueError(f'base_path must be of type str, Path or AnyPath, got {type(base_path)}')
31 | self.path_type = self.get_path_type(self._base_path)
32 | self.path_handler = self.PATH_HANDLERS[self.path_type]
33 |
34 | @staticmethod
35 | def get_path_type(url: str) -> PathType:
36 | parsed_url = urlparse(url)
37 | if parsed_url.scheme in ['http', 'https']:
38 | if 'blob.core.windows.net' in parsed_url.netloc:
39 | return PathType.azure
40 | elif 'amazonaws.com' in parsed_url.netloc or 's3' in parsed_url.netloc:
41 | return PathType.s3
42 | elif parsed_url.scheme in ['s3']:
43 | return PathType.s3
44 | elif parsed_url.scheme in ['file', '']:
45 | return PathType.local
46 | else:
47 | # Assume local
48 | return PathType.local
49 |
50 | def __repr__(self):
51 | return self.base_path
52 |
53 | @property
54 | def is_s3(self) -> bool:
55 | return self.path_type == PathType.s3
56 |
57 | @property
58 | def is_local(self) -> bool:
59 | return self.path_type == PathType.local
60 |
61 | @property
62 | def is_azure(self) -> bool:
63 | return self.path_type == PathType.azure
64 |
65 | # define truediv to allow for concatenation
66 | def __truediv__(self, other: str) -> 'AnyPath':
67 | if self.is_local:
68 | return AnyPath(Path(self.base_path) / other)
69 | else:
70 | valid_other = other[1:] if other.startswith('/') else other
71 | valid_base = self.base_path if self.base_path.endswith('/') else self.base_path + '/'
72 |
73 | return AnyPath(f'{valid_base}{valid_other}')
74 |
75 | @property
76 | def base_path(self) -> str:
77 | if self.path_type == PathType.s3:
78 | base_path = self._base_path
79 | base_path = base_path.replace('//', '/')
80 | if base_path.startswith('s3:/') and not base_path.startswith('s3://'):
81 | base_path = base_path.replace('s3:/', 's3://')
82 | if base_path[-1] == '/':
83 | base_path = base_path[:-1]
84 | elif self.path_type == PathType.local:
85 | base_path = Path(self._base_path).as_posix()
86 | else:
87 | base_path = self._base_path
88 | return base_path
89 |
90 | def is_dir(self) -> bool:
91 | return self.path_handler.is_dir(self.base_path)
92 |
93 | def is_file(self) -> bool:
94 | return self.path_handler.is_file(self.base_path)
95 |
96 | def exists(self) -> bool:
97 | return self.path_handler.exists(self.base_path)
98 |
99 | def remove(self):
100 | self.path_handler.remove(self.base_path)
101 |
102 | @property
103 | def parent(self) -> 'AnyPath':
104 | return AnyPath(self.path_handler.parent(self.base_path))
105 |
106 | @property
107 | def stem(self) -> str:
108 | return self.path_handler.stem(self.base_path)
109 |
110 | @property
111 | def name(self) -> str:
112 | return self.path_handler.name(self.base_path)
113 |
114 | def iterdir(self) -> List['AnyPath']:
115 | return [AnyPath(p) for p in self.path_handler.iterdir(self.base_path)]
116 |
117 | def glob(self, pattern: str) -> List['AnyPath']:
118 | return [AnyPath(p) for p in self.path_handler.glob(self.base_path, pattern)]
119 |
120 | def rglob(self, pattern: str) -> List['AnyPath']:
121 | return [AnyPath(p) for p in self.path_handler.rglob(self.base_path, pattern)]
122 |
123 | def __get_local_path(self, target_path: Optional[Path] = None, force_overwrite: bool = False,
124 | verbose: bool = False) -> Optional[Path]:
125 | if target_path is None:
126 | if self.is_dir():
127 | valid_target_path = Path(tempfile.mkdtemp())
128 | else:
129 | valid_target_path = Path(tempfile.mktemp())
130 | else:
131 | if target_path.exists():
132 | assert target_path.is_dir() == self.is_dir()
133 | assert target_path.is_file() == self.is_file()
134 | valid_target_path = target_path
135 | if self.path_type == PathType.local:
136 | if not target_path.exists() or force_overwrite:
137 | if self.is_dir():
138 | shutil.copytree(self.base_path, valid_target_path, dirs_exist_ok=True)
139 | else:
140 | Path(valid_target_path).parent.mkdir(exist_ok=True, parents=True)
141 | shutil.copy(self.base_path, valid_target_path)
142 | return valid_target_path
143 | else:
144 | if self.is_dir():
145 | result = self.path_handler.download_directory(url=self.base_path,
146 | force_overwrite=force_overwrite,
147 | target_dir=valid_target_path,
148 | verbose=verbose)
149 | if result is not None:
150 | local_path, _ = result
151 | else:
152 | return None
153 |
154 | else:
155 | local_path = self.path_handler.download_file(url=self.base_path, force_overwrite=force_overwrite,
156 | target_path=valid_target_path)
157 |
158 | assert local_path == valid_target_path, \
159 | f'local_path {local_path} is not equal to valid_target_path {valid_target_path}'
160 | return Path(local_path)
161 |
162 | def __get_local_cache_path(self) -> 'AnyPath':
163 | handler_prefix = 's3' if self.is_s3 else 'azure' if self.is_azure else 'local'
164 | local_cache_path = self.LOCAL_CACHE_PATH / handler_prefix / self.path_handler.relative_path(self.base_path)
165 | if self.is_dir():
166 | local_cache_path.mkdir(exist_ok=True, parents=True)
167 | elif self.is_file():
168 | local_cache_path.parent.mkdir(exist_ok=True, parents=True)
169 | return AnyPath(local_cache_path)
170 |
171 | def copy(self, target: Optional[AnyPathLikeType] = None, force_overwrite: bool = True,
172 | verbose: bool = False) -> 'AnyPath':
173 | assert self.exists(), f'source path: {self.base_path} does not exist'
174 | if target is None:
175 | valid_target = self.__get_local_cache_path()
176 | else:
177 | input_target = AnyPath(target)
178 | # if source is a file and target is either an existing dir copy the file to the target dir
179 | if self.is_file() and input_target.is_dir():
180 | valid_target = input_target / self.name
181 | else:
182 | valid_target = input_target
183 | if valid_target.is_local:
184 | self.__get_local_path(target_path=Path(valid_target.base_path), force_overwrite=force_overwrite,
185 | verbose=verbose)
186 | else:
187 | if valid_target.is_s3 and self.is_s3:
188 | S3Handler.copy(source_url=self.base_path, target_url=valid_target.base_path)
189 | elif valid_target.is_azure and self.is_azure:
190 | AzureHandler.copy(source_url=self.base_path, target_url=valid_target.base_path)
191 | else:
192 | # valid_target and source are different,
193 | # so we need to download the source and upload it to the valid_target
194 |
195 | local_path = Path(self.base_path) if self.is_local else self.__get_local_path(
196 | force_overwrite=force_overwrite, verbose=verbose)
197 | target_path_handler = valid_target.path_handler
198 | if self.is_dir():
199 | target_path_handler.upload_directory(local_dir=local_path, target_url=valid_target.base_path,
200 | verbose=verbose)
201 | else:
202 | target_path_handler.upload_file(local_path=str(local_path), target_url=valid_target.base_path)
203 | return valid_target
204 |
--------------------------------------------------------------------------------
/anypathlib/cli.py:
--------------------------------------------------------------------------------
1 | import click
2 | from anypathlib import AnyPath
3 |
4 |
5 | @click.group()
6 | def cli():
7 | pass
8 |
9 |
10 | @click.command()
11 | @click.option('-i', '--input', 'input_path', required=True, type=click.STRING, help='Input path to copy from')
12 | @click.option('-o', '--output', 'output_path', type=click.STRING, help='Output path to copy to')
13 | @click.option('-v', '--verbose', is_flag=True, default=False, help='Verbose flag')
14 | @click.option('-f', '--force/--no-force', is_flag=True, default=True, help='Force overwrite flag')
15 | def copy(input_path, output_path, verbose, force):
16 | """Copy files from input to output path. """
17 | target_path = AnyPath(input_path).copy(target=AnyPath(output_path) if output_path else None,
18 | verbose=verbose, force_overwrite=force)
19 | click.echo(f'Copied Successfully to {target_path}')
20 |
21 |
22 | @click.command()
23 | @click.option('-p', '--path', required=True, type=click.STRING, help='Path to check')
24 | def exists(path):
25 | """Check if the path exists. """
26 | click.echo(AnyPath(path).exists())
27 |
28 |
29 | @click.command()
30 | @click.option('-p', 'path', required=True, type=click.STRING, help='Path to list')
31 | def iterdir(path):
32 | """List the directory. """
33 | click.echo(AnyPath(path).iterdir())
34 |
35 |
36 | @click.command()
37 | @click.option('-p', 'path', required=True, type=click.STRING, help='Path to remove')
38 | def remove(path):
39 | """Remove the path. """
40 | AnyPath(path).remove()
41 |
42 |
43 | cli.add_command(copy)
44 | cli.add_command(exists)
45 | cli.add_command(iterdir)
46 | cli.add_command(remove)
47 |
48 | if __name__ == '__main__':
49 | cli()
50 |
--------------------------------------------------------------------------------
/anypathlib/path_handlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/anypathlib/path_handlers/__init__.py
--------------------------------------------------------------------------------
/anypathlib/path_handlers/azure_handler.py:
--------------------------------------------------------------------------------
1 | import fnmatch
2 | import os
3 | from concurrent.futures import ThreadPoolExecutor
4 | from dataclasses import dataclass, field
5 | from pathlib import Path
6 | from typing import Optional, List, Tuple
7 | from urllib.parse import urlparse
8 |
9 | from tqdm import tqdm
10 | from azure.core.exceptions import ResourceNotFoundError
11 |
12 | from azure.identity import DefaultAzureCredential
13 | from azure.mgmt.storage import StorageManagementClient
14 | from azure.storage.blob import BlobServiceClient, ContainerClient
15 |
16 | from loguru import logger
17 |
18 | from anypathlib.path_handlers.base_path_handler import BasePathHandler
19 |
20 |
21 | @dataclass
22 | class AzureStoragePath:
23 | storage_account: str
24 | container_name: str
25 | blob_name: str
26 | connection_string: Optional[str] = None
27 | _blob_service_client: Optional[BlobServiceClient] = field(init=False, default=None)
28 | _container_client: Optional[ContainerClient] = field(init=False, default=None)
29 |
30 | def __post_init__(self):
31 | if self.connection_string is None:
32 | self.connection_string = AzureHandler.get_connection_string(self.storage_account)
33 | self._container_client = None
34 | self._blob_service_client = None
35 |
36 | @property
37 | def http_url(self) -> str:
38 | return f'https://{self.storage_account}.{AzureHandler.AZURE_URL_SUFFIX}/{self.container_name}/{self.blob_name}'
39 |
40 | @property
41 | def blob_service_client(self) -> BlobServiceClient:
42 | if self._blob_service_client is None:
43 | self._blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
44 | return self._blob_service_client
45 |
46 | @property
47 | def container_client(cls) -> ContainerClient:
48 | if cls._container_client is None:
49 | cls._container_client = cls.blob_service_client.get_container_client(cls.container_name)
50 |
51 | return cls._container_client
52 |
53 |
54 | class AzureHandler(BasePathHandler):
55 | DEFAULT_SUBSCRIPTION_ID = os.environ.get('AZURE_SUBSCRIPTION_ID', None)
56 |
57 | DEFAULT_GROUP_NAME = os.environ.get('AZURE_RESOURCE_GROUP_NAME', None)
58 | AZURE_URL_SUFFIX = r'blob.core.windows.net'
59 |
60 | @classmethod
61 | def refresh_credentials(cls):
62 | if cls.DEFAULT_SUBSCRIPTION_ID is None:
63 | cls.DEFAULT_SUBSCRIPTION_ID = os.environ.get('AZURE_SUBSCRIPTION_ID', None)
64 | if cls.DEFAULT_GROUP_NAME is None:
65 | cls.DEFAULT_GROUP_NAME = os.environ.get('AZURE_RESOURCE_GROUP_NAME', None)
66 |
67 | @classmethod
68 | def relative_path(cls, url: str) -> str:
69 | storage_path = cls.http_to_storage_params(url)
70 | return f'{storage_path.container_name}/{storage_path.blob_name}'
71 |
72 | @classmethod
73 | def is_dir(cls, url: str) -> bool:
74 | return cls.exists(url) and not cls.is_file(url)
75 |
76 | @classmethod
77 | def is_file(cls, url: str) -> bool:
78 | storage_path = cls.http_to_storage_params(url)
79 | container_client = storage_path.container_client
80 | blob_client = container_client.get_blob_client(storage_path.blob_name)
81 |
82 | try:
83 | blob_properties = blob_client.get_blob_properties()
84 | # If the blob exists and is not a directory placeholder, it's a file
85 | return not blob_properties.metadata.get('hdi_isfolder', False)
86 | except Exception:
87 | return False # If exception is raised, the blob does not exist or is not a file
88 |
89 | @classmethod
90 | def exists(cls, url: str) -> bool:
91 | storage_path = cls.http_to_storage_params(url)
92 | container_client = storage_path.container_client
93 | return len([p for p in container_client.list_blobs(name_starts_with=storage_path.blob_name)]) > 0
94 |
95 | @classmethod
96 | def get_connection_string(cls, storage_account: str, subscription_id: Optional[str] = None,
97 | resource_group_name: Optional[str] = None) -> str:
98 | cls.refresh_credentials()
99 | account_key = cls.get_storage_account_key(storage_account_name=storage_account, subscription_id=subscription_id,
100 | resource_group_name=resource_group_name)
101 | connection_string = (f"DefaultEndpointsProtocol=https;AccountName={storage_account};"
102 | f"AccountKey={account_key};EndpointSuffix=core.windows.net")
103 | return connection_string
104 |
105 | @classmethod
106 | def http_to_storage_params(cls, url: str) -> AzureStoragePath:
107 | parsed_url = urlparse(url)
108 | account_name = parsed_url.netloc.split('.')[0]
109 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/')
110 | blob_path = '/'.join(blob_path_parts)
111 |
112 | azure_storage_path = AzureStoragePath(storage_account=account_name, container_name=container_name,
113 | blob_name=blob_path)
114 | return azure_storage_path
115 |
116 | @classmethod
117 | def get_storage_account_key(cls,
118 | storage_account_name: str,
119 | subscription_id: Optional[str] = None,
120 | resource_group_name: Optional[str] = None,
121 | ) -> str:
122 | """
123 | Retrieves the access key for a storage account in Azure.
124 |
125 | Args:
126 | subscription_id (str): The subscription ID of the Azure account.
127 | resource_group_name (str): The name of the resource group containing the storage account.
128 | storage_account_name (str): The name of the storage account.
129 |
130 | Returns:
131 | str: The access key for the storage account.
132 | """
133 | try:
134 | if subscription_id is None:
135 | subscription_id = cls.DEFAULT_SUBSCRIPTION_ID
136 | if subscription_id is None:
137 | raise ValueError(
138 | """
139 | No subscription ID was provided.
140 | Set the AZURE_SUBSCRIPTION_ID environment variable, or pass it as an argument.
141 | """
142 | )
143 | if resource_group_name is None:
144 | resource_group_name = cls.DEFAULT_GROUP_NAME
145 | if resource_group_name is None:
146 | raise ValueError(
147 | """
148 | No resource group name was provided.
149 | Set the AZURE_RESOURCE_GROUP_NAME environment variable, or pass it as an argument.
150 | """
151 | )
152 | client = StorageManagementClient(credential=DefaultAzureCredential(), subscription_id=subscription_id)
153 | response = client.storage_accounts.list_keys(resource_group_name=resource_group_name,
154 | account_name=storage_account_name, )
155 | if not response.keys:
156 | raise ValueError(
157 | """
158 | No keys were found for the storage account.
159 | Ask the MLOps guys for the access key, or try and get it from the Azure portal
160 | """
161 | )
162 | return response.keys[0].value # Returns the first key of the storage account
163 | except Exception as e:
164 | logger.exception(e)
165 | logger.exception(
166 | """
167 | There was an error fetching the storage account key.
168 | Make sure you are connected to VPN, and config is correct.
169 | If it still fails, get it from the Azure portal,
170 | or ask the MLOps guys for the access key.
171 | """
172 | )
173 | raise e
174 |
175 | @classmethod
176 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path:
177 | if target_path.exists() and not force_overwrite:
178 | return target_path
179 | azure_storage_path = cls.http_to_storage_params(url)
180 | # Construct the Blob Service Client
181 | blob_service_client = BlobServiceClient(
182 | account_url=f"https://{azure_storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}")
183 |
184 | # Get a client to interact with the specified container and blob
185 | blob_client = blob_service_client.get_blob_client(container=azure_storage_path.container_name,
186 | blob=azure_storage_path.blob_name)
187 |
188 | # Ensure the directory exists
189 | target_path.parent.mkdir(parents=True, exist_ok=True)
190 |
191 | # Download the blob to a local file
192 | with open(target_path, "wb") as download_file:
193 | download_file.write(blob_client.download_blob().readall())
194 |
195 | return target_path
196 |
197 | @classmethod
198 | def upload_file(cls, local_path: str, target_url: str):
199 | """Upload a single file to Azure Blob Storage."""
200 | azure_storage_path = cls.http_to_storage_params(target_url)
201 | blob_service_client = azure_storage_path.blob_service_client
202 | container_client = azure_storage_path.container_client
203 | # Check if the container exists and create if it does not
204 | try:
205 | container_client.get_container_properties()
206 | except Exception as e:
207 | # Assuming exception means container does not exist. Create new container
208 | container_client.create_container()
209 |
210 | # Now, upload the file
211 | blob_client = blob_service_client.get_blob_client(container=azure_storage_path.container_name,
212 | blob=azure_storage_path.blob_name)
213 | with open(local_path, "rb") as data:
214 | blob_client.upload_blob(data, overwrite=True)
215 |
216 | @classmethod
217 | def remove_directory(cls, url: str):
218 | """Remove a directory (all blobs with the same prefix) from Azure Blob Storage."""
219 | azure_storage_path = cls.http_to_storage_params(url)
220 | container_client = azure_storage_path.container_client
221 | for blob in container_client.list_blobs(name_starts_with=azure_storage_path.blob_name):
222 | container_client.delete_blob(blob.name)
223 |
224 | @classmethod
225 | def remove(cls, url: str, allow_missing: bool = False):
226 | """Remove a single file/directory from Azure Blob Storage."""
227 | if url.endswith('/'):
228 | cls.remove_directory(url)
229 | else:
230 | azure_storage_path = cls.http_to_storage_params(url)
231 | container_client = azure_storage_path.container_client
232 | try:
233 | container_client.delete_blob(azure_storage_path.blob_name)
234 | except ResourceNotFoundError as e:
235 | if not allow_missing:
236 | raise e
237 |
238 | @classmethod
239 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \
240 | Optional[Tuple[Path, List[Path]]]:
241 | """Download a directory (all blobs with the same prefix) from Azure Blob Storage."""
242 | assert target_dir.is_dir()
243 | azure_storage_path = cls.http_to_storage_params(url)
244 |
245 | container_client = azure_storage_path.container_client
246 | local_paths = []
247 |
248 | if verbose:
249 | container_iterator = container_client.list_blobs(name_starts_with=azure_storage_path.blob_name)
250 | progress_bar = tqdm(container_iterator, desc='Downloading directory',
251 | total=len(list(container_iterator)))
252 | else:
253 | progress_bar = container_client.list_blobs(name_starts_with=azure_storage_path.blob_name)
254 |
255 | for blob in progress_bar:
256 | blob_url = AzureStoragePath(storage_account=azure_storage_path.storage_account,
257 | container_name=azure_storage_path.container_name, blob_name=blob.name,
258 | connection_string=azure_storage_path.connection_string).http_url
259 | local_target = target_dir / Path(blob_url).relative_to(Path(url))
260 | local_path = cls.download_file(url=blob_url, force_overwrite=force_overwrite, target_path=local_target)
261 | assert local_path is not None, f'could not download from {url}'
262 | local_paths.append(Path(local_path))
263 | if len(local_paths) == 0:
264 | return None
265 | return local_paths[0].parent, local_paths
266 |
267 | @classmethod
268 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool):
269 | """Upload a directory to Azure Blob Storage."""
270 | azure_storage_path = cls.http_to_storage_params(target_url)
271 | # Check if the container exists and create if it does not
272 | container_client = azure_storage_path.container_client
273 | try:
274 | container_client.get_container_properties()
275 | except Exception as e:
276 | # Assuming exception means container does not exist. Create new container
277 | container_client.create_container()
278 |
279 | def upload_file_wrapper(local_path: str, blob_name: str):
280 | azure_url = rf'https://{azure_storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{azure_storage_path.container_name}/{blob_name}'
281 | cls.upload_file(local_path=local_path, target_url=azure_url)
282 |
283 | # Collect all files to upload
284 | files_to_upload = []
285 | for file_path in local_dir.rglob('*'):
286 | if not file_path.is_file():
287 | continue
288 | blob_name = os.path.join(azure_storage_path.blob_name, file_path.relative_to(local_dir))
289 | files_to_upload.append((file_path, blob_name))
290 |
291 | # Upload files in parallel
292 | with ThreadPoolExecutor() as executor:
293 | futures = [executor.submit(upload_file_wrapper, str(local_path), blob_name) for local_path, blob_name in
294 | files_to_upload]
295 | if verbose:
296 | with tqdm(total=len(files_to_upload), desc='Uploading directory') as pbar:
297 | for future in futures:
298 | future.result() # Wait for each upload to complete
299 | pbar.update(1)
300 | else:
301 | for future in futures:
302 | future.result() # Wait for each upload to complete
303 |
304 | @classmethod
305 | def copy(cls, source_url: str, target_url: str):
306 | source_storage_path = cls.http_to_storage_params(source_url)
307 | target_storage_path = cls.http_to_storage_params(target_url)
308 |
309 | target_blob_service_client = target_storage_path.blob_service_client
310 | source_container_client = source_storage_path.container_client
311 |
312 | blobs_to_rename = source_container_client.list_blobs(name_starts_with=source_storage_path.blob_name)
313 |
314 | def copy_blob(blob):
315 | source_blob_url = AzureStoragePath(storage_account=source_storage_path.storage_account,
316 | container_name=source_storage_path.container_name, blob_name=blob.name,
317 | connection_string=source_storage_path.connection_string).http_url
318 | target_blob_name = blob.name.replace(source_storage_path.blob_name, target_storage_path.blob_name, 1)
319 |
320 | # Copy to new location
321 | target_blob = target_blob_service_client.get_blob_client(container=target_storage_path.container_name,
322 | blob=target_blob_name)
323 | target_blob.start_copy_from_url(source_blob_url)
324 |
325 | # Execute copy and delete operations in parallel
326 | with ThreadPoolExecutor() as executor:
327 | futures = [executor.submit(copy_blob, blob) for blob in blobs_to_rename]
328 | for future in futures:
329 | future.result() # Wait for each operation to complete
330 |
331 | @classmethod
332 | def parent(cls, url: str) -> str:
333 | parsed_url = urlparse(url)
334 | account_name = parsed_url.netloc.split('.')[0]
335 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/')
336 | if blob_path_parts[-1] == "":
337 | blob_path_parts = blob_path_parts[:-1]
338 | blob_path = '/'.join(blob_path_parts[:-1])
339 | parent_url = f'https://{account_name}.{cls.AZURE_URL_SUFFIX}/{container_name}/{blob_path}/'
340 | return parent_url
341 |
342 | @classmethod
343 | def name(cls, url: str) -> str:
344 | parsed_url = urlparse(url)
345 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/')
346 | if blob_path_parts[-1] == "":
347 | blob_path_parts = blob_path_parts[:-1]
348 | blob_name = blob_path_parts[-1]
349 | return blob_name
350 |
351 | @classmethod
352 | def stem(cls, url: str) -> str:
353 | parsed_url = urlparse(url)
354 | container_name, *blob_path_parts = parsed_url.path.lstrip('/').split('/')
355 | if blob_path_parts[-1] == "":
356 | blob_path_parts = blob_path_parts[:-1]
357 | blob_name = blob_path_parts[-1]
358 | return Path(blob_name).stem
359 |
360 | @classmethod
361 | def iterdir(cls, url: str) -> List[str]:
362 | return cls.glob(url, pattern='*')
363 |
364 | @classmethod
365 | def glob(cls, url: str, pattern: str) -> List[str]:
366 | storage_path = cls.http_to_storage_params(url)
367 | container_client = storage_path.container_client
368 | blob_names = [blob.name for blob in
369 | container_client.walk_blobs(name_starts_with=storage_path.blob_name, delimiter='/')]
370 | all_blobs = [
371 | f"https://{storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{storage_path.container_name}/{blob}" for
372 | blob in blob_names]
373 | matched_blobs = [blob for blob in all_blobs if fnmatch.fnmatch(blob, pattern)]
374 | return matched_blobs
375 |
376 | @classmethod
377 | def rglob(cls, url: str, pattern: str) -> List[str]:
378 | storage_path = cls.http_to_storage_params(url)
379 | container_client = storage_path.container_client
380 | blobs = [blob for blob in container_client.list_blob_names(name_starts_with=storage_path.blob_name)]
381 | all_blobs = [
382 | f"https://{storage_path.storage_account}.{cls.AZURE_URL_SUFFIX}/{storage_path.container_name}/{blob}" for
383 | blob in blobs]
384 | matched_blobs = [blob for blob in all_blobs if fnmatch.fnmatch(blob, pattern)]
385 | all_dirs = list(set([cls.parent(url) for url in matched_blobs]))
386 | dirs_under_url = [dir.rstrip('/') for dir in all_dirs if dir.startswith(url) and dir != url]
387 | return matched_blobs + dirs_under_url
388 |
--------------------------------------------------------------------------------
/anypathlib/path_handlers/base_path_handler.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod, ABC
2 | from pathlib import Path
3 | from typing import List, Optional, Tuple
4 |
5 |
6 | class BasePathHandler(ABC):
7 | @classmethod
8 | @abstractmethod
9 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path:
10 | pass
11 |
12 |
13 | @classmethod
14 | @abstractmethod
15 | def remove(cls, url: str):
16 | pass
17 |
18 | @classmethod
19 | @abstractmethod
20 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path,
21 | verbose: bool) -> Optional[Tuple[Path, List[Path]]]:
22 | pass
23 |
24 | @classmethod
25 | @abstractmethod
26 | def upload_file(cls, local_path: str, target_url: str):
27 | pass
28 |
29 | @classmethod
30 | @abstractmethod
31 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool):
32 | pass
33 |
34 | @classmethod
35 | @abstractmethod
36 | def copy(cls, source_url: str, target_url: str):
37 | pass
38 |
39 | @classmethod
40 | @abstractmethod
41 | def is_dir(cls, url: str) -> bool:
42 | pass
43 |
44 | @classmethod
45 | @abstractmethod
46 | def is_file(cls, url: str) -> bool:
47 | pass
48 |
49 | @classmethod
50 | @abstractmethod
51 | def exists(cls, url: str) -> bool:
52 | pass
53 |
54 | @classmethod
55 | @abstractmethod
56 | def relative_path(cls, url: str) -> str:
57 | pass
58 |
59 | @classmethod
60 | @abstractmethod
61 | def parent(cls, url: str) -> str:
62 | pass
63 |
64 | @classmethod
65 | @abstractmethod
66 | def name(cls, url: str) -> str:
67 | pass
68 |
69 | @classmethod
70 | @abstractmethod
71 | def stem(cls, url: str) -> str:
72 | pass
73 |
74 | @classmethod
75 | @abstractmethod
76 | def iterdir(cls, url: str) -> List[str]:
77 | """
78 | Lists all files and directories directly under the given directory
79 | """
80 | pass
81 |
82 | @classmethod
83 | @abstractmethod
84 | def glob(cls, url: str, pattern: str) -> List[str]:
85 | """
86 | Finds all the paths matching a specific pattern, which can include wildcards, but does not search recursively
87 | """
88 | pass
89 |
90 | @classmethod
91 | @abstractmethod
92 | def rglob(cls, url: str, pattern: str) -> List[str]:
93 | """
94 | Finds all the paths matching a specific pattern, including wildcards, and searches recursively in all subdirectories
95 | """
96 | pass
97 |
--------------------------------------------------------------------------------
/anypathlib/path_handlers/local_handler.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from pathlib import Path
3 | from typing import List, Optional, Tuple
4 |
5 | from anypathlib.path_handlers.base_path_handler import BasePathHandler
6 |
7 |
8 | class LocalPathHandler(BasePathHandler):
9 |
10 | @classmethod
11 | def is_dir(cls, url: str) -> bool:
12 | return Path(url).is_dir()
13 |
14 | @classmethod
15 | def is_file(cls, url: str) -> bool:
16 | return Path(url).is_file()
17 |
18 | @classmethod
19 | def exists(cls, url: str) -> bool:
20 | return Path(url).exists()
21 |
22 | @classmethod
23 | def remove(cls, url: str):
24 | local_path = Path(url)
25 | if local_path.is_file():
26 | local_path.unlink()
27 | elif local_path.is_dir():
28 | shutil.rmtree(local_path)
29 |
30 | @classmethod
31 | def upload_file(cls, local_path: str, target_url: str):
32 | cls.copy_path(url=Path(local_path).absolute().as_posix(), target_path=Path(target_url), force_overwrite=True)
33 |
34 | @classmethod
35 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool):
36 | cls.copy_path(url=local_dir.absolute().as_posix(), target_path=Path(target_url), force_overwrite=True)
37 |
38 | @classmethod
39 | def copy(cls, source_url: str, target_url: str):
40 | cls.copy_path(url=source_url, target_path=Path(target_url), force_overwrite=True)
41 |
42 | @classmethod
43 | def copy_path(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path:
44 | if target_path.exists() and not force_overwrite:
45 | return target_path
46 | if target_path.exists() and force_overwrite:
47 | cls.remove(url=target_path.as_posix())
48 | local_path = Path(url)
49 | if local_path.is_dir():
50 | shutil.copytree(local_path, target_path)
51 | else:
52 | shutil.copy(local_path, target_path)
53 |
54 | @classmethod
55 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \
56 | Optional[Tuple[Path, List[Path]]]:
57 | cls.copy_path(url=url, target_path=target_dir, force_overwrite=force_overwrite)
58 | return target_dir, [p for p in target_dir.rglob('*')]
59 |
60 | @classmethod
61 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path:
62 | return cls.copy_path(url=url, target_path=target_path, force_overwrite=force_overwrite)
63 |
64 | @classmethod
65 | def relative_path(cls, url: str) -> str:
66 | return Path(url).relative_to(Path(url).anchor).as_posix()
67 |
68 | @classmethod
69 | def parent(cls, url: str) -> str:
70 | return Path(url).parent.as_posix()
71 |
72 | @classmethod
73 | def stem(cls, url: str) -> str:
74 | return Path(url).stem
75 |
76 | @classmethod
77 | def name(cls, url: str) -> str:
78 | return Path(url).name
79 |
80 | @classmethod
81 | def iterdir(cls, url: str) -> List[str]:
82 | return [str(p) for p in Path(url).iterdir()]
83 |
84 | @classmethod
85 | def glob(cls, url: str, pattern: str) -> List[str]:
86 | return [str(p) for p in Path(url).glob(pattern)]
87 |
88 | @classmethod
89 | def rglob(cls, url: str, pattern: str) -> List[str]:
90 | return [str(p) for p in Path(url).rglob(pattern)]
91 |
--------------------------------------------------------------------------------
/anypathlib/path_handlers/path_types.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class PathType(Enum):
5 | local = 'local'
6 | s3 = 's3'
7 | azure = 'azure'
8 |
--------------------------------------------------------------------------------
/anypathlib/path_handlers/s3_handler.py:
--------------------------------------------------------------------------------
1 | import fnmatch
2 | import os
3 | from concurrent.futures import ThreadPoolExecutor, as_completed
4 | from pathlib import Path
5 | from typing import List, Tuple, Optional, ClassVar
6 | from urllib.parse import urlparse
7 |
8 | import boto3 as boto3
9 | import botocore
10 | from tqdm import tqdm
11 |
12 | from anypathlib.path_handlers.base_path_handler import BasePathHandler
13 |
14 |
15 | class S3Handler(BasePathHandler):
16 | AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
17 | MAX_POOL_CONNECTIONS = 50
18 | # Create a boto3 S3 client
19 | s3_client: ClassVar[boto3.client] = boto3.client('s3', config=botocore.config.Config(
20 | max_pool_connections=MAX_POOL_CONNECTIONS))
21 |
22 | @classmethod
23 | def refresh_credentials(cls):
24 | if cls.AWS_ACCESS_KEY_ID is None:
25 | cls.AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
26 | cls.s3_client = boto3.client('s3',
27 | config=botocore.config.Config(max_pool_connections=cls.MAX_POOL_CONNECTIONS))
28 |
29 | @classmethod
30 | def relative_path(cls, url: str) -> str:
31 | bucket, key = cls.get_bucket_and_key_from_uri(url)
32 | return f'{bucket}/{key}'
33 |
34 | @classmethod
35 | def is_dir(cls, url: str) -> bool:
36 | return cls.exists(url) and not cls.is_file(url)
37 |
38 | @classmethod
39 | def is_file(cls, url: str) -> bool:
40 | bucket_name, object_key = cls.get_bucket_and_key_from_uri(url)
41 | try:
42 | cls.s3_client.head_object(Bucket=bucket_name, Key=object_key)
43 | return True # If the head object doesn't raise an error, it's a file
44 | except (cls.s3_client.exceptions.NoSuchKey, cls.s3_client.exceptions.ClientError):
45 | return False # If a NoSuchKey error is raised, it's not a file
46 |
47 | @classmethod
48 | def parent(cls, url: str) -> str:
49 | bucket, key = cls.get_bucket_and_key_from_uri(url)
50 | return cls.get_full_path(bucket=bucket, key=Path(key).parent.as_posix())
51 |
52 | @classmethod
53 | def stem(cls, url: str) -> str:
54 | bucket, key = cls.get_bucket_and_key_from_uri(url)
55 | return Path(key).stem
56 |
57 | @classmethod
58 | def name(cls, url: str) -> str:
59 | bucket, key = cls.get_bucket_and_key_from_uri(url)
60 | return Path(key).name
61 |
62 | @classmethod
63 | def exists(cls, url: str) -> bool:
64 | bucket, key = cls.get_bucket_and_key_from_uri(url)
65 | try:
66 | resp = cls.s3_client.list_objects(Bucket=bucket, Prefix=key, Delimiter='/', MaxKeys=1)
67 | return 'Contents' in resp or 'CommonPrefixes' in resp
68 | except cls.s3_client.exceptions.NoSuchKey:
69 | return False
70 |
71 | @classmethod
72 | def get_bucket_and_key_from_uri(cls, s3_uri: str) -> Tuple[str, str]:
73 | parsed_uri = urlparse(s3_uri)
74 | bucket = parsed_uri.netloc
75 | key = parsed_uri.path.lstrip('/')
76 | cls.refresh_credentials()
77 | return bucket, key
78 |
79 | @classmethod
80 | def get_full_path(cls, bucket: str, key: str) -> str:
81 |
82 | return f's3://{bucket}/{key}'
83 |
84 | @classmethod
85 | def download_file(cls, url: str, target_path: Path, force_overwrite: bool = True) -> Path:
86 | # Convert the local path to a Path object
87 | local_file_path = Path(target_path)
88 | if not force_overwrite and local_file_path.exists():
89 | return local_file_path
90 | # Parse the S3 URL
91 | bucket, key = cls.get_bucket_and_key_from_uri(url)
92 |
93 | # Ensure the local directory exists
94 | local_file_path.parent.mkdir(parents=True, exist_ok=True)
95 | # Download the file
96 | cls.s3_client.download_file(Bucket=bucket, Key=key, Filename=local_file_path.absolute().as_posix())
97 | return local_file_path
98 |
99 | @classmethod
100 | def remove(cls, url: str):
101 | bucket, key = cls.get_bucket_and_key_from_uri(url)
102 | s3_resource = boto3.resource('s3')
103 | bucket = s3_resource.Bucket(bucket)
104 | bucket.objects.filter(Prefix=key).delete()
105 |
106 | @classmethod
107 | def download_directory(cls, url: str, force_overwrite: bool, target_dir: Path, verbose: bool) -> \
108 | Optional[Tuple[Path, List[Path]]]:
109 |
110 | s3_resource = boto3.resource('s3')
111 |
112 | bucket, source_key = cls.get_bucket_and_key_from_uri(url)
113 | bucket = s3_resource.Bucket(bucket)
114 | all_files = []
115 |
116 | # Prepare the list of s3_paths to download
117 | s3_paths: List[str] = [cls.get_full_path(bucket=bucket.name, key=obj.key) for obj in
118 | bucket.objects.filter(Prefix=source_key)]
119 | s3_paths = [s3_path for s3_path in s3_paths if s3_path.rstrip('/') != url]
120 |
121 | def s3_path_to_local_file_path(s3_path: str, local_base_path: Path) -> Path:
122 | _, key = cls.get_bucket_and_key_from_uri(s3_path)
123 | local_file_relative_path = Path(key).relative_to(source_key)
124 | return local_base_path / local_file_relative_path
125 |
126 | # Download in parallel
127 | with ThreadPoolExecutor() as executor:
128 | future_to_s3_path = {executor.submit(cls.download_file,
129 | url=s3_path,
130 | target_path=s3_path_to_local_file_path(s3_path=s3_path,
131 | local_base_path=target_dir),
132 | force_overwrite=force_overwrite): s3_path for s3_path in s3_paths}
133 |
134 | def process_futures():
135 | for future in as_completed(future_to_s3_path):
136 | s3_path = future_to_s3_path[future]
137 | try:
138 | local_path = future.result()
139 | if local_path:
140 | all_files.append(local_path)
141 | except Exception as exc:
142 | print(f'{s3_path} generated an exception: {exc}')
143 |
144 | yield None
145 |
146 | if verbose:
147 | with tqdm(total=len(s3_paths), desc='Downloading directory') as pbar:
148 | for _ in process_futures():
149 | pbar.update(1)
150 | else:
151 | for _ in process_futures():
152 | pass
153 |
154 | return target_dir, all_files
155 |
156 | @classmethod
157 | def upload_file(cls, local_path: str, target_url: str):
158 | bucket, key = cls.get_bucket_and_key_from_uri(target_url)
159 | cls.s3_client.upload_file(local_path, bucket, key)
160 |
161 | @classmethod
162 | def upload_directory(cls, local_dir: Path, target_url: str, verbose: bool = False):
163 | bucket, key = cls.get_bucket_and_key_from_uri(target_url)
164 |
165 | total_files = 0
166 | if verbose:
167 | for root, dirs, files in os.walk(local_dir):
168 | total_files += len(files)
169 |
170 | if verbose:
171 | progress_bar = tqdm(os.walk(local_dir), desc='Uploading directory', total=total_files)
172 | else:
173 | progress_bar = os.walk(local_dir)
174 |
175 | for root, dirs, files in progress_bar:
176 | for file in files:
177 | local_path = Path(root) / file
178 | s3_key = f'{key.rstrip("/")}/{local_path.relative_to(local_dir).as_posix()}'
179 | cls.s3_client.upload_file(local_path, bucket, s3_key)
180 |
181 | if verbose:
182 | progress_bar.update(len(files))
183 |
184 | @classmethod
185 | def copy(cls, source_url: str, target_url: str):
186 | s3_resource = boto3.resource('s3')
187 | source_bucket_name, source_key = cls.get_bucket_and_key_from_uri(source_url)
188 | target_bucket_name, target_key = cls.get_bucket_and_key_from_uri(target_url)
189 |
190 | source_bucket = s3_resource.Bucket(source_bucket_name)
191 | objects = list(source_bucket.objects.filter(Prefix=source_key))
192 |
193 | def copy_and_delete(obj):
194 | new_key = obj.key.replace(source_key, target_key, 1)
195 | copy_source = {
196 | 'Bucket': source_bucket_name,
197 | 'Key': obj.key
198 | }
199 | # Copy object to the new location
200 | s3_resource.meta.client.copy(copy_source, target_bucket_name, new_key)
201 |
202 | # Use ThreadPoolExecutor to parallelize the operation
203 | with ThreadPoolExecutor() as executor:
204 | futures = [executor.submit(copy_and_delete, obj) for obj in objects]
205 |
206 | for future in as_completed(futures):
207 | try:
208 | future.result() # If needed, handle result or exceptions here
209 | except Exception as exc:
210 | print(f'Operation generated an exception: {exc}')
211 |
212 | @classmethod
213 | def _get_bucket_objects(cls, url: str) -> List[str]:
214 | bucket, key = cls.get_bucket_and_key_from_uri(url)
215 | s3_resource = boto3.resource('s3')
216 | bucket_obj = s3_resource.Bucket(bucket)
217 | return [cls.get_full_path(bucket=bucket, key=obj.key) for obj in bucket_obj.objects.filter(Prefix=key)]
218 |
219 | @classmethod
220 | def iterdir(cls, url: str) -> List[str]:
221 | return cls.glob(url, pattern='*')
222 |
223 | @classmethod
224 | def _get_dirs_under_url(cls, base_url: str, url_list: List[str]) -> List[str]:
225 | all_dirs = list(set([cls.parent(url) for url in url_list]))
226 | dirs_under_url = [dir.rstrip('/') for dir in all_dirs if dir.startswith(base_url) and dir != base_url]
227 | return dirs_under_url
228 |
229 | @classmethod
230 | def glob(cls, url: str, pattern: str) -> List[str]:
231 | objects = cls._get_bucket_objects(url)
232 | matched_objects = [obj for obj in objects if fnmatch.fnmatch(obj, pattern)]
233 | # return only top level matched objects
234 | top_level_objects = [obj for obj in matched_objects if obj.count('/') == url.rstrip('/').count('/') + 1]
235 | all_subdirs = cls._get_dirs_under_url(base_url=url, url_list=matched_objects)
236 | subdirs_in_top_level = [dir for dir in all_subdirs if dir.count('/') == url.rstrip('/').count('/') + 1]
237 | return top_level_objects + subdirs_in_top_level
238 |
239 | @classmethod
240 | def rglob(cls, url: str, pattern: str) -> List[str]:
241 | """
242 | Finds all the paths matching a specific pattern, including wildcards, and searches recursively in all subdirectories
243 | """
244 | objects = cls._get_bucket_objects(url)
245 | matched_objects = [obj for obj in objects if fnmatch.fnmatch(obj, pattern)]
246 | dirs = cls._get_dirs_under_url(base_url=url, url_list=matched_objects)
247 | return matched_objects + dirs
248 |
--------------------------------------------------------------------------------
/docs/anypathlib_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/docs/anypathlib_logo.png
--------------------------------------------------------------------------------
/docs/wsc_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/docs/wsc_logo.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | azure-storage-blob>=12.14.0
2 | azure-identity>=1.10.0
3 | azure-mgmt-storage>=21.1.0
4 | boto3>=1.34.23
5 | loguru
6 | tqdm
7 | click==8.1.7
8 | pytest==8.2.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import setuptools
3 | import codecs
4 | import os
5 | import re
6 |
7 | with open("README.md", "r", encoding='utf-8') as fh:
8 | long_description = fh.read()
9 | packages = setuptools.find_namespace_packages(include=["anypathlib*"])
10 | print("PACKAGES FOUND:", packages)
11 | print(sys.version_info)
12 |
13 |
14 | def find_version(*file_paths: str) -> str:
15 | with codecs.open(os.path.join(*file_paths), "r") as fp:
16 | version_file = fp.read()
17 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
18 | if version_match:
19 | return version_match.group(1)
20 | raise RuntimeError("Unable to find version string.")
21 |
22 |
23 | setuptools.setup(
24 | name="AnyPathLib",
25 | version=find_version("anypathlib", "__init__.py"),
26 | author="Kfir Goldberg @ WSC-Sports",
27 | description="A unified API for every storage resource",
28 | long_description=long_description,
29 | long_description_content_type="text/markdown",
30 | url="",
31 | packages=packages,
32 | package_data={"AnyPathLib": ["py.typed"]},
33 | license='Apache License 2.0',
34 | classifiers=[
35 | "Programming Language :: Python :: 3",
36 | 'License :: OSI Approved :: Apache Software License',
37 | "Operating System :: OS Independent",
38 | ],
39 | python_requires=">=3.7",
40 | install_requires=[
41 | "azure-storage-blob>=12.14.0",
42 | "azure-identity>=1.15.0",
43 | "azure-mgmt-storage>=21.1.0",
44 | "boto3>=1.34.23",
45 | "loguru",
46 | "tqdm",
47 | 'Click'
48 | ],
49 | setup_requires=["pre-commit"],
50 | py_modules=["anypathlib"],
51 | entry_points={"console_scripts": ["anypathlib = anypathlib.cli:cli"]}
52 | )
53 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kfirgoldberg/AnyPathLib/af04c5ef9cd17b57234feb8d511b6fb874caeeb0/tests/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures_anypath.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 |
4 | import pytest
5 | from pathlib import Path
6 | import tempfile
7 |
8 | from click.testing import CliRunner
9 |
10 | from anypathlib import PathType
11 |
12 | from tests.tests_urls import PATH_TYPE_TO_BASE_TEST_PATH, PATH_TYPE_TO_HANDLER
13 |
14 |
15 | def create_files_in_directory(directory: Path, n_files: int = 5):
16 | for _ in range(n_files):
17 | # Generate a random file name
18 | file_name = ''.join(random.choices(string.ascii_lowercase, k=10)) + '.txt'
19 | file_path = directory / file_name
20 |
21 | # Write some random short content to each file
22 | content = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
23 | file_path.write_text(content)
24 |
25 |
26 | @pytest.fixture
27 | def temp_dir_with_files():
28 | with tempfile.TemporaryDirectory() as tmpdirname:
29 | tmpdir = Path(tmpdirname)
30 | create_files_in_directory(tmpdir)
31 | yield tmpdir, list(tmpdir.iterdir())
32 |
33 |
34 | @pytest.fixture
35 | def temp_nested_dir():
36 | with tempfile.TemporaryDirectory() as tmpdirname:
37 | tmpdir = Path(tmpdirname)
38 | nested = tempfile.TemporaryDirectory(dir=tmpdirname)
39 | create_files_in_directory(tmpdir)
40 | create_files_in_directory(Path(nested.name))
41 | yield tmpdir, list(tmpdir.iterdir()), list(Path(nested.name).iterdir())
42 |
43 |
44 | @pytest.fixture
45 | def temp_local_dir():
46 | with tempfile.TemporaryDirectory() as tmpdirname:
47 | yield Path(tmpdirname)
48 |
49 |
50 | @pytest.fixture
51 | def clean_remote_dir(request, path_type: PathType):
52 | test_name = request.node.name
53 | remote_base_dir = PATH_TYPE_TO_BASE_TEST_PATH[path_type]
54 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
55 | remote_dir = f"{remote_base_dir}{test_name}/"
56 | cloud_handler.remove(remote_dir)
57 | yield remote_dir
58 | cloud_handler.remove(remote_dir)
59 |
60 |
61 | @pytest.fixture
62 | def cli_runner():
63 | return CliRunner()
64 |
--------------------------------------------------------------------------------
/tests/test_anypath_flows.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from anypathlib import PathType, AnyPath
4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
5 | from fixtures_anypath import temp_dir_with_files, clean_remote_dir
6 |
7 |
8 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir")
9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local])
10 | def test_exists_copy_exists_rglob_remove_exists(path_type: PathType, temp_dir_with_files, clean_remote_dir):
11 | remote_base_dir = clean_remote_dir
12 | local_dir_path, local_dir_files = temp_dir_with_files
13 | remote_dir = remote_base_dir + 'test_exists_copy_exists_rglob_remove_exists/'
14 | local_any_path = AnyPath(local_dir_path)
15 | target_any_path = AnyPath(remote_dir)
16 | assert not target_any_path.exists()
17 | local_any_path.copy(target=target_any_path, force_overwrite=True)
18 | assert target_any_path.exists()
19 | target_dir_files = target_any_path.rglob('*')
20 | assert sorted([remote_file.name for remote_file in target_dir_files]) == sorted(
21 | [local_dir_file.name for local_dir_file in local_dir_files])
22 | target_any_path.remove()
23 | assert not target_any_path.exists()
24 |
25 |
26 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir")
27 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local])
28 | def test_is_dir(path_type: PathType, temp_dir_with_files, clean_remote_dir):
29 | local_dir_path, local_dir_files = temp_dir_with_files
30 | remote_dir = clean_remote_dir
31 | remote_dir_any_path = AnyPath(remote_dir)
32 | assert not remote_dir_any_path.is_dir()
33 | assert not remote_dir_any_path.is_file()
34 | AnyPath(local_dir_path).copy(target=remote_dir_any_path, force_overwrite=True)
35 | assert remote_dir_any_path.is_dir()
36 | assert not remote_dir_any_path.is_file()
37 | remote_dir_any_path.remove()
38 | assert not remote_dir_any_path.exists()
39 | assert not remote_dir_any_path.is_dir()
40 |
41 |
42 | @pytest.mark.usefixtures("temp_dir_with_files", "clean_remote_dir")
43 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local])
44 | def test_is_file(path_type: PathType, temp_dir_with_files, clean_remote_dir):
45 | local_dir_path, local_dir_files = temp_dir_with_files
46 | local_file = local_dir_files[0]
47 | remote_dir = clean_remote_dir
48 | remote_file = f'{remote_dir}/{local_file.name}'
49 | remote_file_any_path = AnyPath(remote_file)
50 | assert not remote_file_any_path.is_dir()
51 | assert not remote_file_any_path.is_file()
52 | AnyPath(local_file).copy(target=remote_file_any_path)
53 | assert not remote_file_any_path.is_dir()
54 | assert remote_file_any_path.is_file()
55 | remote_file_any_path.remove()
56 | assert not remote_file_any_path.exists()
57 | assert not remote_file_any_path.is_file()
58 |
59 |
60 | @pytest.mark.usefixtures("clean_remote_dir")
61 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local])
62 | @pytest.mark.parametrize("verbose", [True, False])
63 | def test_caching(path_type: PathType, temp_dir_with_files, clean_remote_dir, verbose: bool):
64 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
65 | local_dir_path, local_dir_files = temp_dir_with_files
66 | remote_dir = clean_remote_dir
67 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=verbose)
68 | target1 = AnyPath(remote_dir).copy(target=None, force_overwrite=False, verbose=verbose)
69 | target2 = AnyPath(remote_dir).copy(target=None, force_overwrite=False, verbose=verbose)
70 | target1.remove()
71 | if target2.exists():
72 | target2.remove()
73 | assert target1.base_path == target2.base_path
74 |
--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from anypathlib.cli import cli
6 | from tests.fixtures_anypath import temp_dir_with_files, cli_runner, temp_local_dir
7 |
8 | FOLDER_NAME = 'folder'
9 |
10 |
11 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
12 | def test_copy_command_success(temp_dir_with_files, cli_runner):
13 | local_dir_path, local_dir_files = temp_dir_with_files
14 | input_file = local_dir_files[0]
15 |
16 | output_path = local_dir_path / FOLDER_NAME / input_file.name
17 |
18 | result = cli_runner.invoke(cli, ['copy', '-i', local_dir_files[0], '-o', output_path])
19 | assert result.exit_code == 0
20 | assert output_path.exists()
21 |
22 |
23 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
24 | def test_copy_command_without_output(temp_dir_with_files, cli_runner):
25 | local_dir_path, local_dir_files = temp_dir_with_files
26 | input_file = local_dir_files[0]
27 |
28 | result = cli_runner.invoke(cli, ['copy', '-i', input_file])
29 | assert result.exit_code == 0
30 | output_path = result.output.split(" ")[-1]
31 | assert Path(output_path.strip()).exists()
32 |
33 |
34 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
35 | def test_exists_command_true(temp_dir_with_files, cli_runner):
36 | local_dir_path, local_dir_files = temp_dir_with_files
37 | input_file = local_dir_files[0]
38 | result = cli_runner.invoke(cli, ['exists', '-p', input_file])
39 | assert result.exit_code == 0
40 | assert 'True' in result.output
41 |
42 |
43 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
44 | def test_exists_command_false(temp_dir_with_files, cli_runner):
45 | local_dir_path, local_dir_files = temp_dir_with_files
46 | input_file = local_dir_files[0]
47 | input_file.unlink()
48 | result = cli_runner.invoke(cli, ['exists', '-p', input_file])
49 | assert result.exit_code == 0
50 | assert 'False' in result.output
51 |
52 |
53 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
54 | def test_iterdir_command_with_files(temp_dir_with_files, cli_runner):
55 | local_dir_path, local_dir_files = temp_dir_with_files
56 | result = cli_runner.invoke(cli, ['iterdir', '-p', local_dir_path])
57 | assert result.exit_code == 0
58 |
59 | for file in local_dir_files:
60 | assert file.name in result.output
61 |
62 |
63 | @pytest.mark.usefixtures("temp_local_dir", 'cli_runner')
64 | def test_iterdir_command_empty(temp_local_dir, cli_runner):
65 | result = cli_runner.invoke(cli, ['iterdir', '-p', temp_local_dir])
66 | assert result.exit_code == 0
67 | assert result.output.strip() == '[]'
68 |
69 |
70 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
71 | def test_remove_command_success(temp_dir_with_files, cli_runner):
72 | local_dir_path, local_dir_files = temp_dir_with_files
73 | input_file = local_dir_files[0]
74 |
75 | result = cli_runner.invoke(cli, ['remove', '-p', input_file])
76 | assert result.exit_code == 0
77 | assert not input_file.exists()
78 |
79 |
80 | @pytest.mark.usefixtures("temp_dir_with_files", 'cli_runner')
81 | def test_remove_command_non_existing_path(temp_dir_with_files, cli_runner):
82 | local_dir_path, local_dir_files = temp_dir_with_files
83 | input_file = local_dir_files[0]
84 |
85 | result = cli_runner.invoke(cli, ['remove', '-p', input_file])
86 | assert result.exit_code == 0
87 |
--------------------------------------------------------------------------------
/tests/test_copy_file_to_dir.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from anypathlib import PathType, AnyPath
4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir
6 |
7 |
8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir")
9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3, PathType.local])
10 | def test_copy_file_to_dir(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir):
11 | path_handler = PATH_TYPE_TO_HANDLER[path_type]
12 | local_dir_path, local_dir_files = temp_dir_with_files
13 |
14 | remote_dir = clean_remote_dir
15 | path_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False)
16 | source_files = AnyPath(remote_dir).iterdir()
17 | remote_file = source_files[0]
18 | local_downloaded_file_path = AnyPath(remote_file).copy(target=temp_local_dir, force_overwrite=True)
19 | assert local_downloaded_file_path.exists()
20 | assert local_downloaded_file_path.name == remote_file.name
21 | assert local_downloaded_file_path.is_file()
22 | assert local_downloaded_file_path.parent.base_path == AnyPath(temp_local_dir).base_path
23 |
--------------------------------------------------------------------------------
/tests/test_download_from_cloud.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from anypathlib import PathType, AnyPath
4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir
6 |
7 |
8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir")
9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3])
10 | def test_copy_to_local_from_cloud(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir):
11 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
12 | local_dir_path, local_dir_files = temp_dir_with_files
13 |
14 | remote_dir = clean_remote_dir
15 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False)
16 | local_download_dir = AnyPath(remote_dir).copy(target=AnyPath(temp_local_dir), force_overwrite=True)
17 | remote_files = AnyPath(remote_dir).rglob('*')
18 | assert sorted([fn.name for fn in remote_files]) == sorted(
19 | [fn.name for fn in local_download_dir.rglob('*')])
20 |
--------------------------------------------------------------------------------
/tests/test_iterdir_glob_rglob.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from anypathlib import PathType, AnyPath
3 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
4 | from fixtures_anypath import temp_dir_with_files, clean_remote_dir, temp_nested_dir
5 |
6 |
7 | @pytest.mark.usefixtures("temp_nested_dir", "clean_remote_dir")
8 | @pytest.mark.parametrize("path_type", [PathType.local, PathType.azure, PathType.s3])
9 | def test_rglob_glob_iterdir(path_type: PathType, temp_nested_dir, clean_remote_dir):
10 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
11 | local_dir_path, local_files_top_level, local_nested_files = temp_nested_dir
12 | all_local_files = local_files_top_level + local_nested_files
13 | remote_dir = clean_remote_dir
14 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False)
15 | remote_all_files = AnyPath(remote_dir).rglob(pattern='*')
16 | assert sorted([fn.name for fn in remote_all_files]) == sorted([fn.name for fn in all_local_files])
17 | remote_files_top_level_glob = AnyPath(remote_dir).glob(pattern='*')
18 | assert sorted([fn.name for fn in remote_files_top_level_glob]) == sorted([fn.name for fn in local_files_top_level])
19 | remote_files_top_level_iterdir = AnyPath(remote_dir).iterdir()
20 | assert sorted([fn.name for fn in remote_files_top_level_iterdir]) == sorted(
21 | [fn.name for fn in local_files_top_level])
22 |
--------------------------------------------------------------------------------
/tests/test_pathlib_properties.py:
--------------------------------------------------------------------------------
1 | from anypathlib import AnyPath
2 |
3 |
4 | def test_pathlib_properties():
5 | s3_test_dir = AnyPath(r's3://bucket/AnyPath/tests/')
6 | assert s3_test_dir.stem == 'tests'
7 | assert s3_test_dir.name == 'tests'
8 | assert s3_test_dir.parent.base_path == AnyPath(r's3://bucket/AnyPath/').base_path
9 |
10 | s3_test_file = AnyPath(r's3://bucket/AnyPath/tests/a.txt')
11 | assert s3_test_file.stem == 'a'
12 | assert s3_test_file.name == 'a.txt'
13 | assert s3_test_file.parent.base_path == AnyPath(r's3://bucket/AnyPath/tests/').base_path
14 |
15 | azure_test_dir = AnyPath(r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/')
16 | assert azure_test_dir.stem == 'tests'
17 | assert azure_test_dir.name == 'tests'
18 | assert azure_test_dir.parent.base_path == AnyPath(
19 | r'https://storage_account.blob.core.windows.net/container/AnyPath/').base_path
20 |
21 | azure_test_file = AnyPath(r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/a.txt')
22 | assert azure_test_file.stem == 'a'
23 | assert azure_test_file.name == 'a.txt'
24 | assert azure_test_file.parent.base_path == AnyPath(
25 | r'https://storage_account.blob.core.windows.net/container/AnyPath/tests/').base_path
26 |
27 | local_test_dir = AnyPath(r'/tmp/AnyPath/tests/')
28 | assert local_test_dir.stem == 'tests'
29 | assert local_test_dir.name == 'tests'
30 | assert local_test_dir.parent.base_path == AnyPath(r'/tmp/AnyPath').base_path
31 |
32 | local_test_file = AnyPath(r'/tmp/AnyPath/tests/a.txt')
33 | assert local_test_file.stem == 'a'
34 | assert local_test_file.name == 'a.txt'
35 | assert local_test_file.parent.base_path == AnyPath(r'/tmp/AnyPath/tests').base_path
36 |
--------------------------------------------------------------------------------
/tests/test_str_path_interoperability.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from anypathlib import AnyPath, PathType
6 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir
7 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
8 |
9 |
10 | def test_init_anypath_from_anypath(tmpdir):
11 | tmpdir_str = str(Path(tmpdir.dirname))
12 | local_path = AnyPath(tmpdir_str)
13 | local_path_init_from_anypath = AnyPath(local_path)
14 | assert local_path_init_from_anypath.base_path == local_path.base_path
15 | local_path_init_from_path = AnyPath(Path(tmpdir_str))
16 | assert local_path_init_from_path.base_path == local_path.base_path
17 |
18 |
19 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir")
20 | @pytest.mark.parametrize("target_type", [str, Path, AnyPath])
21 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3])
22 | def test_copy_targets(path_type: PathType, target_type, temp_dir_with_files, temp_local_dir, clean_remote_dir):
23 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
24 | local_dir_path, local_dir_files = temp_dir_with_files
25 | temp_local_dir = target_type(temp_local_dir)
26 | remote_dir = clean_remote_dir
27 | cloud_handler.upload_directory(local_dir=local_dir_path, target_url=remote_dir, verbose=False)
28 | local_download_dir = AnyPath(remote_dir).copy(target=temp_local_dir, force_overwrite=True)
29 | remote_files = AnyPath(remote_dir).rglob('*')
30 | assert sorted([fn.name for fn in remote_files]) == sorted([fn.name for fn in local_download_dir.rglob('*')])
31 |
--------------------------------------------------------------------------------
/tests/test_upload_to_cloud.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from anypathlib import PathType, AnyPath
4 | from tests.tests_urls import PATH_TYPE_TO_HANDLER
5 | from fixtures_anypath import temp_local_dir, temp_dir_with_files, clean_remote_dir
6 |
7 |
8 | @pytest.mark.usefixtures("temp_dir_with_files", "temp_local_dir", "clean_remote_dir")
9 | @pytest.mark.parametrize("path_type", [PathType.azure, PathType.s3])
10 | def test_copy_from_local_to_cloud(path_type: PathType, temp_dir_with_files, temp_local_dir, clean_remote_dir):
11 | cloud_handler = PATH_TYPE_TO_HANDLER[path_type]
12 | local_dir_path, local_dir_files = temp_dir_with_files
13 | remote_dir = clean_remote_dir
14 | local_anypath = AnyPath(local_dir_path)
15 | local_anypath.copy(target=AnyPath(remote_dir))
16 | remote_dir_files = cloud_handler.rglob(remote_dir, pattern='*')
17 | cloud_handler.remove(remote_dir)
18 | assert sorted([remote_file.split('/')[-1] for remote_file in remote_dir_files]) == sorted(
19 | [local_dir_file.name for local_dir_file in local_dir_files])
20 |
--------------------------------------------------------------------------------
/tests/tests_urls.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from anypathlib import PathType
4 | from anypathlib.path_handlers.azure_handler import AzureHandler
5 | from anypathlib.path_handlers.local_handler import LocalPathHandler
6 | from anypathlib.path_handlers.s3_handler import S3Handler
7 |
8 | PATH_TYPE_TO_BASE_TEST_PATH = {PathType.s3: os.environ['ANYPATH_S3_TEST_URL'],
9 | PathType.azure: os.environ['ANYPATH_AZURE_TEST_URL'],
10 | PathType.local: os.environ['ANYPATH_LOCAL_TEST_URL']}
11 |
12 | PATH_TYPE_TO_HANDLER = {PathType.s3: S3Handler, PathType.azure: AzureHandler, PathType.local: LocalPathHandler}
13 |
--------------------------------------------------------------------------------