├── .github └── workflows │ ├── check_reqs.yaml │ ├── ci.yaml │ └── pypi.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── azurebatchload ├── __init__.py ├── checks.py ├── core.py ├── download.py ├── tests │ ├── __init__.py │ ├── test_download.py │ ├── test_upload.py │ └── test_utils.py ├── upload.py └── utils.py ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── scripts ├── check_setupcfg_and_requirementst_equal.py └── generate_requirements_from_setup.py └── setup.cfg /.github/workflows/check_reqs.yaml: -------------------------------------------------------------------------------- 1 | name: Check requirements equal 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'never' 7 | - '!main' 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: checkout repo content 15 | uses: actions/checkout@v2 16 | - name: setup python 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.8 20 | - name: execute py script # run the run.py to get the latest data 21 | run: | 22 | python ./scripts/check_setupcfg_and_requirementst_equal.py 23 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Azure Batch 2 | on: 3 | push: 4 | branches: 5 | - '*' 6 | - '!main' 7 | jobs: 8 | ci: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: ['3.10'] 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install pre-commit setuptools>=61.0.0 23 | - name: Run pre-commit 24 | run: | 25 | pre-commit run --all-files 26 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Package to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine build 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python3 -m build 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea/ 132 | 133 | .DS_Store 134 | 135 | pdfs/ 136 | 137 | data/ 138 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | - repo: https://github.com/pycqa/flake8 10 | rev: 6.0.0 11 | hooks: 12 | - id: flake8 13 | - repo: https://github.com/psf/black 14 | rev: 23.3.0 15 | hooks: 16 | - id: black 17 | args: [--line-length=120] 18 | - repo: https://github.com/PyCQA/isort 19 | rev: 5.12.0 20 | hooks: 21 | - id: isort 22 | args: ["--profile", "black", --line-length=120] 23 | - repo: local 24 | hooks: 25 | - id: pip-to-conda 26 | name: Check requirements 27 | description: Check if requirements in setup.cfg and requirements.txt are equal 28 | language: python 29 | entry: python scripts/check_setupcfg_and_requirementst_equal.py 30 | pass_filenames: false 31 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | === 3 | 4 | 5 | # 0.6.0, 21-08-2021 6 | 7 | - Added `folder` argument in upload, so users can upload to specific folder in Azure Storage. 8 | - Removed `pattern` flag in the CLI version of upload, reason for that is the new way of doing upload to a specific 9 | folder in the Azure CLI is [`az storage fs directory`](https://docs.microsoft.com/en-us/cli/azure/storage/fs/directory?view=azure-cli-latest#az_storage_fs_directory_upload) 10 | and this command does not have a pattern option opposed to 11 | [`az storage blob upload-batch`](https://docs.microsoft.com/en-us/cli/azure/storage/blob?view=azure-cli-latest#az_storage_blob_upload_batch). 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | logo 3 |

4 | 5 | [![Downloads](https://pepy.tech/badge/azurebatchload)](https://pepy.tech/project/azurebatchload) 6 | [![PyPi](https://img.shields.io/pypi/v/azurebatchload.svg)](https://pypi.python.org/pypi/azurebatchload) 7 | [![Open Source](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://opensource.org/) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 9 | 10 | # Azure Batch Load 11 | High level Python wrapper for the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/) to download or upload files in batches from or to Azure Blob Storage Containers. 12 | This project aims to be the [missing functionality](https://github.com/Azure/azure-storage-python/issues/554) 13 | in the Python SDK of Azure Storage since there is no possibility to download or upload batches of files from or to containers. 14 | The only option in the Azure Storage Python SDK is downloading file by file, which takes a lot of time. 15 | 16 | Besides doing loads in batches, since version `0.0.5` it's possible to set method to `single` which will use the 17 | [Azure Python SDK](https://github.com/Azure/azure-sdk-for-python/tree/master/sdk/storage/azure-storage-blob) to process files one by one. 18 | 19 | 20 | # Installation 21 | 22 | ```commandline 23 | pip install azurebatchload 24 | ``` 25 | 26 | See [PyPi](https://pypi.org/project/azurebatchload/) for package index. 27 | 28 | **Note**: For batch uploads (`method="batch"`) Azure CLI has to be [installed](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) 29 | and [configured](https://docs.microsoft.com/en-us/cli/azure/get-started-with-azure-cli). 30 | Check if Azure CLI is installed through terminal: 31 | 32 | ```commandline 33 | az --version 34 | ``` 35 | 36 | # Requirements 37 | 38 | Azure Storage connection string has to be set as environment variable `AZURE_STORAGE_CONNECTION_STRING` or 39 | the seperate environment variables `AZURE_STORAGE_KEY` and `AZURE_STORAGE_NAME` which will be used to create the connection string. 40 | 41 | # Usage 42 | 43 | ## Download 44 | ### 1. Using the standard environment variables 45 | 46 | Azure-batch-load automatically checks for environment variables: `AZURE_STORAGE_CONNECTION_STRING`, 47 | `AZURE_STORAGE_KEY`and `AZURE_STORAGE_ACCOUNT`. 48 | So if the connection_string or storage_key + storage_account are set as environment variables, 49 | we can leave the argument `connection_string`, `account_key` and `account_name` empty: 50 | 51 | ```python 52 | from azurebatchload import Download 53 | 54 | Download( 55 | destination='../pdfs', 56 | source='blobcontainername', 57 | extension='.pdf' 58 | ).download() 59 | ``` 60 | 61 | ### 2. Using `method="single"` 62 | 63 | We can make skip the usage of the `Azure CLI` and just make use Python SDK by setting the `method="single"`: 64 | 65 | ```python 66 | from azurebatchload import Download 67 | 68 | Download( 69 | destination='../pdfs', 70 | source='blobcontainername', 71 | extension='.pdf', 72 | method='single' 73 | ).download() 74 | ``` 75 | 76 | ### 3. Download a specific folder from a container 77 | 78 | We can download a folder by setting the `folder` argument. This works both for `single` and `batch`. 79 | 80 | ```python 81 | from azurebatchload import Download 82 | 83 | Download( 84 | destination='../pdfs', 85 | source='blobcontainername', 86 | folder='uploads/invoices/', 87 | extension='.pdf', 88 | method='single' 89 | ).download() 90 | ``` 91 | 92 | ### 4. Download a given list of files 93 | 94 | We can give a list of files to download with the `list_files` argument. 95 | Note, this only works with `method='single'`. 96 | 97 | ```python 98 | from azurebatchload import Download 99 | 100 | Download( 101 | destination='../pdfs', 102 | source='blobcontainername', 103 | folder='uploads/invoices/', 104 | list_files=["invoice1.pdf", "invoice2.pdf"], 105 | method='single' 106 | ).download() 107 | ``` 108 | 109 | ## Upload: 110 | 111 | ### 1. Using the standard environment variables 112 | 113 | ```python 114 | from azurebatchload import Upload 115 | 116 | Upload( 117 | destination='blobcontainername', 118 | source='../pdf', 119 | extension='*.pdf' 120 | ).upload() 121 | ``` 122 | 123 | ### 2. Using the `method="single"` method which does not require Azure CLI. 124 | 125 | ```python 126 | from azurebatchload import Upload 127 | 128 | Upload( 129 | destination='blobcontainername', 130 | source='../pdf', 131 | extension='*.pdf', 132 | method="single" 133 | ).upload() 134 | ``` 135 | 136 | ### 3. Upload a given list of files with the `list_files` argument. 137 | 138 | ```python 139 | from azurebatchload import Upload 140 | 141 | Upload( 142 | destination='blobcontainername', 143 | source='../pdf', 144 | list_files=["invoice1.pdf", "invoice2.pdf"], 145 | method="single" 146 | ).upload() 147 | ``` 148 | 149 | ## List blobs 150 | 151 | With the `Utils.list_blobs` method we can do advanced listing of blobs in a container or specific folder in a container. 152 | We have several argument we can use to define our scope of information: 153 | 154 | - `name_starts_with`: This can be used to filter files with certain prefix, or to select certain folders: `name_starts_with=folder1/subfolder/lastfolder/` 155 | - `dataframe`: Define if you want a pandas dataframe object returned for your information. 156 | - `extended_info`: Get just the blob names or more extended information like size, creation date, modified date. 157 | 158 | ### 1. List a whole container with just the filenames as a list. 159 | ```python 160 | from azurebatchload import Utils 161 | 162 | list_blobs = Utils(container='containername').list_blobs() 163 | ``` 164 | 165 | ### 2. List a whole container with just the filenames as a dataframe. 166 | ```python 167 | from azurebatchload import Utils 168 | 169 | df_blobs = Utils( 170 | container='containername', 171 | dataframe=True 172 | ).list_blobs() 173 | ``` 174 | 175 | ### 3. List a folder in a container. 176 | ```python 177 | from azurebatchload import Utils 178 | 179 | list_blobs = Utils( 180 | container='containername', 181 | name_starts_with="foldername/" 182 | ).list_blobs() 183 | ``` 184 | 185 | ### 4. Get extended information a folder. 186 | ```python 187 | from azurebatchload import Utils 188 | 189 | dict_blobs = Utils( 190 | container='containername', 191 | name_starts_with="foldername/", 192 | extended_info=True 193 | ).list_blobs() 194 | ``` 195 | 196 | ### 5. Get extended information a folder returned as a pandas dataframe. 197 | ```python 198 | from azurebatchload import Utils 199 | 200 | df_blobs = Utils( 201 | container='containername', 202 | name_starts_with="foldername/", 203 | extended_info=True, 204 | dataframe=True 205 | ).list_blobs() 206 | ``` 207 | -------------------------------------------------------------------------------- /azurebatchload/__init__.py: -------------------------------------------------------------------------------- 1 | from azurebatchload.download import Download 2 | from azurebatchload.upload import Upload 3 | from azurebatchload.utils import Utils 4 | -------------------------------------------------------------------------------- /azurebatchload/checks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from subprocess import STDOUT, CalledProcessError, check_output 5 | 6 | 7 | class Checks: 8 | def __init__(self, directory): 9 | self.directory = directory 10 | 11 | @staticmethod 12 | def _create_connection_string(): 13 | """ 14 | When AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY are given, 15 | create an AZURE_STORAGE_CONNECTION_STRING 16 | 17 | Returns 18 | ------- 19 | connection_string 20 | """ 21 | base_string = ( 22 | "DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};" 23 | "EndpointSuffix=core.windows.net" 24 | ) 25 | connection_string = base_string.format( 26 | account_name=os.environ.get("AZURE_STORAGE_ACCOUNT", None), 27 | account_key=os.environ.get("AZURE_STORAGE_KEY", None), 28 | ) 29 | 30 | return connection_string 31 | 32 | def _check_connection_credentials(self): 33 | """ 34 | If connection string is given as env variable return it, 35 | else the connection string is generated from storage key and name. 36 | 37 | If none of the above are given, raise. 38 | 39 | Returns 40 | ------- 41 | AZURE_STORAGE_CONNECTION_STRING 42 | """ 43 | connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING", None) 44 | account_name = os.environ.get("AZURE_STORAGE_ACCOUNT", None) 45 | account_key = os.environ.get("AZURE_STORAGE_KEY", None) 46 | 47 | if connection_string: 48 | account_name, account_key = self._parse_connection_string(connection_string) 49 | return os.environ.get("AZURE_STORAGE_CONNECTION_STRING"), account_name, account_key 50 | elif all([account_name, account_key]): 51 | return self._create_connection_string(), account_name, account_key 52 | else: 53 | # check for env variables else raise 54 | raise ValueError( 55 | "If AZURE_STORAGE_CONNECTION_STRING is not set as env variable " 56 | " AZURE_STORAGE_KEY and AZURE_STORAGE_ACCOUNT have to be set." 57 | ) 58 | 59 | def _check_dir(self): 60 | """ 61 | When downloading files, create the given directory if it does not exist. 62 | 63 | Returns 64 | ------- 65 | None 66 | """ 67 | if not os.path.exists(self.directory): 68 | raise FileNotFoundError(f"Source directory {self.directory} not found") 69 | 70 | def _create_dir(self, directory=None): 71 | """ 72 | 73 | Parameters 74 | ---------- 75 | directory 76 | 77 | Returns 78 | ------- 79 | 80 | """ 81 | if not directory: 82 | directory = self.directory 83 | 84 | if not os.path.exists(directory): 85 | logging.info(f"Destination {directory} does not exist, creating..") 86 | os.makedirs(directory) 87 | 88 | @staticmethod 89 | def _check_azure_cli_installed(): 90 | try: 91 | check_output(["az", "--version"], stderr=STDOUT, shell=True) 92 | return True 93 | except CalledProcessError: 94 | logging.debug("Azure CLI is not installed, automatically setting method to 'single'") 95 | return False 96 | 97 | @staticmethod 98 | def _parse_connection_string(connection_string): 99 | account_name = re.search(r"AccountName=(.*?);", connection_string).group(1) 100 | account_key = re.search(r"AccountKey=(.*?);", connection_string).group(1) 101 | 102 | return account_name, account_key 103 | -------------------------------------------------------------------------------- /azurebatchload/core.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from azure.storage.blob import BlobSasPermissions, generate_blob_sas 4 | 5 | from azurebatchload.checks import Checks 6 | 7 | 8 | class Base(Checks): 9 | def __init__( 10 | self, 11 | destination, 12 | folder, 13 | extension=None, 14 | modified_since=None, 15 | method="batch", 16 | list_files=None, 17 | expiry_download_links=7, 18 | ): 19 | super().__init__(directory=folder) 20 | 21 | self.destination = destination 22 | self.folder = folder 23 | self.extension = extension 24 | self.modified_since = modified_since 25 | if not self._check_azure_cli_installed(): 26 | self.method = "single" 27 | else: 28 | self.method = method 29 | self.list_files = list_files 30 | credentials = self._check_connection_credentials() 31 | self.connection_string = credentials[0] 32 | self.account_name = credentials[1] 33 | self.account_key = credentials[2] 34 | self.expiry_download_links = expiry_download_links 35 | 36 | def checks(self): 37 | allowed_methods = ("batch", "single") 38 | if self.method not in allowed_methods: 39 | raise ValueError(f"Method {self.method} is not a valid method. Choose from {' or '.join(allowed_methods)}.") 40 | 41 | if self.list_files and self.method == "batch": 42 | raise ValueError("list_files is only allowed with method='single'.") 43 | 44 | if self.list_files and not isinstance(self.list_files, list): 45 | raise ValueError(f"Argument list_files was set, but is not of type list, but type {type(self.list_files)}") 46 | 47 | def create_blob_link(self, blob_folder, blob_name) -> str: 48 | if blob_folder: 49 | full_path_blob = f"{blob_folder}/{blob_name}" 50 | else: 51 | full_path_blob = blob_name 52 | url = f"https://{self.account_name}.blob.core.windows.net/{self.destination}/{full_path_blob}" 53 | sas_token = generate_blob_sas( 54 | account_name=self.account_name, 55 | account_key=self.account_key, 56 | container_name=self.destination, 57 | blob_name=full_path_blob, 58 | permission=BlobSasPermissions(read=True, delete_previous_version=False), 59 | expiry=datetime.utcnow() + timedelta(days=self.expiry_download_links), 60 | ) 61 | 62 | url_with_sas = f"{url}?{sas_token}" 63 | return url_with_sas 64 | 65 | @staticmethod 66 | def create_not_case_sensitive_extension(extension): 67 | """ 68 | We create in-case sensitive fnmatch 69 | .pdf -> .[Pp][Dd][Ff] 70 | .csv -> .[Cc][Ss][Vv] 71 | """ 72 | new_extension = "" 73 | for letter in extension: 74 | if not letter.isalpha(): 75 | new_extension += letter 76 | else: 77 | new_extension += f"[{letter.upper()}{letter}]" 78 | 79 | if not new_extension.startswith("*"): 80 | new_extension = "*" + new_extension 81 | 82 | return new_extension 83 | 84 | def define_pattern(self): 85 | self.extension = self.create_not_case_sensitive_extension(self.extension) 86 | if self.folder and not self.extension: 87 | if self.folder.endswith("/"): 88 | pattern = self.folder + "*" 89 | else: 90 | pattern = self.folder + "/*" 91 | elif self.folder and self.extension: 92 | pattern = self.folder.rstrip("/") + "/" + "*" + self.extension 93 | elif not self.folder and self.extension: 94 | pattern = "*" + self.extension 95 | else: 96 | pattern = None 97 | 98 | return pattern 99 | -------------------------------------------------------------------------------- /azurebatchload/download.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from azure.storage.blob import BlobServiceClient 5 | 6 | from azurebatchload.core import Base 7 | 8 | 9 | class Download(Base): 10 | def __init__( 11 | self, 12 | destination, 13 | source, 14 | folder=None, 15 | extension=None, 16 | method="batch", 17 | modified_since=None, 18 | create_dir=True, 19 | list_files=None, 20 | ): 21 | super(Download, self).__init__( 22 | destination=destination, 23 | folder=folder, 24 | extension=extension, 25 | modified_since=modified_since, 26 | method=method, 27 | list_files=list_files, 28 | ) 29 | self.checks() 30 | self.source = source 31 | if create_dir: 32 | if self.folder: 33 | self._create_dir(os.path.join(self.destination, self.folder)) 34 | else: 35 | self._create_dir(self.destination) 36 | 37 | def _download_batch(self): 38 | pattern = self.define_pattern() 39 | 40 | cmd = f"az storage blob download-batch " f"-d {self.destination} " f"-s {self.source}" 41 | non_default = { 42 | "--connection-string": self.connection_string, 43 | "--pattern": pattern, 44 | } 45 | 46 | for flag, value in non_default.items(): 47 | if value: 48 | cmd = f"{cmd} {flag} '{value}'" 49 | 50 | os.system(cmd) 51 | 52 | def _download_single(self): 53 | blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) 54 | container_client = blob_service_client.get_container_client(container=self.source) 55 | blob_list = container_client.list_blobs(name_starts_with=self.folder) 56 | 57 | n_files = 0 58 | for blob in blob_list: 59 | if self.extension and not blob.name.lower().endswith(self.extension.lower()): 60 | continue 61 | 62 | file_path, file_name = os.path.split(blob.name) 63 | 64 | if self.list_files and file_name not in self.list_files: 65 | continue 66 | blob_client = container_client.get_blob_client(blob=blob.name) 67 | directory = os.path.join(self.destination, file_path) 68 | directory = os.path.abspath(directory) 69 | self._create_dir(directory) 70 | logging.debug(f"Downloading file {blob.name}") 71 | with open(os.path.join(self.destination, blob.name), "wb") as download_file: 72 | download_file.write(blob_client.download_blob().readall()) 73 | 74 | n_files += 1 75 | 76 | logging.info(f"Downloaded total of {n_files} files") 77 | 78 | def download(self): 79 | # for batch load we use the Azure CLI 80 | if self.method == "batch": 81 | self._download_batch() 82 | 83 | # for single load we use Python SDK 84 | else: 85 | self._download_single() 86 | -------------------------------------------------------------------------------- /azurebatchload/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/__init__.py -------------------------------------------------------------------------------- /azurebatchload/tests/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/test_download.py -------------------------------------------------------------------------------- /azurebatchload/tests/test_upload.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/test_upload.py -------------------------------------------------------------------------------- /azurebatchload/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from dotenv import load_dotenv 5 | 6 | from azurebatchload import Utils 7 | 8 | load_dotenv() 9 | 10 | 11 | def test_utils_download_links(): 12 | logging.info("starting test for Utils - download links") 13 | utils = Utils(container=os.environ.get("CONTAINER"), create_download_links=True).list_blobs() 14 | logging.info(f"finished test with {utils.shape[0]} records") 15 | 16 | 17 | if __name__ == "__main__": 18 | test_utils_download_links() 19 | -------------------------------------------------------------------------------- /azurebatchload/upload.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from azure.storage.blob import BlobServiceClient 5 | 6 | from azurebatchload.core import Base 7 | 8 | 9 | class Upload(Base): 10 | def __init__( 11 | self, 12 | destination, 13 | source, 14 | folder=None, 15 | extension=None, 16 | method="batch", 17 | modified_since=None, 18 | overwrite=False, 19 | list_files=None, 20 | create_download_links=False, 21 | expiry_download_links=7, 22 | ): 23 | super(Upload, self).__init__( 24 | destination=destination, 25 | folder=source, 26 | extension=extension, 27 | modified_since=modified_since, 28 | method=method, 29 | list_files=list_files, 30 | expiry_download_links=expiry_download_links, 31 | ) 32 | self.blob_folder = folder 33 | self.overwrite = overwrite 34 | self.create_download_links = create_download_links 35 | 36 | def upload_batch(self): 37 | cmd = f"az storage fs directory upload " f"-f {self.destination} " f"-s {self.folder} -r" 38 | 39 | non_default = {"-d": self.blob_folder, "--connection-string": self.connection_string} 40 | 41 | for flag, value in non_default.items(): 42 | if value: 43 | cmd = f"{cmd} {flag} '{value}'" 44 | 45 | os.system(cmd) 46 | 47 | def upload_single(self): 48 | blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) 49 | download_links = {} 50 | 51 | for root, dirs, files in os.walk(self.folder): 52 | for file in files: 53 | full_path = os.path.join(root, file) 54 | 55 | # ignore hidden files 56 | if file.startswith("."): 57 | continue 58 | 59 | # if list_files is given, only upload matched files 60 | if self.list_files and file not in self.list_files: 61 | continue 62 | 63 | # if extension is given only upload if extension is matched 64 | if self.extension and os.path.isfile(full_path) and not file.lower().endswith(self.extension.lower()): 65 | continue 66 | 67 | blob_folder = root.replace(self.folder, "").lstrip("/") 68 | 69 | if self.blob_folder: 70 | # we only want to append blob_folder if it actually is a path or folder 71 | # blob_folder can be empty string "" 72 | if blob_folder: 73 | blob_folder = os.path.join(self.blob_folder, blob_folder) 74 | else: 75 | blob_folder = self.blob_folder 76 | 77 | # if no folder is given, just upload to the container root path 78 | if not blob_folder: 79 | container = self.destination 80 | else: 81 | container = os.path.join(self.destination, blob_folder) 82 | container_client = blob_service_client.get_container_client(container=container) 83 | 84 | with open(full_path, "rb") as data: 85 | logging.debug(f"Uploading blob {full_path}") 86 | container_client.upload_blob(data=data, name=file, overwrite=self.overwrite) 87 | 88 | if self.create_download_links: 89 | download_links[file] = self.create_blob_link(blob_folder=blob_folder, blob_name=file) 90 | 91 | return download_links 92 | 93 | def upload(self): 94 | self.checks() 95 | 96 | logging.info(f"Uploading to container {self.destination} with method = '{self.method}'.") 97 | if self.method == "batch": 98 | return self.upload_batch() 99 | else: 100 | return self.upload_single() 101 | -------------------------------------------------------------------------------- /azurebatchload/utils.py: -------------------------------------------------------------------------------- 1 | from ntpath import basename 2 | 3 | from azure.storage.blob import BlobServiceClient 4 | from pandas import DataFrame 5 | 6 | from azurebatchload.core import Base 7 | 8 | 9 | class Utils(Base): 10 | def __init__( 11 | self, 12 | container, 13 | name_starts_with=None, 14 | dataframe=False, 15 | extended_info=False, 16 | create_download_links=False, 17 | expiry_download_links=7, 18 | ): 19 | super(Utils, self).__init__( 20 | destination=container, 21 | folder=name_starts_with, 22 | expiry_download_links=expiry_download_links, 23 | ) 24 | 25 | self.container = container 26 | self.name_starts_with = name_starts_with 27 | self.dataframe = dataframe 28 | self.extended_info = extended_info 29 | self.connection_string = self._check_connection_credentials()[0] 30 | self.create_download_links = create_download_links 31 | self._blob_service_client = BlobServiceClient.from_connection_string(self.connection_string) 32 | self._container_client = self._blob_service_client.get_container_client(self.container) 33 | 34 | ################### 35 | # private methods # 36 | ################### 37 | 38 | def _get_files(self): 39 | files = self._container_client.list_blobs(name_starts_with=self.name_starts_with) 40 | return files 41 | 42 | @staticmethod 43 | def _get_file_names_simple(files): 44 | return [file.get("name") for file in files] 45 | 46 | def _list_blobs_not_extended(self, files): 47 | file_names = self._get_file_names_simple(files) 48 | # 1. dataframe = False, return just a list of file names 49 | if not self.dataframe: 50 | return file_names 51 | # 2. dataframe = True, dataframe with one column filenames 52 | else: 53 | return DataFrame({"filename": file_names}) 54 | 55 | def _list_blobs_extended(self, files): 56 | included_info = ("name", "container", "last_modified", "creation_time", "size") 57 | new_file_list = [] 58 | for file in files: 59 | new_dict = {} 60 | for key, value in file.items(): 61 | if key in included_info: 62 | new_dict[key] = file[key] 63 | new_file_list.append(new_dict) 64 | # 3. dataframe = False, return just a list of dicts 65 | if not self.dataframe: 66 | return new_file_list 67 | # 4. dataframe = True, return dataframe 68 | else: 69 | df = DataFrame(new_file_list) 70 | df = df.reindex(columns=included_info) 71 | # convert size to mb 72 | df["size"] = (df["size"] / 1_000_000).round(2) 73 | df = df.rename(columns={"size": "size_mb"}) 74 | return df 75 | 76 | ################## 77 | # public methods # 78 | ################## 79 | 80 | def list_blobs(self): 81 | files = self._get_files() 82 | if not self.extended_info: 83 | return self._list_blobs_not_extended(files) 84 | else: 85 | return self._list_blobs_extended(files) 86 | 87 | def create_blob_links(self): 88 | files = self._get_file_names_simple(self._get_files()) 89 | url_list = [] 90 | for file in files: 91 | url = self.create_blob_link(blob_folder=file.replace("/" + basename(file), ""), blob_name=basename(file)) 92 | url_list.append({"filename": file, "url": url}) 93 | 94 | if self.dataframe: 95 | return DataFrame(url_list) 96 | else: 97 | return url_list 98 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | python-dotenv 3 | setuptools>=61.0.0 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | azure-storage-blob>=12.8.1 2 | pandas>=1.4.1 3 | -------------------------------------------------------------------------------- /scripts/check_setupcfg_and_requirementst_equal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from setuptools.config.setupcfg import read_configuration 5 | 6 | 7 | def get_config(): 8 | repo_path = pathlib.Path(__file__).parent.parent.absolute() 9 | config_setup = read_configuration(os.path.join(repo_path, "setup.cfg")) 10 | config_requirements = config_setup["options"]["install_requires"] 11 | 12 | return config_requirements, repo_path 13 | 14 | 15 | def check(): 16 | config_requirements, repo_path = get_config() 17 | 18 | with open(os.path.join(repo_path, "requirements.txt")) as f: 19 | requirements_txt = f.read().splitlines() 20 | 21 | assert sorted(config_requirements) == sorted(requirements_txt), "Requirements are not equal" 22 | print("Requirements and setup.cfg and both are equal") 23 | 24 | 25 | if __name__ == "__main__": 26 | check() 27 | -------------------------------------------------------------------------------- /scripts/generate_requirements_from_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from setuptools.config import read_configuration 5 | 6 | 7 | def get_config(): 8 | repo_path = pathlib.Path(__file__).parent.parent.absolute() 9 | config_setup = read_configuration(os.path.join(repo_path, "setup.cfg")) 10 | config_requirements = config_setup["options"]["install_requires"] 11 | 12 | return config_requirements, repo_path 13 | 14 | 15 | def generate_requirements(): 16 | config_requirements, repo_path = get_config() 17 | 18 | with open(os.path.join(repo_path, "requirements.txt"), "w") as f: 19 | f.write("\n".join(config_requirements)) 20 | 21 | print( 22 | "Generated requirements.txt from setup.cfg, with the following requirements\n", "\n".join(config_requirements) 23 | ) 24 | 25 | 26 | if __name__ == "__main__": 27 | generate_requirements() 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = azurebatchload 3 | version = 0.6.3 4 | author = Erfan Nariman, Melvin Folkers 5 | author_email = hello@zypp.io 6 | description = Download and upload files in batches from Azure Blob Storage Containers 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | keywords = python, azure, blob, download, upload, batch 10 | url = https://github.com/zypp-io/azure-batch-load 11 | project_urls = 12 | Bug Tracker = https://github.com/zypp-io/azure-batch-load/issues 13 | Source = https://github.com/zypp-io/azure-batch-load 14 | classifiers = 15 | Programming Language :: Python :: 3 16 | License :: OSI Approved :: MIT License 17 | Operating System :: OS Independent 18 | 19 | [options] 20 | packages = azurebatchload 21 | python_requires = >=3.7 22 | install_requires = 23 | azure-storage-blob>=12.8.1 24 | pandas>=1.4.1 25 | 26 | [flake8] 27 | statistics = True 28 | count = True 29 | max-complexity=12 30 | max-line-length=120 31 | per-file-ignores=__init__.py: F401 32 | --------------------------------------------------------------------------------