├── .github
└── workflows
│ ├── check_reqs.yaml
│ ├── ci.yaml
│ └── pypi.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── azurebatchload
├── __init__.py
├── checks.py
├── core.py
├── download.py
├── tests
│ ├── __init__.py
│ ├── test_download.py
│ ├── test_upload.py
│ └── test_utils.py
├── upload.py
└── utils.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── scripts
├── check_setupcfg_and_requirementst_equal.py
└── generate_requirements_from_setup.py
└── setup.cfg
/.github/workflows/check_reqs.yaml:
--------------------------------------------------------------------------------
1 | name: Check requirements equal
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'never'
7 | - '!main'
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - name: checkout repo content
15 | uses: actions/checkout@v2
16 | - name: setup python
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: 3.8
20 | - name: execute py script # run the run.py to get the latest data
21 | run: |
22 | python ./scripts/check_setupcfg_and_requirementst_equal.py
23 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: Azure Batch
2 | on:
3 | push:
4 | branches:
5 | - '*'
6 | - '!main'
7 | jobs:
8 | ci:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: ['3.10']
13 | steps:
14 | - uses: actions/checkout@v3
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v4
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install pre-commit setuptools>=61.0.0
23 | - name: Run pre-commit
24 | run: |
25 | pre-commit run --all-files
26 |
--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
1 | name: Upload Package to PyPi
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v3
12 | - name: Set up Python
13 | uses: actions/setup-python@v4
14 | with:
15 | python-version: '3.x'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine build
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 | run: |
25 | python3 -m build
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | .idea/
132 |
133 | .DS_Store
134 |
135 | pdfs/
136 |
137 | data/
138 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.4.0
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: end-of-file-fixer
7 | - id: check-yaml
8 | - id: check-added-large-files
9 | - repo: https://github.com/pycqa/flake8
10 | rev: 6.0.0
11 | hooks:
12 | - id: flake8
13 | - repo: https://github.com/psf/black
14 | rev: 23.3.0
15 | hooks:
16 | - id: black
17 | args: [--line-length=120]
18 | - repo: https://github.com/PyCQA/isort
19 | rev: 5.12.0
20 | hooks:
21 | - id: isort
22 | args: ["--profile", "black", --line-length=120]
23 | - repo: local
24 | hooks:
25 | - id: pip-to-conda
26 | name: Check requirements
27 | description: Check if requirements in setup.cfg and requirements.txt are equal
28 | language: python
29 | entry: python scripts/check_setupcfg_and_requirementst_equal.py
30 | pass_filenames: false
31 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | Changelog
2 | ===
3 |
4 |
5 | # 0.6.0, 21-08-2021
6 |
7 | - Added `folder` argument in upload, so users can upload to specific folder in Azure Storage.
8 | - Removed `pattern` flag in the CLI version of upload, reason for that is the new way of doing upload to a specific
9 | folder in the Azure CLI is [`az storage fs directory`](https://docs.microsoft.com/en-us/cli/azure/storage/fs/directory?view=azure-cli-latest#az_storage_fs_directory_upload)
10 | and this command does not have a pattern option opposed to
11 | [`az storage blob upload-batch`](https://docs.microsoft.com/en-us/cli/azure/storage/blob?view=azure-cli-latest#az_storage_blob_upload_batch).
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | [](https://pepy.tech/project/azurebatchload)
6 | [](https://pypi.python.org/pypi/azurebatchload)
7 | [](https://opensource.org/)
8 | [](https://github.com/psf/black)
9 |
10 | # Azure Batch Load
11 | High level Python wrapper for the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/) to download or upload files in batches from or to Azure Blob Storage Containers.
12 | This project aims to be the [missing functionality](https://github.com/Azure/azure-storage-python/issues/554)
13 | in the Python SDK of Azure Storage since there is no possibility to download or upload batches of files from or to containers.
14 | The only option in the Azure Storage Python SDK is downloading file by file, which takes a lot of time.
15 |
16 | Besides doing loads in batches, since version `0.0.5` it's possible to set method to `single` which will use the
17 | [Azure Python SDK](https://github.com/Azure/azure-sdk-for-python/tree/master/sdk/storage/azure-storage-blob) to process files one by one.
18 |
19 |
20 | # Installation
21 |
22 | ```commandline
23 | pip install azurebatchload
24 | ```
25 |
26 | See [PyPi](https://pypi.org/project/azurebatchload/) for package index.
27 |
28 | **Note**: For batch uploads (`method="batch"`) Azure CLI has to be [installed](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli)
29 | and [configured](https://docs.microsoft.com/en-us/cli/azure/get-started-with-azure-cli).
30 | Check if Azure CLI is installed through terminal:
31 |
32 | ```commandline
33 | az --version
34 | ```
35 |
36 | # Requirements
37 |
38 | Azure Storage connection string has to be set as environment variable `AZURE_STORAGE_CONNECTION_STRING` or
39 | the seperate environment variables `AZURE_STORAGE_KEY` and `AZURE_STORAGE_NAME` which will be used to create the connection string.
40 |
41 | # Usage
42 |
43 | ## Download
44 | ### 1. Using the standard environment variables
45 |
46 | Azure-batch-load automatically checks for environment variables: `AZURE_STORAGE_CONNECTION_STRING`,
47 | `AZURE_STORAGE_KEY`and `AZURE_STORAGE_ACCOUNT`.
48 | So if the connection_string or storage_key + storage_account are set as environment variables,
49 | we can leave the argument `connection_string`, `account_key` and `account_name` empty:
50 |
51 | ```python
52 | from azurebatchload import Download
53 |
54 | Download(
55 | destination='../pdfs',
56 | source='blobcontainername',
57 | extension='.pdf'
58 | ).download()
59 | ```
60 |
61 | ### 2. Using `method="single"`
62 |
63 | We can make skip the usage of the `Azure CLI` and just make use Python SDK by setting the `method="single"`:
64 |
65 | ```python
66 | from azurebatchload import Download
67 |
68 | Download(
69 | destination='../pdfs',
70 | source='blobcontainername',
71 | extension='.pdf',
72 | method='single'
73 | ).download()
74 | ```
75 |
76 | ### 3. Download a specific folder from a container
77 |
78 | We can download a folder by setting the `folder` argument. This works both for `single` and `batch`.
79 |
80 | ```python
81 | from azurebatchload import Download
82 |
83 | Download(
84 | destination='../pdfs',
85 | source='blobcontainername',
86 | folder='uploads/invoices/',
87 | extension='.pdf',
88 | method='single'
89 | ).download()
90 | ```
91 |
92 | ### 4. Download a given list of files
93 |
94 | We can give a list of files to download with the `list_files` argument.
95 | Note, this only works with `method='single'`.
96 |
97 | ```python
98 | from azurebatchload import Download
99 |
100 | Download(
101 | destination='../pdfs',
102 | source='blobcontainername',
103 | folder='uploads/invoices/',
104 | list_files=["invoice1.pdf", "invoice2.pdf"],
105 | method='single'
106 | ).download()
107 | ```
108 |
109 | ## Upload:
110 |
111 | ### 1. Using the standard environment variables
112 |
113 | ```python
114 | from azurebatchload import Upload
115 |
116 | Upload(
117 | destination='blobcontainername',
118 | source='../pdf',
119 | extension='*.pdf'
120 | ).upload()
121 | ```
122 |
123 | ### 2. Using the `method="single"` method which does not require Azure CLI.
124 |
125 | ```python
126 | from azurebatchload import Upload
127 |
128 | Upload(
129 | destination='blobcontainername',
130 | source='../pdf',
131 | extension='*.pdf',
132 | method="single"
133 | ).upload()
134 | ```
135 |
136 | ### 3. Upload a given list of files with the `list_files` argument.
137 |
138 | ```python
139 | from azurebatchload import Upload
140 |
141 | Upload(
142 | destination='blobcontainername',
143 | source='../pdf',
144 | list_files=["invoice1.pdf", "invoice2.pdf"],
145 | method="single"
146 | ).upload()
147 | ```
148 |
149 | ## List blobs
150 |
151 | With the `Utils.list_blobs` method we can do advanced listing of blobs in a container or specific folder in a container.
152 | We have several argument we can use to define our scope of information:
153 |
154 | - `name_starts_with`: This can be used to filter files with certain prefix, or to select certain folders: `name_starts_with=folder1/subfolder/lastfolder/`
155 | - `dataframe`: Define if you want a pandas dataframe object returned for your information.
156 | - `extended_info`: Get just the blob names or more extended information like size, creation date, modified date.
157 |
158 | ### 1. List a whole container with just the filenames as a list.
159 | ```python
160 | from azurebatchload import Utils
161 |
162 | list_blobs = Utils(container='containername').list_blobs()
163 | ```
164 |
165 | ### 2. List a whole container with just the filenames as a dataframe.
166 | ```python
167 | from azurebatchload import Utils
168 |
169 | df_blobs = Utils(
170 | container='containername',
171 | dataframe=True
172 | ).list_blobs()
173 | ```
174 |
175 | ### 3. List a folder in a container.
176 | ```python
177 | from azurebatchload import Utils
178 |
179 | list_blobs = Utils(
180 | container='containername',
181 | name_starts_with="foldername/"
182 | ).list_blobs()
183 | ```
184 |
185 | ### 4. Get extended information a folder.
186 | ```python
187 | from azurebatchload import Utils
188 |
189 | dict_blobs = Utils(
190 | container='containername',
191 | name_starts_with="foldername/",
192 | extended_info=True
193 | ).list_blobs()
194 | ```
195 |
196 | ### 5. Get extended information a folder returned as a pandas dataframe.
197 | ```python
198 | from azurebatchload import Utils
199 |
200 | df_blobs = Utils(
201 | container='containername',
202 | name_starts_with="foldername/",
203 | extended_info=True,
204 | dataframe=True
205 | ).list_blobs()
206 | ```
207 |
--------------------------------------------------------------------------------
/azurebatchload/__init__.py:
--------------------------------------------------------------------------------
1 | from azurebatchload.download import Download
2 | from azurebatchload.upload import Upload
3 | from azurebatchload.utils import Utils
4 |
--------------------------------------------------------------------------------
/azurebatchload/checks.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 | from subprocess import STDOUT, CalledProcessError, check_output
5 |
6 |
7 | class Checks:
8 | def __init__(self, directory):
9 | self.directory = directory
10 |
11 | @staticmethod
12 | def _create_connection_string():
13 | """
14 | When AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY are given,
15 | create an AZURE_STORAGE_CONNECTION_STRING
16 |
17 | Returns
18 | -------
19 | connection_string
20 | """
21 | base_string = (
22 | "DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};"
23 | "EndpointSuffix=core.windows.net"
24 | )
25 | connection_string = base_string.format(
26 | account_name=os.environ.get("AZURE_STORAGE_ACCOUNT", None),
27 | account_key=os.environ.get("AZURE_STORAGE_KEY", None),
28 | )
29 |
30 | return connection_string
31 |
32 | def _check_connection_credentials(self):
33 | """
34 | If connection string is given as env variable return it,
35 | else the connection string is generated from storage key and name.
36 |
37 | If none of the above are given, raise.
38 |
39 | Returns
40 | -------
41 | AZURE_STORAGE_CONNECTION_STRING
42 | """
43 | connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING", None)
44 | account_name = os.environ.get("AZURE_STORAGE_ACCOUNT", None)
45 | account_key = os.environ.get("AZURE_STORAGE_KEY", None)
46 |
47 | if connection_string:
48 | account_name, account_key = self._parse_connection_string(connection_string)
49 | return os.environ.get("AZURE_STORAGE_CONNECTION_STRING"), account_name, account_key
50 | elif all([account_name, account_key]):
51 | return self._create_connection_string(), account_name, account_key
52 | else:
53 | # check for env variables else raise
54 | raise ValueError(
55 | "If AZURE_STORAGE_CONNECTION_STRING is not set as env variable "
56 | " AZURE_STORAGE_KEY and AZURE_STORAGE_ACCOUNT have to be set."
57 | )
58 |
59 | def _check_dir(self):
60 | """
61 | When downloading files, create the given directory if it does not exist.
62 |
63 | Returns
64 | -------
65 | None
66 | """
67 | if not os.path.exists(self.directory):
68 | raise FileNotFoundError(f"Source directory {self.directory} not found")
69 |
70 | def _create_dir(self, directory=None):
71 | """
72 |
73 | Parameters
74 | ----------
75 | directory
76 |
77 | Returns
78 | -------
79 |
80 | """
81 | if not directory:
82 | directory = self.directory
83 |
84 | if not os.path.exists(directory):
85 | logging.info(f"Destination {directory} does not exist, creating..")
86 | os.makedirs(directory)
87 |
88 | @staticmethod
89 | def _check_azure_cli_installed():
90 | try:
91 | check_output(["az", "--version"], stderr=STDOUT, shell=True)
92 | return True
93 | except CalledProcessError:
94 | logging.debug("Azure CLI is not installed, automatically setting method to 'single'")
95 | return False
96 |
97 | @staticmethod
98 | def _parse_connection_string(connection_string):
99 | account_name = re.search(r"AccountName=(.*?);", connection_string).group(1)
100 | account_key = re.search(r"AccountKey=(.*?);", connection_string).group(1)
101 |
102 | return account_name, account_key
103 |
--------------------------------------------------------------------------------
/azurebatchload/core.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 |
3 | from azure.storage.blob import BlobSasPermissions, generate_blob_sas
4 |
5 | from azurebatchload.checks import Checks
6 |
7 |
8 | class Base(Checks):
9 | def __init__(
10 | self,
11 | destination,
12 | folder,
13 | extension=None,
14 | modified_since=None,
15 | method="batch",
16 | list_files=None,
17 | expiry_download_links=7,
18 | ):
19 | super().__init__(directory=folder)
20 |
21 | self.destination = destination
22 | self.folder = folder
23 | self.extension = extension
24 | self.modified_since = modified_since
25 | if not self._check_azure_cli_installed():
26 | self.method = "single"
27 | else:
28 | self.method = method
29 | self.list_files = list_files
30 | credentials = self._check_connection_credentials()
31 | self.connection_string = credentials[0]
32 | self.account_name = credentials[1]
33 | self.account_key = credentials[2]
34 | self.expiry_download_links = expiry_download_links
35 |
36 | def checks(self):
37 | allowed_methods = ("batch", "single")
38 | if self.method not in allowed_methods:
39 | raise ValueError(f"Method {self.method} is not a valid method. Choose from {' or '.join(allowed_methods)}.")
40 |
41 | if self.list_files and self.method == "batch":
42 | raise ValueError("list_files is only allowed with method='single'.")
43 |
44 | if self.list_files and not isinstance(self.list_files, list):
45 | raise ValueError(f"Argument list_files was set, but is not of type list, but type {type(self.list_files)}")
46 |
47 | def create_blob_link(self, blob_folder, blob_name) -> str:
48 | if blob_folder:
49 | full_path_blob = f"{blob_folder}/{blob_name}"
50 | else:
51 | full_path_blob = blob_name
52 | url = f"https://{self.account_name}.blob.core.windows.net/{self.destination}/{full_path_blob}"
53 | sas_token = generate_blob_sas(
54 | account_name=self.account_name,
55 | account_key=self.account_key,
56 | container_name=self.destination,
57 | blob_name=full_path_blob,
58 | permission=BlobSasPermissions(read=True, delete_previous_version=False),
59 | expiry=datetime.utcnow() + timedelta(days=self.expiry_download_links),
60 | )
61 |
62 | url_with_sas = f"{url}?{sas_token}"
63 | return url_with_sas
64 |
65 | @staticmethod
66 | def create_not_case_sensitive_extension(extension):
67 | """
68 | We create in-case sensitive fnmatch
69 | .pdf -> .[Pp][Dd][Ff]
70 | .csv -> .[Cc][Ss][Vv]
71 | """
72 | new_extension = ""
73 | for letter in extension:
74 | if not letter.isalpha():
75 | new_extension += letter
76 | else:
77 | new_extension += f"[{letter.upper()}{letter}]"
78 |
79 | if not new_extension.startswith("*"):
80 | new_extension = "*" + new_extension
81 |
82 | return new_extension
83 |
84 | def define_pattern(self):
85 | self.extension = self.create_not_case_sensitive_extension(self.extension)
86 | if self.folder and not self.extension:
87 | if self.folder.endswith("/"):
88 | pattern = self.folder + "*"
89 | else:
90 | pattern = self.folder + "/*"
91 | elif self.folder and self.extension:
92 | pattern = self.folder.rstrip("/") + "/" + "*" + self.extension
93 | elif not self.folder and self.extension:
94 | pattern = "*" + self.extension
95 | else:
96 | pattern = None
97 |
98 | return pattern
99 |
--------------------------------------------------------------------------------
/azurebatchload/download.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from azure.storage.blob import BlobServiceClient
5 |
6 | from azurebatchload.core import Base
7 |
8 |
9 | class Download(Base):
10 | def __init__(
11 | self,
12 | destination,
13 | source,
14 | folder=None,
15 | extension=None,
16 | method="batch",
17 | modified_since=None,
18 | create_dir=True,
19 | list_files=None,
20 | ):
21 | super(Download, self).__init__(
22 | destination=destination,
23 | folder=folder,
24 | extension=extension,
25 | modified_since=modified_since,
26 | method=method,
27 | list_files=list_files,
28 | )
29 | self.checks()
30 | self.source = source
31 | if create_dir:
32 | if self.folder:
33 | self._create_dir(os.path.join(self.destination, self.folder))
34 | else:
35 | self._create_dir(self.destination)
36 |
37 | def _download_batch(self):
38 | pattern = self.define_pattern()
39 |
40 | cmd = f"az storage blob download-batch " f"-d {self.destination} " f"-s {self.source}"
41 | non_default = {
42 | "--connection-string": self.connection_string,
43 | "--pattern": pattern,
44 | }
45 |
46 | for flag, value in non_default.items():
47 | if value:
48 | cmd = f"{cmd} {flag} '{value}'"
49 |
50 | os.system(cmd)
51 |
52 | def _download_single(self):
53 | blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
54 | container_client = blob_service_client.get_container_client(container=self.source)
55 | blob_list = container_client.list_blobs(name_starts_with=self.folder)
56 |
57 | n_files = 0
58 | for blob in blob_list:
59 | if self.extension and not blob.name.lower().endswith(self.extension.lower()):
60 | continue
61 |
62 | file_path, file_name = os.path.split(blob.name)
63 |
64 | if self.list_files and file_name not in self.list_files:
65 | continue
66 | blob_client = container_client.get_blob_client(blob=blob.name)
67 | directory = os.path.join(self.destination, file_path)
68 | directory = os.path.abspath(directory)
69 | self._create_dir(directory)
70 | logging.debug(f"Downloading file {blob.name}")
71 | with open(os.path.join(self.destination, blob.name), "wb") as download_file:
72 | download_file.write(blob_client.download_blob().readall())
73 |
74 | n_files += 1
75 |
76 | logging.info(f"Downloaded total of {n_files} files")
77 |
78 | def download(self):
79 | # for batch load we use the Azure CLI
80 | if self.method == "batch":
81 | self._download_batch()
82 |
83 | # for single load we use Python SDK
84 | else:
85 | self._download_single()
86 |
--------------------------------------------------------------------------------
/azurebatchload/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/__init__.py
--------------------------------------------------------------------------------
/azurebatchload/tests/test_download.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/test_download.py
--------------------------------------------------------------------------------
/azurebatchload/tests/test_upload.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zypp-io/azure-batch-load/c076bb267d0a43e1d3423d6e0d1642c410b9227c/azurebatchload/tests/test_upload.py
--------------------------------------------------------------------------------
/azurebatchload/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from dotenv import load_dotenv
5 |
6 | from azurebatchload import Utils
7 |
8 | load_dotenv()
9 |
10 |
11 | def test_utils_download_links():
12 | logging.info("starting test for Utils - download links")
13 | utils = Utils(container=os.environ.get("CONTAINER"), create_download_links=True).list_blobs()
14 | logging.info(f"finished test with {utils.shape[0]} records")
15 |
16 |
17 | if __name__ == "__main__":
18 | test_utils_download_links()
19 |
--------------------------------------------------------------------------------
/azurebatchload/upload.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from azure.storage.blob import BlobServiceClient
5 |
6 | from azurebatchload.core import Base
7 |
8 |
9 | class Upload(Base):
10 | def __init__(
11 | self,
12 | destination,
13 | source,
14 | folder=None,
15 | extension=None,
16 | method="batch",
17 | modified_since=None,
18 | overwrite=False,
19 | list_files=None,
20 | create_download_links=False,
21 | expiry_download_links=7,
22 | ):
23 | super(Upload, self).__init__(
24 | destination=destination,
25 | folder=source,
26 | extension=extension,
27 | modified_since=modified_since,
28 | method=method,
29 | list_files=list_files,
30 | expiry_download_links=expiry_download_links,
31 | )
32 | self.blob_folder = folder
33 | self.overwrite = overwrite
34 | self.create_download_links = create_download_links
35 |
36 | def upload_batch(self):
37 | cmd = f"az storage fs directory upload " f"-f {self.destination} " f"-s {self.folder} -r"
38 |
39 | non_default = {"-d": self.blob_folder, "--connection-string": self.connection_string}
40 |
41 | for flag, value in non_default.items():
42 | if value:
43 | cmd = f"{cmd} {flag} '{value}'"
44 |
45 | os.system(cmd)
46 |
47 | def upload_single(self):
48 | blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
49 | download_links = {}
50 |
51 | for root, dirs, files in os.walk(self.folder):
52 | for file in files:
53 | full_path = os.path.join(root, file)
54 |
55 | # ignore hidden files
56 | if file.startswith("."):
57 | continue
58 |
59 | # if list_files is given, only upload matched files
60 | if self.list_files and file not in self.list_files:
61 | continue
62 |
63 | # if extension is given only upload if extension is matched
64 | if self.extension and os.path.isfile(full_path) and not file.lower().endswith(self.extension.lower()):
65 | continue
66 |
67 | blob_folder = root.replace(self.folder, "").lstrip("/")
68 |
69 | if self.blob_folder:
70 | # we only want to append blob_folder if it actually is a path or folder
71 | # blob_folder can be empty string ""
72 | if blob_folder:
73 | blob_folder = os.path.join(self.blob_folder, blob_folder)
74 | else:
75 | blob_folder = self.blob_folder
76 |
77 | # if no folder is given, just upload to the container root path
78 | if not blob_folder:
79 | container = self.destination
80 | else:
81 | container = os.path.join(self.destination, blob_folder)
82 | container_client = blob_service_client.get_container_client(container=container)
83 |
84 | with open(full_path, "rb") as data:
85 | logging.debug(f"Uploading blob {full_path}")
86 | container_client.upload_blob(data=data, name=file, overwrite=self.overwrite)
87 |
88 | if self.create_download_links:
89 | download_links[file] = self.create_blob_link(blob_folder=blob_folder, blob_name=file)
90 |
91 | return download_links
92 |
93 | def upload(self):
94 | self.checks()
95 |
96 | logging.info(f"Uploading to container {self.destination} with method = '{self.method}'.")
97 | if self.method == "batch":
98 | return self.upload_batch()
99 | else:
100 | return self.upload_single()
101 |
--------------------------------------------------------------------------------
/azurebatchload/utils.py:
--------------------------------------------------------------------------------
1 | from ntpath import basename
2 |
3 | from azure.storage.blob import BlobServiceClient
4 | from pandas import DataFrame
5 |
6 | from azurebatchload.core import Base
7 |
8 |
9 | class Utils(Base):
10 | def __init__(
11 | self,
12 | container,
13 | name_starts_with=None,
14 | dataframe=False,
15 | extended_info=False,
16 | create_download_links=False,
17 | expiry_download_links=7,
18 | ):
19 | super(Utils, self).__init__(
20 | destination=container,
21 | folder=name_starts_with,
22 | expiry_download_links=expiry_download_links,
23 | )
24 |
25 | self.container = container
26 | self.name_starts_with = name_starts_with
27 | self.dataframe = dataframe
28 | self.extended_info = extended_info
29 | self.connection_string = self._check_connection_credentials()[0]
30 | self.create_download_links = create_download_links
31 | self._blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
32 | self._container_client = self._blob_service_client.get_container_client(self.container)
33 |
34 | ###################
35 | # private methods #
36 | ###################
37 |
38 | def _get_files(self):
39 | files = self._container_client.list_blobs(name_starts_with=self.name_starts_with)
40 | return files
41 |
42 | @staticmethod
43 | def _get_file_names_simple(files):
44 | return [file.get("name") for file in files]
45 |
46 | def _list_blobs_not_extended(self, files):
47 | file_names = self._get_file_names_simple(files)
48 | # 1. dataframe = False, return just a list of file names
49 | if not self.dataframe:
50 | return file_names
51 | # 2. dataframe = True, dataframe with one column filenames
52 | else:
53 | return DataFrame({"filename": file_names})
54 |
55 | def _list_blobs_extended(self, files):
56 | included_info = ("name", "container", "last_modified", "creation_time", "size")
57 | new_file_list = []
58 | for file in files:
59 | new_dict = {}
60 | for key, value in file.items():
61 | if key in included_info:
62 | new_dict[key] = file[key]
63 | new_file_list.append(new_dict)
64 | # 3. dataframe = False, return just a list of dicts
65 | if not self.dataframe:
66 | return new_file_list
67 | # 4. dataframe = True, return dataframe
68 | else:
69 | df = DataFrame(new_file_list)
70 | df = df.reindex(columns=included_info)
71 | # convert size to mb
72 | df["size"] = (df["size"] / 1_000_000).round(2)
73 | df = df.rename(columns={"size": "size_mb"})
74 | return df
75 |
76 | ##################
77 | # public methods #
78 | ##################
79 |
80 | def list_blobs(self):
81 | files = self._get_files()
82 | if not self.extended_info:
83 | return self._list_blobs_not_extended(files)
84 | else:
85 | return self._list_blobs_extended(files)
86 |
87 | def create_blob_links(self):
88 | files = self._get_file_names_simple(self._get_files())
89 | url_list = []
90 | for file in files:
91 | url = self.create_blob_link(blob_folder=file.replace("/" + basename(file), ""), blob_name=basename(file))
92 | url_list.append({"filename": file, "url": url})
93 |
94 | if self.dataframe:
95 | return DataFrame(url_list)
96 | else:
97 | return url_list
98 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | python-dotenv
3 | setuptools>=61.0.0
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | azure-storage-blob>=12.8.1
2 | pandas>=1.4.1
3 |
--------------------------------------------------------------------------------
/scripts/check_setupcfg_and_requirementst_equal.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 |
4 | from setuptools.config.setupcfg import read_configuration
5 |
6 |
7 | def get_config():
8 | repo_path = pathlib.Path(__file__).parent.parent.absolute()
9 | config_setup = read_configuration(os.path.join(repo_path, "setup.cfg"))
10 | config_requirements = config_setup["options"]["install_requires"]
11 |
12 | return config_requirements, repo_path
13 |
14 |
15 | def check():
16 | config_requirements, repo_path = get_config()
17 |
18 | with open(os.path.join(repo_path, "requirements.txt")) as f:
19 | requirements_txt = f.read().splitlines()
20 |
21 | assert sorted(config_requirements) == sorted(requirements_txt), "Requirements are not equal"
22 | print("Requirements and setup.cfg and both are equal")
23 |
24 |
25 | if __name__ == "__main__":
26 | check()
27 |
--------------------------------------------------------------------------------
/scripts/generate_requirements_from_setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 |
4 | from setuptools.config import read_configuration
5 |
6 |
7 | def get_config():
8 | repo_path = pathlib.Path(__file__).parent.parent.absolute()
9 | config_setup = read_configuration(os.path.join(repo_path, "setup.cfg"))
10 | config_requirements = config_setup["options"]["install_requires"]
11 |
12 | return config_requirements, repo_path
13 |
14 |
15 | def generate_requirements():
16 | config_requirements, repo_path = get_config()
17 |
18 | with open(os.path.join(repo_path, "requirements.txt"), "w") as f:
19 | f.write("\n".join(config_requirements))
20 |
21 | print(
22 | "Generated requirements.txt from setup.cfg, with the following requirements\n", "\n".join(config_requirements)
23 | )
24 |
25 |
26 | if __name__ == "__main__":
27 | generate_requirements()
28 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = azurebatchload
3 | version = 0.6.3
4 | author = Erfan Nariman, Melvin Folkers
5 | author_email = hello@zypp.io
6 | description = Download and upload files in batches from Azure Blob Storage Containers
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | keywords = python, azure, blob, download, upload, batch
10 | url = https://github.com/zypp-io/azure-batch-load
11 | project_urls =
12 | Bug Tracker = https://github.com/zypp-io/azure-batch-load/issues
13 | Source = https://github.com/zypp-io/azure-batch-load
14 | classifiers =
15 | Programming Language :: Python :: 3
16 | License :: OSI Approved :: MIT License
17 | Operating System :: OS Independent
18 |
19 | [options]
20 | packages = azurebatchload
21 | python_requires = >=3.7
22 | install_requires =
23 | azure-storage-blob>=12.8.1
24 | pandas>=1.4.1
25 |
26 | [flake8]
27 | statistics = True
28 | count = True
29 | max-complexity=12
30 | max-line-length=120
31 | per-file-ignores=__init__.py: F401
32 |
--------------------------------------------------------------------------------