├── .coveragerc
├── .env
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── pull_request_template.md
└── workflows
│ ├── ci.yml
│ └── pypi-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── ROADMAP.md
├── conftest.py
├── docker-compose.yml
├── docker
└── raster_loader
│ └── Dockerfile
├── docs
├── .pages
├── Makefile
├── make.bat
└── source
│ ├── _static
│ ├── carto-logo.png
│ └── custom.css
│ ├── conf.py
│ ├── developer_guide
│ ├── contribute.md
│ └── roadmap.md
│ ├── index.rst
│ └── user_guide
│ ├── cli.rst
│ ├── installation.rst
│ ├── modules
│ └── raster_loader.rst
│ └── use_with_python.rst
├── pytest.ini
├── raster_loader
├── __init__.py
├── cli
│ ├── __init__.py
│ ├── bigquery.py
│ ├── databricks.py
│ ├── info.py
│ └── snowflake.py
├── errors.py
├── geo.py
├── io
│ ├── __init__.py
│ ├── bigquery.py
│ ├── common.py
│ ├── databricks.py
│ ├── datawarehouse.py
│ └── snowflake.py
├── tests
│ ├── .env.sample
│ ├── __init__.py
│ ├── bigquery
│ │ ├── __init__.py
│ │ ├── test_cli.py
│ │ └── test_io.py
│ ├── databricks
│ │ ├── __init__.py
│ │ ├── test_cli.py
│ │ └── test_io.py
│ ├── fixtures
│ │ ├── expected_blocksize_512.pkl
│ │ ├── expected_custom_column.npy
│ │ ├── expected_custom_column.pkl
│ │ ├── expected_custom_multiple_column.npy
│ │ ├── expected_custom_multiple_column.pkl
│ │ ├── expected_default_column.npy
│ │ ├── expected_default_column.pkl
│ │ ├── expected_default_multiple_column.npy
│ │ ├── expected_multiple_column.pkl
│ │ ├── mosaic.tif
│ │ ├── mosaic_cog.tif
│ │ ├── mosaic_cog_1_1.tif
│ │ ├── mosaic_cog_1_2.tif
│ │ ├── mosaic_cog_2_1.tif
│ │ ├── mosaic_cog_2_2.tif
│ │ └── mosaic_cog_512.tif
│ ├── mocks.py
│ ├── snowflake
│ │ ├── __init__.py
│ │ ├── test_cli.py
│ │ └── test_io.py
│ └── test_utils.py
└── utils.py
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── setup.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | conftest.py
4 | */test*
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | # === Raster Loader Environment Variables ===
2 | GOOGLE_APPLICATION_CREDENTIALS=/usr/local/gcloud/credentials.json
3 |
4 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: File a bug report to help improve the project
4 | title: '[BUG] '
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Bug Description**
11 | [A clear and concise description of the bug.]
12 |
13 | **Expected behavior**
14 | [A clear and concise description of what you expected to happen.]
15 |
16 | **Screenshots**
17 | [If applicable, add screenshots to help explain the issue.]
18 |
19 | **System information**
20 | [Run `carto info` in a terminal and add the output here, overwriting the
21 | text below.]
22 |
23 | ```text
24 | Raster Loader version: 0.1.dev32+gd59b0da
25 | Python version: 3.11.0 | packaged by conda-forge |
26 | Platform: Linux-5.10.16.3-microsoft-standard-WSL2-x86_64-with-glibc2.35
27 | System version: Linux 5.10.16.3-microsoft-standard-WSL2
28 | Machine: x86_64
29 | Processor: x86_64
30 | ```
31 |
32 | **Additional context**
33 | [Add any other context about the problem here.]
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | [A clear and concise description of what the problem is you'd like to see
12 | solved.]
13 |
14 | **Describe the solution you'd like**
15 | [A clear and concise description of what you want to happen.]
16 |
17 | **Describe alternatives you've considered**
18 | [A clear and concise description of any alternative solutions or features you've
19 | considered.]
20 |
21 | **Additional context**
22 | [Add any other context or screenshots about the feature request here.]
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Issue
2 |
3 | Fixes #
4 | [Link the issue this PR is based on. All PRs need to be associated to at least
5 | one issue in the issue tracker.]
6 |
7 | ## Proposed Changes
8 |
9 | [Summarize the changes this PR contains. Describe how these changes resolve or
10 | help resolve the issue linked above.]
11 |
12 | ## Pull Request Checklist
13 |
14 | - [ ] I have tested the changes locally
15 | - [ ] I have added tests to cover my changes (if applicable)
16 | - [ ] I have updated the documentation (if applicable)
17 |
18 | ## Additional Information
19 |
20 | [Anything else you'd like to include.]
21 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 |
9 | jobs:
10 |
11 | lint-test:
12 | runs-on: ubuntu-latest
13 | timeout-minutes: 10
14 | strategy:
15 | matrix:
16 | python-version: ["3.9", "3.10", "3.11", "3.12"]
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Initialize environment
24 | run: make init
25 | - name: Run linter
26 | run: make lint
27 | - name: Run tests
28 | run: make test
29 | - name: Check docs
30 | run: make test-docs
31 |
--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish package to PyPI
2 | on:
3 | push:
4 | tags:
5 | - '*'
6 | jobs:
7 | publish:
8 | runs-on: ubuntu-20.04
9 | steps:
10 | - uses: actions/checkout@master
11 | - name: Set up Python 3.9
12 | uses: actions/setup-python@v1
13 | with:
14 | python-version: 3.9
15 | - name: Get release version
16 | run: |
17 | echo "CHANGELOG_VERSION=$(cat CHANGELOG.md | grep -oP '(?<=##\s)(.*)(?=\])' | head -n 1 | sed 's/\[/v/g')" >> $GITHUB_ENV
18 | echo "TAG_VERSION=`echo $(git describe --tags --abbrev=0)`" >> $GITHUB_ENV
19 | - name: Check changelog release version
20 | if: ${{ env.TAG_VERSION != env.CHANGELOG_VERSION }}
21 | run: |
22 | echo "CHANGELOG_VERSION($CHANGELOG_VERSION) is different from TAG_VERSION($TAG_VERSION)"
23 | exit 1
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install build
28 | - name: Get all git tags
29 | run: git fetch --tags -f
30 | - name: Build package
31 | run: |
32 | python -m build --sdist --wheel
33 | - name: Get package size
34 | run: echo "PKG_SIZE=$(find dist -maxdepth 1 -regex '.*gz' | xargs stat --format='%s')" >> $GITHUB_ENV
35 | - name: Check package size
36 | if: ${{ env.PKG_SIZE > 1e+8 }}
37 | run: |
38 | echo "PKG_SIZE($PKG_SIZE bytes) is greater than 100MB"
39 | exit 1
40 | - name: Publish package
41 | if: startsWith(github.ref, 'refs/tags')
42 | uses: pypa/gh-action-pypi-publish@release/v1
43 | with:
44 | skip-existing: true
45 | password: ${{ secrets.PYPI_API_TOKEN }}
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env*/
126 | venv*/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # Images
163 | *.svg
164 |
165 | # Visual Studio Code
166 | .vscode
167 |
168 | # Custom
169 | carto_credentials.json
170 | raster_loader/tests/fixtures/*.tif.aux.xml
171 |
172 | # written by setuptools_scm
173 | */_version.py
174 | .idea/encodings.xml
175 | .idea/misc.xml
176 | .idea/modules.xml
177 | .idea/raster-loader.iml
178 | .idea/vcs.xml
179 | .idea/codeStyles/codeStyleConfig.xml
180 | .idea/codeStyles/Project.xml
181 | .idea/.gitignore
182 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | files: 'raster_loader\/'
2 | repos:
3 | - repo: https://github.com/psf/black
4 | rev: 22.3.0
5 | hooks:
6 | - id: black
7 | language_version: python3
8 | - repo: https://github.com/pycqa/flake8
9 | rev: 7.1.1
10 | hooks:
11 | - id: flake8
12 | language: python_venv
13 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the version of Python and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.10"
13 |
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 | configuration: docs/source/conf.py
17 |
18 | # If using Sphinx, optionally build your docs in additional formats such as PDF
19 | # formats:
20 | # - pdf
21 |
22 | # Optionally declare the Python requirements required to build your docs
23 | python:
24 | install:
25 | - requirements: requirements.txt
26 | - requirements: requirements-dev.txt
27 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 |
9 |
10 | ## [0.11.0] 2025-03-20
11 |
12 | - feat: add support for databricks (#169) (#feat: add support for databricks (#169)) ([38f6006](https://github.com/CartoDB/raster-loader/commit/38f6006415d642c13c0a38191f183b6af7fa5c8f) by Valentin de la Cruz Barquero).
13 |
14 | ## [0.10.5] 2025-03-11
15 |
16 | ### Fixed
17 |
18 | - Fix band nodata NaN values (#170) ([a561949](https://github.com/CartoDB/raster-loader/commit/a56194978d531cbe0397088e142357f56b6a0f83) by Juan Ramón).
19 | - fix: matching compressions should be checked when appending rasters (#165) ([0e30d11](https://github.com/CartoDB/raster-loader/commit/0e30d116ba922b6ebc8ab7895eb7371c80d351e6) by Valentin de la Cruz Barquero).
20 |
21 | ## [0.10.4] 2025-03-07
22 |
23 | ### Fixed
24 |
25 | - Fix band nodata value (#167) ([2bf9bb5](https://github.com/CartoDB/raster-loader/commit/2bf9bb594a4e3eabcf5c7245cae8ad22f6cee156) by Juan Ramón).
26 |
27 | ## [0.10.3] 2025-01-17
28 |
29 | [Compare with latest](https://github.com/CartoDB/raster-loader/compare/v0.10.2...v0.10.3)
30 |
31 | ### Fixed
32 |
33 | - fix: most_common_approx duplicated counts definition ([1415d3d](https://github.com/CartoDB/raster-loader/commit/1415d3d2731bd0a1f35054846c598b7f3a839115) by cayetanobv).
34 |
35 | ### Added
36 |
37 | - add new option: compression level (#162) ([fc45f68](https://github.com/CartoDB/raster-loader/commit/fc45f68415929b780d69311e51cef7e954ae87ac) by Cayetano Benavent).
38 |
39 | ## [0.10.2] 2025-01-14
40 |
41 | [Compare with latest](https://github.com/CartoDB/raster-loader/compare/v0.10.1...HEAD)
42 |
43 | ### Fixed
44 |
45 | - Fix: OverflowError error when casting approx sum to integer ([46cab53](https://github.com/CartoDB/raster-loader/commit/46cab53bbf71a86a7df784922956eb03f9dbb327) by Roberto Antolín).
46 | - Fix: Compute approximate most common negative values ([f9f5ff5](https://github.com/CartoDB/raster-loader/commit/f9f5ff5010b1aea0d13afbea6d1869d4094fa7d7) by Roberto Antolín).
47 |
48 | ## [0.10.1] 2025-01-13
49 |
50 | [Compare with latest](https://github.com/CartoDB/raster-loader/compare/57d55999704fb003da2947db65d5617e27c5c104...HEAD)
51 |
52 | ### Added
53 |
54 | - Snowflake key pair authentication support (#158)
55 |
56 |
57 |
58 | ## [0.10.0] 2025-01-10
59 |
60 | [Compare with latest](https://github.com/CartoDB/raster-loader/compare/v0.9.2...HEAD)
61 |
62 | ### Added
63 |
64 | - add new option: compression (#160) ([c46dd51](https://github.com/CartoDB/raster-loader/commit/c46dd51bf53847e21de7550e5b826be1a6cda3eb) by Cayetano Benavent).
65 |
66 | ## [0.9.2] 2024-12-11
67 |
68 | [Compare with latest](https://github.com/CartoDB/raster-loader/compare/v0.9.1...HEAD)
69 |
70 | ### Added
71 |
72 | - Add: Compute top values only for integer bands ([6c10cc0](https://github.com/CartoDB/raster-loader/commit/6c10cc025f5691f7841beee560437fb591bddfe9) by Roberto Antolín).
73 |
74 | ### Fixed
75 |
76 | - Fix: Tackle degenerate case of stdev computation ([b112c80](https://github.com/CartoDB/raster-loader/commit/b112c80be7d7c1adfd08f651b43dc591fd54a2ef) by Roberto Antolín).
77 | - Fix: Get count stats from shape of raster band ([c066a30](https://github.com/CartoDB/raster-loader/commit/c066a307ee116598c54ea4871d563f79deebad0b) by Roberto Antolín).
78 | - Fix: Raise error when 0 non-masked samples due to sparse rasters ([dfd89ae](https://github.com/CartoDB/raster-loader/commit/dfd89aef27726a3217843022769600315d8e5b6f) by Roberto Antolín).
79 |
80 | ### Changed
81 |
82 | - Change '--all_stats' flag to '--basic_stats' ([2cb89cc](https://github.com/CartoDB/raster-loader/pull/156/commits/2cb89cca30eb15189c876760c026074e262cc10f) by Roberto Antolín).
83 |
84 | ## [0.9.1] 2024-11-26
85 |
86 | ### Fixed
87 |
88 | - fix: changed default no data for byte data type ([06ad98f](https://github.com/CartoDB/raster-loader/commit/06ad98f3723c44ce847f475887cdca084c6ca571) by volaya).
89 |
90 | ## [0.9.0](https://github.com/CartoDB/raster-loader/releases/tag/v0.9.0) - 2024-11-04
91 |
92 | [Compare with first commit](https://github.com/CartoDB/raster-loader/compare/167c3d69359f9b3abb49a3c1c5aa6249f76c0992...v0.9.0)
93 |
94 | ## [0.9.0](https://github.com/CartoDB/raster-loader/releases/tag/0.9.0) - 2024-11-04
95 |
96 | ### Added
97 |
98 | - Added exact stats (#153)
99 |
100 | ## [0.8.2] - 2024-10-07
101 |
102 | ### Bug Fixes
103 |
104 | - Fix casting in quantiles (#151)
105 |
106 | ## [0.8.1] - 2024-09-24
107 |
108 | ### Bug Fixes
109 |
110 | - Fix stats for unmasked rasters
111 |
112 | ## [0.8.0] - 2024-09-17
113 |
114 | ### Added
115 |
116 | - Add metadata for Builder Rasters (#147)
117 |
118 | ## [0.7.1] - 2024-07-05
119 |
120 | ### Bug Fixes
121 |
122 | - fix: support for Python 3.9 / NumPy 2.0 (#145)
123 |
124 | ## [0.7.0] - 2024-07-02
125 |
126 | ### Added
127 |
128 | - Support raster overviews (#140)
129 |
130 | ### Enhancements
131 |
132 | - increase chunk-size to 10000 (#142)
133 |
134 | ### Bug Fixes
135 |
136 | - fix: make the gdalwarp examples consistent (#143)
137 |
138 | ## [0.6.1] - 2024-04-02
139 |
140 | ### Enhancements
141 |
142 | - Add a argument to skip interactive question on upload failure (#138)
143 |
144 | ### Bug Fixes
145 |
146 | - fix: shapely.wkt import (#136)
147 | - fix: update pip commands to make it compatible with zsh (#137)
148 |
149 | ## [0.6.0] - 2024-03-25
150 |
151 | ### Enhancements
152 |
153 | - Add labels to BQ uploaded tables (#131)
154 | - Support input URLs and more connection credential types (#129)
155 |
156 | ### Bug Fixes
157 |
158 | - fixed using raster files with block size other than default value (#130)
159 | - fix: error when bigquery dependencies not installed (#133)
160 |
161 | ## [0.5.0] - 2024-01-05
162 |
163 | ### Enhancements
164 |
165 | - Add support for snowflake (#127)
166 |
167 | ## [0.4.0] - 2023-12-21
168 |
169 | ### Enhancements
170 |
171 | - Update raster-loader to generate new Raster and Metadata table format (#116)
172 | - Add pixel_resolution, rename block_resolution (#123)
173 |
174 | ### Bug Fixes
175 |
176 | - fix: metadata field pixel_resolution as an integer and not allow zooms over 26 (#124, #125)
177 |
178 | ## [0.3.3] - 2023-10-30
179 |
180 | ### Bug Fixes
181 |
182 | - Fixed issue in parsing long json decimals (#117)
183 |
184 | ## [0.3.2] - 2023-09-15
185 |
186 | ### Enhancements
187 |
188 | - Add append option to skip check (#114)
189 |
190 | ## [0.3.1] - 2023-04-21
191 |
192 | ### Enhancements
193 |
194 | - Store raster nodata value in table metadata (#111)
195 | - Add level to raster metadata (#110)
196 |
197 | ### Bug Fixes
198 |
199 | - Fixed issue in metadata when updating table (#112)
200 |
201 | ## [0.3.0] - 2023-03-07
202 |
203 | ### Enhancements
204 |
205 | - Create raster tables with geography and metadata (#105)
206 |
207 | ### Bug Fixes
208 |
209 | - Fixed band in field name (#102)
210 | - Dockerfile - avoid installing GDAL twice (#104)
211 |
212 | ## [0.2.0] - 2023-01-26
213 |
214 | ### Enhancements
215 |
216 | - Updated setup.cfg and readme (#70)
217 | - Bumped wheel from 0.37.1 to 0.38.1 (#63)
218 | - Added a basic docker-compose based dev environment (#80)
219 | - Use quadbin (#72)
220 | - Raise rasterio.errors.CRSError for invalid CRS and Add test error condition (#89)
221 | - Cluster "quadbin raster" table by quadbin (#95)
222 | - Changed the endianess to little endian to accomodate the front end (#97)
223 |
224 | ### Bug Fixes
225 |
226 | - Fixed swapped lon lat (#75)
227 | - Fixed performance regression bug (#77)
228 | - Added small speed hack (#79)
229 |
230 | ### Documentation
231 |
232 | - Added docs badge and readthedocs link to readme (#69)
233 | - Updated contributor guide (#91)
234 | - Updated docs for quadbin (#93)
235 |
236 | ## [0.1.0] - 2023-01-05
237 |
238 | ### Added
239 |
240 | - raster_loader module
241 | - rasterio_to_bigquery function
242 | - bigquery_to_records function
243 | - rasterio_to_bigquery.cli submodule
244 | - upload command
245 | - describe command
246 | - info command
247 | - docs
248 | - tests
249 | - CI/CD with GitHub Actions
250 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributor guide
2 |
3 | Thank you for your interest in contributing to Raster Loader!
4 |
5 | ## Feature requests and bug reports
6 |
7 | Reporting bugs and submitting ideas for new features are great ways to help make Raster
8 | Loader better.
9 |
10 | To report a bug or request a feature, please
11 | [create a new issue in the GitHub repository](https://github.com/CartoDB/raster-loader/issues/new/choose).
12 | The issue tracker gives you the option to choose between a bug report and a feature
13 | request. It also provides templates with more information on how to file your bug report
14 | or feature request.
15 |
16 | ## Contributing code and documentation
17 |
18 | ### Prerequisites
19 |
20 | You will need to sign a Contributor License Agreement (CLA) before making a submission.
21 | [Learn more here](https://carto.com/contributions/).
22 |
23 | Raster Loader uses GitHub and git for version control. If you are new to git, you can
24 | learn more about it [here](https://git-scm.com/book/en/v2/Getting-Started-About-Version-Control).
25 |
26 | Raster Loader uses a Makefile to automate many aspects of the development process.
27 | Using the Makefile requires that you have [GNU Make](https://www.gnu.org/software/make/) installed.
28 |
29 | ### Setting up your environment
30 |
31 | Create a [fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/about-forks)
32 | of [the Raster Loader repository](https://github.com/CartoDB/raster-loader).
33 | Use `git clone` to clone the repo to your local machine.
34 |
35 | Once the repository is cloned, you can use the Makefile to set up your development
36 | environment:
37 |
38 | ```bash
39 | make init
40 | ```
41 |
42 | This will create a virtual environment in the `env` directory and install all
43 | necessary dependencies, including a development version of Raster Loader and the
44 | Raster Loader CLI.
45 |
46 | If you don't have `make` available, you can open the file `Makefile` and run the
47 | commands manually to set up a virtual environment and install the dependencies.
48 |
49 | After creating your environment, you can enter the virtual environment with
50 | `source env/bin/activate` on Linux and macOS or `env\bin\Activate.ps1` on Windows
51 | (PowerShell).
52 |
53 | ### Setting up your environment (Docker / Docker-Compose)
54 |
55 | As an alternative to setting up a virtual environment, you can also set up a
56 | development environment using Docker:
57 |
58 | 1. Install Docker and Docker Compose on your system by following the instructions for your operating system from the official Docker website.
59 | 2. Use `git clone` to clone [the Raster Loader repository](https://github.com/CartoDB/raster-loader)
60 | to your local machine.
61 | 3. Navigate to the root directory of the repository in your terminal.
62 | 4. Run `make docker-build` command to build the docker image
63 | 5. Run `make docker-start` to start the development environment. Keep this process running.
64 | 6. Begin your development in a new terminal.
65 | 7. Run `make docker-test` to run the test suite.
66 | 8. Run a targeted test using pytest flags: `make docker-test PYTEST_FLAGS='-s -k array'`
67 | 9. Run `git checkout -b my-new-feature` to start a new feature branch
68 | 10. Consider writing a test in `raster_loader/tests/` to guide your implementation
69 | 11. Drop into `pdb` when a test fails: `make docker-test PYTEST_FLAGS='-s --pdb'`
70 | 12. Run `make docker-enter` to open a terminal inside of the docker container
71 | 13. Run `make docker-stop` to stop the development environment
72 | 14. Run `make docker-remove` to remove docker raster_loader Container/Network/Volume from your system
73 |
74 | _Note: If you want to make changes to library dependencies (i.e. requirements.txt or requirements-dev.txt) while the container is running, you'll need to rebuild the image using the make docker-build command and restart the container."_
75 |
76 | ### Tests and linting
77 |
78 | Before submitting a pull request, you need to make sure your updates pass tests and
79 | linting.
80 |
81 | #### Running linting
82 |
83 | To run linting, use the following command:
84 |
85 | ```bash
86 | make lint
87 | ```
88 |
89 | This runs [flake8](https://flake8.pycqa.org/en/latest/) and
90 | [black](https://black.readthedocs.io/en/stable/). You can also run these tools
91 | individually using the `flake8` or `black` command.
92 |
93 | #### Running tests
94 |
95 | Raster Loader uses [pytest](https://docs.pytest.org/en/stable/) for testing. You can
96 | run the tests with the following command:
97 |
98 | ```bash
99 | make test
100 | ```
101 |
102 | This runs all tests in the `tests` directory. You can also run all tests with the
103 | `pytest` command.
104 |
105 | The test suite includes optional integration tests that require credentials for a
106 | BigQuery account. To run these tests, you need to set the `GOOGLE_APPLICATION_CREDENTIALS`
107 | environment variable to the path of a JSON file containing your BigQuery credentials
108 | (see the [GCP documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key)
109 | for more information).
110 |
111 | You must also copy the `/test/.env.sample` to `/test/.env` and edit it to set a
112 | test project and dataset in which the used credentials have permissions to create tables.
113 |
114 | If you're working on Windows, please set manually the env variables or the .env file in your terminal.
115 |
116 | After setting up your credentials and .env, you can enable the integration
117 | test with the following command:
118 |
119 | ```bash
120 | pytest --runintegration
121 | ```
122 |
123 | #### Updating tests
124 |
125 | All new code needs to be covered by tests. The tests for Raster Loader are located in
126 | the `raster_loader/tests` directory. Each Python module in the package should have its
127 | own test module in this directory.
128 |
129 | The `raster_loader/tests` directory also contains tests for the CLI. To learn more about
130 | writing tests for the CLI, see the
131 | [Click documentation](https://click.palletsprojects.com/en/8.1.x/testing/).
132 |
133 | To only run a specific test file, use the following command:
134 |
135 | ```bash
136 | pytest tests/[test_file_name]
137 | ```
138 |
139 | To only run a specific test, use the following command:
140 |
141 | ```bash
142 | pytest -k "[test_name]"
143 | ```
144 |
145 | ### Updating documentation
146 |
147 | All new features and updates to features need to be documented.
148 |
149 | Raster Loader uses [Sphinx](https://www.sphinx-doc.org/en/master/) to generate
150 | documentation.
151 |
152 | The documentation is located in the `docs` directory. You can build the documentation
153 | with the following command:
154 |
155 | ```bash
156 | make docs
157 | ```
158 |
159 | This will generate the documentation in the `docs/build` directory.
160 |
161 | The documentation follows the
162 | [Google developer documentation style guide](https://developers.google.com/style).
163 |
164 | The documentation also includes a module API reference. This reference is generated
165 | automatically from the docstrings in the code. Please use
166 | [NumPy style](https://numpydoc.readthedocs.io/en/latest/format.html) for all your
167 | docstrings.
168 |
169 | Also included in the documentation is a reference of all available CLI commands.
170 | This reference is generated automatically from the docstrings in the CLI code. See
171 | the [documentation for sphinx-click](https://sphinx-click.readthedocs.io/en/latest/)
172 | for more information on how to document the CLI.
173 |
174 | ### Making pull requests
175 |
176 | All contributions to Raster Loader are made through pull requests to the
177 | [the Raster Loader repository](https://github.com/CartoDB/raster-loader).
178 |
179 | See the [GitHub documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests)
180 | for more information on how to use pull requests.
181 |
182 | All pull requests must reference an issue in the Raster Loader repository (a bug report
183 | or feature request, for example). If you can't find an issue to reference, make
184 | sure to create a new issue before submitting your pull request.
185 |
186 | Pull requests to the Raster Loader repository must pass all automated tests and linting
187 | before they are considered for merging. You can use the ["WIP" label](https://github.com/CartoDB/raster-loader/labels/WIP)
188 | or [mark your pull request as a draft](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests#draft-pull-requests)
189 | to indicate that it is not ready for review.
190 |
191 | Before merging a pull request, the Raster Loader maintainers will review your code and
192 | might request changes. You can make changes to your pull request by pushing new commits.
193 |
194 | ### Commit messages and changelog
195 |
196 | Raster loader uses Semantic Versioning and Conventional Commits. Format your PR and commit messages accordingly.
197 |
198 | To update the changelog before a new release, run the following in your terminal:
199 |
200 | $ git-changelog --output CHANGELOG.md --in-place --filter-commits "v0.8.2.."
201 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | SPDX short identifier: BSD-3-Clause
2 |
3 | Copyright (c) 2022, CARTO
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11 |
12 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | VENV ?= env
2 | DIST=dist
3 | BUILD=build
4 | BIN=$(VENV)/bin
5 |
6 | .PHONY: docs
7 |
8 | init:
9 | test `command -v python3` || echo Please install python3
10 | [ -d $(VENV) ] || python3 -m venv $(VENV)
11 | $(BIN)/pip install -r requirements-dev.txt
12 | $(BIN)/pre-commit install
13 | $(BIN)/pip install -e .[snowflake,bigquery,databricks]
14 |
15 | lint:
16 | $(BIN)/black raster_loader setup.py
17 | $(BIN)/flake8 raster_loader setup.py
18 |
19 | test:
20 | $(BIN)/pytest raster_loader --cov=raster_loader --verbose
21 |
22 | test-integration:
23 | $(BIN)/pytest raster_loader --cov=raster_loader --verbose --runintegration
24 |
25 | docs:
26 | cd docs; make clean html
27 |
28 | test-docs:
29 | $(BIN)/sphinx-build -a -W --keep-going docs/source/ docs/build/
30 |
31 | publish-pypi:
32 | rm -rf $(DIST) $(BUILD) *.egg-info
33 | $(BIN)/python setup.py sdist bdist_wheel
34 | $(BIN)/twine upload $(DIST)/*
35 |
36 | publish-test-pypi:
37 | rm -rf $(DIST) $(BUILD) *.egg-info
38 | $(BIN)/python setup.py sdist bdist_wheel
39 | $(BIN)/twine upload --repository-url https://test.pypi.org/legacy/ $(DIST)/* --verbose
40 |
41 | clean:
42 | rm -rf $(VENV) $(DIST) $(BUILD) *.egg-info
43 |
44 | ENTER_CONTAINER:=docker-compose exec raster_loader
45 |
46 | .PHONY: docker-build
47 | docker-build: ## Build necessary stuff.
48 | docker-compose build
49 |
50 | .PHONY: docker-start
51 | docker-start: ## Start containers with docker-compose and attach to logs.
52 | docker-compose up --no-build
53 |
54 | .PHONY: docker-test
55 | docker-test: ## Enter the running backend container and run tests.
56 | $(ENTER_CONTAINER) sh -c 'cd raster_loader && pytest $(PYTEST_FLAGS)'
57 |
58 | .PHONY: docker-enter
59 | docker-enter: ## Enter the backend container.
60 | $(ENTER_CONTAINER) bash
61 |
62 | .PHONY: docker-stop
63 | docker-stop: ## Stop all running containers.
64 | docker-compose stop
65 |
66 | .PHONY: docker-remove
67 | docker-remove: ## Remove all containers / volumes
68 | docker-compose down --volumes
69 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # raster-loader
2 |
3 | [](https://badge.fury.io/py/raster-loader)
4 | [](https://pypistats.org/packages/raster-loader)
5 | [](https://github.com/cartodb/raster-loader/actions)
6 | [](https://raster-loader.readthedocs.io/en/latest/?badge=latest)
7 |
8 | Python library for loading GIS raster data to standard cloud-based data warehouses that
9 | don't natively support raster data.
10 |
11 | Raster Loader is currently tested on Python 3.9, 3.10, 3.11 and 3.12.
12 |
13 | ## Documentation
14 |
15 | The Raster Loader documentation is available at [raster-loader.readthedocs.io](https://raster-loader.readthedocs.io).
16 |
17 | ## Install
18 |
19 | ```bash
20 | pip install -U raster-loader
21 | ```
22 |
23 | To install from source:
24 |
25 | ```bash
26 | git clone https://github.com/cartodb/raster-loader
27 | cd raster-loader
28 | pip install -U .
29 | ```
30 |
31 | > **Tip**: In most cases, it is recommended to install Raster Loader in a virtual environment. Use [venv](https://docs.python.org/3/library/venv.html) to create and manage your virtual environment.
32 |
33 | The above will install the dependencies required to work with all cloud providers (BigQuery, Snowflake, Databricks). If you only want to work with one of them, you can install the dependencies for each separately:
34 |
35 | ```bash
36 | pip install -U raster-loader[bigquery]
37 | pip install -U raster-loader[snowflake]
38 | pip install -U raster-loader[databricks]
39 | ```
40 |
41 | For Databricks, you will also need to install the [databricks-connect](https://pypi.org/project/databricks-connect/) package corresponding to your Databricks Runtime Version. For example, if your cluster uses DBR 15.1, install:
42 |
43 | ```bash
44 | pip install databricks-connect==15.1
45 | ```
46 |
47 | You can find your cluster's DBR version in the Databricks UI under Compute > Your Cluster > Configuration > Databricks Runtime version.
48 | Or you can run the following SQL query from your cluster:
49 |
50 | ```sql
51 | SELECT current_version();
52 | ```
53 |
54 | To verify the installation was successful, run:
55 |
56 | ```bash
57 | carto info
58 | ```
59 |
60 | This command will display system information including the installed Raster Loader version.
61 |
62 | ## Prerequisites
63 |
64 | Before using Raster Loader with each platform, you need to have the following set up:
65 |
66 | **BigQuery:**
67 | - A [GCP project](https://cloud.google.com/resource-manager/docs/creating-managing-projects)
68 | - A [BigQuery dataset](https://cloud.google.com/bigquery/docs/datasets-intro)
69 | - The `GOOGLE_APPLICATION_CREDENTIALS` environment variable set to the path of a JSON file containing your BigQuery credentials. See the [GCP documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key) for more information.
70 |
71 | **Snowflake:**
72 | - A Snowflake account
73 | - A Snowflake database
74 | - A Snowflake schema
75 |
76 | **Databricks:**
77 | - A [Databricks server hostname](https://docs.databricks.com/aws/en/integrations/compute-details)
78 | - A [Databricks cluster id](https://learn.microsoft.com/en-us/azure/databricks/workspace/workspace-details#cluster-url)
79 | - A [Databricks token](https://docs.databricks.com/aws/en/dev-tools/auth/pat)
80 |
81 | **Raster files**
82 |
83 | The input raster must be a `GoogleMapsCompatible` raster. You can make your raster compatible by converting it with the following GDAL command:
84 |
85 | ```bash
86 | gdalwarp -of COG -co TILING_SCHEME=GoogleMapsCompatible -co COMPRESS=DEFLATE -co OVERVIEWS=IGNORE_EXISTING -co ADD_ALPHA=NO -co RESAMPLING=NEAREST -co BLOCKSIZE=512 .tif .tif
87 | ```
88 |
89 | Your raster file must be in a format that can be [read by GDAL](https://gdal.org/drivers/raster/index.html) and processed with [rasterio](https://rasterio.readthedocs.io/en/latest/).
90 |
91 | ## Usage
92 |
93 | There are two ways you can use Raster Loader:
94 |
95 | * Using the CLI by running `carto` in your terminal
96 | * Using Raster Loader as a Python library (`import raster_loader`)
97 |
98 | ### CLI
99 |
100 | After installing Raster Loader, you can run the CLI by typing `carto` in your terminal.
101 |
102 | Currently, Raster Loader allows you to upload a local raster file to BigQuery, Snowflake, or Databricks tables. You can also download and inspect raster files from these platforms.
103 |
104 | #### Uploading Raster Data
105 |
106 | Examples for each platform:
107 |
108 | **BigQuery:**
109 | ```bash
110 | carto bigquery upload \
111 | --file_path /path/to/my/raster/file.tif \
112 | --project my-gcp-project \
113 | --dataset my-bigquery-dataset \
114 | --table my-bigquery-table \
115 | --overwrite
116 | ```
117 |
118 | **Snowflake:**
119 | ```bash
120 | carto snowflake upload \
121 | --file_path /path/to/my/raster/file.tif \
122 | --database my-snowflake-database \
123 | --schema my-snowflake-schema \
124 | --table my-snowflake-table \
125 | --account my-snowflake-account \
126 | --username my-snowflake-user \
127 | --password my-snowflake-password \
128 | --overwrite
129 | ```
130 |
131 | Note that authentication parameters are explicitly required since they are not set up in the environment.
132 |
133 | **Databricks:**
134 | ```bash
135 | carto databricks upload \
136 | --file_path /path/to/my/raster/file.tif \
137 | --catalog my-databricks-catalog \
138 | --schema my-databricks-schema \
139 | --table my-databricks-table \
140 | --server-hostname my-databricks-server-hostname \
141 | --cluster-id my-databricks-cluster-id \
142 | --token my-databricks-token \
143 | --overwrite
144 | ```
145 |
146 | Note that authentication parameters are explicitly required since they are not set up in the environment.
147 |
148 | Additional features include:
149 | - Specifying bands with `--band` and `--band_name`
150 | - Enabling compression with `--compress` and `--compression-level`
151 | - Chunking large uploads with `--chunk_size`
152 |
153 | #### Inspecting Raster Data
154 |
155 | To inspect a raster file stored in any platform, use the `describe` command:
156 |
157 | **BigQuery:**
158 | ```bash
159 | carto bigquery describe \
160 | --project my-gcp-project \
161 | --dataset my-bigquery-dataset \
162 | --table my-bigquery-table
163 | ```
164 |
165 | **Snowflake:**
166 | ```bash
167 | carto snowflake describe \
168 | --database my-snowflake-database \
169 | --schema my-snowflake-schema \
170 | --table my-snowflake-table \
171 | --account my-snowflake-account \
172 | --username my-snowflake-user \
173 | --password my-snowflake-password
174 | ```
175 |
176 | Note that authentication parameters are explicitly required since they are not set up in the environment.
177 |
178 | **Databricks:**
179 | ```bash
180 | carto databricks describe \
181 | --catalog my-databricks-catalog \
182 | --schema my-databricks-schema \
183 | --table my-databricks-table \
184 | --server-hostname my-databricks-server-hostname \
185 | --cluster-id my-databricks-cluster-id \
186 | --token my-databricks-token
187 | ```
188 |
189 | Note that authentication parameters are explicitly required since they are not set up in the environment.
190 |
191 | For a complete list of options and commands, run `carto --help` or see the [full documentation](https://raster-loader.readthedocs.io/en/latest/user_guide/cli.html).
192 |
193 | ### Using Raster Loader as a Python library
194 |
195 | After installing Raster Loader, you can use it in your Python project.
196 |
197 | First, import the corresponding connection class for your platform:
198 |
199 | ```python
200 | # For BigQuery
201 | from raster_loader import BigQueryConnection
202 |
203 | # For Snowflake
204 | from raster_loader import SnowflakeConnection
205 |
206 | # For Databricks
207 | from raster_loader import DatabricksConnection
208 | ```
209 |
210 | Then, create a connection object with the appropriate parameters:
211 |
212 | ```python
213 | # For BigQuery
214 | connection = BigQueryConnection('my-project')
215 |
216 | # For Snowflake
217 | connection = SnowflakeConnection('my-user', 'my-password', 'my-account', 'my-database', 'my-schema')
218 |
219 | # For Databricks
220 | connection = DatabricksConnection('my-server-hostname', 'my-token', 'my-cluster-id')
221 | ```
222 |
223 | #### Uploading a raster file
224 |
225 | To upload a raster file, use the `upload_raster` function:
226 |
227 | ```python
228 | connection.upload_raster(
229 | file_path = 'path/to/raster.tif',
230 | fqn = 'database.schema.tablename'
231 | )
232 | ```
233 |
234 | This function returns `True` if the upload was successful.
235 |
236 | You can enable compression of the band data to reduce storage size:
237 |
238 | ```python
239 | connection.upload_raster(
240 | file_path = 'path/to/raster.tif',
241 | fqn = 'database.schema.tablename',
242 | compress = True, # Enable gzip compression of band data
243 | compression_level = 3 # Optional: Set compression level (1-9, default=6)
244 | )
245 | ```
246 |
247 | #### Inspecting a raster file
248 |
249 | To access and inspect a raster file stored in any platform, use the `get_records` function:
250 |
251 | ```python
252 | records = connection.get_records(
253 | fqn = 'database.schema.tablename'
254 | )
255 | ```
256 |
257 | This function returns a DataFrame with some samples from the raster table (10 rows by default).
258 |
259 | For more details, see the [full documentation](https://raster-loader.readthedocs.io/en/latest/user_guide/use_with_python.html).
260 |
261 | ## Development
262 |
263 | See [CONTRIBUTING.md](CONTRIBUTING.md) for information on how to contribute to this
264 | project.
265 |
266 | [ROADMAP.md](ROADMAP.md) contains a list of features and improvements planned for future
267 | versions of Raster Loader.
268 |
269 | ## Releasing
270 |
271 | ### 1. Create and merge a release PR updating the CHANGELOG
272 |
273 | - Branch: `release/X.Y.Z`
274 | - Title: `Release vX.Y.Z`
275 | - Description: CHANGELOG release notes
276 |
277 | Example:
278 | ```
279 | ## [0.7.0] - 2024-06-02
280 |
281 | ### Added
282 | - Support raster overviews (#140)
283 |
284 | ### Enhancements
285 | - increase chunk-size to 10000 (#142)
286 |
287 | ### Bug Fixes
288 | - fix: make the gdalwarp examples consistent (#143)
289 | ```
290 |
291 | ### 2. Create and push a tag `vX.Y.Z`
292 |
293 | This will trigger an automatic workflow that will publish the package at https://pypi.org/project/raster-loader.
294 |
295 | ### 3. Create the GitHub release
296 |
297 | Go to the tags page (https://github.com/CartoDB/raster-loader/tags), select the release tag and click on "Create a new release"
298 |
299 | - Title: `vX.Y.Z`
300 | - Description: CHANGELOG release notes
301 |
302 | Example:
303 | ```
304 | ### Added
305 | - Support raster overviews (#140)
306 |
307 | ### Enhancements
308 | - increase chunk-size to 10000 (#142)
309 |
310 | ### Bug Fixes
311 | - fix: make the gdalwarp examples consistent (#143)
312 | ```
313 |
--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
1 | # Raster Loader roadmap
2 |
3 | ## Vision
4 |
5 | The Raster Loader is an open-source tool for loading GIS raster data to standard
6 | cloud-based data warehouses that don't natively support raster data.
7 |
8 | ## Goals for version 1.0.0
9 |
10 | Goals for version 1.0.0 of Raster Loader include:
11 |
12 | For loading raster data:
13 |
14 | - Support loading raster data from a local file and file archives (ZIP)
15 | - Support loading raster data from a URL
16 | - Support loading raster data from a cloud storage bucket (such as AWS S3)
17 |
18 | For storing raster data in the cloud:
19 |
20 | - Support storing raster data in GCP, AWS, and Azure data warehouses
21 |
22 | ## Goals for future versions
23 |
24 | Goals for future versions include:
25 |
26 | - Add improved error handling
27 | - Make the CLI more user-friendly
28 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | # pytest configuration file
2 | import pytest
3 |
4 |
5 | def pytest_addoption(parser):
6 | parser.addoption("--runslow", action="store_true", help="run slow tests")
7 | parser.addoption("--runintegration",
8 | action="store_true",
9 | help="run integration tests")
10 |
11 |
12 | def pytest_collection_modifyitems(config, items):
13 | if config.getoption("--runslow"):
14 | # --runslow given in cli: do not skip slow tests
15 | return
16 | skip_slow = pytest.mark.skip(reason="need --runslow option to run")
17 | for item in items:
18 | if "slow_test" in item.keywords:
19 | item.add_marker(skip_slow)
20 |
21 | if config.getoption("--runintegration"):
22 | # --runintegration given in cli: do not skip integration tests
23 | return
24 |
25 | skip_integration = pytest.mark.skip(reason="need --runintegration option to run")
26 | for item in items:
27 | if "integration_test" in item.keywords:
28 | item.add_marker(skip_integration)
29 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | raster_loader:
5 | platform: linux/amd64
6 | image: carto/raster_loader
7 | ports:
8 | - '8888:8888'
9 | build:
10 | context: .
11 | dockerfile: docker/raster_loader/Dockerfile
12 | volumes:
13 | - './:/code'
14 | env_file: ./.env
15 | command: |
16 | sh -c 'tail -f /dev/null'
17 |
--------------------------------------------------------------------------------
/docker/raster_loader/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM osgeo/gdal:3.2.0
2 |
3 | ENV HOMEAPP=/code
4 | ENV PATH=$PATH:$HOMEAPP/.local/bin
5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
6 |
7 | WORKDIR $HOMEAPP/
8 |
9 | RUN apt-get update \
10 | && apt-get upgrade -y \
11 | && apt-get install --no-install-recommends -y \
12 | bash \
13 | build-essential \
14 | gcc \
15 | git \
16 | libpq-dev \
17 | python3-dev \
18 | postgresql-client \
19 | wget \
20 | python3-pip \
21 | && apt-get clean -y && rm -rf /var/lib/apt/lists/*
22 |
23 | # Using a non-privileged user to own our code
24 | RUN useradd -d $HOMEAPP -N non-privileged
25 |
26 | # Update non-privileged user folder permission
27 | RUN chown -R non-privileged $HOMEAPP
28 |
29 | # Copy the requirements file into the container
30 | COPY requirements.txt .
31 | COPY requirements-dev.txt .
32 | COPY . .
33 |
34 | # Install the dependencies
35 | RUN pip install --upgrade pip
36 | RUN pip install --no-cache-dir -r requirements.txt
37 | RUN pip install --no-cache-dir -r requirements-dev.txt
38 |
39 | # Copy the rest of the files into the container
40 | USER non-privileged
41 |
--------------------------------------------------------------------------------
/docs/.pages:
--------------------------------------------------------------------------------
1 | title: API Reference
2 | nav:
3 | - Overview: README.md
4 | - ...
5 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/carto-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/docs/source/_static/carto-logo.png
--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
1 | #carto dt {
2 | border: none;
3 | border-left: 3px solid #ccc;
4 | background: #f0f0f0;
5 | color: #555;
6 | }
7 |
8 | #carto h3 {
9 | border-top: 3px solid #6ab0de;
10 | background: #e7f2fa;
11 | padding: 6px;
12 | }
13 |
14 | #carto h4 {
15 | font-size: 100%;
16 | border-top: 3px solid #6ab0de;
17 | background: #e7f2fa;
18 | padding: 4px;
19 | }
20 |
21 | #carto > section > section {
22 | margin-left: 8%;
23 | }
24 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | import sphinx_rtd_theme
7 | import datetime as dt
8 |
9 | import raster_loader
10 |
11 | # -- Project information -----------------------------------------------------
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
13 |
14 | project = "Raster Loader"
15 | copyright = f"{dt.datetime.now().year}, Carto"
16 | author = "Carto"
17 | copyright = author
18 | release = version = raster_loader.__version__
19 | language = "en"
20 |
21 | # -- General configuration ---------------------------------------------------
22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
23 |
24 | extensions = [
25 | "myst_parser",
26 | "sphinx_click",
27 | "sphinx_rtd_theme",
28 | "sphinx.ext.autodoc",
29 | "sphinx.ext.coverage",
30 | "sphinx.ext.napoleon",
31 | ]
32 |
33 | templates_path = ["_templates"]
34 | exclude_patterns = []
35 |
36 | source_suffix = {
37 | '.rst': 'restructuredtext',
38 | '.txt': 'markdown',
39 | '.md': 'markdown',
40 | }
41 |
42 |
43 | # -- Options for HTML output -------------------------------------------------
44 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
45 |
46 | html_theme = "sphinx_rtd_theme"
47 | html_static_path = ["_static"]
48 | html_logo = "_static/carto-logo.png"
49 | html_theme_options = {
50 | "display_version": True,
51 | }
52 | html_context = {
53 | "display_github": True,
54 | "github_user": "CartoDB",
55 | "github_repo": "raster-loader",
56 | "github_version": "main/docs/",
57 | }
58 | html_favicon = "_static/carto-logo.png"
59 | html_css_files = [
60 | "custom.css",
61 | ]
62 |
--------------------------------------------------------------------------------
/docs/source/developer_guide/contribute.md:
--------------------------------------------------------------------------------
1 | ```{include} ../../../CONTRIBUTING.md
2 | ```
--------------------------------------------------------------------------------
/docs/source/developer_guide/roadmap.md:
--------------------------------------------------------------------------------
1 | ```{include} ../../../ROADMAP.md
2 | ```
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. _docs_index:
2 |
3 | Raster Loader
4 | =============
5 |
6 | The Raster Loader is a tool for loading GIS raster data to standard cloud-based data
7 | warehouses that don't natively support raster data.
8 |
9 | You can use this package in two ways:
10 |
11 | * As a standalone tool using the :ref:`command line interface `.
12 | * As a Python library that you can :ref:`import and use in your Python projects
13 | `.
14 |
15 | This documentation contains the following sections:
16 |
17 | .. toctree::
18 | :caption: Using Raster Loader
19 | :maxdepth: 1
20 |
21 | user_guide/installation
22 | user_guide/cli
23 | user_guide/use_with_python
24 | user_guide/modules/raster_loader
25 |
26 | .. toctree::
27 | :caption: Contributing to Raster Loader
28 | :maxdepth: 1
29 |
30 | developer_guide/contribute
31 | developer_guide/roadmap
32 |
--------------------------------------------------------------------------------
/docs/source/user_guide/cli.rst:
--------------------------------------------------------------------------------
1 | .. _cli:
2 |
3 | Using the Raster Loader CLI
4 | ===========================
5 |
6 | Most functions of the Raster Loader are accessible through the carto
7 | command-line interface (CLI). To start the CLI, use the ``carto`` command in a
8 | terminal.
9 |
10 | Currently, Raster Loader allows you to upload a local raster file to a BigQuery, Snowflake, or Databricks table.
11 | You can also download and inspect a raster file from a BigQuery, Snowflake, or Databricks table.
12 |
13 |
14 | Using the Raster Loader with BigQuery
15 | -----------------------------------------
16 |
17 | Before you can upload a raster file, you need to have set up the following in
18 | BigQuery:
19 |
20 | #. A `GCP project`_
21 | #. A `BigQuery dataset`_
22 |
23 | To use the bigquery utilities, use the ``carto bigquery`` command. This command has
24 | several subcommands, which are described below.
25 |
26 | .. note::
27 |
28 | Accessing BigQuery with Raster Loader requires the ``GOOGLE_APPLICATION_CREDENTIALS``
29 | environment variable to be set to the path of a JSON file containing your BigQuery
30 | credentials. See the `GCP documentation`_ for more information.
31 |
32 | Using the Raster Loader with Snowflake
33 | -----------------------------------------
34 |
35 | Before you can upload a raster file, you need to have set up the following in
36 | Snowflake:
37 |
38 | #. A Snowflake account
39 | #. A Snowflake database
40 | #. A Snowflake schema
41 |
42 | To use the snowflake utilities, use the ``carto snowflake`` command. This command has
43 | several subcommands, which are described below.
44 |
45 | Using the Raster Loader with Databricks
46 | -----------------------------------------
47 |
48 | Before you can upload a raster file, you need to have set up the following in
49 | Databricks:
50 |
51 | #. A `Databricks server hostname`_
52 | #. A `Databricks cluster id`_
53 | #. A `Databricks token`_
54 |
55 | To use the databricks utilities, use the ``carto databricks`` command. This command has
56 | several subcommands, which are described below.
57 |
58 | Uploading a raster layer
59 | ------------------------
60 |
61 | To upload a raster file, use the ``carto [bigquery|snowflake|databricks] upload`` command.
62 |
63 | The input raster must be a ``GoogleMapsCompatible`` raster. You can make your raster compatible
64 | by converting it with the following GDAL command:
65 |
66 | .. code-block:: bash
67 |
68 | gdalwarp -of COG -co TILING_SCHEME=GoogleMapsCompatible -co COMPRESS=DEFLATE -co OVERVIEWS=IGNORE_EXISTING -co ADD_ALPHA=NO -co RESAMPLING=NEAREST -co BLOCKSIZE=512 .tif .tif
69 |
70 | You have the option to also set up a table in your provider and use this table to upload
71 | your data to. In case you do not specify a table name, Raster Loader will automatically
72 | generate a table name for you and create that table.
73 |
74 | At a minimum, the ``carto upload`` command requires a ``file_path`` to a local
75 | raster file that can be `read by GDAL`_ and processed with `rasterio`_. It also requires
76 | the ``project`` (the GCP project name) and ``dataset`` (the BigQuery dataset name)
77 | parameters in the case of Bigquery; the ``database`` and ``schema`` parameters in the
78 | case of Snowflake; or the ``catalog`` and ``schema`` parameters in the case of Databricks.
79 |
80 | There are also additional parameters, such as ``table`` (table
81 | name) and ``overwrite`` (to overwrite existing data). For example:
82 |
83 | .. code-block:: bash
84 |
85 | carto bigquery upload \
86 | --file_path /path/to/my/raster/file.tif \
87 | --project my-gcp-project \
88 | --dataset my-bigquery-dataset \
89 | --table my-bigquery-table \
90 | --overwrite
91 |
92 | This command uploads the TIFF file from ``/path/to/my/raster/file.tif`` to a BigQuery
93 | project named ``my-gcp-project``, a dataset named ``my-bigquery-dataset``, and a table
94 | named ``my-bigquery-table``. If the table already contains data, this data will be
95 | overwritten because the ``--overwrite`` flag is set.
96 |
97 | The same operation, performed with Snowflake, would be:
98 |
99 | .. code-block:: bash
100 |
101 | carto snowflake upload \
102 | --file_path /path/to/my/raster/file.tif \
103 | --database my-snowflake-database \
104 | --schema my-snowflake-schema \
105 | --table my-snowflake-table \
106 | --account my-snowflake-account \
107 | --username my-snowflake-user \
108 | --password my-snowflake-password \
109 | --overwrite
110 |
111 | Authentication parameters are explicitly required in this case for Snowflake, since they
112 | are not set up in the environment.
113 |
114 | The same operation, performed with Databricks, would be:
115 |
116 | .. code-block:: bash
117 |
118 | carto databricks upload \
119 | --file_path /path/to/my/raster/file.tif \
120 | --catalog my-databricks-catalog \
121 | --schema my-databricks-schema \
122 | --table my-databricks-table \
123 | --server-hostname my-databricks-server-hostname \
124 | --cluster-id my-databricks-cluster-id \
125 | --token my-databricks-token \
126 | --overwrite
127 |
128 | Authentication parameters are also explicitly required in the case of Databricks, since they
129 | are not set up in the environment.
130 |
131 | If no band is specified, the first band of the raster will be uploaded. If the
132 | ``--band`` flag is set, the specified band will be uploaded. For example, the following
133 | command uploads the second band of the raster:
134 |
135 | .. code-block:: bash
136 |
137 | carto bigquery upload \
138 | --file_path /path/to/my/raster/file.tif \
139 | --project my-gcp-project \
140 | --dataset my-bigquery-dataset \
141 | --table my-bigquery-table \
142 | --band 2
143 |
144 | Band names can be specified with the ``--band_name`` flag. For example, the following
145 | command uploads the ``red`` band of the raster:
146 |
147 | .. code-block:: bash
148 |
149 | carto bigquery upload \
150 | --file_path /path/to/my/raster/file.tif \
151 | --project my-gcp-project \
152 | --dataset my-bigquery-dataset \
153 | --table my-bigquery-table \
154 | --band 2 \
155 | --band_name red
156 |
157 | If the raster contains multiple bands, you can upload multiple bands at once by
158 | specifying a list of bands. For example, the following command uploads the first and
159 | second bands of the raster:
160 |
161 | .. code-block:: bash
162 |
163 | carto bigquery upload \
164 | --file_path /path/to/my/raster/file.tif \
165 | --project my-gcp-project \
166 | --dataset my-bigquery-dataset \
167 | --table my-bigquery-table \
168 | --band 1 \
169 | --band 2
170 |
171 | Or, with band names:
172 |
173 | .. code-block:: bash
174 |
175 | carto bigquery upload \
176 | --file_path /path/to/my/raster/file.tif \
177 | --project my-gcp-project \
178 | --dataset my-bigquery-dataset \
179 | --table my-bigquery-table \
180 | --band 1 \
181 | --band 2 \
182 | --band_name red \
183 | --band_name green
184 |
185 | You can enable compression of the band data using the ``--compress`` flag. This uses gzip compression which can significantly reduce storage size. By default, it uses compression level 6, which provides a good balance between compression ratio and performance. You can adjust this using the ``--compression-level`` parameter (values from 1 to 9, where 1 is fastest but least compressed, and 9 gives maximum compression):
186 |
187 | .. code-block:: bash
188 |
189 | carto bigquery upload \
190 | --file_path /path/to/my/raster/file.tif \
191 | --project my-gcp-project \
192 | --dataset my-bigquery-dataset \
193 | --table my-bigquery-table \
194 | --compress \
195 | --compression-level 3
196 |
197 | The same works for Snowflake:
198 |
199 | .. code-block:: bash
200 |
201 | carto snowflake upload \
202 | --file_path /path/to/my/raster/file.tif \
203 | --database my-snowflake-database \
204 | --schema my-snowflake-schema \
205 | --table my-snowflake-table \
206 | --account my-snowflake-account \
207 | --username my-snowflake-user \
208 | --password my-snowflake-password \
209 | --compress \
210 | --compression-level 3
211 |
212 | And for Databricks:
213 |
214 | .. code-block:: bash
215 |
216 | carto databricks upload \
217 | --file_path /path/to/my/raster/file.tif \
218 | --catalog my-databricks-catalog \
219 | --schema my-databricks-schema \
220 | --table my-databricks-table \
221 | --server-hostname my-databricks-server-hostname \
222 | --cluster-id my-databricks-cluster-id \
223 | --token my-databricks-token \
224 | --compress \
225 | --compression-level 3
226 |
227 | .. seealso::
228 | See the :ref:`cli_details` for a full list of options.
229 |
230 | For large raster files, you can use the ``--chunk_size`` flag to specify the number of
231 | rows to upload at once, and preventing BigQuery from showing you an exception like the following,
232 | due to excessive operations in the destination table:
233 |
234 | ```
235 | Exceeded rate limits: too many table update operations for this table. For more information, see https://cloud.google.com/bigquery/troubleshooting-errors
236 | ```
237 |
238 | The default chunk size is 10000 rows.
239 |
240 | For example, the following command uploads the raster in chunks
241 | of 20000 rows:
242 |
243 | .. code-block:: bash
244 |
245 | carto bigquery upload \
246 | --file_path /path/to/my/raster/file.tif \
247 | --project my-gcp-project \
248 | --dataset my-bigquery-dataset \
249 | --table my-bigquery-table \
250 | --chunk_size 20000
251 |
252 | For large raster files in Databricks, you might get the following error:
253 |
254 | ```
255 | Error uploading records: Cannot convert pyarrow.lib.ChunkedArray to pyarrow.lib.Array
256 | ```
257 |
258 | This error is due to the size of the raster file being too large to be uploaded in one go,
259 | and the default chunk size being too large. In this case, you can try to reduce the number of
260 | rows to upload at once by using the ``--chunk_size`` flag.
261 |
262 | Inspecting a raster file
263 | ------------------------------------
264 |
265 | You can also use Raster Loader to retrieve information about a raster file stored in a
266 | BigQuery, Snowflake, or Databricks table. This can be useful to make sure a raster file was transferred correctly
267 | or to get information about a raster file's metadata, for example.
268 |
269 | To access a raster file in a BigQuery table, use the ``carto bigquery describe`` command.
270 |
271 | At a minimum, this command requires a `GCP project name `_, a
272 | `BigQuery dataset name `_, and a
273 | `BigQuery table name `_. For example:
274 |
275 | .. code-block:: bash
276 |
277 | carto bigquery describe \
278 | --project my-gcp-project \
279 | --dataset my-bigquery-dataset \
280 | --table my-bigquery-table
281 |
282 | The same operation, performed with Snowflake, would be:
283 |
284 | .. code-block:: bash
285 |
286 | carto snowflake describe \
287 | --database my-snowflake-database \
288 | --schema my-snowflake-schema \
289 | --table my-snowflake-table \
290 | --account my-snowflake-account \
291 | --username my-snowflake-user \
292 | --password my-snowflake-password
293 |
294 | Authentication parameters are explicitly required in this case for Snowflake, since they
295 | are not set up in the environment.
296 |
297 | The same operation, performed with Databricks, would be:
298 |
299 | .. code-block:: bash
300 |
301 | carto databricks describe \
302 | --catalog my-databricks-catalog \
303 | --schema my-databricks-schema \
304 | --table my-databricks-table \
305 | --server-hostname my-databricks-server-hostname \
306 | --cluster-id my-databricks-cluster-id \
307 | --token my-databricks-token
308 |
309 | Authentication parameters are also explicitly required in the case of Databricks, since they
310 | are not set up in the environment.
311 |
312 | .. seealso::
313 | See the :ref:`cli_details` for a full list of options.
314 |
315 | .. _cli_details:
316 |
317 | CLI details
318 | -----------
319 |
320 | The following is a detailed overview of all of the CLI's subcommands and options:
321 |
322 | .. click:: raster_loader.cli:main
323 | :prog: carto
324 | :nested: full
325 |
326 | .. _`GCP documentation`: https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key
327 | .. _`read by GDAL`: https://gdal.org/drivers/raster/index.html
328 | .. _`rasterio`: https://rasterio.readthedocs.io/en/latest/
329 | .. _`GCP project`: https://cloud.google.com/resource-manager/docs/creating-managing-projects
330 | .. _`BigQuery dataset`: https://cloud.google.com/bigquery/docs/datasets-intro
331 | .. _`BigQuery table`: https://cloud.google.com/bigquery/docs/tables-intro
332 | .. _`Databricks server hostname`: https://docs.databricks.com/aws/en/integrations/compute-details
333 | .. _`Databricks cluster id`: https://learn.microsoft.com/en-us/azure/databricks/workspace/workspace-details#cluster-url
334 | .. _`Databricks token`: https://docs.databricks.com/aws/en/dev-tools/auth/pat
--------------------------------------------------------------------------------
/docs/source/user_guide/installation.rst:
--------------------------------------------------------------------------------
1 | .. _installation:
2 |
3 | Installing Raster Loader
4 | ========================
5 |
6 | Raster Loader is available on PyPI_ and can be installed with pip_:
7 |
8 | .. code-block:: bash
9 |
10 | pip install -U raster-loader
11 |
12 | To install from source:
13 |
14 | .. code-block:: bash
15 |
16 | git clone https://github.com/cartodb/raster-loader
17 | cd raster-loader
18 | pip install -U .
19 |
20 | .. tip::
21 |
22 | In most cases, it is recommended to install Raster Loader in a virtual environment.
23 | Use venv_ to create and manage your virtual environment.
24 |
25 | The above will install the dependencies required to work with the different cloud providers (BigQuery, Snowflake, Databricks). In case you only want to work with one of them, you can install the
26 | dependencies for each of them separately:
27 |
28 | .. code-block:: bash
29 |
30 | pip install -U raster-loader"[bigquery]"
31 | pip install -U raster-loader"[snowflake]"
32 | pip install -U raster-loader"[databricks]"
33 |
34 | For the case of Databricks, additionally you will need to install the databricks-connect_ package corresponding to your Databricks Runtime Version. For example, if your cluster uses DBR 15.1, install:
35 |
36 | .. code-block:: bash
37 |
38 | pip install databricks-connect==15.1
39 |
40 | You can find your cluster's DBR version in the Databricks UI under Compute > Your Cluster > Configuration > Databricks Runtime version.
41 | Or you can run the following SQL query from your cluster:
42 |
43 | .. code-block:: sql
44 |
45 | SELECT current_version();
46 |
47 | After installing the Raster Loader package, you will have access to the
48 | :ref:`carto CLI `. To make sure the installation was successful, run the
49 | following command in your terminal:
50 |
51 | .. code-block:: bash
52 |
53 | carto info
54 |
55 | This command should print some basic system information, including the version of Raster
56 | Loader installed on your system. For example:
57 |
58 | .. code-block:: bash
59 |
60 | Raster Loader version: 0.1
61 | Python version: 3.11.0 | packaged by conda-forge |
62 | Platform: Linux-5.10.16.3-microsoft-standard-WSL2-x86_64-with-glibc2.35
63 | System version: Linux 5.10.16.3-microsoft-standard-WSL2
64 | Machine: x86_64
65 | Processor: x86_64
66 | Architecture: 64bit
67 |
68 | .. _PyPI: https://pypi.org/project/raster-loader/
69 | .. _pip: https://pip.pypa.io/en/stable/
70 | .. _venv: https://docs.python.org/3/library/venv.html
71 | .. _databricks-connect: https://pypi.org/project/databricks-connect/
--------------------------------------------------------------------------------
/docs/source/user_guide/modules/raster_loader.rst:
--------------------------------------------------------------------------------
1 | .. _api_reference:
2 |
3 | Module API reference
4 | ====================
5 |
6 | Module contents
7 | ---------------
8 |
9 | .. automodule:: raster_loader
10 | :members:
11 | :undoc-members:
12 | :show-inheritance:
13 |
--------------------------------------------------------------------------------
/docs/source/user_guide/use_with_python.rst:
--------------------------------------------------------------------------------
1 | .. _python:
2 |
3 | Usage with Python projects
4 | ==========================
5 |
6 | After installing Raster Loader, you can use it in your Python project.
7 |
8 | First, import the corresponding connection class from the ``raster_loader`` package.
9 |
10 | For BigQuery, use ``BigQueryConnection``:
11 |
12 | .. code-block:: python
13 |
14 | from raster_loader import BigQueryConnection
15 |
16 | For Snowflake, use ``SnowflakeConnection``:
17 |
18 | .. code-block:: python
19 |
20 | from raster_loader import SnowflakeConnection
21 |
22 | For Databricks, use ``DatabricksConnection``:
23 |
24 | .. code-block:: python
25 |
26 | from raster_loader import DatabricksConnection
27 |
28 | Then, create a connection object with the appropriate parameters.
29 |
30 | For BigQuery:
31 |
32 | .. code-block:: python
33 |
34 | connection = BigQueryConnection('my-project')
35 |
36 | .. note::
37 |
38 | Accessing BigQuery with Raster Loader requires the ``GOOGLE_APPLICATION_CREDENTIALS``
39 | environment variable to be set to the path of a JSON file containing your BigQuery
40 | credentials. See the `GCP documentation`_ for more information.
41 |
42 | For Snowflake:
43 |
44 | .. code-block:: python
45 |
46 | connection = SnowflakeConnection('my-user', 'my-password', 'my-account', 'my-database', 'my-schema')
47 |
48 | For Databricks:
49 |
50 | .. code-block:: python
51 |
52 | connection = DatabricksConnection('my-server-hostname', 'my-token', 'my-cluster-id')
53 |
54 | Uploading a raster file
55 | -----------------------------------
56 |
57 | To upload a raster file, use the ``upload_raster`` function
58 |
59 |
60 | For example:
61 |
62 | .. code-block:: python
63 |
64 | connection.upload_raster(
65 | file_path = 'path/to/raster.tif',
66 | fqn = 'database.schema.tablename',
67 | )
68 |
69 | This function returns `True` if the upload was successful.
70 |
71 | The input raster must be a ``GoogleMapsCompatible`` raster. You can make your raster compatible
72 | by converting it with the following GDAL command:
73 |
74 | .. code-block:: bash
75 |
76 | gdalwarp -of COG -co TILING_SCHEME=GoogleMapsCompatible -co COMPRESS=DEFLATE -co OVERVIEWS=IGNORE_EXISTINGNONE -co ADD_ALPHA=NO -co RESAMPLING=NEAREST -co BLOCKSIZE=512 .tif .tif
77 |
78 | Inspecting a raster file
79 | ------------------------
80 |
81 | You can also access and inspect a raster file located in a BigQuery or Snowflake table using the
82 | :func:`get_records` function.
83 |
84 | For example:
85 |
86 | .. code-block:: python
87 |
88 | records = connection.get_records(
89 | fqn = 'database.schema.tablename',
90 | )
91 |
92 | This function returns a DataFrame with some samples from the raster table
93 | (10 rows by default).
94 |
95 | .. seealso::
96 | See the :ref:`api_reference` for more details.
97 |
98 | .. _`GCP documentation`: https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key
99 |
100 | To enable compression of the band data, which can significantly reduce storage size, use the ``compress`` parameter:
101 |
102 | .. code-block:: python
103 |
104 | connection.upload_raster(
105 | file_path = 'path/to/raster.tif',
106 | fqn = 'database.schema.tablename',
107 | compress = True, # Enable gzip compression of band data
108 | compression_level = 3 # Optional: Set compression level (1-9, default=6)
109 | )
110 |
111 | The compression information will be stored in the metadata of the table, and the data will be automatically decompressed when reading it back.
112 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | integration_test
4 |
--------------------------------------------------------------------------------
/raster_loader/__init__.py:
--------------------------------------------------------------------------------
1 | from raster_loader._version import __version__
2 |
3 | from raster_loader.io.bigquery import (
4 | BigQueryConnection,
5 | )
6 | from raster_loader.io.snowflake import (
7 | SnowflakeConnection,
8 | )
9 | from raster_loader.io.databricks import (
10 | DatabricksConnection,
11 | )
12 |
13 | __all__ = [
14 | "__version__",
15 | "BigQueryConnection",
16 | "SnowflakeConnection",
17 | "DatabricksConnection",
18 | ]
19 |
--------------------------------------------------------------------------------
/raster_loader/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import iter_entry_points as entry_points
2 |
3 | import click
4 | from click_plugins import with_plugins
5 |
6 |
7 | @with_plugins(cmd for cmd in list(entry_points("raster_loader.cli")))
8 | @click.group(context_settings=dict(help_option_names=["-h", "--help"]))
9 | def main(args=None):
10 | """
11 | The ``carto`` command line interface.
12 | """
13 | pass
14 |
15 |
16 | if __name__ == "__main__": # pragma: no cover
17 | main()
18 |
--------------------------------------------------------------------------------
/raster_loader/cli/bigquery.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib.parse import urlparse
3 |
4 | import click
5 | from functools import wraps, partial
6 |
7 | from raster_loader.utils import get_default_table_name
8 | from raster_loader.io.bigquery import BigQueryConnection, AccessTokenCredentials
9 |
10 |
11 | def catch_exception(func=None, *, handle=Exception):
12 | if not func:
13 | return partial(catch_exception, handle=handle)
14 |
15 | @wraps(func)
16 | def wrapper(*args, **kwargs):
17 | try:
18 | return func(*args, **kwargs)
19 | except handle as e:
20 | raise click.ClickException(e)
21 |
22 | return wrapper
23 |
24 |
25 | @click.group(context_settings=dict(help_option_names=["-h", "--help"]))
26 | def bigquery(args=None):
27 | """
28 | Manage Google BigQuery resources.
29 | """
30 | pass
31 |
32 |
33 | @bigquery.command(help="Upload a raster file to Google BigQuery.")
34 | @click.option(
35 | "--file_path", help="The path to the raster file.", required=False, default=None
36 | )
37 | @click.option(
38 | "--file_url", help="The path to the raster file.", required=False, default=None
39 | )
40 | @click.option("--project", help="The name of the Google Cloud project.", required=True)
41 | @click.option("--token", help="An access token to authenticate with.", default=None)
42 | @click.option("--dataset", help="The name of the dataset.", required=True)
43 | @click.option("--table", help="The name of the table.", default=None)
44 | @click.option(
45 | "--band",
46 | help="Band(s) within raster to upload. "
47 | "Could repeat --band to specify multiple bands.",
48 | default=[1],
49 | multiple=True,
50 | )
51 | @click.option(
52 | "--band_name",
53 | help="Column name(s) used to store band (Default: band_). "
54 | "Could repeat --band_name to specify multiple bands column names. "
55 | "List of columns names HAVE to pair --band list with the same order.",
56 | default=[None],
57 | multiple=True,
58 | )
59 | @click.option(
60 | "--chunk_size", help="The number of blocks to upload in each chunk.", default=10000
61 | )
62 | @click.option(
63 | "--compress",
64 | help="Compress band data using zlib.",
65 | is_flag=True,
66 | default=False,
67 | )
68 | @click.option(
69 | "--overwrite",
70 | help="Overwrite existing data in the table if it already exists.",
71 | default=False,
72 | is_flag=True,
73 | )
74 | @click.option(
75 | "--append",
76 | help="Append records into a table if it already exists.",
77 | default=False,
78 | is_flag=True,
79 | )
80 | @click.option(
81 | "--cleanup-on-failure",
82 | help="Clean up resources if the upload fails. Useful for non-interactive scripts.",
83 | default=False,
84 | is_flag=True,
85 | )
86 | @click.option(
87 | "--exact_stats",
88 | help="Compute exact statistics for the raster bands.",
89 | default=False,
90 | is_flag=True,
91 | )
92 | @click.option(
93 | "--basic_stats",
94 | help="Compute basic stats and omit quantiles and most frequent values.",
95 | required=False,
96 | is_flag=True,
97 | )
98 | @click.option(
99 | "--compression-level",
100 | help="Compression level (1-9, higher = better compression but slower)",
101 | type=int,
102 | default=6,
103 | )
104 | @catch_exception()
105 | def upload(
106 | file_path,
107 | file_url,
108 | project,
109 | token,
110 | dataset,
111 | table,
112 | band,
113 | band_name,
114 | chunk_size,
115 | compress,
116 | overwrite=False,
117 | append=False,
118 | cleanup_on_failure=False,
119 | exact_stats=False,
120 | basic_stats=False,
121 | compression_level=6,
122 | ):
123 | from raster_loader.io.common import (
124 | get_number_of_blocks,
125 | print_band_information,
126 | get_block_dims,
127 | )
128 |
129 | if file_path is None and file_url is None:
130 | raise ValueError("Either --file_path or --file_url must be provided.")
131 |
132 | if file_path and file_url:
133 | raise ValueError("Only one of --file_path or --file_url must be provided.")
134 |
135 | is_local_file = file_path is not None
136 |
137 | # check that band and band_name are the same length
138 | # if band_name provided
139 | if band_name != (None,):
140 | if len(band) != len(band_name):
141 | raise ValueError("The number of bands must equal the number of band names.")
142 | else:
143 | band_name = [None] * len(band)
144 |
145 | # pair band and band_name in a list of tuple
146 | bands_info = list(zip(band, band_name))
147 |
148 | # create default table name if not provided
149 | if table is None:
150 | table = get_default_table_name(
151 | file_path if is_local_file else urlparse(file_url).path, band
152 | )
153 |
154 | credentials = None
155 | if token is not None:
156 | credentials = AccessTokenCredentials(token)
157 |
158 | connection = BigQueryConnection(project, credentials)
159 |
160 | source = file_path if is_local_file else file_url
161 |
162 | # introspect raster file
163 | num_blocks = get_number_of_blocks(source)
164 | file_size_mb = 0
165 | if is_local_file:
166 | file_size_mb = os.path.getsize(file_path) / 1024 / 1024
167 |
168 | click.echo("Preparing to upload raster file to BigQuery...")
169 | click.echo("File Path: {}".format(source))
170 | click.echo("File Size: {} MB".format(file_size_mb))
171 | print_band_information(source)
172 | click.echo("Source Band: {}".format(band))
173 | click.echo("Band Name: {}".format(band_name))
174 | click.echo("Number of Blocks: {}".format(num_blocks))
175 | click.echo("Block Dims: {}".format(get_block_dims(source)))
176 | click.echo("Project: {}".format(project))
177 | click.echo("Dataset: {}".format(dataset))
178 | click.echo("Table: {}".format(table))
179 | click.echo("Number of Records Per BigQuery Append: {}".format(chunk_size))
180 | click.echo("Compress: {}".format(compress))
181 |
182 | click.echo("Uploading Raster to BigQuery")
183 |
184 | fqn = f"{project}.{dataset}.{table}"
185 | connection.upload_raster(
186 | source,
187 | fqn,
188 | bands_info,
189 | chunk_size,
190 | overwrite=overwrite,
191 | append=append,
192 | cleanup_on_failure=cleanup_on_failure,
193 | exact_stats=exact_stats,
194 | basic_stats=basic_stats,
195 | compress=compress,
196 | compression_level=compression_level,
197 | )
198 |
199 | click.echo("Raster file uploaded to Google BigQuery")
200 | return 0
201 |
202 |
203 | @bigquery.command(help="Load and describe a table from BigQuery")
204 | @click.option("--project", help="The name of the Google Cloud project.", required=True)
205 | @click.option("--dataset", help="The name of the dataset.", required=True)
206 | @click.option("--table", help="The name of the table.", required=True)
207 | @click.option("--limit", help="Limit number of rows returned", default=10)
208 | @click.option(
209 | "--token",
210 | help="An access token to authenticate with.",
211 | required=False,
212 | default=None,
213 | )
214 | def describe(project, dataset, table, limit, token):
215 | credentials = None
216 | if token is not None:
217 | credentials = AccessTokenCredentials(token)
218 |
219 | connection = BigQueryConnection(project, credentials)
220 |
221 | fqn = f"{project}.{dataset}.{table}"
222 | df = connection.get_records(fqn, limit)
223 | print(f"Table: {fqn}")
224 | print(f"Number of rows: {len(df)}")
225 | print(f"Number of columns: {len(df.columns)}")
226 | print(f"Column names: {df.columns}")
227 | print(f"Column types: {df.dtypes}")
228 | print(f"Column head: {df.head()}")
229 |
--------------------------------------------------------------------------------
/raster_loader/cli/databricks.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib.parse import urlparse
3 |
4 | import click
5 | from functools import wraps, partial
6 |
7 | from raster_loader.utils import get_default_table_name
8 | from raster_loader.io.databricks import DatabricksConnection
9 |
10 |
11 | def catch_exception(func=None, *, handle=Exception):
12 | if not func:
13 | return partial(catch_exception, handle=handle)
14 |
15 | @wraps(func)
16 | def wrapper(*args, **kwargs):
17 | try:
18 | return func(*args, **kwargs)
19 | except handle as e:
20 | raise click.ClickException(e)
21 |
22 | return wrapper
23 |
24 |
25 | @click.group(context_settings=dict(help_option_names=["-h", "--help"]))
26 | def databricks(args=None):
27 | """
28 | Manage Databricks resources.
29 | """
30 | pass
31 |
32 |
33 | @databricks.command(help="Upload a raster file to Databricks.")
34 | @click.option(
35 | "--server-hostname", help="The Databricks workspace hostname.", required=True
36 | )
37 | @click.option("--token", help="The Databricks access token.", required=True)
38 | @click.option(
39 | "--cluster-id",
40 | help="The Databricks cluster ID for Spark operations.",
41 | required=True,
42 | )
43 | @click.option(
44 | "--file_path", help="The path to the raster file.", required=False, default=None
45 | )
46 | @click.option(
47 | "--file_url", help="The path to the raster file.", required=False, default=None
48 | )
49 | @click.option("--catalog", help="The name of the catalog.", required=True)
50 | @click.option("--schema", help="The name of the schema.", required=True)
51 | @click.option("--table", help="The name of the table.", default=None)
52 | @click.option(
53 | "--band",
54 | help="Band(s) within raster to upload. "
55 | "Could repeat --band to specify multiple bands.",
56 | default=[1],
57 | multiple=True,
58 | )
59 | @click.option(
60 | "--band_name",
61 | help="Column name(s) used to store band (Default: band_). "
62 | "Could repeat --band_name to specify multiple bands column names. "
63 | "List of columns names HAVE to pair --band list with the same order.",
64 | default=[None],
65 | multiple=True,
66 | )
67 | @click.option(
68 | "--chunk_size", help="The number of blocks to upload in each chunk.", default=10000
69 | )
70 | @click.option(
71 | "--parallelism",
72 | help="Number of partitions when uploading each chunk.",
73 | default=1000,
74 | type=int,
75 | )
76 | @click.option(
77 | "--overwrite",
78 | help="Overwrite existing data in the table if it already exists.",
79 | default=False,
80 | is_flag=True,
81 | )
82 | @click.option(
83 | "--append",
84 | help="Append records into a table if it already exists.",
85 | default=False,
86 | is_flag=True,
87 | )
88 | @click.option(
89 | "--cleanup-on-failure",
90 | help="Clean up resources if the upload fails. Useful for non-interactive scripts.",
91 | default=False,
92 | is_flag=True,
93 | )
94 | @click.option(
95 | "--exact_stats",
96 | help="Compute exact statistics for the raster bands.",
97 | default=False,
98 | is_flag=True,
99 | )
100 | @click.option(
101 | "--basic_stats",
102 | help="Compute basic stats and omit quantiles and most frequent values.",
103 | required=False,
104 | is_flag=True,
105 | )
106 | @click.option(
107 | "--compress",
108 | help="Compress band data using zlib.",
109 | is_flag=True,
110 | default=False,
111 | )
112 | @click.option(
113 | "--compression-level",
114 | help="Compression level (1-9, higher = better compression but slower)",
115 | type=int,
116 | default=6,
117 | )
118 | @catch_exception()
119 | def upload(
120 | server_hostname,
121 | token,
122 | cluster_id,
123 | file_path,
124 | file_url,
125 | catalog,
126 | schema,
127 | table,
128 | band,
129 | band_name,
130 | chunk_size,
131 | parallelism,
132 | compress,
133 | overwrite=False,
134 | append=False,
135 | cleanup_on_failure=False,
136 | exact_stats=False,
137 | basic_stats=False,
138 | compression_level=6,
139 | ):
140 | from raster_loader.io.common import (
141 | get_number_of_blocks,
142 | print_band_information,
143 | get_block_dims,
144 | )
145 |
146 | if file_path is None and file_url is None:
147 | raise ValueError("Either --file_path or --file_url must be provided.")
148 |
149 | if file_path and file_url:
150 | raise ValueError("Only one of --file_path or --file_url must be provided.")
151 |
152 | is_local_file = file_path is not None
153 |
154 | # check that band and band_name are the same length
155 | # if band_name provided
156 | if band_name != (None,):
157 | if len(band) != len(band_name):
158 | raise ValueError("The number of bands must equal the number of band names.")
159 | else:
160 | band_name = [None] * len(band)
161 |
162 | # pair band and band_name in a list of tuple
163 | bands_info = list(zip(band, band_name))
164 |
165 | # create default table name if not provided
166 | if table is None:
167 | table = get_default_table_name(
168 | file_path if is_local_file else urlparse(file_url).path, band
169 | )
170 |
171 | connection = DatabricksConnection(
172 | server_hostname=server_hostname,
173 | access_token=token,
174 | cluster_id=cluster_id,
175 | )
176 |
177 | source = file_path if is_local_file else file_url
178 |
179 | # introspect raster file
180 | num_blocks = get_number_of_blocks(source)
181 | file_size_mb = 0
182 | if is_local_file:
183 | file_size_mb = os.path.getsize(file_path) / 1024 / 1024
184 |
185 | click.echo("Preparing to upload raster file to Databricks...")
186 | click.echo("File Path: {}".format(source))
187 | click.echo("File Size: {} MB".format(file_size_mb))
188 | print_band_information(source)
189 | click.echo("Source Band: {}".format(band))
190 | click.echo("Band Name: {}".format(band_name))
191 | click.echo("Number of Blocks: {}".format(num_blocks))
192 | click.echo("Block Dims: {}".format(get_block_dims(source)))
193 | click.echo("Catalog: {}".format(catalog))
194 | click.echo("Schema: {}".format(schema))
195 | click.echo("Table: {}".format(table))
196 | click.echo("Number of Records Per Databricks Append: {}".format(chunk_size))
197 | click.echo("Parallelism: {}".format(parallelism))
198 | click.echo("Compress: {}".format(compress))
199 |
200 | click.echo("Uploading Raster to Databricks")
201 |
202 | fqn = f"`{catalog}`.`{schema}`.`{table}`"
203 | connection.upload_raster(
204 | source,
205 | fqn,
206 | bands_info,
207 | chunk_size,
208 | parallelism,
209 | overwrite=overwrite,
210 | append=append,
211 | cleanup_on_failure=cleanup_on_failure,
212 | exact_stats=exact_stats,
213 | basic_stats=basic_stats,
214 | compress=compress,
215 | compression_level=compression_level,
216 | )
217 |
218 | click.echo("Raster file uploaded to Databricks")
219 | return 0
220 |
221 |
222 | @databricks.command(help="Load and describe a table from Databricks")
223 | @click.option(
224 | "--server-hostname", help="The Databricks workspace hostname.", required=True
225 | )
226 | @click.option("--token", help="The Databricks access token.", required=True)
227 | @click.option(
228 | "--cluster-id",
229 | help="The Databricks cluster ID for Spark operations.",
230 | required=True,
231 | )
232 | @click.option("--catalog", help="The name of the catalog.", required=True)
233 | @click.option("--schema", help="The name of the schema.", required=True)
234 | @click.option("--table", help="The name of the table.", required=True)
235 | @click.option("--limit", help="Limit number of rows returned", default=10)
236 | def describe(
237 | server_hostname,
238 | token,
239 | cluster_id,
240 | catalog,
241 | schema,
242 | table,
243 | limit,
244 | ):
245 | fqn = f"`{catalog}`.`{schema}`.`{table}`"
246 | connection = DatabricksConnection(
247 | server_hostname=server_hostname,
248 | access_token=token,
249 | cluster_id=cluster_id,
250 | )
251 |
252 | df = connection.get_records(fqn, limit)
253 | print(f"Table: {fqn}")
254 | print(f"Number of rows: {len(df)}")
255 | print(f"Number of columns: {len(df.columns)}")
256 | print(f"Column names: {df.columns}")
257 | print(f"Column types: {df.dtypes}")
258 | print(f"Column head: {df.head()}")
259 |
--------------------------------------------------------------------------------
/raster_loader/cli/info.py:
--------------------------------------------------------------------------------
1 | """Module for info subcommand."""
2 |
3 | import platform
4 | import sys
5 |
6 | import click
7 |
8 | import raster_loader
9 |
10 |
11 | @click.command(help="Display system information.")
12 | def info():
13 | """Display system information."""
14 | click.echo(f"Raster Loader version: {raster_loader.__version__}")
15 | click.echo(f"Python version: {sys.version.split(' (')[0]}")
16 | click.echo(f"Platform: {platform.platform()}")
17 | click.echo(f"System version: {platform.system()} {platform.release()}")
18 | click.echo(f"Machine: {platform.machine()}")
19 | click.echo(f"Processor: {platform.processor()}")
20 | click.echo(f"Architecture: {platform.architecture()[0]}")
21 |
--------------------------------------------------------------------------------
/raster_loader/cli/snowflake.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib.parse import urlparse
3 |
4 | import click
5 | from functools import wraps, partial
6 |
7 | from raster_loader.utils import get_default_table_name, check_private_key
8 | from raster_loader.io.snowflake import SnowflakeConnection
9 |
10 |
11 | def catch_exception(func=None, *, handle=Exception):
12 | if not func:
13 | return partial(catch_exception, handle=handle)
14 |
15 | @wraps(func)
16 | def wrapper(*args, **kwargs):
17 | try:
18 | return func(*args, **kwargs)
19 | except handle as e:
20 | raise click.ClickException(e)
21 |
22 | return wrapper
23 |
24 |
25 | @click.group(context_settings=dict(help_option_names=["-h", "--help"]))
26 | def snowflake(args=None):
27 | """
28 | Manage Snowflake resources.
29 | """
30 | pass
31 |
32 |
33 | @snowflake.command(help="Upload a raster file to Snowflake.")
34 | @click.option("--account", help="The Swnoflake account.", required=True)
35 | @click.option("--username", help="The username.", required=False, default=None)
36 | @click.option("--password", help="The password.", required=False, default=None)
37 | @click.option(
38 | "--token",
39 | help="An access token to authenticate with.",
40 | required=False,
41 | default=None,
42 | )
43 | @click.option(
44 | "--private-key-path",
45 | help="The path to the private key file. (PEM format)",
46 | required=False,
47 | default=None,
48 | )
49 | @click.option(
50 | "--private-key-passphrase",
51 | help="The passphrase for the private key.",
52 | required=False,
53 | default=None,
54 | )
55 | @click.option("--role", help="The role to use for the file upload.", default=None)
56 | @click.option("--warehouse", help="Name of the default warehouse to use.", default=None)
57 | @click.option(
58 | "--file_path", help="The path to the raster file.", required=False, default=None
59 | )
60 | @click.option(
61 | "--file_url", help="The path to the raster file.", required=False, default=None
62 | )
63 | @click.option("--database", help="The name of the database.", required=True)
64 | @click.option("--schema", help="The name of the schema.", required=True)
65 | @click.option("--table", help="The name of the table.", default=None)
66 | @click.option(
67 | "--band",
68 | help="Band(s) within raster to upload. "
69 | "Could repeat --band to specify multiple bands.",
70 | default=[1],
71 | multiple=True,
72 | )
73 | @click.option(
74 | "--band_name",
75 | help="Column name(s) used to store band (Default: band_). "
76 | "Could repeat --band_name to specify multiple bands column names. "
77 | "List of columns names HAVE to pair --band list with the same order.",
78 | default=[None],
79 | multiple=True,
80 | )
81 | @click.option(
82 | "--chunk_size", help="The number of blocks to upload in each chunk.", default=10000
83 | )
84 | @click.option(
85 | "--overwrite",
86 | help="Overwrite existing data in the table if it already exists.",
87 | default=False,
88 | is_flag=True,
89 | )
90 | @click.option(
91 | "--append",
92 | help="Append records into a table if it already exists.",
93 | default=False,
94 | is_flag=True,
95 | )
96 | @click.option(
97 | "--cleanup-on-failure",
98 | help="Clean up resources if the upload fails. Useful for non-interactive scripts.",
99 | default=False,
100 | is_flag=True,
101 | )
102 | @click.option(
103 | "--exact_stats",
104 | help="Compute exact statistics for the raster bands.",
105 | default=False,
106 | is_flag=True,
107 | )
108 | @click.option(
109 | "--basic_stats",
110 | help="Compute basic stats and omit quantiles and most frequent values.",
111 | required=False,
112 | is_flag=True,
113 | )
114 | @click.option(
115 | "--compress",
116 | help="Compress band data using zlib.",
117 | is_flag=True,
118 | default=False,
119 | )
120 | @click.option(
121 | "--compression-level",
122 | help="Compression level (1-9, higher = better compression but slower)",
123 | type=int,
124 | default=6,
125 | )
126 | @catch_exception()
127 | def upload(
128 | account,
129 | username,
130 | password,
131 | token,
132 | private_key_path,
133 | private_key_passphrase,
134 | role,
135 | warehouse,
136 | file_path,
137 | file_url,
138 | database,
139 | schema,
140 | table,
141 | band,
142 | band_name,
143 | chunk_size,
144 | compress,
145 | overwrite=False,
146 | append=False,
147 | cleanup_on_failure=False,
148 | exact_stats=False,
149 | basic_stats=False,
150 | compression_level=6,
151 | ):
152 | from raster_loader.io.common import (
153 | get_number_of_blocks,
154 | print_band_information,
155 | get_block_dims,
156 | )
157 |
158 | if not (
159 | (token is not None and username is None)
160 | or (
161 | token is None
162 | and username is not None
163 | and password is not None
164 | and private_key_path is None
165 | )
166 | or (
167 | token is None
168 | and username is not None
169 | and password is None
170 | and private_key_path is not None
171 | )
172 | ):
173 | raise ValueError(
174 | "Either (--token) or (--username and --private-key-path) or"
175 | " (--username and --password) must be provided."
176 | )
177 |
178 | if private_key_path is not None:
179 | check_private_key(private_key_path, private_key_passphrase)
180 | if username is None:
181 | raise ValueError("--username must be provided when using a private key.")
182 |
183 | if file_path is None and file_url is None:
184 | raise ValueError("Either --file_path or --file_url must be provided.")
185 |
186 | if file_path and file_url:
187 | raise ValueError("Only one of --file_path or --file_url must be provided.")
188 |
189 | is_local_file = file_path is not None
190 |
191 | # check that band and band_name are the same length
192 | # if band_name provided
193 | if band_name != (None,):
194 | if len(band) != len(band_name):
195 | raise ValueError("The number of bands must equal the number of band names.")
196 | else:
197 | band_name = [None] * len(band)
198 |
199 | # pair band and band_name in a list of tuple
200 | bands_info = list(zip(band, band_name))
201 |
202 | # create default table name if not provided
203 | if table is None:
204 | table = get_default_table_name(
205 | file_path if is_local_file else urlparse(file_url).path, band
206 | )
207 |
208 | connection = SnowflakeConnection(
209 | username=username,
210 | password=password,
211 | private_key_path=private_key_path,
212 | private_key_passphrase=private_key_passphrase,
213 | token=token,
214 | account=account,
215 | database=database,
216 | schema=schema,
217 | role=role,
218 | warehouse=warehouse,
219 | )
220 |
221 | source = file_path if is_local_file else file_url
222 |
223 | # introspect raster file
224 | num_blocks = get_number_of_blocks(source)
225 | file_size_mb = 0
226 | if is_local_file:
227 | file_size_mb = os.path.getsize(file_path) / 1024 / 1024
228 |
229 | click.echo("Preparing to upload raster file to Snowflake...")
230 | click.echo("File Path: {}".format(source))
231 | click.echo("File Size: {} MB".format(file_size_mb))
232 | print_band_information(source)
233 | click.echo("Source Band: {}".format(band))
234 | click.echo("Band Name: {}".format(band_name))
235 | click.echo("Number of Blocks: {}".format(num_blocks))
236 | click.echo("Block Dims: {}".format(get_block_dims(source)))
237 | click.echo("Database: {}".format(database))
238 | click.echo("Schema: {}".format(schema))
239 | click.echo("Table: {}".format(table))
240 | click.echo("Number of Records Per Snowflake Append: {}".format(chunk_size))
241 | click.echo("Compress: {}".format(compress))
242 |
243 | click.echo("Uploading Raster to Snowflake")
244 |
245 | fqn = f"{database}.{schema}.{table}"
246 | connection.upload_raster(
247 | source,
248 | fqn,
249 | bands_info,
250 | chunk_size,
251 | overwrite=overwrite,
252 | append=append,
253 | cleanup_on_failure=cleanup_on_failure,
254 | exact_stats=exact_stats,
255 | basic_stats=basic_stats,
256 | compress=compress,
257 | compression_level=compression_level,
258 | )
259 |
260 | click.echo("Raster file uploaded to Snowflake")
261 | return 0
262 |
263 |
264 | @snowflake.command(help="Load and describe a table from Snowflake")
265 | @click.option("--account", help="The Swnoflake account.", required=True)
266 | @click.option("--username", help="The username.", required=False, default=None)
267 | @click.option("--password", help="The password.", required=False, default=None)
268 | @click.option(
269 | "--token",
270 | help="An access token to authenticate with.",
271 | required=False,
272 | default=None,
273 | )
274 | @click.option(
275 | "--private-key-path",
276 | help="The path to the private key file. (PEM format)",
277 | required=False,
278 | default=None,
279 | )
280 | @click.option(
281 | "--private-key-passphrase",
282 | help="The passphrase for the private key.",
283 | required=False,
284 | default=None,
285 | )
286 | @click.option("--role", help="The role to use for the file upload.", default=None)
287 | @click.option("--warehouse", help="Name of the default warehouse to use.", default=None)
288 | @click.option("--database", help="The name of the database.", required=True)
289 | @click.option("--schema", help="The name of the schema.", required=True)
290 | @click.option("--table", help="The name of the table.", required=True)
291 | @click.option("--limit", help="Limit number of rows returned", default=10)
292 | def describe(
293 | account,
294 | username,
295 | password,
296 | token,
297 | private_key_path,
298 | private_key_passphrase,
299 | role,
300 | warehouse,
301 | database,
302 | schema,
303 | table,
304 | limit,
305 | ):
306 |
307 | if not (
308 | (token is not None and username is None)
309 | or (
310 | token is None
311 | and username is not None
312 | and password is not None
313 | and private_key_path is None
314 | )
315 | or (
316 | token is None
317 | and username is not None
318 | and password is None
319 | and private_key_path is not None
320 | )
321 | ):
322 | raise ValueError(
323 | "Either (--token) or (--username and --private-key-path) or"
324 | " (--username and --password) must be provided."
325 | )
326 |
327 | if private_key_path is not None:
328 | check_private_key(private_key_path, private_key_passphrase)
329 | if username is None:
330 | raise ValueError("--username must be provided when using a private key.")
331 |
332 | fqn = f"{database}.{schema}.{table}"
333 | connection = SnowflakeConnection(
334 | username=username,
335 | password=password,
336 | private_key_path=private_key_path,
337 | private_key_passphrase=private_key_passphrase,
338 | token=token,
339 | account=account,
340 | database=database,
341 | schema=schema,
342 | role=role,
343 | warehouse=warehouse,
344 | )
345 | df = connection.get_records(fqn, limit)
346 | print(f"Table: {fqn}")
347 | print(f"Number of rows: {len(df)}")
348 | print(f"Number of columns: {len(df.columns)}")
349 | print(f"Column names: {df.columns}")
350 | print(f"Column types: {df.dtypes}")
351 | print(f"Column head: {df.head()}")
352 |
--------------------------------------------------------------------------------
/raster_loader/errors.py:
--------------------------------------------------------------------------------
1 | def import_error_bigquery(): # pragma: no cover
2 | msg = (
3 | "Google Cloud BigQuery client is not installed.\n"
4 | "Please install Google Cloud BigQuery dependencies to use this function.\n"
5 | 'run `pip install -U raster-loader"[bigquery]"` to install from pypi.'
6 | )
7 | raise ImportError(msg)
8 |
9 |
10 | def import_error_snowflake(): # pragma: no cover
11 | msg = (
12 | "Snowflake client is not installed.\n"
13 | "Please install Snowflake dependencies to use this function.\n"
14 | 'run `pip install -U raster-loader"[snowflake]"` to install from pypi.'
15 | )
16 | raise ImportError(msg)
17 |
18 |
19 | def import_error_databricks():
20 | raise ImportError(
21 | "The databricks-connect package is required and must match your "
22 | "Databricks Runtime version.\n"
23 | "For example, if your cluster uses DBR 15.1, "
24 | "run `pip install databricks-connect==15.1`\n"
25 | "You can find your cluster's DBR version in the Databricks UI "
26 | "under Compute > Your Cluster > Configuration > Databricks Runtime version.\n"
27 | "Or you can run the following SQL query from your cluster:\n"
28 | "`SELECT current_version()`"
29 | )
30 |
31 |
32 | class IncompatibleRasterException(Exception):
33 | def __init__(self):
34 | self.message = (
35 | "The input raster must be a GoogleMapsCompatible raster.\n"
36 | "You can make your raster compatible "
37 | "by converting it using the following command:\n"
38 | "gdalwarp -of COG -co TILING_SCHEME=GoogleMapsCompatible "
39 | "-co COMPRESS=DEFLATE -co OVERVIEWS=IGNORE_EXISTING -co ADD_ALPHA=NO "
40 | "-co RESAMPLING=NEAREST -co BLOCKSIZE=512 "
41 | ".tif .tif"
42 | )
43 |
44 |
45 | def error_not_google_compatible(): # pragma: no cover
46 | raise IncompatibleRasterException()
47 |
--------------------------------------------------------------------------------
/raster_loader/geo.py:
--------------------------------------------------------------------------------
1 | import math
2 | import functools
3 | import json
4 |
5 |
6 | def coord_range(start_x, start_y, end_x, end_y, num_subdivisions):
7 | num_subdivisions = max(num_subdivisions, 1)
8 | return [
9 | [
10 | start_x + (end_x - start_x) * i / num_subdivisions,
11 | start_y + (end_y - start_y) * i / num_subdivisions,
12 | ]
13 | for i in range(0, num_subdivisions + 1)
14 | ]
15 |
16 |
17 | def norm_lon(x):
18 | return x - 360.0 if x > 180.0 else x + 360.0 if x <= -180.0 else x
19 |
20 |
21 | def norm_coords(coords):
22 | return [[norm_lon(point[0]), point[1]] for point in coords]
23 |
24 |
25 | def polygon_geography(rings, format, normalize_coords):
26 | if normalize_coords:
27 | rings = [norm_coords(coords) for coords in rings]
28 |
29 | if format == "wkt":
30 | return polygon_wkt(rings)
31 | elif format == "geojson":
32 | return polygon_geojson(rings)
33 | else:
34 | raise ValueError(f"Invalid geography format {format}")
35 |
36 |
37 | def polygon_wkt(rings):
38 | return (
39 | "POLYGON("
40 | + ",".join(
41 | [
42 | "("
43 | + ",".join(
44 | [" ".join([str(coord) for coord in point]) for point in coords]
45 | )
46 | + ")"
47 | for coords in rings
48 | ]
49 | )
50 | + ")"
51 | )
52 |
53 |
54 | def polygon_geojson(rings):
55 | return json.dumps({"type": "Polygon", "coordinates": rings})
56 |
57 |
58 | def coords_to_geography(coords, format, whole_earth):
59 | # remove too-close coordinates cause they cause errors
60 | # in BigQuery's ST_GEOGFROMGEOJSON
61 | def are_too_close(point1, point2):
62 | return (
63 | math.fabs(point1[0] - point2[0]) <= 1e-13
64 | and math.fabs(point1[1] - point2[1]) <= 1e-13
65 | )
66 |
67 | def filter_near_points(coords, point):
68 | previous = None if not coords else coords[-1]
69 | if not previous or not are_too_close(previous, point):
70 | coords.append(point)
71 | return coords
72 |
73 | coords = functools.reduce(filter_near_points, coords, [])
74 |
75 | # now let's make sure the initial and final points are exactly the same
76 | if coords[0] != coords[-1]:
77 | # replace the last point; never mind, it must be very close
78 | coords[-1] = coords[0]
79 | return polygon_geography([coords], format, not whole_earth)
80 |
81 |
82 | def raster_bounds(raster_dataset, transformer, format):
83 | min_x = 0
84 | min_y = 0
85 | max_x = raster_dataset.width
86 | max_y = raster_dataset.height
87 |
88 | x_subdivisions = math.ceil((max_x - min_x) / 64.0)
89 | y_subdivisions = math.ceil((max_y - min_y) / 64.0)
90 | pixel_coords = (
91 | # SW -> SE
92 | coord_range(min_x, max_y, max_x, max_y, x_subdivisions)
93 | # SE -> NE
94 | + coord_range(max_x, max_y, max_x, min_y, y_subdivisions)
95 | # NE -> NW
96 | + coord_range(max_x, min_y, min_x, min_y, x_subdivisions)
97 | # NW -> SW
98 | + coord_range(min_x, min_y, min_x, max_y, y_subdivisions)
99 | )
100 | coords = [
101 | transformer.transform(*(raster_dataset.transform * (x, y)))
102 | for x, y in pixel_coords
103 | ]
104 | lon_NW, _ = transformer.transform(*(raster_dataset.transform * (min_x, min_y)))
105 | lon_NE, _ = transformer.transform(*(raster_dataset.transform * (max_x, min_y)))
106 | lon_SW, _ = transformer.transform(*(raster_dataset.transform * (min_x, max_y)))
107 | lon_SE, _ = transformer.transform(*(raster_dataset.transform * (max_x, max_y)))
108 | whole_earth = (
109 | math.fabs(lon_NW - lon_NE) >= 360.0 and math.fabs(lon_SW - lon_SE) >= 360
110 | )
111 |
112 | return coords_to_geography(coords, format, whole_earth)
113 |
--------------------------------------------------------------------------------
/raster_loader/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/io/__init__.py
--------------------------------------------------------------------------------
/raster_loader/io/bigquery.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import pandas as pd
4 | import rasterio
5 | import re
6 |
7 | from itertools import chain
8 | from raster_loader import __version__
9 | from raster_loader.errors import import_error_bigquery, IncompatibleRasterException
10 | from raster_loader.utils import ask_yes_no_question, batched
11 | from raster_loader.io.common import (
12 | check_metadata_is_compatible,
13 | get_number_of_blocks,
14 | get_number_of_overviews_blocks,
15 | rasterio_metadata,
16 | rasterio_overview_to_records,
17 | rasterio_windows_to_records,
18 | update_metadata,
19 | )
20 |
21 | from typing import Iterable, List, Tuple
22 | from functools import partial
23 |
24 | try:
25 | from google.cloud import bigquery
26 | from google.auth.credentials import Credentials
27 |
28 | except ImportError: # pragma: no cover
29 | _has_bigquery = False
30 | else:
31 | _has_bigquery = True
32 |
33 | from raster_loader.io.datawarehouse import DataWarehouseConnection
34 |
35 | if _has_bigquery:
36 |
37 | class AccessTokenCredentials(Credentials):
38 | def __init__(self, access_token):
39 | super(AccessTokenCredentials, self).__init__()
40 | self._access_token = access_token
41 |
42 | def refresh(self, request):
43 | pass
44 |
45 | def apply(self, headers, token=None):
46 | headers["Authorization"] = f"Bearer {self._access_token}"
47 |
48 | else:
49 |
50 | class Credentials:
51 | def __init__(self):
52 | import_error_bigquery()
53 |
54 | class AccessTokenCredentials:
55 | def __init__(self, access_token):
56 | import_error_bigquery()
57 |
58 |
59 | class BigQueryConnection(DataWarehouseConnection):
60 | def __init__(self, project, credentials: Credentials = None):
61 | if not _has_bigquery: # pragma: no cover
62 | import_error_bigquery()
63 |
64 | self.client = bigquery.Client(project=project, credentials=credentials)
65 |
66 | def execute(self, sql):
67 | return self.client.query(sql).result()
68 |
69 | def execute_to_dataframe(self, sql):
70 | return self.client.query(sql).to_dataframe()
71 |
72 | def quote(self, q):
73 | if isinstance(q, str):
74 | q = q.replace("\\", "\\\\")
75 | return f"'''{q}'''"
76 | return str(q)
77 |
78 | def quote_name(self, name):
79 | return f"`{name}`"
80 |
81 | def upload_records(self, records: Iterable, fqn):
82 | records = list(records)
83 |
84 | data_df = pd.DataFrame(records)
85 |
86 | job_config = bigquery.LoadJobConfig(
87 | schema=[
88 | bigquery.SchemaField("block", bigquery.enums.SqlTypeNames.INT64),
89 | bigquery.SchemaField("metadata", bigquery.enums.SqlTypeNames.STRING),
90 | ],
91 | clustering_fields=["block"],
92 | )
93 |
94 | return self.client.load_table_from_dataframe(
95 | dataframe=data_df,
96 | destination=fqn,
97 | job_id_prefix=f"{fqn.split('.')[-1]}_",
98 | job_config=job_config,
99 | )
100 |
101 | def upload_raster(
102 | self,
103 | file_path: str,
104 | fqn: str,
105 | bands_info: List[Tuple[int, str]] = [(1, None)],
106 | chunk_size: int = None,
107 | overwrite: bool = False,
108 | append: bool = False,
109 | cleanup_on_failure: bool = False,
110 | exact_stats: bool = False,
111 | basic_stats: bool = False,
112 | compress: bool = False,
113 | compression_level: int = 6,
114 | ):
115 | """Write a raster file to a BigQuery table."""
116 | print("Loading raster file to BigQuery...")
117 |
118 | append_records = False
119 |
120 | try:
121 | if self.check_if_table_exists(fqn) and not self.check_if_table_is_empty(
122 | fqn
123 | ):
124 | if overwrite:
125 | self.delete_bigquery_table(fqn)
126 | else:
127 | append_records = append or ask_yes_no_question(
128 | f"Table {fqn} already exists "
129 | "and is not empty. Append records? [yes/no] "
130 | )
131 |
132 | if not append_records:
133 | exit()
134 |
135 | metadata = rasterio_metadata(
136 | file_path,
137 | bands_info,
138 | self.band_rename_function,
139 | exact_stats,
140 | basic_stats,
141 | compress=compress,
142 | )
143 |
144 | overviews_records_gen = rasterio_overview_to_records(
145 | file_path,
146 | self.band_rename_function,
147 | bands_info,
148 | compress=compress,
149 | compression_level=compression_level,
150 | )
151 |
152 | windows_records_gen = rasterio_windows_to_records(
153 | file_path,
154 | self.band_rename_function,
155 | bands_info,
156 | compress=compress,
157 | compression_level=compression_level,
158 | )
159 | records_gen = chain(overviews_records_gen, windows_records_gen)
160 |
161 | if append_records:
162 | old_metadata = self.get_metadata(fqn)
163 | check_metadata_is_compatible(metadata, old_metadata)
164 | update_metadata(metadata, old_metadata)
165 |
166 | number_of_blocks = get_number_of_blocks(file_path)
167 | number_of_overview_tiles = get_number_of_overviews_blocks(file_path)
168 | total_blocks = number_of_blocks + number_of_overview_tiles
169 |
170 | if chunk_size is None:
171 | job = self.upload_records(records_gen, fqn)
172 | # raise error if job went wrong (blocking call)
173 | job.result()
174 | else:
175 | from tqdm.auto import tqdm
176 |
177 | jobs = []
178 | errors = []
179 | print(
180 | f"Writing {number_of_blocks} blocks and {number_of_overview_tiles} "
181 | "overview tiles to BigQuery..."
182 | )
183 | with tqdm(total=total_blocks) as pbar:
184 | if total_blocks < chunk_size:
185 | chunk_size = total_blocks
186 |
187 | def done_callback(job):
188 | pbar.update(job.num_records or 0)
189 | try:
190 | job.result()
191 | except Exception as e:
192 | errors.append(e)
193 | try:
194 | jobs.remove(job)
195 | except ValueError:
196 | # job already removed because failed
197 | pass
198 |
199 | processed_blocks = 0
200 | for records in batched(records_gen, chunk_size):
201 | job = self.upload_records(records, fqn)
202 | job.num_records = len(records)
203 | processed_blocks += len(records)
204 |
205 | job.add_done_callback(partial(lambda job: done_callback(job)))
206 | jobs.append(job)
207 |
208 | # do not continue to schedule jobs if there are errors
209 | if len(errors):
210 | raise Exception(errors)
211 |
212 | # wait for end of jobs or any error
213 | while len(jobs) > 0 and len(errors) == 0:
214 | time.sleep(1)
215 |
216 | if len(errors):
217 | raise Exception(errors)
218 |
219 | empty_blocks = total_blocks - processed_blocks
220 | pbar.update(empty_blocks)
221 |
222 | print("Number of empty blocks: ", empty_blocks)
223 |
224 | print("Writing metadata to BigQuery...")
225 | self.write_metadata(metadata, append_records, fqn)
226 |
227 | print("Updating labels...")
228 | self.update_labels(fqn, self.get_labels(__version__))
229 |
230 | except IncompatibleRasterException as e:
231 | raise IOError("Error uploading to BigQuery: {}".format(e.message))
232 |
233 | except KeyboardInterrupt:
234 | delete = cleanup_on_failure or ask_yes_no_question(
235 | "Would you like to delete the partially uploaded table? [yes/no] "
236 | )
237 |
238 | if delete:
239 | self.delete_table(fqn)
240 |
241 | raise KeyboardInterrupt
242 |
243 | except rasterio.errors.CRSError as e:
244 | raise e
245 |
246 | except Exception as e:
247 | delete = cleanup_on_failure or ask_yes_no_question(
248 | (
249 | "Error uploading to BigQuery. "
250 | "Would you like to delete the partially uploaded table? [yes/no] "
251 | )
252 | )
253 |
254 | if delete:
255 | self.delete_table(fqn)
256 |
257 | import traceback
258 |
259 | print(traceback.print_exc())
260 | raise IOError("Error uploading to BigQuery: {}".format(e))
261 |
262 | print("Done.")
263 | return True
264 |
265 | def delete_bigquery_table(self, fqn: str):
266 | try:
267 | self.client.delete_table(fqn, not_found_ok=True)
268 | return True
269 | except Exception:
270 | return False
271 |
272 | def check_if_table_exists(self, fqn: str):
273 | try:
274 | self.client.get_table(fqn)
275 | return True
276 | except Exception:
277 | return False
278 |
279 | def check_if_table_is_empty(self, fqn: str):
280 | table = self.client.get_table(fqn)
281 | return table.num_rows == 0
282 |
283 | def get_metadata(self, fqn):
284 | rows = self.execute(
285 | f"""
286 | SELECT metadata FROM {self.quote_name(fqn)} WHERE block = 0
287 | """
288 | )
289 |
290 | rows = list(rows)
291 | if len(rows) == 0:
292 | return None
293 |
294 | return json.loads(rows[0]["metadata"])
295 |
296 | def get_labels(self, version: str):
297 | return {
298 | "raster_loader": re.sub(r"[^a-z0-9_-]", "_", version.lower()),
299 | }
300 |
301 | def update_labels(self, fqn, labels):
302 | table = self.client.get_table(fqn)
303 | table.labels = labels
304 | table = self.client.update_table(table, ["labels"])
305 |
306 | def write_metadata(
307 | self,
308 | metadata,
309 | append_records,
310 | fqn,
311 | ):
312 | if append_records:
313 | self.execute(
314 | f"""
315 | UPDATE {self.quote_name(fqn)}
316 | SET metadata = (
317 | SELECT TO_JSON_STRING(
318 | PARSE_JSON(
319 | {self.quote(json.dumps(metadata))},
320 | wide_number_mode=>'round'
321 | )
322 | )
323 | ) WHERE block = 0
324 | """
325 | )
326 |
327 | return True
328 | else:
329 | return self.insert_in_table(
330 | [
331 | {
332 | # store metadata in the record with this block number
333 | "block": 0,
334 | "metadata": json.dumps(metadata),
335 | }
336 | ],
337 | fqn,
338 | )
339 |
--------------------------------------------------------------------------------
/raster_loader/io/databricks.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | import rasterio
4 |
5 | from itertools import chain
6 | from raster_loader.errors import import_error_databricks, IncompatibleRasterException
7 | from raster_loader.utils import ask_yes_no_question, batched
8 | from raster_loader.io.common import (
9 | check_metadata_is_compatible,
10 | get_number_of_blocks,
11 | get_number_of_overviews_blocks,
12 | rasterio_metadata,
13 | rasterio_overview_to_records,
14 | rasterio_windows_to_records,
15 | update_metadata,
16 | )
17 |
18 | from typing import Iterable, List, Tuple
19 |
20 | try:
21 | from databricks.connect import DatabricksSession
22 | except ImportError: # pragma: no cover
23 | _has_databricks = False
24 | else:
25 | _has_databricks = True
26 |
27 | from raster_loader.io.datawarehouse import DataWarehouseConnection
28 |
29 |
30 | class DatabricksConnection(DataWarehouseConnection):
31 | def __init__(self, server_hostname, access_token, cluster_id, parallelism=200):
32 | # Validate required parameters
33 | if not server_hostname:
34 | raise ValueError("server_hostname cannot be null or empty")
35 | if not access_token:
36 | raise ValueError("access_token cannot be null or empty")
37 | if not cluster_id:
38 | raise ValueError("cluster_id cannot be null or empty")
39 | if not _has_databricks:
40 | import_error_databricks()
41 |
42 | # Normalize server_hostname by removing any 'https://' prefix
43 | self.server_hostname = server_hostname.replace("https://", "")
44 | self.access_token = access_token
45 | self.cluster_id = cluster_id
46 | self.parallelism = parallelism
47 |
48 | try:
49 | self.spark = DatabricksSession.builder.remote(
50 | host=f"https://{self.server_hostname}",
51 | token=access_token,
52 | cluster_id=cluster_id,
53 | ).getOrCreate()
54 | except Exception as e:
55 | print(f"Error initializing Spark session: {e}")
56 | raise
57 |
58 | def execute(self, query: str) -> list:
59 | """Execute a SQL query and return results as a list of rows"""
60 | try:
61 | return self.spark.sql(query).collect()
62 | except Exception as e:
63 | print(f"Error executing query: {e}")
64 | raise
65 |
66 | def execute_to_dataframe(self, query: str) -> "pd.DataFrame":
67 | """Execute a SQL query and return results as a pandas DataFrame"""
68 | try:
69 | return self.spark.sql(query).toPandas()
70 | except Exception as e:
71 | print(f"Error executing query: {e}")
72 | raise
73 |
74 | def quote(self, q):
75 | if isinstance(q, str):
76 | q = q.replace("'", "''")
77 | return f"'{q}'"
78 | return str(q)
79 |
80 | def quote_name(self, name):
81 | """Quote a table name with proper escaping for Databricks."""
82 | parts = name.replace("`", "").split(".")
83 | return ".".join(f"`{part}`" for part in parts)
84 |
85 | def upload_records(
86 | self,
87 | records: Iterable,
88 | fqn: str,
89 | overwrite: bool = False,
90 | parallelism: int = 1000,
91 | ):
92 | # Convert to Pandas DataFrame
93 | data_df = pd.DataFrame(records)
94 |
95 | # Drop metadata column if it exists
96 | if "metadata" in data_df.columns:
97 | data_df = data_df.drop(columns=["metadata"])
98 |
99 | if data_df.empty:
100 | print("No records to upload.")
101 | return True
102 |
103 | try:
104 | # Convert Pandas DataFrame to Spark DataFrame
105 | spark_df = self.spark.createDataFrame(data_df)
106 |
107 | # Write to Delta table
108 | write_mode = "overwrite" if overwrite else "append"
109 | (
110 | spark_df.repartition(parallelism)
111 | .write.format("delta")
112 | .mode(write_mode)
113 | .saveAsTable(fqn)
114 | )
115 |
116 | return True
117 | except Exception as e:
118 | print(f"Error uploading records: {str(e)}")
119 | return False
120 |
121 | def wait_for_cluster(self):
122 | """Wait for the Databricks cluster to be ready."""
123 | print("Waiting for Databricks cluster to be ready...")
124 | try:
125 | # Execute a simple SQL query that doesn't affect any tables
126 | self.execute("SELECT 1")
127 | except Exception as e:
128 | raise RuntimeError(f"Failed to connect to Databricks cluster: {str(e)}")
129 |
130 | def upload_raster(
131 | self,
132 | file_path: str,
133 | fqn: str,
134 | bands_info: List[Tuple[int, str]] = None,
135 | chunk_size: int = None,
136 | parallelism: int = 1000,
137 | overwrite: bool = False,
138 | append: bool = False,
139 | cleanup_on_failure: bool = False,
140 | exact_stats: bool = False,
141 | basic_stats: bool = False,
142 | compress: bool = False,
143 | compression_level: int = 6,
144 | ):
145 | """Write a raster file to a Databricks table."""
146 | # Wait for cluster to be ready before starting the upload
147 | self.wait_for_cluster()
148 |
149 | print("Loading raster file to Databricks...")
150 |
151 | bands_info = bands_info or [(1, None)]
152 | append_records = False
153 | fqn = self.quote_name(fqn)
154 |
155 | try:
156 | if self.check_if_table_exists(fqn) and not self.check_if_table_is_empty(
157 | fqn
158 | ):
159 | if overwrite:
160 | self.delete_table(fqn)
161 | else:
162 | append_records = append or ask_yes_no_question(
163 | f"Table {fqn} already exists "
164 | "and is not empty. Append records? [yes/no] "
165 | )
166 |
167 | if not append_records:
168 | exit()
169 |
170 | metadata = rasterio_metadata(
171 | file_path,
172 | bands_info,
173 | self.band_rename_function,
174 | exact_stats,
175 | basic_stats,
176 | compress=compress,
177 | )
178 |
179 | overviews_records_gen = rasterio_overview_to_records(
180 | file_path,
181 | self.band_rename_function,
182 | bands_info,
183 | compress=compress,
184 | compression_level=compression_level,
185 | )
186 |
187 | windows_records_gen = rasterio_windows_to_records(
188 | file_path,
189 | self.band_rename_function,
190 | bands_info,
191 | compress=compress,
192 | compression_level=compression_level,
193 | )
194 |
195 | records_gen = chain(overviews_records_gen, windows_records_gen)
196 |
197 | if append_records:
198 | old_metadata = self.get_metadata(fqn)
199 | check_metadata_is_compatible(metadata, old_metadata)
200 | update_metadata(metadata, old_metadata)
201 |
202 | number_of_blocks = get_number_of_blocks(file_path)
203 | number_of_overview_tiles = get_number_of_overviews_blocks(file_path)
204 | total_blocks = number_of_blocks + number_of_overview_tiles
205 |
206 | if chunk_size is None:
207 | success = self.upload_records(records_gen, fqn, overwrite, parallelism)
208 | if not success:
209 | raise IOError("Error uploading to Databricks.")
210 | else:
211 | from tqdm.auto import tqdm
212 |
213 | processed_blocks = 0
214 | print(
215 | f"Writing {number_of_blocks} blocks and {number_of_overview_tiles} "
216 | "overview tiles to Databricks..."
217 | )
218 | with tqdm(total=total_blocks) as pbar:
219 | if total_blocks < chunk_size:
220 | chunk_size = total_blocks
221 | isFirstBatch = True
222 |
223 | for records in batched(records_gen, chunk_size):
224 | ret = self.upload_records(
225 | records, fqn, overwrite and isFirstBatch, parallelism
226 | )
227 |
228 | num_records = len(records)
229 | processed_blocks += num_records
230 | pbar.update(num_records)
231 | if not ret:
232 | raise IOError("Error uploading to Databricks.")
233 | isFirstBatch = False
234 |
235 | empty_blocks = total_blocks - processed_blocks
236 | pbar.update(empty_blocks)
237 |
238 | print("Number of empty blocks: ", empty_blocks)
239 |
240 | print("Writing metadata to Databricks...")
241 | self.write_metadata(metadata, append_records, fqn)
242 |
243 | except IncompatibleRasterException as e:
244 | raise IOError("Error uploading to Databricks: {}".format(e.message))
245 |
246 | except KeyboardInterrupt:
247 | delete = cleanup_on_failure or ask_yes_no_question(
248 | "Would you like to delete the partially uploaded table? [yes/no] "
249 | )
250 |
251 | if delete:
252 | self.delete_table(fqn)
253 |
254 | raise KeyboardInterrupt
255 |
256 | except rasterio.errors.CRSError as e:
257 | raise e
258 |
259 | except Exception as e:
260 | delete = cleanup_on_failure or ask_yes_no_question(
261 | "Error uploading to Databricks. "
262 | "Would you like to delete the partially uploaded table? [yes/no] "
263 | )
264 |
265 | if delete:
266 | self.delete_table(fqn)
267 |
268 | import traceback
269 |
270 | print(traceback.print_exc())
271 | raise IOError("Error uploading to Databricks: {}".format(e))
272 |
273 | print("Done.")
274 | return True
275 |
276 | def check_if_table_exists(self, fqn: str):
277 | try:
278 | self.execute(f"DESCRIBE TABLE {fqn}")
279 | return True
280 | except Exception:
281 | return False
282 |
283 | def check_if_table_is_empty(self, fqn: str):
284 | try:
285 | result = self.execute(f"SELECT COUNT(*) FROM {fqn}")
286 | return result[0][0] == 0
287 | except Exception:
288 | return True
289 |
290 | def get_metadata(self, fqn):
291 | rows = self.execute(
292 | f"""
293 | SELECT metadata FROM {fqn} WHERE block = 0
294 | """
295 | )
296 | if not rows:
297 | return None
298 | return json.loads(rows[0][0])
299 |
300 | def write_metadata(self, metadata, append_records, fqn):
301 | if append_records:
302 | self.execute(
303 | f"""
304 | UPDATE {fqn}
305 | SET metadata = {self.quote(json.dumps(metadata))}
306 | WHERE block = 0
307 | """
308 | )
309 | else:
310 | self.execute(
311 | f"""
312 | ALTER TABLE {fqn}
313 | ADD COLUMN metadata STRING;
314 | """
315 | )
316 | return self.insert_in_table(
317 | [
318 | {
319 | # store metadata in the record with this block number
320 | "block": 0,
321 | "metadata": json.dumps(metadata),
322 | }
323 | ],
324 | fqn,
325 | )
326 |
--------------------------------------------------------------------------------
/raster_loader/io/datawarehouse.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from typing import List
4 |
5 |
6 | class DataWarehouseConnection:
7 | def __init__(self, *args, **kwargs):
8 | pass
9 |
10 | def execute(self, sql):
11 | """Execute a SQL query."""
12 | raise NotImplementedError
13 |
14 | def execute_to_dataframe(self, sql):
15 | """Execute a SQL query and return the result as a pandas dataframe.
16 | Parameters
17 | ----------
18 | sql : str
19 | SQL query to execute.
20 | Returns
21 | -------
22 | pandas.DataFrame
23 | Result of the query.
24 | """
25 | raise NotImplementedError
26 |
27 | def create_table(self, fqn):
28 | """Create a table.
29 | Parameters
30 | ----------
31 | fqn : str
32 | Fully qualified name of the table.
33 | """
34 | self.execute(f"CREATE TABLE IF NOT EXISTS {self.quote(fqn)})")
35 |
36 | def delete_table(self, fqn):
37 | """Delete a table.
38 | Parameters
39 | ----------
40 | fqn : str
41 | Fully qualified name of the table.
42 | """
43 | self.execute(f"DROP TABLE IF EXISTS {self.quote_name(fqn)}")
44 |
45 | def quote(self, value):
46 | """Quote a value.
47 | Parameters
48 | ----------
49 | value : str
50 | Value to quote.
51 | Returns
52 | -------
53 | str
54 | Quoted value.
55 | """
56 | if isinstance(value, str):
57 | value = value.replace("\\", "\\\\")
58 | return f"'{value}'"
59 | return str(value)
60 |
61 | def upload_raster(
62 | self,
63 | file_path: str,
64 | fqn: str,
65 | band: int = 1,
66 | band_name: str = None,
67 | chunk_size: int = 10000,
68 | overwrite: bool = False,
69 | append: bool = False,
70 | ):
71 | """Upload a raster file to the data warehouse.
72 | Parameters
73 | ----------
74 | file_path : str
75 | Path to the raster file.
76 | fqn : str
77 | Fully qualified name of the table.
78 | band : int, optional
79 | Band to upload, by default 1
80 | band_name : str, optional
81 | Name of the band
82 | chunk_size : int, optional
83 | Number of blocks to upload in each chunk, by default 10000
84 | overwrite : bool, optional
85 | Overwrite existing data in the table if it already exists, by default False
86 | append : bool, optional
87 | Append records into a table if it already exists, by default False
88 | """
89 | raise NotImplementedError
90 |
91 | def get_records(self, fqn: str, limit=10) -> pd.DataFrame:
92 | """Get records from a table.
93 | Parameters
94 | ----------
95 | fqn : str
96 | Fully qualified name of the table.
97 | limit : int, optional
98 | Maximum number of records to return, by default 10
99 | Returns
100 | -------
101 | pandas.DataFrame
102 | Records from the table.
103 | """
104 | query = f"SELECT * FROM {self.quote_name(fqn)} LIMIT {limit}"
105 | return self.execute_to_dataframe(query)
106 |
107 | def band_rename_function(self, band_name: str):
108 | return band_name
109 |
110 | def insert_in_table(
111 | self,
112 | rows: List[dict],
113 | fqn: str,
114 | ) -> bool:
115 | """Insert records into a table.
116 | Parameters
117 | ----------
118 | rows : List[dict]
119 | Records to insert.
120 | fqn : str
121 | Fully qualified name of the table.
122 | Returns
123 | -------
124 | bool
125 | True if the insertion was successful, False otherwise.
126 | """
127 | columns = rows[0].keys()
128 | values = ",".join(
129 | [
130 | "(" + ",".join([self.quote(row[column]) for column in columns]) + ")"
131 | for row in rows
132 | ]
133 | )
134 | query = f"""
135 | INSERT INTO {self.quote_name(fqn)}({','.join(columns)})
136 | VALUES {values}
137 | """
138 | self.execute(query)
139 |
140 | return True
141 |
142 | def write_metadata(
143 | self,
144 | metadata,
145 | append_records,
146 | fqn,
147 | ):
148 | """Write metadata to a table.
149 | Parameters
150 | ----------
151 | metadata : dict
152 | Metadata to write.
153 | append_records : bool
154 | Whether to update the metadata of an existing table or insert a new record.
155 | fqn : str
156 | Fully qualified name of the table.
157 | Returns
158 | -------
159 | bool
160 | True if the insertion was successful, False otherwise.
161 | """
162 | raise NotImplementedError
163 |
164 | def get_metadata(self, fqn):
165 | """Get metadata from a table.
166 | Parameters
167 | ----------
168 | fqn : str
169 | Fully qualified name of the table.
170 | Returns
171 | -------
172 | dict
173 | Metadata from the table.
174 | """
175 | raise NotImplementedError
176 |
177 | def quote_name(self, name):
178 | """Quote a table name.
179 | Parameters
180 | ----------
181 | name : str
182 | Name to quote.
183 | Returns
184 | -------
185 | str
186 | Quoted name.
187 | """
188 | return f"{name}"
189 |
--------------------------------------------------------------------------------
/raster_loader/io/snowflake.py:
--------------------------------------------------------------------------------
1 | import json
2 | import rasterio
3 | import pandas as pd
4 |
5 | from itertools import chain
6 | from typing import Iterable, List, Tuple
7 |
8 | from raster_loader.errors import (
9 | IncompatibleRasterException,
10 | import_error_snowflake,
11 | )
12 |
13 | from raster_loader.utils import ask_yes_no_question, batched
14 |
15 | from raster_loader.io.common import (
16 | rasterio_metadata,
17 | rasterio_overview_to_records,
18 | rasterio_windows_to_records,
19 | get_number_of_blocks,
20 | get_number_of_overviews_blocks,
21 | check_metadata_is_compatible,
22 | update_metadata,
23 | )
24 | from raster_loader.io.datawarehouse import DataWarehouseConnection
25 |
26 | try:
27 | from snowflake.connector.pandas_tools import write_pandas
28 | import snowflake.connector
29 | except ImportError: # pragma: no cover
30 | _has_snowflake = False
31 | else:
32 | _has_snowflake = True
33 |
34 |
35 | class SnowflakeConnection(DataWarehouseConnection):
36 | def __init__(
37 | self,
38 | username,
39 | password,
40 | account,
41 | database,
42 | schema,
43 | token,
44 | private_key_path,
45 | private_key_passphrase,
46 | role,
47 | warehouse,
48 | ):
49 | if not _has_snowflake:
50 | import_error_snowflake()
51 |
52 | # TODO: Write a proper static factory for this
53 | if token is not None:
54 | self.client = snowflake.connector.connect(
55 | authenticator="oauth",
56 | token=token,
57 | account=account,
58 | database=database.upper(),
59 | schema=schema.upper(),
60 | role=role.upper() if role is not None else None,
61 | warehouse=warehouse,
62 | )
63 | elif private_key_path is not None:
64 | self.client = snowflake.connector.connect(
65 | authenticator="snowflake_jwt",
66 | user=username,
67 | private_key_file=private_key_path,
68 | private_key_file_pwd=private_key_passphrase,
69 | account=account,
70 | database=database.upper(),
71 | schema=schema.upper(),
72 | role=role.upper() if role is not None else None,
73 | warehouse=warehouse,
74 | )
75 | else:
76 | self.client = snowflake.connector.connect(
77 | user=username,
78 | password=password,
79 | account=account,
80 | database=database.upper(),
81 | schema=schema.upper(),
82 | role=role.upper() if role is not None else None,
83 | warehouse=warehouse,
84 | )
85 |
86 | def band_rename_function(self, band_name: str):
87 | return band_name
88 |
89 | def write_metadata(
90 | self,
91 | metadata,
92 | append_records,
93 | fqn,
94 | ):
95 | fqn = fqn.upper()
96 | if append_records:
97 | query = f"""
98 | UPDATE {fqn}
99 | SET metadata = (
100 | SELECT TO_JSON(
101 | PARSE_JSON(
102 | {self.quote(json.dumps(metadata))}
103 | )
104 | )
105 | ) WHERE block = 0
106 | """
107 |
108 | self.execute(query)
109 |
110 | return True
111 | else:
112 | self.execute(
113 | f"""
114 | ALTER TABLE {fqn}
115 | ADD COLUMN metadata STRING;
116 | """
117 | )
118 | return self.insert_in_table(
119 | [
120 | {
121 | # store metadata in the record with this block number
122 | "BLOCK": 0,
123 | "METADATA": json.dumps(metadata),
124 | }
125 | ],
126 | fqn,
127 | )
128 |
129 | def get_metadata(self, fqn: str):
130 | query = f"""
131 | SELECT metadata
132 | FROM {fqn.upper()}
133 | WHERE block = 0
134 | """
135 | result = self.execute(query)
136 | if len(result) == 0:
137 | return None
138 | return json.loads(result[0][0])
139 |
140 | def upload_records(
141 | self,
142 | records: Iterable,
143 | fqn: str,
144 | overwrite: bool,
145 | ):
146 | # Convert to Pandas DataFrame
147 | data_df = pd.DataFrame(records)
148 |
149 | # Drop metadata column if it exists
150 | if "METADATA" in data_df.columns:
151 | data_df = data_df.drop(columns=["METADATA"])
152 |
153 | database, schema, table = fqn.upper().split(".")
154 |
155 | return write_pandas(
156 | conn=self.client,
157 | df=data_df,
158 | table_name=table,
159 | database=database,
160 | schema=schema,
161 | chunk_size=10000,
162 | auto_create_table=True,
163 | overwrite=overwrite,
164 | )[0]
165 |
166 | def execute(self, sql):
167 | return self.client.cursor().execute(sql).fetchall()
168 |
169 | def execute_to_dataframe(self, sql):
170 | return self.client.cursor().execute(sql).fetch_pandas_all()
171 |
172 | def check_if_table_exists(self, fqn: str): # pragma: no cover
173 | database, schema, table = fqn.upper().split(".")
174 | query = f"""
175 | SELECT *
176 | FROM {database}.INFORMATION_SCHEMA.TABLES
177 | WHERE TABLE_SCHEMA = '{schema}'
178 | AND TABLE_NAME = '{table}';
179 | """
180 | res = self.execute(query)
181 |
182 | return len(res) > 0
183 |
184 | def check_if_table_is_empty(
185 | self,
186 | fqn: str,
187 | ): # pragma: no cover
188 | database, schema, table = fqn.split(".")
189 | query = f"""
190 | SELECT ROW_COUNT
191 | FROM {database}.INFORMATION_SCHEMA.TABLES
192 | WHERE TABLE_SCHEMA = '{schema.upper()}'
193 | AND TABLE_NAME = '{table.upper()}';
194 | """
195 | res = self.execute(query)
196 | return res[0] == 0
197 |
198 | def upload_raster(
199 | self,
200 | file_path: str,
201 | fqn: str,
202 | bands_info: List[Tuple[int, str]] = None,
203 | chunk_size: int = None,
204 | overwrite: bool = False,
205 | append: bool = False,
206 | cleanup_on_failure: bool = False,
207 | exact_stats: bool = False,
208 | basic_stats: bool = False,
209 | compress: bool = False,
210 | compression_level: int = 6,
211 | ) -> bool:
212 | """Write a raster file to a Snowflake table."""
213 |
214 | def band_rename_function(x):
215 | return x.upper()
216 |
217 | print("Loading raster file to Snowflake...")
218 |
219 | bands_info = bands_info or [(1, None)]
220 |
221 | append_records = False
222 |
223 | fqn = fqn.upper()
224 |
225 | try:
226 | if (
227 | self.check_if_table_exists(fqn)
228 | and not self.check_if_table_is_empty(fqn)
229 | and not overwrite
230 | ):
231 | append_records = append or ask_yes_no_question(
232 | f"Table {fqn} already exists "
233 | "and is not empty. Append records? [yes/no] "
234 | )
235 |
236 | if not append_records:
237 | exit()
238 |
239 | metadata = rasterio_metadata(
240 | file_path,
241 | bands_info,
242 | band_rename_function,
243 | exact_stats,
244 | basic_stats,
245 | compress,
246 | )
247 |
248 | overviews_records_gen = rasterio_overview_to_records(
249 | file_path,
250 | band_rename_function,
251 | bands_info,
252 | compress=compress,
253 | compression_level=compression_level,
254 | )
255 | windows_records_gen = rasterio_windows_to_records(
256 | file_path,
257 | band_rename_function,
258 | bands_info,
259 | compress=compress,
260 | compression_level=compression_level,
261 | )
262 |
263 | records_gen = chain(overviews_records_gen, windows_records_gen)
264 |
265 | number_of_blocks = get_number_of_blocks(file_path)
266 | number_of_overview_tiles = get_number_of_overviews_blocks(file_path)
267 | total_blocks = number_of_blocks + number_of_overview_tiles
268 |
269 | if chunk_size is None:
270 | ret = self.upload_records(records_gen, fqn, overwrite)
271 | if not ret:
272 | raise IOError("Error uploading to Snowflake.")
273 | else:
274 | from tqdm.auto import tqdm
275 |
276 | processed_blocks = 0
277 | print(
278 | f"Writing {number_of_blocks} blocks and {number_of_overview_tiles} "
279 | "overview tiles to Snowflake..."
280 | )
281 | with tqdm(total=total_blocks) as pbar:
282 | if total_blocks < chunk_size:
283 | chunk_size = total_blocks
284 | isFirstBatch = True
285 |
286 | for records in batched(records_gen, chunk_size):
287 | ret = self.upload_records(
288 | records, fqn, overwrite and isFirstBatch
289 | )
290 | num_records = len(records)
291 | processed_blocks += num_records
292 | pbar.update(num_records)
293 |
294 | if not ret:
295 | raise IOError("Error uploading to Snowflake.")
296 | isFirstBatch = False
297 |
298 | empty_blocks = total_blocks - processed_blocks
299 | pbar.update(empty_blocks)
300 |
301 | print("Number of empty blocks: ", empty_blocks)
302 |
303 | print("Writing metadata to Snowflake...")
304 | if append_records:
305 | old_metadata = self.get_metadata(fqn)
306 | check_metadata_is_compatible(metadata, old_metadata)
307 | update_metadata(metadata, old_metadata)
308 |
309 | self.write_metadata(metadata, append_records, fqn)
310 |
311 | except IncompatibleRasterException as e:
312 | raise IOError("Error uploading to Snowflake: {}".format(e.message))
313 |
314 | except KeyboardInterrupt:
315 | delete = cleanup_on_failure or ask_yes_no_question(
316 | "Would you like to delete the partially uploaded table? [yes/no] "
317 | )
318 |
319 | if delete:
320 | self.delete_table(fqn)
321 |
322 | raise KeyboardInterrupt
323 |
324 | except rasterio.errors.CRSError as e:
325 | raise e
326 |
327 | except Exception as e:
328 | delete = cleanup_on_failure or ask_yes_no_question(
329 | (
330 | "Error uploading to Snowflake. "
331 | "Would you like to delete the partially uploaded table? [yes/no] "
332 | )
333 | )
334 |
335 | if delete:
336 | self.delete_table(fqn)
337 |
338 | raise IOError("Error uploading to Snowflake: {}".format(e))
339 |
340 | print("Done.")
341 | return True
342 |
--------------------------------------------------------------------------------
/raster_loader/tests/.env.sample:
--------------------------------------------------------------------------------
1 | BQ_PROJECT_ID=carto-raster-loader
2 | BQ_DATASET_ID=brendan_dev
3 |
4 | SF_USERNAME=your_username
5 | SF_PASSWORD=your_password
6 | SF_ACCOUNT=your_account
7 | SF_DATABASE=your_database
8 | SF_SCHEMA=your_schema
9 | SF_ROLE=your_role
10 |
11 | DB_SERVER_HOSTNAME=your_server_hostname
12 | DB_TOKEN=your_token
13 | DB_CLUSTER_ID=your_cluster_id
14 | DB_CATALOG=your_catalog
15 | DB_SCHEMA=your_schema
--------------------------------------------------------------------------------
/raster_loader/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/__init__.py
--------------------------------------------------------------------------------
/raster_loader/tests/bigquery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/bigquery/__init__.py
--------------------------------------------------------------------------------
/raster_loader/tests/bigquery/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest.mock import patch
3 |
4 | from click.testing import CliRunner
5 | import pandas as pd
6 |
7 | from raster_loader.cli import main
8 |
9 |
10 | here = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11 | fixtures = os.path.join(here, "fixtures")
12 | tiff = os.path.join(fixtures, "mosaic_cog.tif")
13 |
14 |
15 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
16 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
17 | def test_bigquery_upload(*args, **kwargs):
18 | runner = CliRunner()
19 | result = runner.invoke(
20 | main,
21 | [
22 | "bigquery",
23 | "upload",
24 | "--file_path",
25 | f"{tiff}",
26 | "--project",
27 | "project",
28 | "--dataset",
29 | "dataset",
30 | "--table",
31 | "table",
32 | "--chunk_size",
33 | 1,
34 | "--band",
35 | 1,
36 | ],
37 | )
38 | assert result.exit_code == 0
39 |
40 |
41 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
42 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
43 | def test_bigquery_upload_with_basic_stats(*args, **kwargs):
44 | runner = CliRunner()
45 | result = runner.invoke(
46 | main,
47 | [
48 | "bigquery",
49 | "upload",
50 | "--file_path",
51 | f"{tiff}",
52 | "--project",
53 | "project",
54 | "--dataset",
55 | "dataset",
56 | "--table",
57 | "table",
58 | "--chunk_size",
59 | 1,
60 | "--band",
61 | 1,
62 | "--basic_stats",
63 | ],
64 | )
65 | assert result.exit_code == 0
66 |
67 |
68 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
69 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
70 | def test_bigquery_upload_with_all_stats(*args, **kwargs):
71 | runner = CliRunner()
72 | result = runner.invoke(
73 | main,
74 | [
75 | "bigquery",
76 | "upload",
77 | "--file_path",
78 | f"{tiff}",
79 | "--project",
80 | "project",
81 | "--dataset",
82 | "dataset",
83 | "--table",
84 | "table",
85 | "--chunk_size",
86 | 1,
87 | "--band",
88 | 1,
89 | ],
90 | )
91 | assert result.exit_code == 0
92 |
93 |
94 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
95 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
96 | def test_bigquery_upload_with_exact_stats(*args, **kwargs):
97 | runner = CliRunner()
98 | result = runner.invoke(
99 | main,
100 | [
101 | "bigquery",
102 | "upload",
103 | "--file_path",
104 | f"{tiff}",
105 | "--project",
106 | "project",
107 | "--dataset",
108 | "dataset",
109 | "--table",
110 | "table",
111 | "--chunk_size",
112 | 1,
113 | "--band",
114 | 1,
115 | "--exact_stats",
116 | ],
117 | )
118 | assert result.exit_code == 0
119 |
120 |
121 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
122 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
123 | def test_bigquery_file_path_or_url_check(*args, **kwargs):
124 | runner = CliRunner()
125 | result = runner.invoke(
126 | main,
127 | [
128 | "bigquery",
129 | "upload",
130 | "--project",
131 | "project",
132 | "--dataset",
133 | "dataset",
134 | "--table",
135 | "table",
136 | "--chunk_size",
137 | 1,
138 | "--band",
139 | 1,
140 | ],
141 | )
142 | assert result.exit_code == 1
143 | assert "Either --file_path or --file_url must be provided" in result.output
144 |
145 | result = runner.invoke(
146 | main,
147 | [
148 | "bigquery",
149 | "upload",
150 | "--file_path",
151 | f"{tiff}",
152 | "--file_url",
153 | "http://example.com/raster.tif",
154 | "--project",
155 | "project",
156 | "--dataset",
157 | "dataset",
158 | "--table",
159 | "table",
160 | "--chunk_size",
161 | 1,
162 | "--band",
163 | 1,
164 | ],
165 | )
166 | assert result.exit_code == 1
167 | assert "Only one of --file_path or --file_url must be provided" in result.output
168 |
169 |
170 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
171 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
172 | def test_bigquery_upload_multiple_bands(*args, **kwargs):
173 | runner = CliRunner()
174 | result = runner.invoke(
175 | main,
176 | [
177 | "bigquery",
178 | "upload",
179 | "--file_path",
180 | f"{tiff}",
181 | "--project",
182 | "project",
183 | "--dataset",
184 | "dataset",
185 | "--table",
186 | "table",
187 | "--chunk_size",
188 | 1,
189 | "--band",
190 | 1,
191 | "--band",
192 | 2,
193 | ],
194 | )
195 | assert result.exit_code == 0
196 |
197 |
198 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
199 | def test_bigquery_fail_upload_multiple_bands_misaligned_with_band_names(
200 | *args, **kwargs
201 | ):
202 | runner = CliRunner()
203 | result = runner.invoke(
204 | main,
205 | [
206 | "bigquery",
207 | "upload",
208 | "--file_path",
209 | f"{tiff}",
210 | "--project",
211 | "project",
212 | "--dataset",
213 | "dataset",
214 | "--table",
215 | "table",
216 | "--chunk_size",
217 | 1,
218 | "--band",
219 | 1,
220 | "--band_name",
221 | "band_1",
222 | "--band",
223 | 2,
224 | ],
225 | )
226 | assert result.exit_code == 1
227 |
228 | assert "The number of bands must equal the number of band names." in result.output
229 |
230 |
231 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
232 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
233 | def test_bigquery_upload_multiple_bands_aligned_with_band_names(*args, **kwargs):
234 | runner = CliRunner()
235 | result = runner.invoke(
236 | main,
237 | [
238 | "bigquery",
239 | "upload",
240 | "--file_path",
241 | f"{tiff}",
242 | "--project",
243 | "project",
244 | "--dataset",
245 | "dataset",
246 | "--table",
247 | "table",
248 | "--chunk_size",
249 | 1,
250 | "--band",
251 | 1,
252 | "--band_name",
253 | "band_1",
254 | "--band_name",
255 | "band_2",
256 | "--band",
257 | 2,
258 | ],
259 | )
260 | assert result.exit_code == 0
261 |
262 |
263 | @patch("raster_loader.cli.bigquery.BigQueryConnection.upload_raster", return_value=None)
264 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
265 | def test_bigquery_upload_no_table_name(*args, **kwargs):
266 | runner = CliRunner()
267 | result = runner.invoke(
268 | main,
269 | [
270 | "bigquery",
271 | "upload",
272 | "--file_path",
273 | f"{tiff}",
274 | "--project",
275 | "project",
276 | "--dataset",
277 | "dataset",
278 | "--chunk_size",
279 | 1,
280 | "--band",
281 | 1,
282 | ],
283 | )
284 | assert result.exit_code == 0
285 | assert "Table: mosaic_cog_band__1___" in result.output
286 |
287 |
288 | @patch(
289 | "raster_loader.io.bigquery.BigQueryConnection.get_records",
290 | return_value=pd.DataFrame.from_dict({"col_1": [1, 2], "col_2": ["a", "b"]}),
291 | )
292 | @patch("raster_loader.cli.bigquery.BigQueryConnection.__init__", return_value=None)
293 | def test_bigquery_describe(*args, **kwargs):
294 | runner = CliRunner()
295 | result = runner.invoke(
296 | main,
297 | [
298 | "bigquery",
299 | "describe",
300 | "--project",
301 | "project",
302 | "--dataset",
303 | "dataset",
304 | "--table",
305 | "table",
306 | ],
307 | )
308 | assert result.exit_code == 0
309 |
310 |
311 | def test_info(*args, **kwargs):
312 | runner = CliRunner()
313 | result = runner.invoke(main, ["info"])
314 |
315 | assert result.exit_code == 0
316 | assert "Raster Loader version" in result.output
317 | assert "Python version" in result.output
318 | assert "Platform" in result.output
319 | assert "System version" in result.output
320 | assert "Machine" in result.output
321 | assert "Processor" in result.output
322 | assert "Architecture" in result.output
323 |
--------------------------------------------------------------------------------
/raster_loader/tests/databricks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/databricks/__init__.py
--------------------------------------------------------------------------------
/raster_loader/tests/databricks/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest.mock import patch
3 |
4 | from click.testing import CliRunner
5 | import pandas as pd
6 |
7 | from raster_loader.cli import main
8 |
9 |
10 | here = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11 | fixtures = os.path.join(here, "fixtures")
12 | tiff = os.path.join(fixtures, "mosaic_cog.tif")
13 |
14 |
15 | @patch(
16 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
17 | )
18 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
19 | def test_databricks_upload(*args, **kwargs):
20 | runner = CliRunner()
21 | result = runner.invoke(
22 | main,
23 | [
24 | "databricks",
25 | "upload",
26 | "--server-hostname",
27 | "test.cloud.databricks.com",
28 | "--token",
29 | "test-token",
30 | "--cluster-id",
31 | "test-cluster-id",
32 | "--file_path",
33 | f"{tiff}",
34 | "--catalog",
35 | "catalog",
36 | "--schema",
37 | "schema",
38 | "--table",
39 | "table",
40 | "--chunk_size",
41 | 1,
42 | "--band",
43 | 1,
44 | ],
45 | )
46 | assert result.exit_code == 0
47 |
48 |
49 | @patch(
50 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
51 | )
52 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
53 | def test_databricks_upload_with_basic_stats(*args, **kwargs):
54 | runner = CliRunner()
55 | result = runner.invoke(
56 | main,
57 | [
58 | "databricks",
59 | "upload",
60 | "--server-hostname",
61 | "test.cloud.databricks.com",
62 | "--token",
63 | "test-token",
64 | "--cluster-id",
65 | "test-cluster-id",
66 | "--file_path",
67 | f"{tiff}",
68 | "--catalog",
69 | "catalog",
70 | "--schema",
71 | "schema",
72 | "--table",
73 | "table",
74 | "--chunk_size",
75 | 1,
76 | "--band",
77 | 1,
78 | "--basic_stats",
79 | ],
80 | )
81 | assert result.exit_code == 0
82 |
83 |
84 | @patch(
85 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
86 | )
87 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
88 | def test_databricks_upload_with_all_stats(*args, **kwargs):
89 | runner = CliRunner()
90 | result = runner.invoke(
91 | main,
92 | [
93 | "databricks",
94 | "upload",
95 | "--server-hostname",
96 | "test.cloud.databricks.com",
97 | "--token",
98 | "test-token",
99 | "--cluster-id",
100 | "test-cluster-id",
101 | "--file_path",
102 | f"{tiff}",
103 | "--catalog",
104 | "catalog",
105 | "--schema",
106 | "schema",
107 | "--table",
108 | "table",
109 | "--chunk_size",
110 | 1,
111 | "--band",
112 | 1,
113 | ],
114 | )
115 | assert result.exit_code == 0
116 |
117 |
118 | @patch(
119 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
120 | )
121 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
122 | def test_databricks_upload_with_exact_stats(*args, **kwargs):
123 | runner = CliRunner()
124 | result = runner.invoke(
125 | main,
126 | [
127 | "databricks",
128 | "upload",
129 | "--server-hostname",
130 | "test.cloud.databricks.com",
131 | "--token",
132 | "test-token",
133 | "--cluster-id",
134 | "test-cluster-id",
135 | "--file_path",
136 | f"{tiff}",
137 | "--catalog",
138 | "catalog",
139 | "--schema",
140 | "schema",
141 | "--table",
142 | "table",
143 | "--chunk_size",
144 | 1,
145 | "--band",
146 | 1,
147 | "--exact_stats",
148 | ],
149 | )
150 | assert result.exit_code == 0
151 |
152 |
153 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
154 | def test_databricks_credentials_validation(*args, **kwargs):
155 | runner = CliRunner()
156 | result = runner.invoke(
157 | main,
158 | [
159 | "databricks",
160 | "upload",
161 | "--server-hostname",
162 | "test.cloud.databricks.com",
163 | "--token",
164 | "test-token",
165 | "--file_path",
166 | f"{tiff}",
167 | "--catalog",
168 | "catalog",
169 | "--schema",
170 | "schema",
171 | "--table",
172 | "table",
173 | "--chunk_size",
174 | 1,
175 | "--band",
176 | 1,
177 | ],
178 | )
179 | assert result.exit_code == 2
180 | assert "Missing option '--cluster-id'" in result.output
181 |
182 |
183 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
184 | def test_databricks_file_path_or_url_check(*args, **kwargs):
185 | runner = CliRunner()
186 | result = runner.invoke(
187 | main,
188 | [
189 | "databricks",
190 | "upload",
191 | "--server-hostname",
192 | "test.cloud.databricks.com",
193 | "--token",
194 | "test-token",
195 | "--cluster-id",
196 | "test-cluster-id",
197 | "--catalog",
198 | "catalog",
199 | "--schema",
200 | "schema",
201 | "--table",
202 | "table",
203 | "--chunk_size",
204 | 1,
205 | "--band",
206 | 1,
207 | ],
208 | )
209 | assert result.exit_code == 1
210 | assert "Either --file_path or --file_url must be provided" in result.output
211 |
212 | result = runner.invoke(
213 | main,
214 | [
215 | "databricks",
216 | "upload",
217 | "--file_path",
218 | f"{tiff}",
219 | "--file_url",
220 | "http://example.com/raster.tif",
221 | "--server-hostname",
222 | "test.cloud.databricks.com",
223 | "--token",
224 | "test-token",
225 | "--cluster-id",
226 | "test-cluster-id",
227 | "--catalog",
228 | "catalog",
229 | "--schema",
230 | "schema",
231 | "--table",
232 | "table",
233 | "--chunk_size",
234 | 1,
235 | "--band",
236 | 1,
237 | ],
238 | )
239 | assert result.exit_code == 1
240 | assert "Only one of --file_path or --file_url must be provided" in result.output
241 |
242 |
243 | @patch(
244 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
245 | )
246 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
247 | def test_databricks_upload_multiple_bands(*args, **kwargs):
248 | runner = CliRunner()
249 | result = runner.invoke(
250 | main,
251 | [
252 | "databricks",
253 | "upload",
254 | "--server-hostname",
255 | "test.cloud.databricks.com",
256 | "--token",
257 | "test-token",
258 | "--cluster-id",
259 | "test-cluster-id",
260 | "--file_path",
261 | f"{tiff}",
262 | "--catalog",
263 | "catalog",
264 | "--schema",
265 | "schema",
266 | "--table",
267 | "table",
268 | "--chunk_size",
269 | 1,
270 | "--band",
271 | 1,
272 | "--band",
273 | 2,
274 | ],
275 | )
276 | assert result.exit_code == 0
277 |
278 |
279 | def test_databricks_fail_upload_multiple_bands_misaligned_with_band_names(
280 | *args, **kwargs
281 | ):
282 | runner = CliRunner()
283 | result = runner.invoke(
284 | main,
285 | [
286 | "databricks",
287 | "upload",
288 | "--server-hostname",
289 | "test.cloud.databricks.com",
290 | "--token",
291 | "test-token",
292 | "--cluster-id",
293 | "test-cluster-id",
294 | "--file_path",
295 | f"{tiff}",
296 | "--catalog",
297 | "catalog",
298 | "--schema",
299 | "schema",
300 | "--table",
301 | "table",
302 | "--chunk_size",
303 | 1,
304 | "--band",
305 | 1,
306 | "--band_name",
307 | "band_1",
308 | "--band",
309 | 2,
310 | ],
311 | )
312 | assert result.exit_code == 1
313 | assert "The number of bands must equal the number of band names." in result.output
314 |
315 |
316 | @patch(
317 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
318 | )
319 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
320 | def test_databricks_upload_multiple_bands_aligned_with_band_names(*args, **kwargs):
321 | runner = CliRunner()
322 | result = runner.invoke(
323 | main,
324 | [
325 | "databricks",
326 | "upload",
327 | "--server-hostname",
328 | "test.cloud.databricks.com",
329 | "--token",
330 | "test-token",
331 | "--cluster-id",
332 | "test-cluster-id",
333 | "--file_path",
334 | f"{tiff}",
335 | "--catalog",
336 | "catalog",
337 | "--schema",
338 | "schema",
339 | "--table",
340 | "table",
341 | "--chunk_size",
342 | 1,
343 | "--band",
344 | 1,
345 | "--band_name",
346 | "band_1",
347 | "--band_name",
348 | "band_2",
349 | "--band",
350 | 2,
351 | ],
352 | )
353 | assert result.exit_code == 0
354 |
355 |
356 | @patch(
357 | "raster_loader.io.databricks.DatabricksConnection.upload_raster", return_value=None
358 | )
359 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
360 | def test_databricks_upload_no_table_name(*args, **kwargs):
361 | runner = CliRunner()
362 | result = runner.invoke(
363 | main,
364 | [
365 | "databricks",
366 | "upload",
367 | "--server-hostname",
368 | "test.cloud.databricks.com",
369 | "--token",
370 | "test-token",
371 | "--cluster-id",
372 | "test-cluster-id",
373 | "--file_path",
374 | f"{tiff}",
375 | "--catalog",
376 | "catalog",
377 | "--schema",
378 | "schema",
379 | "--chunk_size",
380 | 1,
381 | "--band",
382 | 1,
383 | ],
384 | )
385 | assert result.exit_code == 0
386 | assert "Table: mosaic_cog_band__1___" in result.output
387 |
388 |
389 | @patch(
390 | "raster_loader.io.databricks.DatabricksConnection.execute_to_dataframe",
391 | return_value=pd.DataFrame.from_dict({"col_1": [1, 2], "col_2": ["a", "b"]}),
392 | )
393 | @patch("raster_loader.io.databricks.DatabricksConnection.__init__", return_value=None)
394 | def test_databricks_describe(*args, **kwargs):
395 | runner = CliRunner()
396 | result = runner.invoke(
397 | main,
398 | [
399 | "databricks",
400 | "describe",
401 | "--server-hostname",
402 | "test.cloud.databricks.com",
403 | "--token",
404 | "test-token",
405 | "--cluster-id",
406 | "test-cluster-id",
407 | "--catalog",
408 | "catalog",
409 | "--schema",
410 | "schema",
411 | "--table",
412 | "table",
413 | ],
414 | )
415 | assert result.exit_code == 0
416 |
--------------------------------------------------------------------------------
/raster_loader/tests/databricks/test_io.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import sys
4 | import json
5 | from unittest.mock import patch
6 |
7 | import pandas as pd
8 | import pytest
9 |
10 | from raster_loader.tests import mocks
11 | from raster_loader.io.databricks import DatabricksConnection
12 |
13 |
14 | HERE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | fixtures_dir = os.path.join(HERE, "fixtures")
16 |
17 |
18 | should_swap = {"=": sys.byteorder != "little", "<": False, ">": True, "|": False}
19 |
20 | env_filename = os.path.join(HERE, ".env")
21 | if os.path.isfile(env_filename):
22 | with open(env_filename) as env_file:
23 | for line in env_file:
24 | line = line.strip()
25 | if line:
26 | var, value = line.split("=")
27 | os.environ[var] = value
28 |
29 | DB_SERVER_HOSTNAME = os.environ.get("DB_SERVER_HOSTNAME")
30 | DB_TOKEN = os.environ.get("DB_TOKEN")
31 | DB_CLUSTER_ID = os.environ.get("DB_CLUSTER_ID")
32 | DB_CATALOG = os.environ.get("DB_CATALOG")
33 | DB_SCHEMA = os.environ.get("DB_SCHEMA")
34 |
35 |
36 | def check_integration_config():
37 | if not all([DB_SERVER_HOSTNAME, DB_TOKEN, DB_CLUSTER_ID, DB_CATALOG, DB_SCHEMA]):
38 | raise Exception(
39 | "You need to copy tests/.env.sample to test/.env and set your configuration"
40 | "before running the tests"
41 | )
42 |
43 |
44 | @pytest.mark.integration_test
45 | def test_rasterio_to_databricks_with_raster_default_band_name():
46 | check_integration_config()
47 |
48 | table_name = "test_mosaic_1"
49 | fqn = f"`{DB_CATALOG}`.`{DB_SCHEMA}`.`{table_name}`"
50 |
51 | connection = DatabricksConnection(
52 | server_hostname=DB_SERVER_HOSTNAME,
53 | access_token=DB_TOKEN,
54 | cluster_id=DB_CLUSTER_ID,
55 | )
56 |
57 | connection.upload_raster(
58 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
59 | fqn,
60 | overwrite=True,
61 | )
62 |
63 | result = connection.execute_to_dataframe(f"SELECT * FROM {fqn} LIMIT 20")
64 |
65 | expected_dataframe = pd.read_pickle(
66 | os.path.join(fixtures_dir, "expected_default_column.pkl")
67 | )
68 | expected_dataframe = expected_dataframe.sort_values("block")
69 |
70 | assert sorted(result.columns) == sorted(expected_dataframe.columns)
71 | assert sorted(
72 | list(result.block), key=lambda x: x if x is not None else -math.inf
73 | ) == sorted(
74 | list(expected_dataframe.block), key=lambda x: x if x is not None else -math.inf
75 | )
76 | assert sorted(
77 | list(result.metadata), key=lambda x: x if x is not None else ""
78 | ) == sorted(
79 | list(expected_dataframe.metadata), key=lambda x: x if x is not None else ""
80 | )
81 | assert sorted(
82 | list(result.band_1), key=lambda x: x if x is not None else b""
83 | ) == sorted(
84 | list(expected_dataframe.band_1), key=lambda x: x if x is not None else b""
85 | )
86 |
87 |
88 | @pytest.mark.integration_test
89 | def test_rasterio_to_databricks_appending_rows():
90 | check_integration_config()
91 |
92 | table_name = "test_mosaic_append_rows"
93 | fqn = f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}"
94 |
95 | connection = DatabricksConnection(
96 | server_hostname=DB_SERVER_HOSTNAME,
97 | access_token=DB_TOKEN,
98 | cluster_id=DB_CLUSTER_ID,
99 | )
100 |
101 | connection.upload_raster(
102 | os.path.join(fixtures_dir, "mosaic_cog_1_1.tif"),
103 | fqn,
104 | overwrite=True,
105 | )
106 |
107 | result = connection.get_records(fqn, 20)
108 |
109 | metadata = json.loads([x for x in list(result.metadata) if x][0])
110 |
111 | assert metadata == {
112 | "pixel_resolution": 13,
113 | "block_resolution": 5,
114 | "minresolution": 5,
115 | "maxresolution": 5,
116 | "nodata": None,
117 | "bands": [{"type": "uint8", "name": "band_1"}],
118 | "bounds": [
119 | 11.249999999997055,
120 | 40.979898069622585,
121 | 22.49999999999707,
122 | 48.92249926376037,
123 | ],
124 | "center": [16.874999999997062, 44.951198666691475, 5],
125 | "width": 256,
126 | "height": 256,
127 | "block_width": 256,
128 | "block_height": 256,
129 | "num_blocks": 1,
130 | "num_pixels": 65536,
131 | }
132 |
133 | connection.upload_raster(
134 | os.path.join(fixtures_dir, "mosaic_cog_1_2.tif"),
135 | fqn,
136 | append=True,
137 | )
138 |
139 | result = connection.get_records(fqn, 20)
140 |
141 | metadata = json.loads([x for x in list(result.metadata) if x][0])
142 |
143 | assert metadata == {
144 | "bands": [{"name": "band_1", "type": "uint8"}],
145 | "block_height": 256,
146 | "block_width": 256,
147 | "bounds": [
148 | 11.249999999997055,
149 | 40.979898069622585,
150 | 33.74999999999708,
151 | 48.92249926376037,
152 | ],
153 | "center": [22.499999999997065, 44.95119866669148, 5],
154 | "height": 256,
155 | "maxresolution": 5,
156 | "minresolution": 5,
157 | "nodata": None,
158 | "num_blocks": 2,
159 | "num_pixels": 131072,
160 | "block_resolution": 5,
161 | "pixel_resolution": 13,
162 | "width": 512,
163 | }
164 |
165 | assert len(result) == 3
166 |
167 |
168 | @pytest.mark.integration_test
169 | def test_rasterio_to_databricks_with_raster_custom_band_column():
170 | check_integration_config()
171 |
172 | table_name = "test_mosaic_custom_band_column_1"
173 | fqn = f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}"
174 |
175 | connection = DatabricksConnection(
176 | server_hostname=DB_SERVER_HOSTNAME,
177 | access_token=DB_TOKEN,
178 | cluster_id=DB_CLUSTER_ID,
179 | )
180 |
181 | connection.upload_raster(
182 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
183 | fqn,
184 | overwrite=True,
185 | bands_info=[(1, "customband")],
186 | )
187 |
188 | result = connection.get_records(fqn, 20)
189 |
190 | # sort value because return query can vary the order of rows
191 | result = result.sort_values("block")
192 |
193 | expected_dataframe = pd.read_pickle(
194 | os.path.join(fixtures_dir, "expected_custom_column.pkl")
195 | )
196 | expected_dataframe = expected_dataframe.sort_values("block")
197 |
198 | assert sorted(result.columns) == sorted(expected_dataframe.columns)
199 | assert sorted(
200 | list(result.block), key=lambda x: x if x is not None else -math.inf
201 | ) == sorted(
202 | list(expected_dataframe.block), key=lambda x: x if x is not None else -math.inf
203 | )
204 | assert sorted(
205 | list(result.metadata), key=lambda x: x if x is not None else ""
206 | ) == sorted(
207 | list(expected_dataframe.metadata), key=lambda x: x if x is not None else ""
208 | )
209 | assert sorted(
210 | list(result.customband), key=lambda x: x if x is not None else b""
211 | ) == sorted(
212 | list(expected_dataframe.customband), key=lambda x: x if x is not None else b""
213 | )
214 |
215 |
216 | @pytest.mark.integration_test
217 | def test_rasterio_to_databricks_with_raster_multiple_default():
218 | check_integration_config()
219 |
220 | table_name = "test_mosaic_multiple_default_bands"
221 | fqn = f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}"
222 |
223 | connection = DatabricksConnection(
224 | server_hostname=DB_SERVER_HOSTNAME,
225 | access_token=DB_TOKEN,
226 | cluster_id=DB_CLUSTER_ID,
227 | )
228 |
229 | connection.upload_raster(
230 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
231 | fqn,
232 | overwrite=True,
233 | bands_info=[(1, None), (2, None)],
234 | )
235 |
236 | result = connection.get_records(fqn, 20)
237 |
238 | # sort value because return query can vary the order of rows
239 | result = result.sort_values("block")
240 |
241 | expected_dataframe = pd.read_pickle(
242 | os.path.join(fixtures_dir, "expected_multiple_column.pkl")
243 | )
244 | expected_dataframe = expected_dataframe.sort_values("block")
245 |
246 | assert sorted(result.columns) == sorted(expected_dataframe.columns)
247 | assert sorted(
248 | list(result.block), key=lambda x: x if x is not None else -math.inf
249 | ) == sorted(
250 | list(expected_dataframe.block), key=lambda x: x if x is not None else -math.inf
251 | )
252 | assert sorted(
253 | list(result.metadata), key=lambda x: x if x is not None else ""
254 | ) == sorted(
255 | list(expected_dataframe.metadata), key=lambda x: x if x is not None else ""
256 | )
257 | assert sorted(
258 | list(result.band_1), key=lambda x: x if x is not None else b""
259 | ) == sorted(
260 | list(expected_dataframe.band_1), key=lambda x: x if x is not None else b""
261 | )
262 | assert sorted(
263 | list(result.band_2), key=lambda x: x if x is not None else b""
264 | ) == sorted(
265 | list(expected_dataframe.band_2), key=lambda x: x if x is not None else b""
266 | )
267 |
268 |
269 | @pytest.mark.integration_test
270 | def test_rasterio_to_databricks_with_raster_multiple_custom():
271 | check_integration_config()
272 |
273 | table_name = "test_mosaic_multiple_custom_bands"
274 | fqn = f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}"
275 |
276 | connection = DatabricksConnection(
277 | server_hostname=DB_SERVER_HOSTNAME,
278 | access_token=DB_TOKEN,
279 | cluster_id=DB_CLUSTER_ID,
280 | )
281 |
282 | connection.upload_raster(
283 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
284 | fqn,
285 | overwrite=True,
286 | bands_info=[(1, "custom_band_1"), (2, "custom_band_2")],
287 | )
288 |
289 | result = connection.get_records(fqn, 20)
290 |
291 | # sort value because return query can vary the order of rows
292 | result = result.sort_values("block")
293 |
294 | expected_dataframe = pd.read_pickle(
295 | os.path.join(fixtures_dir, "expected_custom_multiple_column.pkl")
296 | )
297 | expected_dataframe = expected_dataframe.sort_values("block")
298 |
299 | assert sorted(result.columns) == sorted(expected_dataframe.columns)
300 | assert sorted(
301 | list(result.block), key=lambda x: x if x is not None else -math.inf
302 | ) == sorted(
303 | list(expected_dataframe.block), key=lambda x: x if x is not None else -math.inf
304 | )
305 | assert sorted(
306 | list(result.metadata), key=lambda x: x if x is not None else ""
307 | ) == sorted(
308 | list(expected_dataframe.metadata), key=lambda x: x if x is not None else ""
309 | )
310 | assert sorted(
311 | list(result.custom_band_1), key=lambda x: x if x is not None else b""
312 | ) == sorted(
313 | list(expected_dataframe.custom_band_1),
314 | key=lambda x: x if x is not None else b"",
315 | )
316 | assert sorted(
317 | list(result.custom_band_2), key=lambda x: x if x is not None else b""
318 | ) == sorted(
319 | list(expected_dataframe.custom_band_2),
320 | key=lambda x: x if x is not None else b"",
321 | )
322 |
323 |
324 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=False)
325 | def test_rasterio_to_table_wrong_band_name_metadata(*args, **kwargs):
326 | table_name = "test_mosaic_custom_band_column_1"
327 | connection = mocks.MockDatabricksConnection()
328 |
329 | with pytest.raises(IOError):
330 | connection.upload_raster(
331 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
332 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
333 | overwrite=True,
334 | bands_info=[(1, "metadata"), (2, "custom_band_2")],
335 | )
336 |
337 |
338 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=False)
339 | def test_rasterio_to_table_wrong_band_name_block(*args, **kwargs):
340 | table_name = "test_mosaic_custom_band_column_1"
341 | connection = mocks.MockDatabricksConnection()
342 |
343 | with pytest.raises(IOError):
344 | connection.upload_raster(
345 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
346 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
347 | overwrite=True,
348 | bands_info=[(1, "block"), (2, "custom_band_2")],
349 | )
350 |
351 |
352 | @patch(
353 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
354 | return_value=False,
355 | )
356 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=False)
357 | def test_rasterio_to_table(*args, **kwargs):
358 | table_name = "test_mosaic_custom_band_column_1"
359 | connection = mocks.MockDatabricksConnection()
360 | success = connection.upload_raster(
361 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
362 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
363 | )
364 | assert success
365 |
366 |
367 | # Define the standard metadata that will be used for the tests
368 | STANDARD_METADATA = {
369 | "bounds": [0, 0, 0, 0],
370 | "block_resolution": 5,
371 | "nodata": 0,
372 | "block_width": 256,
373 | "block_height": 256,
374 | "bands": [
375 | {
376 | "type": "uint8",
377 | "name": "band_1",
378 | "colorinterp": "red",
379 | "stats": {
380 | "min": 0.0,
381 | "max": 255.0,
382 | "mean": 28.66073989868164,
383 | "stddev": 41.5693439511935,
384 | "count": 100000,
385 | "sum": 2866073.989868164,
386 | "sum_squares": 1e15,
387 | "approximated_stats": False,
388 | "top_values": [1, 2, 3],
389 | "version": "0.0.3",
390 | },
391 | "nodata": "0",
392 | "colorinterp": "red",
393 | "colortable": None,
394 | }
395 | ],
396 | "num_blocks": 1,
397 | "num_pixels": 1,
398 | }
399 |
400 |
401 | @patch(
402 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
403 | return_value=True,
404 | )
405 | @patch(
406 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
407 | return_value=True,
408 | )
409 | @patch(
410 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
411 | )
412 | @patch("raster_loader.io.common.rasterio_windows_to_records", return_value={})
413 | @patch("raster_loader.io.common.rasterio_metadata", return_value={})
414 | @patch("raster_loader.io.common.get_number_of_blocks", return_value=1)
415 | @patch(
416 | "raster_loader.io.databricks.DatabricksConnection.write_metadata", return_value=None
417 | )
418 | def test_rasterio_to_table_overwrite(*args, **kwargs):
419 | table_name = "test_mosaic_custom_band_column_1"
420 | connection = mocks.MockDatabricksConnection()
421 | success = connection.upload_raster(
422 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
423 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
424 | overwrite=True,
425 | )
426 | assert success
427 |
428 |
429 | @patch(
430 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
431 | return_value=True,
432 | )
433 | @patch(
434 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
435 | return_value=False,
436 | )
437 | @patch(
438 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
439 | )
440 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
441 | @patch(
442 | "raster_loader.io.databricks.DatabricksConnection.get_metadata",
443 | return_value=STANDARD_METADATA,
444 | )
445 | def test_rasterio_to_table_is_not_empty_append(*args, **kwargs):
446 | table_name = "test_mosaic_custom_band_column_1"
447 | connection = mocks.MockDatabricksConnection()
448 | success = connection.upload_raster(
449 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
450 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
451 | )
452 | assert success
453 |
454 |
455 | @patch(
456 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
457 | return_value=True,
458 | )
459 | @patch(
460 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
461 | )
462 | @patch(
463 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
464 | return_value=False,
465 | )
466 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=False)
467 | def test_rasterio_to_table_is_not_empty_dont_append(*args, **kwargs):
468 | table_name = "test_mosaic_custom_band_column_1"
469 | connection = mocks.MockDatabricksConnection()
470 | with pytest.raises(SystemExit):
471 | connection.upload_raster(
472 | os.path.join(fixtures_dir, "mosaic.tif"),
473 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
474 | )
475 |
476 |
477 | @patch(
478 | "raster_loader.io.databricks.DatabricksConnection.upload_records",
479 | side_effect=Exception(),
480 | )
481 | @patch(
482 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=True
483 | )
484 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
485 | def test_rasterio_to_table_uploading_error(*args, **kwargs):
486 | table_name = "test_mosaic_custom_band_column_1"
487 | connection = mocks.MockDatabricksConnection()
488 | with pytest.raises(IOError):
489 | connection.upload_raster(
490 | os.path.join(fixtures_dir, "mosaic.tif"),
491 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
492 | )
493 |
494 |
495 | @patch(
496 | "raster_loader.io.databricks.DatabricksConnection.upload_records",
497 | side_effect=KeyboardInterrupt(),
498 | )
499 | @patch(
500 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=True
501 | )
502 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
503 | @patch(
504 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
505 | return_value=False,
506 | )
507 | def test_rasterio_to_table_keyboard_interrupt(*args, **kwargs):
508 | table_name = "test_mosaic_custom_band_column_1"
509 | connection = mocks.MockDatabricksConnection()
510 | with pytest.raises(KeyboardInterrupt):
511 | connection.upload_raster(
512 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
513 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
514 | )
515 |
516 |
517 | @patch(
518 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
519 | return_value=False,
520 | )
521 | def test_rasterio_to_table_with_chunk_size(*args, **kwargs):
522 | table_name = "test_mosaic_custom_band_column_1"
523 | connection = mocks.MockDatabricksConnection()
524 | success = connection.upload_raster(
525 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
526 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
527 | chunk_size=10000,
528 | )
529 | assert success
530 |
531 |
532 | @patch(
533 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
534 | return_value=False,
535 | )
536 | def test_rasterio_to_table_with_one_chunk_size(*args, **kwargs):
537 | table_name = "test_mosaic_custom_band_column_1"
538 | connection = mocks.MockDatabricksConnection()
539 | success = connection.upload_raster(
540 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
541 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
542 | chunk_size=1,
543 | )
544 | assert success
545 |
546 |
547 | @patch(
548 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
549 | return_value=False,
550 | )
551 | def test_rasterio_to_table_invalid_raster(*args, **kwargs):
552 | table_name = "test_mosaic_custom_band_column_1"
553 | connection = mocks.MockDatabricksConnection()
554 | with pytest.raises(OSError):
555 | connection.upload_raster(
556 | os.path.join(fixtures_dir, "mosaic.tif"),
557 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
558 | chunk_size=10000,
559 | )
560 |
561 |
562 | @patch(
563 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
564 | return_value=True,
565 | )
566 | @patch(
567 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
568 | )
569 | @patch(
570 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
571 | return_value=False,
572 | )
573 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
574 | @patch(
575 | "raster_loader.io.databricks.DatabricksConnection.get_metadata",
576 | return_value=STANDARD_METADATA,
577 | )
578 | def test_rasterio_to_databricks_valid_raster(*args, **kwargs):
579 | table_name = "test_mosaic_valid_raster"
580 | connection = mocks.MockDatabricksConnection()
581 | success = connection.upload_raster(
582 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
583 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
584 | )
585 | assert success
586 |
587 |
588 | @patch(
589 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
590 | return_value=True,
591 | )
592 | @patch(
593 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
594 | )
595 | @patch(
596 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
597 | return_value=False,
598 | )
599 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
600 | @patch(
601 | "raster_loader.io.databricks.DatabricksConnection.get_metadata",
602 | return_value={"bounds": [0, 0, 0, 0], "block_resolution": 1},
603 | )
604 | def test_append_with_different_resolution(*args, **kwargs):
605 | table_name = "test_different_resolution"
606 | connection = mocks.MockDatabricksConnection()
607 | with pytest.raises(OSError):
608 | connection.upload_raster(
609 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
610 | f"{DB_CATALOG}.{DB_SCHEMA}.{table_name}",
611 | )
612 |
613 |
614 | @patch(
615 | "raster_loader.io.databricks.DatabricksConnection.get_metadata",
616 | return_value={
617 | **STANDARD_METADATA,
618 | "compression": "gzip",
619 | },
620 | )
621 | @patch(
622 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
623 | return_value=True,
624 | )
625 | @patch(
626 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
627 | )
628 | @patch(
629 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
630 | return_value=False,
631 | )
632 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
633 | @patch(
634 | "raster_loader.io.databricks.DatabricksConnection.write_metadata", return_value=None
635 | )
636 | def test_rasterio_to_databricks_with_compression(*args, **kwargs):
637 | connection = mocks.MockDatabricksConnection()
638 | success = connection.upload_raster(
639 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
640 | f"`{DB_CATALOG}`.`{DB_SCHEMA}`.`test_table`",
641 | compress=True,
642 | )
643 | assert success
644 |
645 |
646 | @patch(
647 | "raster_loader.io.databricks.DatabricksConnection.get_metadata",
648 | return_value={
649 | **STANDARD_METADATA,
650 | "compression": "gzip",
651 | },
652 | )
653 | @patch(
654 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_exists",
655 | return_value=True,
656 | )
657 | @patch(
658 | "raster_loader.io.databricks.DatabricksConnection.delete_table", return_value=None
659 | )
660 | @patch(
661 | "raster_loader.io.databricks.DatabricksConnection.check_if_table_is_empty",
662 | return_value=False,
663 | )
664 | @patch("raster_loader.io.databricks.ask_yes_no_question", return_value=True)
665 | @patch(
666 | "raster_loader.io.databricks.DatabricksConnection.write_metadata", return_value=None
667 | )
668 | def test_rasterio_to_databricks_with_compression_level(*args, **kwargs):
669 | connection = mocks.MockDatabricksConnection()
670 | success = connection.upload_raster(
671 | os.path.join(fixtures_dir, "mosaic_cog.tif"),
672 | f"`{DB_CATALOG}`.`{DB_SCHEMA}`.`test_table`",
673 | compress=True,
674 | compression_level=3,
675 | )
676 | assert success
677 |
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_blocksize_512.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_blocksize_512.pkl
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_custom_column.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_custom_column.npy
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_custom_column.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_custom_column.pkl
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_custom_multiple_column.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_custom_multiple_column.npy
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_custom_multiple_column.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_custom_multiple_column.pkl
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_default_column.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_default_column.npy
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_default_column.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_default_column.pkl
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_default_multiple_column.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_default_multiple_column.npy
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/expected_multiple_column.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/expected_multiple_column.pkl
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog_1_1.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog_1_1.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog_1_2.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog_1_2.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog_2_1.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog_2_1.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog_2_2.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog_2_2.tif
--------------------------------------------------------------------------------
/raster_loader/tests/fixtures/mosaic_cog_512.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/fixtures/mosaic_cog_512.tif
--------------------------------------------------------------------------------
/raster_loader/tests/mocks.py:
--------------------------------------------------------------------------------
1 | from threading import Timer
2 | from functools import partial
3 |
4 | from raster_loader.io.bigquery import BigQueryConnection
5 | from raster_loader.io.snowflake import SnowflakeConnection
6 | from raster_loader.io.databricks import DatabricksConnection
7 |
8 |
9 | def bigquery_client(load_error=False):
10 | class BigQueryClient:
11 | def __init__(self, load_error):
12 | self.load_error = load_error
13 |
14 | def load_table_from_dataframe(self, *args, **kwargs):
15 | if load_error: # pragma: no cover
16 | raise Exception
17 |
18 | class job(object):
19 | def result():
20 | return True
21 |
22 | def add_done_callback(callback):
23 | # need to simulate async behavior
24 | # simulating calling callback after chunk download
25 | # is completed
26 | Timer(0.2, partial(lambda: callback(job))).start()
27 |
28 | return job
29 |
30 | def query(self, query):
31 | class job(object):
32 | def result():
33 | return True
34 |
35 | return job
36 |
37 | def create_table(self, table):
38 | return True
39 |
40 | return BigQueryClient(load_error=load_error)
41 |
42 |
43 | class MockBigQueryConnection(BigQueryConnection):
44 | def __init__(self, *args, **kwargs):
45 | self.client = bigquery_client()
46 |
47 |
48 | def snowflake_client(load_error=False):
49 | class SnowflakeClient:
50 | def __init__(self, load_error):
51 | self.load_error = load_error
52 |
53 | def cursor(self):
54 | return self
55 |
56 | def execute(self, *args, **kwargs):
57 | return self
58 |
59 | def fetchall(self):
60 | return [[1]]
61 |
62 | def fetchone(self):
63 | return [1]
64 |
65 | def close(self):
66 | return True
67 |
68 | def _log_telemetry_job_data(self, *args, **kwargs):
69 | return True
70 |
71 | return SnowflakeClient(load_error=load_error)
72 |
73 |
74 | class MockSnowflakeConnection(SnowflakeConnection):
75 | def __init__(self, *args, **kwargs):
76 | self.client = snowflake_client()
77 |
78 |
79 | def databricks_session():
80 | class SparkSession:
81 | def sql(self, query):
82 | class DataFrame:
83 | def toPandas(self):
84 | import pandas as pd
85 |
86 | return pd.DataFrame({"col_1": [1, 2], "col_2": ["a", "b"]})
87 |
88 | def collect(self):
89 | return [[1]]
90 |
91 | def repartition(self, n):
92 | return self
93 |
94 | def write(self):
95 | return self
96 |
97 | def format(self, fmt):
98 | return self
99 |
100 | def mode(self, mode):
101 | return self
102 |
103 | def saveAsTable(self, table_name):
104 | return True
105 |
106 | return DataFrame()
107 |
108 | def createDataFrame(self, data, schema=None):
109 | class DataFrame:
110 | def repartition(self, n):
111 | return self
112 |
113 | @property
114 | def write(self):
115 | return self
116 |
117 | def format(self, fmt):
118 | return self
119 |
120 | def mode(self, mode):
121 | return self
122 |
123 | def saveAsTable(self, table_name):
124 | return True
125 |
126 | return DataFrame()
127 |
128 | return SparkSession()
129 |
130 |
131 | class MockDatabricksConnection(DatabricksConnection):
132 | def __init__(self, *args, **kwargs):
133 | self.server_hostname = "test.cloud.databricks.com"
134 | self.access_token = "test-token"
135 | self.cluster_id = "test-cluster"
136 | self.parallelism = 1000
137 | self.spark = databricks_session()
138 |
--------------------------------------------------------------------------------
/raster_loader/tests/snowflake/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CartoDB/raster-loader/c04cb56bcff9a22a64d820e4bc4a68d90e680694/raster_loader/tests/snowflake/__init__.py
--------------------------------------------------------------------------------
/raster_loader/tests/snowflake/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest.mock import patch
3 |
4 | from click.testing import CliRunner
5 | import pandas as pd
6 |
7 | from raster_loader.cli import main
8 |
9 |
10 | here = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11 | fixtures = os.path.join(here, "fixtures")
12 | tiff = os.path.join(fixtures, "mosaic_cog.tif")
13 |
14 |
15 | @patch(
16 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
17 | )
18 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
19 | def test_snowflake_upload(*args, **kwargs):
20 | runner = CliRunner()
21 | result = runner.invoke(
22 | main,
23 | [
24 | "snowflake",
25 | "upload",
26 | "--file_path",
27 | f"{tiff}",
28 | "--database",
29 | "database",
30 | "--schema",
31 | "schema",
32 | "--table",
33 | "table",
34 | "--account",
35 | "account",
36 | "--username",
37 | "username",
38 | "--password",
39 | "password",
40 | "--chunk_size",
41 | 1,
42 | "--band",
43 | 1,
44 | ],
45 | )
46 | assert result.exit_code == 0
47 |
48 |
49 | @patch(
50 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
51 | )
52 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
53 | def test_snowflake_upload_with_basic_stats(*args, **kwargs):
54 | runner = CliRunner()
55 | result = runner.invoke(
56 | main,
57 | [
58 | "snowflake",
59 | "upload",
60 | "--file_path",
61 | f"{tiff}",
62 | "--database",
63 | "database",
64 | "--schema",
65 | "schema",
66 | "--table",
67 | "table",
68 | "--account",
69 | "account",
70 | "--username",
71 | "username",
72 | "--password",
73 | "password",
74 | "--chunk_size",
75 | 1,
76 | "--band",
77 | 1,
78 | "--basic_stats",
79 | ],
80 | )
81 | assert result.exit_code == 0
82 |
83 |
84 | @patch(
85 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
86 | )
87 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
88 | def test_snowflake_upload_with_all_stats(*args, **kwargs):
89 | runner = CliRunner()
90 | result = runner.invoke(
91 | main,
92 | [
93 | "snowflake",
94 | "upload",
95 | "--file_path",
96 | f"{tiff}",
97 | "--database",
98 | "database",
99 | "--schema",
100 | "schema",
101 | "--table",
102 | "table",
103 | "--account",
104 | "account",
105 | "--username",
106 | "username",
107 | "--password",
108 | "password",
109 | "--chunk_size",
110 | 1,
111 | "--band",
112 | 1,
113 | ],
114 | )
115 | assert result.exit_code == 0
116 |
117 |
118 | @patch(
119 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
120 | )
121 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
122 | def test_snowflake_upload_with_exact_stats(*args, **kwargs):
123 | runner = CliRunner()
124 | result = runner.invoke(
125 | main,
126 | [
127 | "snowflake",
128 | "upload",
129 | "--file_path",
130 | f"{tiff}",
131 | "--database",
132 | "database",
133 | "--schema",
134 | "schema",
135 | "--table",
136 | "table",
137 | "--account",
138 | "account",
139 | "--username",
140 | "username",
141 | "--password",
142 | "password",
143 | "--chunk_size",
144 | 1,
145 | "--band",
146 | 1,
147 | "--exact_stats",
148 | ],
149 | )
150 | assert result.exit_code == 0
151 |
152 |
153 | @patch(
154 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
155 | )
156 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
157 | def test_snowflake_credentials_validation(*args, **kwargs):
158 | runner = CliRunner()
159 | result = runner.invoke(
160 | main,
161 | [
162 | "snowflake",
163 | "upload",
164 | "--file_path",
165 | f"{tiff}",
166 | "--database",
167 | "database",
168 | "--schema",
169 | "schema",
170 | "--table",
171 | "table",
172 | "--account",
173 | "account",
174 | "--username",
175 | "username",
176 | "--chunk_size",
177 | 1,
178 | "--band",
179 | 1,
180 | ],
181 | )
182 | assert result.exit_code == 1
183 | assert (
184 | "Either (--token) or (--username and --private-key-path) or"
185 | " (--username and --password) must be provided." in result.output
186 | )
187 |
188 | result = runner.invoke(
189 | main,
190 | [
191 | "snowflake",
192 | "upload",
193 | "--file_path",
194 | f"{tiff}",
195 | "--database",
196 | "database",
197 | "--schema",
198 | "schema",
199 | "--table",
200 | "table",
201 | "--account",
202 | "account",
203 | "--username",
204 | "username",
205 | "--password",
206 | "password",
207 | "--token",
208 | "token",
209 | "--chunk_size",
210 | 1,
211 | "--band",
212 | 1,
213 | ],
214 | )
215 | assert result.exit_code == 1
216 | assert (
217 | "Either (--token) or (--username and --private-key-path) or"
218 | " (--username and --password) must be provided." in result.output
219 | )
220 |
221 |
222 | @patch(
223 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
224 | )
225 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
226 | def test_snowflake_file_path_or_url_check(*args, **kwargs):
227 | runner = CliRunner()
228 | result = runner.invoke(
229 | main,
230 | [
231 | "snowflake",
232 | "upload",
233 | "--database",
234 | "database",
235 | "--schema",
236 | "schema",
237 | "--table",
238 | "table",
239 | "--account",
240 | "account",
241 | "--username",
242 | "username",
243 | "--password",
244 | "password",
245 | "--chunk_size",
246 | 1,
247 | "--band",
248 | 1,
249 | ],
250 | )
251 | assert result.exit_code == 1
252 | assert "Either --file_path or --file_url must be provided" in result.output
253 |
254 | result = runner.invoke(
255 | main,
256 | [
257 | "snowflake",
258 | "upload",
259 | "--file_path",
260 | f"{tiff}",
261 | "--file_url",
262 | "http://example.com/raster.tif",
263 | "--database",
264 | "database",
265 | "--schema",
266 | "schema",
267 | "--table",
268 | "table",
269 | "--account",
270 | "account",
271 | "--username",
272 | "username",
273 | "--password",
274 | "password",
275 | "--chunk_size",
276 | 1,
277 | "--band",
278 | 1,
279 | ],
280 | )
281 | assert result.exit_code == 1
282 | assert "Only one of --file_path or --file_url must be provided" in result.output
283 |
284 |
285 | @patch(
286 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
287 | )
288 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
289 | def test_bigquery_upload_multiple_bands(*args, **kwargs):
290 | runner = CliRunner()
291 | result = runner.invoke(
292 | main,
293 | [
294 | "snowflake",
295 | "upload",
296 | "--file_path",
297 | f"{tiff}",
298 | "--database",
299 | "database",
300 | "--schema",
301 | "schema",
302 | "--table",
303 | "table",
304 | "--account",
305 | "account",
306 | "--username",
307 | "username",
308 | "--password",
309 | "password",
310 | "--chunk_size",
311 | 1,
312 | "--band",
313 | 1,
314 | "--band",
315 | 2,
316 | ],
317 | )
318 | assert result.exit_code == 0
319 |
320 |
321 | def test_snowflake_fail_upload_multiple_bands_misaligned_with_band_names(
322 | *args, **kwargs
323 | ):
324 | runner = CliRunner()
325 | result = runner.invoke(
326 | main,
327 | [
328 | "snowflake",
329 | "upload",
330 | "--file_path",
331 | f"{tiff}",
332 | "--database",
333 | "database",
334 | "--schema",
335 | "schema",
336 | "--table",
337 | "table",
338 | "--account",
339 | "account",
340 | "--username",
341 | "username",
342 | "--password",
343 | "password",
344 | "--chunk_size",
345 | 1,
346 | "--band",
347 | 1,
348 | "--band_name",
349 | "band_1",
350 | "--band",
351 | 2,
352 | ],
353 | )
354 | assert result.exit_code == 1
355 |
356 | assert "The number of bands must equal the number of band names." in result.output
357 |
358 |
359 | @patch(
360 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
361 | )
362 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
363 | def test_snowflake_upload_multiple_bands_aligned_with_band_names(*args, **kwargs):
364 | runner = CliRunner()
365 | result = runner.invoke(
366 | main,
367 | [
368 | "snowflake",
369 | "upload",
370 | "--file_path",
371 | f"{tiff}",
372 | "--database",
373 | "database",
374 | "--schema",
375 | "schema",
376 | "--table",
377 | "table",
378 | "--account",
379 | "account",
380 | "--username",
381 | "username",
382 | "--password",
383 | "password",
384 | "--chunk_size",
385 | 1,
386 | "--band",
387 | 1,
388 | "--band_name",
389 | "band_1",
390 | "--band_name",
391 | "band_2",
392 | "--band",
393 | 2,
394 | ],
395 | )
396 | assert result.exit_code == 0
397 |
398 |
399 | @patch(
400 | "raster_loader.io.snowflake.SnowflakeConnection.upload_raster", return_value=None
401 | )
402 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
403 | def test_snowflake_upload_no_table_name(*args, **kwargs):
404 | runner = CliRunner()
405 | result = runner.invoke(
406 | main,
407 | [
408 | "snowflake",
409 | "upload",
410 | "--file_path",
411 | f"{tiff}",
412 | "--database",
413 | "database",
414 | "--schema",
415 | "schema",
416 | "--account",
417 | "account",
418 | "--username",
419 | "username",
420 | "--password",
421 | "password",
422 | "--chunk_size",
423 | 1,
424 | "--band",
425 | 1,
426 | ],
427 | )
428 | assert result.exit_code == 0
429 | assert "Table: mosaic_cog_band__1___" in result.output
430 |
431 |
432 | @patch(
433 | "raster_loader.io.snowflake.SnowflakeConnection.get_records",
434 | return_value=pd.DataFrame.from_dict({"col_1": [1, 2], "col_2": ["a", "b"]}),
435 | )
436 | @patch("raster_loader.io.snowflake.SnowflakeConnection.__init__", return_value=None)
437 | def test_snowflake_describe(*args, **kwargs):
438 | runner = CliRunner()
439 | result = runner.invoke(
440 | main,
441 | [
442 | "snowflake",
443 | "describe",
444 | "--database",
445 | "database",
446 | "--schema",
447 | "schema",
448 | "--table",
449 | "table",
450 | "--account",
451 | "account",
452 | "--username",
453 | "username",
454 | "--password",
455 | "password",
456 | ],
457 | )
458 | assert result.exit_code == 0
459 |
--------------------------------------------------------------------------------
/raster_loader/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import builtins
2 | from unittest.mock import patch
3 |
4 | from raster_loader.utils import ask_yes_no_question
5 |
6 |
7 | def test_ask_yes_no_question_answer_yes():
8 | with patch.object(builtins, "input", lambda _: "yes"):
9 | assert ask_yes_no_question("Test?") is True
10 |
11 |
12 | def test_ask_yes_no_question_answer_no():
13 | with patch.object(builtins, "input", lambda _: "no"):
14 | assert ask_yes_no_question("Test?") is False
15 |
--------------------------------------------------------------------------------
/raster_loader/utils.py:
--------------------------------------------------------------------------------
1 | from itertools import islice
2 | import os
3 | import re
4 | import uuid
5 | import warnings
6 |
7 |
8 | def ask_yes_no_question(question: str) -> bool:
9 | """Ask a yes or no question and return True or False."""
10 | yes_choices = ["yes", "y"]
11 | no_choices = ["no", "n"]
12 |
13 | while True:
14 | user_input = input(question)
15 | if user_input.lower() in yes_choices:
16 | return True
17 | elif user_input.lower() in no_choices:
18 | return False
19 | else: # pragma: no cover
20 | print("Type yes or no")
21 | continue
22 |
23 |
24 | def batched(iterable, n):
25 | "Batch data into tuples of length n. The last batch may be shorter."
26 | # batched('ABCDEFG', 3) --> ABC DEF G
27 | if n < 1: # pragma: no cover
28 | raise ValueError("n must be at least one")
29 | it = iter(iterable)
30 | while batch := tuple(islice(it, n)): # noqa
31 | yield batch
32 |
33 |
34 | def get_default_table_name(base_path: str, band):
35 | table = os.path.basename(base_path).split(".")[0]
36 | table = "_".join([table, "band", str(band), str(uuid.uuid4())])
37 | return re.sub(r"[^a-zA-Z0-9_-]", "_", table)
38 |
39 |
40 | def check_private_key(private_key_path: str, private_key_passphrase: str):
41 | # Check that the private key file exists
42 | if not os.path.exists(private_key_path):
43 | raise ValueError(f"Private key file {private_key_path} not found")
44 |
45 | with open(private_key_path, "r") as f:
46 | private_key = f.read()
47 | if (
48 | private_key.startswith("-----BEGIN ENCRYPTED PRIVATE KEY-----")
49 | and private_key_passphrase is None
50 | ):
51 | raise ValueError(
52 | "The private key file is encrypted. Please provide a passphrase."
53 | )
54 |
55 |
56 | # Modify the __init__ so that self.line = "" instead of None
57 | def new_init(
58 | self, message, category, filename, lineno, file=None, line=None, source=None
59 | ):
60 | self.message = message
61 | self.category = category
62 | self.filename = filename
63 | self.lineno = lineno
64 | self.file = file
65 | self.line = ""
66 | self.source = source
67 | self._category_name = category.__name__.upper() if category else None
68 |
69 |
70 | warnings.WarningMessage.__init__ = new_init
71 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black==24.3.0
2 | flake8>=7.0.0
3 | importlib-metadata<5
4 | ipython>=7.8.0
5 | lazydocs==0.4.8
6 | myst-parser==0.18.1
7 | pre-commit==2.20.0
8 | pydocstyle==6.1.1
9 | pytest-cov==3.0.0
10 | pytest-mock==3.8.2
11 | pytest==7.1.2
12 | requests-mock==1.9.3
13 | setuptools>=75.3.0
14 | sphinx-click==4.3.0
15 | sphinx-rtd-theme==1.1.1
16 | sphinx==5.3.0
17 | tokenize-rt>=3.2.0
18 | twine==4.0.0
19 | wheel==0.38.1
20 | git-changelog==2.5.2
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = raster-loader
3 | description = Python library for loading GIS raster data to standard cloud-based data warehouses that don't natively support raster data.
4 | long_description = file: README.md
5 | long_description_content_type = text/markdown
6 | keywords =
7 | carto
8 | raster
9 | gis
10 | data warehouse
11 | bigquery
12 | snowflake
13 | databricks
14 | author = CARTO
15 | url = https://github.com/cartodb/raster-loader
16 | license = BSD 3-Clause
17 | classifiers =
18 | Development Status :: 5 - Production/Stable
19 | Intended Audience :: Developers
20 | License :: OSI Approved :: BSD License
21 | Natural Language :: English
22 | Programming Language :: Python :: 3
23 | Programming Language :: Python :: 3.9
24 | Programming Language :: Python :: 3.10
25 | Programming Language :: Python :: 3.11
26 | Programming Language :: Python :: 3.12
27 |
28 | [options]
29 | packages = find:
30 | python_requires = >=3.9
31 | setup_requires = setuptools_scm
32 | install_requires =
33 | click-plugins>=1.1.1
34 | click>=8.1.3
35 | db-dtypes>=1.0.5
36 | pandas>=1.3.4
37 | pyarrow>=10.0.1
38 | pyproj>=3.2.1
39 | rasterio>=1.3a3
40 | rio-cogeo>=3.5.0
41 | shapely>=1.7.1
42 | quadbin>=0.2.0
43 | tqdm>=4.64.1
44 | zip_safe = False
45 |
46 | [options.entry_points]
47 | console_scripts =
48 | carto = raster_loader.cli:main
49 | raster_loader.cli =
50 | bigquery = raster_loader.cli.bigquery:bigquery
51 | snowflake = raster_loader.cli.snowflake:snowflake
52 | databricks = raster_loader.cli.databricks:databricks
53 | info = raster_loader.cli.info:info
54 |
55 | [options.extras_require]
56 | test =
57 | pytest>=7.1.2
58 | pytest-mock>=3.8.2
59 | pytest-cov>=3.0.0
60 | bigquery =
61 | google-cloud-bigquery>=3.13.0
62 | google-auth>=2.28.0
63 | snowflake =
64 | snowflake-connector-python>=2.6.0
65 | databricks =
66 | all =
67 | %(bigquery)s
68 | %(snowflake)s
69 | %(databricks)s
70 |
71 | [flake8]
72 | max-line-length = 88
73 | ignore = E203 W503
74 |
75 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 |
4 | use_scm = {"write_to": "raster_loader/_version.py"}
5 | setup(use_scm_version=use_scm)
6 |
--------------------------------------------------------------------------------