├── .dockerignore ├── .env ├── .eslintrc.cjs ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── build.python.yml │ ├── build.ts.yml │ └── codeql-analysis.yml ├── .gitignore ├── .kodiak.toml ├── .prettierrc.cjs ├── .pylintrc ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── VERSION ├── cdk.json ├── conftest.py ├── infra └── src │ ├── batch-monitor.ts │ ├── batch.ts │ ├── index.ts │ ├── lambda-code │ └── index.js │ └── submit.ts ├── package.json ├── poetry.lock ├── pyproject.toml ├── scripts └── version.bump.sh ├── test_data ├── historical_aerial_photos_metadata.csv ├── historical_aerial_photos_metadata.gpkg ├── historical_aerial_photos_metadata_error.csv ├── historical_survey_footprint_metadata.csv ├── manifest.json ├── manifest_duplicate.json ├── schemas │ └── README.md └── tiffs │ ├── SURVEY_1 │ ├── CONTROL.tiff │ ├── MULTIPLE_ASSET.his │ ├── MULTIPLE_ASSET.tiff │ ├── WRONG_PHOTO_TYPE.tiff │ └── WRONG_SURVEY.tiff │ └── SURVEY_2 │ └── CONTROL_2.tif ├── topo_processor ├── __init__.py ├── cli │ ├── __init__.py │ ├── geostore │ │ ├── __init__.py │ │ ├── add.py │ │ ├── delete.py │ │ ├── list.py │ │ └── status.py │ ├── tests │ │ ├── __init__.py │ │ └── upload_test.py │ ├── upload.py │ └── validate.py ├── cog │ ├── __init__.py │ ├── create_cog.py │ └── tests │ │ ├── __init__.py │ │ └── create_cog_test.py ├── data │ ├── __init__.py │ └── data_transformers │ │ ├── __init__.py │ │ ├── data_transformer.py │ │ ├── data_transformer_imagery_historic.py │ │ └── data_transformer_repo.py ├── file_system │ ├── __init__.py │ ├── assets.py │ ├── file_searcher.py │ ├── get_fs.py │ ├── get_path_with_protocol.py │ ├── manifest.py │ ├── tests │ │ ├── __init__.py │ │ ├── assets_test.py │ │ ├── file_searcher_test.py │ │ ├── get_fs_test.py │ │ ├── get_path_with_protocol_test.py │ │ ├── transfer_test.py │ │ └── write_json_test.py │ ├── transfer.py │ └── write_json.py ├── geostore │ ├── invoke.py │ └── tests │ │ └── invoke_test.py ├── metadata │ ├── __init__.py │ ├── csv_loader │ │ ├── csv_loader.py │ │ └── tests │ │ │ └── csv_loader_test.py │ ├── data_type.py │ ├── lds_cache │ │ ├── lds_cache.py │ │ └── tests │ │ │ └── lds_cache_test.py │ ├── metadata_loaders │ │ ├── __init__.py │ │ ├── metadata_loader.py │ │ ├── metadata_loader_imagery_historic.py │ │ ├── metadata_loader_repo.py │ │ ├── metadata_loader_tiff.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── metadata_loader_imagery_historic_test.py │ │ │ └── metadata_loader_tiff_test.py │ └── metadata_validators │ │ ├── __init__.py │ │ ├── metadata_validator.py │ │ ├── metadata_validator_repo.py │ │ ├── metadata_validator_stac.py │ │ ├── metadata_validator_tiff.py │ │ └── tests │ │ ├── __init__.py │ │ ├── metadata_validator_stac_test.py │ │ └── metadata_validator_tiff_test.py ├── stac │ ├── __init__.py │ ├── asset.py │ ├── asset_key.py │ ├── collection.py │ ├── item.py │ ├── item_factory.py │ ├── iter_errors_validator.py │ ├── linz_provider.py │ ├── providers.py │ ├── stac_extensions.py │ ├── store.py │ ├── tests │ │ ├── __init__.py │ │ ├── asset_test.py │ │ ├── collection_test.py │ │ ├── file_extension_test.py │ │ ├── iter_errors_validator_test.py │ │ └── validate_report_test.py │ ├── validate_report.py │ └── validation.py └── util │ ├── __init__.py │ ├── aws_credentials.py │ ├── aws_files.py │ ├── checksum.py │ ├── command.py │ ├── configuration.py │ ├── conversions.py │ ├── execution.py │ ├── file_converter.py │ ├── file_extension.py │ ├── files.py │ ├── gzip.py │ ├── s3.py │ ├── tests │ ├── __init__.py │ ├── aws_credentials_test.py │ ├── aws_files_test.py │ ├── checksum_test.py │ ├── command_test.py │ ├── conversions_test.py │ ├── files_test.py │ ├── time_test.py │ └── transfer_collection_test.py │ ├── time.py │ ├── transfer_collection.py │ └── valid.py ├── tsconfig.json ├── upload └── yarn.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | infra/ 2 | README.md 3 | .gitignore 4 | .pylintrc 5 | __pycache__ 6 | .github 7 | .mypy_cache 8 | .pytest_cache 9 | .vscode 10 | .kodiak.toml 11 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | LINZ_CACHE_BUCKET=linz-lds-cache 2 | LINZ_HISTORICAL_IMAGERY_BUCKET=linz-historical-imagery-staging 3 | LINZ_SSM_BUCKET_CONFIG_NAME=BucketConfig 4 | -------------------------------------------------------------------------------- /.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | ...require("@linzjs/style/.eslintrc.js"), 3 | }; 4 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @linz/li-topo-data-engineering 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | ### Bug Description 8 | A clear and concise description of what the bug is and what you expected to happen. 9 | 10 | #### Steps to Reproduce 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | #### Desktop 18 | - Environment: [e.g. Windows / DaaS / Ubuntu] 19 | - Relevant Software Versions [e.g. QGIS 2.18.21] 20 | 21 | #### Screenshots 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Add an _Assignee_, _Milestone_, _Release_ and any relevant _Labels_.** 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | ### User Story 8 | 9 | In order to [accomplish goal] as a [role] I want [capability] 10 | (optional: instead of [existing behaviour]). 11 | 12 | #### Acceptance Criteria 13 | - [ ] ... 14 | - [ ] ... 15 | - [ ] ... 16 | 17 | **Add an _Assignee_, _Milestone_, _Release_ and any relevant _Labels_.** 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes: # 2 | _Add new line for each issue that was fixed_ 3 | 4 | ### Change Description: 5 | 6 | ... 7 | 8 | ### Notes for Testing: 9 | 10 | ... 11 | 12 | #### Source Code Documentation Tasks: 13 | - [ ] README updated (where applicable) 14 | - [ ] CHANGELOG (Unreleased section) updated 15 | - [ ] Docstrings / comments included to help explain code 16 | 17 | #### User Documentation Tasks: 18 | - [ ] Confluence updated (where applicable) 19 | 20 | #### Testing Tasks: 21 | - [ ] Added tests that fail without this change 22 | - [ ] All tests are passing in development environment 23 | - [ ] Reviewers assigned 24 | - [ ] Linked to main issue for ZenHub board 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | - package-ecosystem: npm 8 | directory: "/" 9 | schedule: 10 | interval: daily 11 | open-pull-requests-limit: 10 12 | - package-ecosystem: pip 13 | directory: "/" 14 | schedule: 15 | interval: daily 16 | open-pull-requests-limit: 10 17 | -------------------------------------------------------------------------------- /.github/workflows/build.python.yml: -------------------------------------------------------------------------------- 1 | name: Build Python 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - name: Use Python "3.8" 15 | uses: actions/setup-python@v3 16 | with: 17 | python-version: "3.8" 18 | - name: Install 19 | run: | 20 | pip install poetry 21 | poetry install 22 | - name: Format 23 | run: | 24 | poetry run black . --check --diff 25 | - name: Lint 26 | run: | 27 | poetry run pylint topo_processor/ --exit-zero 28 | - name: Import Sorting 29 | run: | 30 | poetry run isort -rc . --check --diff 31 | - name: Test 32 | run: | 33 | poetry run pytest --slow --cov topo_processor 34 | - name: Mypy 35 | run: | 36 | poetry run mypy . 37 | -------------------------------------------------------------------------------- /.github/workflows/build.ts.yml: -------------------------------------------------------------------------------- 1 | name: Build Typescript 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | id-token: write 13 | contents: write 14 | steps: 15 | - name: Build and test 16 | uses: linz/action-typescript@v1 17 | 18 | - name: (Prod) Setup git config 19 | if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:') 20 | run: | 21 | git config user.name "github-actions[bot]" 22 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 23 | 24 | - name: (Prod) Create tag 25 | if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:') 26 | run: | 27 | CURRENT_VERSION=$(node -p "require('./package.json').version") 28 | git tag v${CURRENT_VERSION} -m v${CURRENT_VERSION} || true 29 | git push --tags 30 | 31 | - name: (Prod) Create github release 32 | if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:') 33 | run: npx conventional-github-releaser -p angular 34 | env: 35 | CONVENTIONAL_GITHUB_RELEASER_TOKEN: ${{secrets.GITHUB_TOKEN}} 36 | 37 | - name: (NonProd) Configure AWS Credentials 38 | if: github.ref == 'refs/heads/master' 39 | uses: aws-actions/configure-aws-credentials@v1 40 | with: 41 | aws-region: 'ap-southeast-2' 42 | role-to-assume: ${{ secrets.AWS_ROLE_NON_PROD }} 43 | 44 | - name: (NonProd) Deploy to NonProd 45 | if: github.ref == 'refs/heads/master' 46 | run: | 47 | npx cdk deploy --all -y --require-approval never 48 | env: 49 | AWS_ORG_ID: ${{secrets.AWS_ORG_ID}} 50 | 51 | - name: (Prod) Configure AWS Credentials 52 | if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:') 53 | uses: aws-actions/configure-aws-credentials@v1 54 | with: 55 | aws-region: 'ap-southeast-2' 56 | role-to-assume: ${{ secrets.AWS_ROLE_PROD }} 57 | 58 | - name: (Prod) Deploy to Prod 59 | if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:') 60 | run: | 61 | npx cdk deploy --all -y --require-approval never 62 | env: 63 | AWS_ORG_ID: ${{secrets.AWS_ORG_ID}} 64 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [master, ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: '0 0 * * 1' 11 | 12 | jobs: 13 | analyse: 14 | name: Analyse 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v3 20 | with: 21 | # We must fetch at least the immediate parents so that if this is 22 | # a pull request then we can checkout the head. 23 | fetch-depth: 2 24 | 25 | # If this run was triggered by a pull request event, then checkout 26 | # the head of the pull request instead of the merge commit. 27 | - run: git checkout HEAD^2 28 | if: ${{ github.event_name == 'pull_request' }} 29 | 30 | # Initializes the CodeQL tools for scanning. 31 | - name: Initialize CodeQL 32 | uses: github/codeql-action/init@v2 33 | # Override language selection by uncommenting this and choosing your languages 34 | # with: 35 | # languages: go, javascript, csharp, python, cpp, java 36 | 37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 38 | # If this step fails, then you should remove it and run the build manually (see below) 39 | - name: Autobuild 40 | uses: github/codeql-action/autobuild@v2 41 | 42 | # ℹ️ Command-line programs to run using the OS shell. 43 | # 📚 https://git.io/JvXDl 44 | 45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 46 | # and modify them (or add more) to build your code if your project 47 | # uses a compiled language 48 | 49 | #- run: | 50 | # make bootstrap 51 | # make release 52 | 53 | - name: Perform CodeQL Analysis 54 | uses: github/codeql-action/analyze@v2 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest_cache 3 | *.pyc 4 | .venv 5 | .vscode 6 | Thumbs.db 7 | build/ 8 | .coverage 9 | batch/roles.json 10 | node_modules 11 | cdk.out 12 | cdk.context.json 13 | -------------------------------------------------------------------------------- /.kodiak.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | 4 | [merge] 5 | method = "squash" 6 | automerge_label = "automerge 🚀" 7 | -------------------------------------------------------------------------------- /.prettierrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | ...require('@linzjs/style/.prettierrc.js'), 3 | }; 4 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | disable = 3 | bad-continuation, 4 | invalid-name, 5 | missing-class-docstring, 6 | missing-module-docstring, 7 | missing-function-docstring 8 | [FORMAT] 9 | max-line-length=127 10 | [MISCELLANEOUS] 11 | notes=FIXME,XXX 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM osgeo/gdal:ubuntu-small-3.5.0 2 | 3 | # Install Poetry 4 | RUN apt-get update 5 | RUN apt-get install python3-pip -y 6 | RUN pip install poetry 7 | 8 | # Set environment variable to prevent GDAL running in Docker 9 | ENV IS_DOCKER=true 10 | 11 | WORKDIR /app 12 | # Add Poetry config and scripts 13 | COPY poetry.lock pyproject.toml VERSION /app/ 14 | 15 | RUN poetry config virtualenvs.create false \ 16 | && poetry install --no-dev --no-interaction --no-ansi 17 | 18 | COPY ./topo_processor /app/topo_processor 19 | COPY ./upload /app/ 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Land Information New Zealand 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Topo Processor 2 | 3 | [![GitHub Actions Status](https://github.com/linz/topo-processor/workflows/Build/badge.svg)](https://github.com/linz/topo-processor/actions) 4 | [![Alerts](https://badgen.net/lgtm/alerts/g/linz/topo-processor?icon=lgtm&labelColor=2e3a44&label=Alerts&color=3dc64b)](https://lgtm.com/projects/g/linz/topo-processor/context:python) 5 | [![Dependabot Status](https://badgen.net/dependabot/linz/topo-processor?icon=dependabot&labelColor=2e3a44&color=blue)](https://dependabot.com) 6 | [![License](https://badgen.net/github/license/linz/processor-aerial-imagery?labelColor=2e3a44&label=License)](https://github.com/linz/topo-processor/blob/master/LICENSE) 7 | [![Conventional Commits](https://badgen.net/badge/Commits/conventional?labelColor=2e3a44&color=EC5772)](https://conventionalcommits.org) 8 | [![Code Style](https://badgen.net/badge/Code%20Style/black?labelColor=2e3a44&color=000000)](https://github.com/psf/black) 9 | 10 | ## Description 11 | 12 | The Topo Processor is a collection of small components that can be combined together to create a pipeline. It can be run on a local workstation or using AWS Batch. 13 | 14 | These components include transforming data into cloud optimised formats like [COG](https://www.cogeo.org/) and the creation of [STAC](http://stacspec.org/) metadata. 15 | 16 | ## Installation 17 | 18 | ### Requirements to run Topo Processor locally: 19 | 20 | #### Poetry 21 | 22 | Follow the [Poetry installation guide](https://python-poetry.org/docs/). 23 | 24 | #### Docker 25 | 26 | Follow the [Docker Engine installation guide (Ubuntu)](https://docs.docker.com/engine/install/ubuntu/). 27 | 28 | ### Recommended 29 | 30 | - [node](https://nodejs.org/en/about/) 31 | - [pretty-json-log](https://npmjs.com/package/pretty-json-log) 32 | 33 | ### Use poetry to install 34 | 35 | ```shell 36 | poetry shell 37 | 38 | poetry install 39 | ``` 40 | 41 | ## Configuration 42 | 43 | The global user configuration is defined by environment variables, example environment variables are found in the `.env` file. 44 | 45 | ### Requirements to run Topo Processor using AWS Batch: 46 | 47 | #### Software 48 | 49 | ```shell 50 | yarn 51 | 52 | yarn build 53 | ``` 54 | 55 | #### AWS Batch Stack deployment 56 | 57 | **_NOTE:_** [AWS deployment is done automatically through GitHub Actions.](#aws-deployment--ci--cd) 58 | 59 | To deploy the Batch via CDK locally: 60 | 61 | On the AWS account you are logged into 62 | 63 | ```shell 64 | yarn build 65 | 66 | npx cdk deploy 67 | ``` 68 | 69 | ### AWS Roles 70 | 71 | To allow the system to perform cross account AWS requests, you'll need to config AWS roles inside of an AWS SSM parameter. 72 | 73 | This configuration parameter can be referenced via `$LINZ_SSM_BUCKET_CONFIG_NAME` 74 | 75 | ## Usage 76 | 77 | ### AWS Batch Job Submission 78 | 79 | **_NOTE:_** Only the `upload` command is implemented to run on AWS Batch. Currently the job submission is restricted to only one job per survey. 80 | 81 | **_NOTE:_** You may need to set the `AWS_REGION` environment variable to your region. 82 | 83 | ```shell 84 | # Passing survey IDs as argument 85 | node ./build/infra/src/submit.js surveyId1 surveyId3 [...] 86 | 87 | # Passing S3 folder as argument 88 | node ./build/infra/src/submit.js s3://my-bucket/backup2/surveyId1/ s3://my-bucket/backup4/surveyId3/ [...] 89 | ``` 90 | 91 | ### `upload` 92 | 93 | **_NOTE:_** The `upload` command is restricted to a run per `survey` and only for the `Historical Imagery` layer. To run multiple surveys, please refere to `AWS Batch` described above. 94 | 95 | | Argument | Description | 96 | | --------------------------- | :---------------------------------------------------------------------------------------------: | 97 | | `-s` or `--source` | The source of the data to import. Can be a `survey ID` or a path (local or `s3`) to the survey. | 98 | | `-d` or `--datatype` | The datatype of the upload. _Only `imagery.historic` is available at the moment._ | 99 | | `-t` or `--target` | The target local directory path or `s3` path of the upload. | 100 | | `-cid` or `--correlationid` | OPTIONAL. The `correlation ID` of the batch job. _`AWS Batch` only._ | 101 | | `-m` or `--metadata` | OPTIONAL. The metadata file (local or `s3`) path. | 102 | | `-f` or `--footprint` | TESTING PURPOSE. The footprint metadata file (local or `s3`) path. | 103 | | `--force` | Flag to force the upload even if some data are invalid (some items might not be uploaded). | 104 | | `-v` or `--verbose` | Flag to display trace logs. | 105 | 106 | The user has to specify the survey id or path (where the data is) as a `--source` and it will be validated against the latest version of metadata. A metadata file path can also be specified by using `--metadata` if the LDS cache version one is not wanted. The `--datatype` has to be `imagery.historic`. The user also has to specify a target folder for the output. 107 | 108 | ```shell 109 | # Run in a virtual environment (poetry shell): 110 | ./upload --source source_path --datatype data.type --target target_folder 111 | ``` 112 | 113 | ```shell 114 | # For help: 115 | ./upload --help 116 | ``` 117 | 118 | ```shell 119 | # To see all logs in a tidy format, use pretty-json-log: 120 | ./upload --source source_path --datatype data.type --target target_folder --verbose | pjl 121 | ``` 122 | 123 | The following source and target combinations can be used: 124 | 125 | | Source | Target | 126 | | ------ | :----: | 127 | | s3 | s3 | 128 | | s3 | local | 129 | | local | local | 130 | | local | s3 | 131 | 132 | ### `add` (Geostore) 133 | 134 | This command allows to add a survey to the [Geostore](https://github.com/linz/geostore) by using the [Geostore API](https://github.com/linz/geostore/blob/master/USAGE.md). 135 | 136 | **_Prerequisites:_** The survey has to be processed by the `upload` command first. The output files of the `upload` is what will be exported to the `Geostore`. 137 | 138 | | Argument | Description | 139 | | --------------------- | :----------------------------------------------------: | 140 | | `-s`, `--source` TEXT | The s3 path to the survey to export [required] | 141 | | `-r`, `--role` TEXT | The ARN role to access to the source bucket [required] | 142 | | `-c`, `--commit` | Use this flag to commit the creation of the dataset | 143 | | `-v`, `--verbose` | Use verbose to display debug logs | 144 | 145 | ```bash 146 | poetry run add -s "s3://bucket/survey-path/" -r "arn:aws:iam::123456789:role/read-role" 147 | ``` 148 | 149 | ### `status` (Geostore) 150 | 151 | This is to follow the current upload status to the `Geostore` for a particular `dataset` version. You may have to run it several times as the status gets updated. 152 | 153 | | Argument | Description | 154 | | ---------------------------- | :------------------------------------------------------------------------------: | 155 | | `-a`, `--execution-arn` TEXT | The execution ARN received from the Geostore after invoking an upload [required] | 156 | | `-v`, `--verbose` | Use verbose to display debug logs | 157 | 158 | **_NOTE:_** The command to run is given in the logs after calling successfully the `add` command: 159 | 160 | ```json 161 | "info": "To check the export status, run the following command 'poetry run status -arn arn:aws:states:ap-southeast-2:632223577832:execution:ABCD'" 162 | ``` 163 | 164 | ### `list` (Geostore) 165 | 166 | It gives you the information for one or all the datasets created on the `Geostore`. 167 | 168 | | Argument | Description | 169 | | -------------------- | :-----------------------------------------------------------------------------------: | 170 | | `-t`, `--title` TEXT | The Geostore title of the survey to filter e.g. historical-aerial-imagery-survey-2660 | 171 | | `-v`, `--verbose` | Use verbose to display debug logs | 172 | 173 | ```bash 174 | poetry run list [-s ID123ABC] 175 | ``` 176 | 177 | ### `delete` (Geostore) 178 | 179 | Delete a dataset from the `Geostore`. Only if the dataset does not contain any version. To delete a dataset which contains a version, contact the **Geostore** support. 180 | 181 | | Argument | Description | 182 | | ------------------------- | :-----------------------------------------------------: | 183 | | `-d`, `--dataset-id` TEXT | The dataset id to delete [required] | 184 | | `-c`, `--commit` | Use this flag to commit the suppression of the dataset. | 185 | | `-v`, `--verbose` | Use verbose to display debug logs | 186 | 187 | ```bash 188 | poetry run delete -d ID123ABC [--commit] 189 | ``` 190 | 191 | ### `validate` 192 | 193 | **_NOTE:_** This command is currently only implemented for `Historical Imagery`. Other layers will come later. 194 | 195 | This command runs a validation against a layer. It gets the layer last version metadata and generates the corresponding STAC objects on the fly. Then, it runs a JSON schema validation (using [jsonschema-rs](https://github.com/Stranger6667/jsonschema-rs)) for the `Items` and `Collections`. It outputs the errors and their recurrences grouped by JSON schemas as: 196 | 197 | ```json 198 | "errors": {"https://stac.linz.govt.nz/v0.0.11/aerial-photo/schema.json": {"'aerial-photo:run' is a required property": 4, "'aerial-photo:sequence_number' is a required property": 10} 199 | ``` 200 | 201 | To validate another version than the latest one, specify the metadata csv file wanted to be validated by using the `--metadata` argument. 202 | 203 | The following command have to be run in a virtual environment (poetry shell): 204 | 205 | ```shell 206 | # Run default: 207 | poetry run validate 208 | ``` 209 | 210 | ```shell 211 | # Run against a specific version (can be a s3 or local file): 212 | poetry run validate --metadata s3://bucket/layer_id/metadata_file.csv 213 | ``` 214 | 215 | ```shell 216 | # Run against the `Items` only: 217 | poetry run validate --item 218 | ``` 219 | 220 | ```shell 221 | # Run against the `Collections` only: 222 | poetry run validate --collection 223 | ``` 224 | 225 | ```shell 226 | # For help: 227 | poetry run validate --help 228 | ``` 229 | 230 | ```shell 231 | # To see all logs in a tidy format, use pretty-json-log: 232 | poetry run validate --verbose | pjl 233 | ``` 234 | 235 | ```shell 236 | # To record the output in an external file: 237 | poetry run validate | tee output.file 238 | ``` 239 | 240 | ## AWS Deployment / CI / CD 241 | 242 | CI/CD is used to deploy into AWS, to trigger a deployment create a new "release:" commit and merge it to master 243 | 244 | A helpful utility script is in `./scripts/version.bump.sh` to automate this process 245 | 246 | ```bash 247 | ./scripts/version.bump.sh 248 | # Push branch release/v:versionNumber 249 | git push 250 | # Create the pull request 251 | gh pr create 252 | # Merge to master 253 | ``` 254 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | v0.15.0 2 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "node build/infra/src/index.js" 3 | } 4 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from tempfile import mkdtemp 3 | from typing import Generator 4 | 5 | import pystac 6 | import pytest 7 | 8 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator 9 | 10 | 11 | @pytest.fixture(autouse=True) 12 | def set_iter_errors_validator() -> None: 13 | pystac.validation.set_validator(IterErrorsValidator()) 14 | 15 | 16 | def pytest_addoption(parser) -> None: # type: ignore 17 | parser.addoption("--slow", action="store_true", default=False, help="run slow tests") 18 | 19 | 20 | def pytest_runtest_setup(item) -> None: # type: ignore 21 | if "slow" in item.keywords and not item.config.getoption("--slow"): 22 | pytest.skip("need --slow option to run this test") 23 | 24 | 25 | @pytest.fixture(autouse=True) 26 | def setup() -> Generator[str, None, None]: 27 | """ 28 | This function creates a temporary directory and deletes it after each test. 29 | See following link for details: 30 | https://docs.pytest.org/en/stable/fixture.html#yield-fixtures-recommended 31 | """ 32 | target = mkdtemp() 33 | yield target 34 | shutil.rmtree(target) 35 | -------------------------------------------------------------------------------- /infra/src/batch-monitor.ts: -------------------------------------------------------------------------------- 1 | import { Stack, StackProps } from 'aws-cdk-lib'; 2 | import * as events from 'aws-cdk-lib/aws-events'; 3 | import * as evtTargets from 'aws-cdk-lib/aws-events-targets'; 4 | import * as lf from 'aws-cdk-lib/aws-lambda'; 5 | import { Code } from 'aws-cdk-lib/aws-lambda'; 6 | import { Construct } from 'constructs'; 7 | import * as path from 'path'; 8 | 9 | export class AwsBatchMonitor extends Stack { 10 | public constructor(scope: Construct, id: string, props: StackProps) { 11 | super(scope, id, props); 12 | 13 | const rule = new events.Rule(this, 'BatchEventRule', { 14 | eventPattern: { 15 | source: ['aws.batch'], 16 | detailType: ['Batch Job State Change'], 17 | }, 18 | }); 19 | 20 | const lambda = new lf.Function(this, 'BatchLog', { 21 | runtime: lf.Runtime.NODEJS_14_X, 22 | handler: 'index.handler', 23 | code: Code.fromAsset(path.join(process.cwd(), 'infra', 'src', 'lambda-code')), 24 | }); 25 | 26 | rule.addTarget(new evtTargets.LambdaFunction(lambda)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /infra/src/batch.ts: -------------------------------------------------------------------------------- 1 | import { CfnOutput, Duration, RemovalPolicy, Stack, StackProps } from 'aws-cdk-lib'; 2 | import { DockerImageAsset } from 'aws-cdk-lib/aws-ecr-assets'; 3 | import { ContainerImage } from 'aws-cdk-lib/aws-ecs'; 4 | import { 5 | Role, 6 | CompositePrincipal, 7 | ServicePrincipal, 8 | CfnInstanceProfile, 9 | ManagedPolicy, 10 | PolicyStatement, 11 | } from 'aws-cdk-lib/aws-iam'; 12 | import { Vpc, InstanceClass, InstanceType, InstanceSize } from 'aws-cdk-lib/aws-ec2'; 13 | import { ComputeResourceType, ComputeEnvironment, JobDefinition, JobQueue } from '@aws-cdk/aws-batch-alpha'; 14 | import { BlockPublicAccess, Bucket } from 'aws-cdk-lib/aws-s3'; 15 | import { StringParameter } from 'aws-cdk-lib/aws-ssm'; 16 | import { Construct } from 'constructs'; 17 | 18 | interface BatchStackProps extends StackProps { 19 | container: string; 20 | } 21 | 22 | export class AwsBatchStack extends Stack { 23 | public constructor(scope: Construct, id: string, props: BatchStackProps) { 24 | super(scope, id, props); 25 | 26 | const container = new DockerImageAsset(this, 'BatchContainer', { directory: props.container }); 27 | const image = ContainerImage.fromDockerImageAsset(container); 28 | 29 | const vpc = Vpc.fromLookup(this, 'Vpc', { tags: { BaseVPC: 'true' } }); 30 | const instanceRole = new Role(this, 'BatchInstanceRole', { 31 | assumedBy: new CompositePrincipal( 32 | new ServicePrincipal('ec2.amazonaws.com'), 33 | new ServicePrincipal('ecs.amazonaws.com'), 34 | ), 35 | }); 36 | instanceRole.addManagedPolicy( 37 | ManagedPolicy.fromAwsManagedPolicyName('service-role/AmazonEC2ContainerServiceforEC2Role'), 38 | ); 39 | instanceRole.addManagedPolicy(ManagedPolicy.fromAwsManagedPolicyName('AmazonSSMManagedInstanceCore')); 40 | 41 | instanceRole.addToPrincipalPolicy( 42 | new PolicyStatement({ resources: ['*'], actions: ['sts:AssumeRole', 'cloudformation:DescribeStacks'] }), 43 | ); 44 | 45 | const tempBucket = new Bucket(this, 'TempBucket', { 46 | removalPolicy: RemovalPolicy.RETAIN, 47 | blockPublicAccess: BlockPublicAccess.BLOCK_ALL, 48 | lifecycleRules: [{ expiration: Duration.days(90) }], 49 | }); 50 | 51 | const roRole = Role.fromRoleName(this, 'LINZReadRole', 'internal-user-read'); 52 | tempBucket.grantRead(roRole); 53 | tempBucket.grantReadWrite(instanceRole); 54 | StringParameter.fromStringParameterName(this, 'BucketConfig', 'BucketConfig').grantRead(instanceRole); 55 | 56 | new CfnInstanceProfile(this, 'BatchInstanceProfile', { 57 | instanceProfileName: instanceRole.roleName, 58 | roles: [instanceRole.roleName], 59 | }); 60 | 61 | const computeEnvironment = new ComputeEnvironment(this, 'BatchCompute', { 62 | managed: true, 63 | computeResources: { 64 | instanceRole: instanceRole.roleName, 65 | vpc, 66 | type: ComputeResourceType.SPOT, 67 | maxvCpus: 100, 68 | minvCpus: 0, 69 | instanceTypes: [ 70 | InstanceType.of(InstanceClass.C5, InstanceSize.LARGE), 71 | InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE), 72 | InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE2), 73 | InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE4), 74 | ], 75 | }, 76 | }); 77 | 78 | const job = new JobDefinition(this, 'BatchJob', { container: { image } }); 79 | const queue = new JobQueue(this, 'BatchQueue', { computeEnvironments: [{ computeEnvironment, order: 1 }] }); 80 | 81 | new CfnOutput(this, 'BatchJobArn', { value: job.jobDefinitionArn }); 82 | new CfnOutput(this, 'BatchQueueArn', { value: queue.jobQueueArn }); 83 | new CfnOutput(this, 'BatchEc2InstanceRole', { value: instanceRole.roleArn }); 84 | new CfnOutput(this, 'TempBucketName', { value: tempBucket.bucketName }); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /infra/src/index.ts: -------------------------------------------------------------------------------- 1 | import { App } from 'aws-cdk-lib'; 2 | import { AwsBatchStack } from './batch'; 3 | import { AwsBatchMonitor } from './batch-monitor'; 4 | 5 | const app = new App(); 6 | new AwsBatchStack(app, 'TopoProcessorBatch', { 7 | env: { 8 | region: 'ap-southeast-2', 9 | account: process.env['CDK_DEFAULT_ACCOUNT'], 10 | }, 11 | container: './', 12 | }); 13 | new AwsBatchMonitor(app, 'TopoProcessorBatchMon', { 14 | env: { 15 | region: 'ap-southeast-2', 16 | account: process.env['CDK_DEFAULT_ACCOUNT'], 17 | }, 18 | }); 19 | -------------------------------------------------------------------------------- /infra/src/lambda-code/index.js: -------------------------------------------------------------------------------- 1 | function handler(event, ctx, cb) { 2 | console.log(JSON.stringify({ event })); 3 | cb(null, 'done'); 4 | } 5 | module.exports = { handler }; 6 | -------------------------------------------------------------------------------- /infra/src/submit.ts: -------------------------------------------------------------------------------- 1 | import * as sdk from 'aws-sdk'; 2 | import * as ulid from 'ulid'; 3 | import CloudFormation from 'aws-sdk/clients/cloudformation.js'; 4 | 5 | const batch = new sdk.Batch(); 6 | 7 | const cloudFormation = new CloudFormation({ region: 'ap-southeast-2' }); 8 | 9 | async function main(): Promise { 10 | const correlationId = ulid.ulid(); 11 | console.log({ correlationId }); 12 | 13 | const environment = [ 14 | { name: 'AWS_DEFAULT_REGION', value: 'ap-southeast-2' }, 15 | { name: 'LINZ_CACHE_BUCKET', value: 'linz-lds-cache' }, 16 | { name: 'LINZ_CORRELATION_ID', value: correlationId }, 17 | { name: 'LINZ_HISTORICAL_IMAGERY_BUCKET', value: 'linz-historical-imagery-staging' }, 18 | { name: 'LINZ_SSM_BUCKET_CONFIG_NAME', value: 'BucketConfig' }, 19 | ]; 20 | 21 | const stackInfo = await cloudFormation.describeStacks({ StackName: 'TopoProcessorBatch' }).promise(); 22 | const stackOutputs = stackInfo.Stacks?.[0].Outputs; 23 | 24 | const JobDefinitionArn = stackOutputs?.find((f) => f.OutputKey === 'BatchJobArn')?.OutputValue; 25 | if (JobDefinitionArn == null) throw new Error('Unable to find CfnOutput "BatchJobArn"'); 26 | const JobQueueArn = stackOutputs?.find((f) => f.OutputKey === 'BatchQueueArn')?.OutputValue; 27 | if (JobQueueArn == null) throw new Error('Unable to find CfnOutput "BatchQueueArn"'); 28 | const TempBucketName = stackOutputs?.find((f) => f.OutputKey === 'TempBucketName')?.OutputValue; 29 | if (TempBucketName == null) throw new Error('Unable to find CfnOutput "TempBucketName"'); 30 | 31 | if (process.argv.length > 2) { 32 | for (let i = 2; i < process.argv.length; i++) { 33 | const res = await batch 34 | .submitJob({ 35 | jobName: ['Job', correlationId].join('-'), 36 | jobQueue: JobQueueArn, 37 | jobDefinition: JobDefinitionArn, 38 | containerOverrides: { 39 | resourceRequirements: [{ type: 'MEMORY', value: '3600' }], 40 | command: buildCommandArguments(correlationId, TempBucketName, process.argv[i]), 41 | environment, 42 | }, 43 | }) 44 | .promise(); 45 | console.log({ source: process.argv[i] }, '\n', res); 46 | } 47 | } else { 48 | console.log( 49 | 'You need to provide a source (a list of S3 bucket folders or a list of survey ID to process. Check the README for more information.', 50 | ); 51 | } 52 | } 53 | 54 | function buildCommandArguments(correlationId: string, tempBucket: string, source: string): string[] { 55 | const command: string[] = []; 56 | command.push('./upload'); 57 | command.push('--correlationid'); 58 | command.push(correlationId); 59 | command.push('--source'); 60 | command.push(source); 61 | command.push('--target'); 62 | command.push('s3://' + tempBucket + '/' + correlationId + '/'); 63 | command.push('--datatype'); 64 | command.push('imagery.historic'); 65 | command.push('-v'); 66 | 67 | return command; 68 | } 69 | 70 | main().catch(console.error); 71 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "@aws-cdk/aws-batch-alpha": "2.9.0-alpha.0", 4 | "@linzjs/style": "^3.9.0", 5 | "aws-cdk-lib": "^2.25.0", 6 | "aws-sdk": "^2.1140.0", 7 | "constructs": "^10.1.15", 8 | "conventional-changelog-cli": "^2.1.1", 9 | "ulid": "^2.3.0" 10 | }, 11 | "devDependencies": { 12 | "@types/node": "^17.0.21", 13 | "aws-cdk": "2.25.0", 14 | "conventional-github-releaser": "^3.1.5" 15 | }, 16 | "version": "0.15.0", 17 | "scripts": { 18 | "build": "tsc", 19 | "version": "conventional-changelog -p angular -i CHANGELOG.md -s && git add CHANGELOG.md", 20 | "lint": "npx eslint . --quiet --fix --report-unused-disable-directives --ignore-path .gitignore", 21 | "test": "echo No tests yet", 22 | "submit": "node build/src/infra/submit.js" 23 | }, 24 | "publishConfig": { 25 | "access": "public" 26 | }, 27 | "files": [ 28 | "build/src/**" 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "topo-processor" 3 | version = "0.15.0" 4 | description = "" 5 | authors = ["Blayne Chard ", "Paul Fouquet ", "Megan Davidson "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.8" 9 | linz-logger= "^0.6.0" 10 | py-multihash = "^2.0.1" 11 | rasterio = "^1.2.10" 12 | click = "^8.1.3" 13 | boto3 = "^1.23.5" 14 | python-ulid = "^1.1.0" 15 | fsspec = {extras = ["s3"], version = "^2022.5.0"} 16 | pystac = "^1.4.0" 17 | jsonschema = "^4.5.1" 18 | jsonschema-rs = "^0.13.1" 19 | Shapely = "^1.8.2" 20 | requests = "^2.26.0" 21 | aiohttp = "^3.8.1" 22 | python-dotenv = "^0.20.0" 23 | 24 | [tool.poetry.dev-dependencies] 25 | black = "^22.3" 26 | boto3-stubs = {version = "*", extras = ["lambda","sts"]} 27 | isort = "^5.10.1" 28 | pylint = "^2.13.9" 29 | pytest = "^7.1.2" 30 | rope = "^1.0.0" 31 | pytest-cov = "^3.0.0" 32 | pytest-mock = "^3.6.1" 33 | mypy = "*" 34 | moto = "^3.1.4" 35 | mypy-boto3-lambda = "^1.24.0" 36 | 37 | [tool.poetry.scripts] 38 | add = 'topo_processor.cli.geostore.add:main' 39 | status = 'topo_processor.cli.geostore.status:main' 40 | list = 'topo_processor.cli.geostore.list:main' 41 | delete = 'topo_processor.cli.geostore.delete:main' 42 | validate = 'topo_processor.cli.validate:main' 43 | 44 | [build-system] 45 | requires = ["poetry-core>=1.0.0"] 46 | build-backend = "poetry.core.masonry.api" 47 | 48 | [tool.black] 49 | line-length = 127 50 | 51 | [tool.isort] 52 | line_length = 127 53 | case_sensitive = true 54 | profile = "black" 55 | 56 | [tool.mypy] 57 | show_error_codes = true 58 | strict = true 59 | disable_error_code = [ 60 | "import", 61 | ] 62 | 63 | [[tool.mypy.overrides]] 64 | module = [ 65 | "linz_logger", 66 | ] 67 | ignore_missing_imports = true 68 | 69 | [tool.pytest.ini_options] 70 | markers = [ 71 | "slow: marks tests as slow", 72 | ] 73 | testpaths = ["topo_processor"] 74 | -------------------------------------------------------------------------------- /scripts/version.bump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Version bump the repo and create a branch ready for pull request 4 | # 5 | set -e 6 | 7 | git checkout master 8 | git pull --rebase 9 | 10 | # Validate that there are actually changes to be made, this will fail if nothing needs publishing 11 | npm version -m 'release: %s' minor 12 | 13 | # Set the version environment variable 14 | CURRENT_VERSION=$(node -p "require('./package.json').version") 15 | 16 | # Bump the version in Poetry pyproject.toml file 17 | poetry version ${CURRENT_VERSION} 18 | 19 | # Write version to a file for Topo Processor to use 20 | echo v${CURRENT_VERSION} | tee VERSION 21 | 22 | # Commit the changed files 23 | git commit -a --amend --no-edit 24 | 25 | # Checkout a new release branch 26 | git checkout -b release/v${CURRENT_VERSION} 27 | 28 | # This tag will be created once the pull request is merged 29 | git tag -d v${CURRENT_VERSION} 30 | -------------------------------------------------------------------------------- /test_data/historical_aerial_photos_metadata.csv: -------------------------------------------------------------------------------- 1 | WKT,sufi,survey,run,photo_no,alternate_survey_name,camera,camera_sequence_no,nominal_focal_length,altitude,scale,photocentre_lat,photocentre_lon,date,film,film_sequence_no,photo_type,format,source,physical_film_condition,image_anomalies,scanned,raw_filename,released_filename,when_scanned,photo_version 2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72358,SURVEY_1,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,Film scratched,,Y,WRONG_PHOTO_TYPE,CROWN_731_114,2018/Q2,1 3 | "POLYGON ((170.550411567673 -45.8023873533434,170.569928799273 -45.802784811616,170.569485879683 -45.8134678833323,170.549964961905 -45.8130703891654,170.550411567673 -45.8023873533434))",72360,SURVEY_1,E,50,,EAGLE IV,89554,508,11000,6600,-45.8079,170.5599,1952-04-23T00:00:00.000,731,112,B&W,18cm x 23cm,ORIGINAL ,Metadata manually populated,,Y,MULTIPLE_ASSET,CROWN_731_112,2020/Q1,1 4 | "POLYGON ((170.545239253866 -45.8023714719313,170.564756441893 -45.8027698029763,170.564312545088 -45.813452864901,170.544791670795 -45.8130544977075,170.545239253866 -45.8023714719313))",72359,SURVEY_1,E,49,,EAGLE IV,89555,,11000,6600,-45.8079,170.5548,1952-04-23T00:00:00.000,731,113,B&W,18cm x 23cm,ORIGINAL ,Not Film 222,,Y,CONTROL,CROWN_731_113,2014/Q3,1 5 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72352,SURVEY_3,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,,,Y,WRONG_SURVEY,CROWN_731_114,2020/Q4,1 6 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",29659,SURVEY_2,A,2,,ZEISS RMK,279,210,12750,15000,-41.7779,172.646,1982-02-16T00:00:00.000,C2559,100,COLOUR,23cm x 23cm,ORIGINAL,Film scratched,,Y,CONTROL_2,CROWN_2559c_100,,1 7 | -------------------------------------------------------------------------------- /test_data/historical_aerial_photos_metadata.gpkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/historical_aerial_photos_metadata.gpkg -------------------------------------------------------------------------------- /test_data/historical_aerial_photos_metadata_error.csv: -------------------------------------------------------------------------------- 1 | WKT,sufi,survey,run,photo_no,alternate_survey_name,camera,camera_sequence_no,nominal_focal_length,altitude,scale,photocentre_lat,photocentre_lon,date,film,film_sequence_no,photo_type,format,source,physical_film_condition,image_anomalies,scanned,raw_filename,released_filename,when_scanned,photo_version 2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72358,SURVEY_1,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,COLOUR,18cm x 23cm,ORIGINAL ,Film scratched,,Y,WRONG_PHOTO_TYPE,CROWN_731_114,2018/Q2,1 3 | "POLYGON ((170.550411567673 -45.8023873533434,170.569928799273 -45.802784811616,170.569485879683 -45.8134678833323,170.549964961905 -45.8130703891654,170.550411567673 -45.8023873533434))",72360,SURVEY_1,E,50,,EAGLE IV,89554,508,11000,6600,-45.8079,170.5599,"ERROR",731,112,B&W,18cm x 23cm,ORIGINAL ,Metadata manually populated,,Y,MULTIPLE_ASSET,CROWN_731_112,2020/Q1,1 4 | "POLYGON ((170.545239253866 -45.8023714719313,170.564756441893 -45.8027698029763,170.564312545088 -45.813452864901,170.544791670795 -45.8130544977075,170.545239253866 -45.8023714719313))",72359,SURVEY_1,E,49,,EAGLE IV,89555,,11000,6600,-45.8079,170.5548,1952-04-23T00:00:00.000,731,113,B&W,18cm x 23cm,ORIGINAL ,Not Film 222,,Y,CONTROL,CROWN_731_113,2014/Q3,1 5 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72352,SURVEY_3,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,,,Y,WRONG_SURVEY,CROWN_731_114,2020/Q4,1 6 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",29659,SURVEY_2,A,2,,ZEISS RMK,279,210,12750,15000,-41.7779,172.646,1982-02-16T00:00:00.000,C2559,100,COLOUR,23cm x 23cm,ORIGINAL,Film scratched,,Y,CONTROL_2,CROWN_2559c_100,,1 7 | -------------------------------------------------------------------------------- /test_data/historical_survey_footprint_metadata.csv: -------------------------------------------------------------------------------- 1 | WKT,SURVEY,COUNTRY,FILM_TYPE,COPYRIGHT,CONTRACTOR,NAME,COMMENTS,AREV,COORD_SYS,CHECKED,DONE 2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",SURVEY_1,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),TE KUITI 1,,f,NEW ZEALAND MAP GRID,t,1 3 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",SURVEY_3,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),AUCKLAND 1,,f,NEW ZEALAND MAP GRID,t,1 4 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",SURVEY_2,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),WELLINGTON 2,,f,NEW ZEALAND MAP GRID,t,1 5 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",SURVEY_NO_NAME,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),,,f,NEW ZEALAND MAP GRID,t,1 6 | -------------------------------------------------------------------------------- /test_data/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "path": "test_data", 3 | "time": 1644961117023, 4 | "files": [ 5 | { "path": "/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif" }, 6 | { "path": "/tiffs/SURVEY_1/MULTIPLE_ASSET.tif" }, 7 | { "path": "/tiffs/SURVEY_1/CONTROL.tif" } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /test_data/manifest_duplicate.json: -------------------------------------------------------------------------------- 1 | { 2 | "path": "test_data", 3 | "time": 1644961117023, 4 | "files": [ 5 | { "path": "/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif" }, 6 | { "path": "/tiffs/SURVEY_1/MULTIPLE_ASSET.tif" }, 7 | { "path": "/tiffs/SURVEY_3/MULTIPLE_ASSET.tif" }, 8 | { "path": "/tiffs/SURVEY_1/CONTROL.tif" } 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /test_data/schemas/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Notes for Developers 3 | 4 | You can put local schemas here for testing against. See the example tests in: 5 | `topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py` 6 | 7 | For LINZ STAC extensions you will need to alter the stac_extensions stanza, e.g. change 8 | 9 | 10 | ``` 11 | "definitions": { 12 | "stac_extensions": { 13 | "type": "object", 14 | "required": ["stac_extensions"], 15 | "properties": { 16 | "stac_extensions": { 17 | "type": "array", 18 | "contains": { 19 | "const": "https://stac.linz.govt.nz/_STAC_VERSION_/film/schema.json" 20 | } 21 | } 22 | } 23 | }, 24 | ``` 25 | 26 | to reference the local path of the schema: 27 | 28 | ``` 29 | "definitions": { 30 | "stac_extensions": { 31 | "type": "object", 32 | "required": ["stac_extensions"], 33 | "properties": { 34 | "stac_extensions": { 35 | "type": "array", 36 | "contains": { 37 | "const": "file:///home/your_username/dev/topo-processor/test_data/schemas/film.json" 38 | } 39 | } 40 | } 41 | }, 42 | ``` 43 | -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_1/CONTROL.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/CONTROL.tiff -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.his: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.his -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tiff -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_1/WRONG_SURVEY.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/WRONG_SURVEY.tiff -------------------------------------------------------------------------------- /test_data/tiffs/SURVEY_2/CONTROL_2.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_2/CONTROL_2.tif -------------------------------------------------------------------------------- /topo_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/__init__.py -------------------------------------------------------------------------------- /topo_processor/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/__init__.py -------------------------------------------------------------------------------- /topo_processor/cli/geostore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/geostore/__init__.py -------------------------------------------------------------------------------- /topo_processor/cli/geostore/add.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | from typing import Any, Dict, List 5 | from urllib.parse import urlparse 6 | 7 | import boto3 8 | import click 9 | from linz_logger import LogLevel, get_log, set_level 10 | 11 | from topo_processor.geostore.invoke import invoke_import_status, invoke_lambda 12 | from topo_processor.stac.stac_extensions import StacExtensions 13 | from topo_processor.util.aws_credentials import Credentials 14 | from topo_processor.util.aws_files import s3_download 15 | from topo_processor.util.configuration import temp_folder 16 | from topo_processor.util.file_extension import is_tiff 17 | from topo_processor.util.s3 import is_s3_path 18 | from topo_processor.util.time import time_in_ms 19 | 20 | 21 | @click.command() 22 | @click.option( 23 | "-s", 24 | "--source", 25 | required=True, 26 | help="The s3 path to the survey to export", 27 | ) 28 | @click.option( 29 | "-r", 30 | "--role", 31 | required=True, 32 | help="The ARN role to access to the source bucket", 33 | ) 34 | @click.option( 35 | "-c", 36 | "--commit", 37 | is_flag=True, 38 | help="Use this flag to commit the creation of the dataset", 39 | ) 40 | @click.option( 41 | "-v", 42 | "--verbose", 43 | is_flag=True, 44 | help="Use verbose to display debug logs", 45 | ) 46 | def main(source: str, role: str, commit: bool, verbose: bool) -> None: 47 | """Create or add a new version of an existing dataset to the Geostore for the source (survey) passed as argument.""" 48 | start_time = time_in_ms() 49 | logger = get_log() 50 | logger.info("geostore_add_started", source=source) 51 | 52 | if not verbose: 53 | set_level(LogLevel.info) 54 | 55 | try: 56 | source_role_arn = role 57 | client_sts = boto3.client("sts") 58 | assumed_role = client_sts.assume_role(RoleArn=source_role_arn, RoleSessionName="read-session") 59 | # Get Collection information 60 | collection_local_path = os.path.join(temp_folder, "collection.json") 61 | 62 | if is_s3_path(source): 63 | try: 64 | credentials = Credentials( 65 | assumed_role["Credentials"]["AccessKeyId"], 66 | assumed_role["Credentials"]["SecretAccessKey"], 67 | assumed_role["Credentials"]["SessionToken"], 68 | ) 69 | s3_download(os.path.join(source, "collection.json"), collection_local_path, credentials) 70 | except Exception as e: 71 | logger.error("geostore_export_failed", source=source, error=e) 72 | return 73 | else: 74 | raise Exception("The source has to be a survey in a S3 bucket.") 75 | 76 | with open(collection_local_path) as collection_file: 77 | collection_json: Dict[str, Any] = json.load(collection_file) 78 | 79 | # Get survey id for dataset id, collection.title for Description, and datatype prefix 80 | survey_id = collection_json["summaries"]["mission"][0] 81 | if not survey_id: 82 | raise Exception("No survey ID found in collection.json") 83 | if StacExtensions.historical_imagery.value in collection_json["stac_extensions"]: 84 | title_prefix = "historical-aerial-imagery-survey-" 85 | else: 86 | raise Exception("No match for data type in collection.json stac_extensions.") 87 | title = collection_json["title"] 88 | 89 | prefixed_survey_id = title_prefix + survey_id 90 | 91 | if commit: 92 | # Check if a dataset for this survey already exists 93 | list_parameters = {"title": prefixed_survey_id} 94 | dataset_list = invoke_lambda("datasets", "GET", list_parameters) 95 | if len(dataset_list["body"]) == 1 and dataset_list["body"][0]["title"] == prefixed_survey_id: 96 | # A dataset already exists 97 | if click.confirm( 98 | f"A dataset for the survey {prefixed_survey_id} already exists. A new version will be created. Do you want to continue?", 99 | abort=True, 100 | ): 101 | # Create a new version 102 | dataset_id = dataset_list["body"][0]["id"] 103 | click.echo("A new version will be created.") 104 | else: 105 | # Create a dataset 106 | logger.info("create_new_dataset", surveyId=prefixed_survey_id, surveyTitle=title) 107 | create_dataset_parameters = {"title": prefixed_survey_id, "description": title} 108 | dataset_response_payload = invoke_lambda("datasets", "POST", create_dataset_parameters) 109 | dataset_id = dataset_response_payload["body"]["id"] 110 | if not dataset_id: 111 | raise Exception(f"No dataset ID found in datasets Lambda function response: {dataset_response_payload}") 112 | 113 | # Upload data 114 | upload_data_parameters = { 115 | "id": dataset_id, 116 | "metadata_url": os.path.join(source, "collection.json"), 117 | "s3_role_arn": source_role_arn, 118 | } 119 | version_response_payload = invoke_lambda("dataset-versions", "POST", upload_data_parameters) 120 | execution_arn = version_response_payload["body"]["execution_arn"] 121 | 122 | # Check import status 123 | import_status = invoke_import_status(execution_arn) 124 | 125 | logger.info( 126 | "geostore_add_invoked", 127 | info=f"To check the import status, run the following command 'poetry run status -a {execution_arn}'", 128 | ) 129 | 130 | logger.debug( 131 | "geostore_add_details", 132 | source=source, 133 | datasetId=dataset_id, 134 | executionArn=execution_arn, 135 | currentImportStatus=import_status, 136 | duration=time_in_ms() - start_time, 137 | ) 138 | else: 139 | source_parse = urlparse(source, allow_fragments=False) 140 | bucket_name = source_parse.netloc 141 | prefix = source_parse.path[1:].replace("collection.json", "") 142 | logger.debug("no_commit", action="list_objects", bucket=bucket_name, prefix=prefix) 143 | file_list: List[str] = [] 144 | s3 = boto3.client( 145 | "s3", 146 | aws_access_key_id=credentials.access_key, 147 | aws_secret_access_key=credentials.secret_key, 148 | aws_session_token=credentials.token, 149 | ) 150 | paginator = s3.get_paginator("list_objects_v2") 151 | response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) 152 | for response in response_iterator: 153 | for contents_data in response["Contents"]: 154 | key = contents_data["Key"] 155 | if is_tiff(key): 156 | file_list.append(key) 157 | logger.info( 158 | "The change won't be commit since the --commit flag has not been specified.", 159 | sourceFiles=file_list, 160 | surveyId=prefixed_survey_id, 161 | surveyTitle=title, 162 | ) 163 | 164 | except Exception as e: 165 | logger.error("geostore_add_failed", err=e) 166 | finally: 167 | shutil.rmtree(temp_folder) 168 | -------------------------------------------------------------------------------- /topo_processor/cli/geostore/delete.py: -------------------------------------------------------------------------------- 1 | import click 2 | from linz_logger import LogLevel, get_log, set_level 3 | 4 | from topo_processor.geostore.invoke import invoke_lambda 5 | from topo_processor.util.time import time_in_ms 6 | 7 | 8 | @click.command() 9 | @click.option( 10 | "-d", 11 | "--dataset-id", 12 | required=True, 13 | help="The dataset id to delete", 14 | ) 15 | @click.option( 16 | "-c", 17 | "--commit", 18 | is_flag=True, 19 | help="Use this flag to commit the suppression of the dataset.", 20 | ) 21 | @click.option( 22 | "-v", 23 | "--verbose", 24 | is_flag=True, 25 | help="Use verbose to display debug logs", 26 | ) 27 | def main(dataset_id: str, commit: bool, verbose: str) -> None: 28 | start_time = time_in_ms() 29 | logger = get_log() 30 | logger.info("delete_datasets_start", dataset_id=dataset_id) 31 | 32 | if not verbose: 33 | set_level(LogLevel.info) 34 | 35 | try: 36 | delete_parameters = {"title": dataset_id} 37 | operation = "GET" 38 | if commit: 39 | operation = "DELETE" 40 | 41 | response = invoke_lambda("datasets", operation, delete_parameters) 42 | if not commit: 43 | logger.info( 44 | f"You are about to delete the following dataset: {response['body']}. Run the command again with the --commit flag to confirm." 45 | ) 46 | else: 47 | logger.info("delete_dataset_success", deleted_id=dataset_id, duration=time_in_ms() - start_time) 48 | except Exception as e: 49 | logger.error("delete_dataset_failed", err=e) 50 | -------------------------------------------------------------------------------- /topo_processor/cli/geostore/list.py: -------------------------------------------------------------------------------- 1 | import click 2 | from linz_logger import LogLevel, get_log, set_level 3 | 4 | from topo_processor.geostore.invoke import invoke_lambda 5 | from topo_processor.util.time import time_in_ms 6 | 7 | 8 | @click.command() 9 | @click.option( 10 | "-t", 11 | "--title", 12 | required=False, 13 | help="The Geostore title of the survey to filter", 14 | ) 15 | @click.option( 16 | "-v", 17 | "--verbose", 18 | is_flag=True, 19 | help="Use verbose to display debug logs", 20 | ) 21 | def main(title: str, verbose: bool) -> None: 22 | start_time = time_in_ms() 23 | logger = get_log() 24 | logger.info("list_datasets_start", title=title) 25 | 26 | if not verbose: 27 | set_level(LogLevel.info) 28 | 29 | try: 30 | list_parameters = {} 31 | if title: 32 | list_parameters = {"title": title} 33 | dataset_list = invoke_lambda("datasets", "GET", list_parameters) 34 | 35 | logger.info("list_datasets_end", dataset_list=dataset_list, duration=time_in_ms() - start_time) 36 | except Exception as e: 37 | logger.error("list_datasets_failed", err=e) 38 | -------------------------------------------------------------------------------- /topo_processor/cli/geostore/status.py: -------------------------------------------------------------------------------- 1 | import click 2 | from linz_logger import LogLevel, get_log, set_level 3 | 4 | from topo_processor.geostore.invoke import invoke_import_status 5 | from topo_processor.util.time import time_in_ms 6 | 7 | 8 | @click.command() 9 | @click.option( 10 | "-a", 11 | "--execution-arn", 12 | required=True, 13 | help="The execution arn received from the Geostore after invoking an upload", 14 | ) 15 | @click.option( 16 | "-v", 17 | "--verbose", 18 | is_flag=True, 19 | help="Use verbose to display debug logs", 20 | ) 21 | def main(execution_arn: str, verbose: bool) -> None: 22 | start_time = time_in_ms() 23 | logger = get_log() 24 | logger.info("check_import_status_start", arn=execution_arn) 25 | 26 | if not verbose: 27 | set_level(LogLevel.info) 28 | 29 | try: 30 | import_status = invoke_import_status(execution_arn) 31 | 32 | logger.info( 33 | "check_import_status", 34 | current_import_status=import_status, 35 | ) 36 | 37 | logger.debug( 38 | "check_export_status_end", 39 | duration=time_in_ms() - start_time, 40 | ) 41 | 42 | except Exception as e: 43 | logger.error("check_import_status_failed", err=e) 44 | -------------------------------------------------------------------------------- /topo_processor/cli/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/cli/tests/upload_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | 5 | import pytest 6 | 7 | from topo_processor.stac.stac_extensions import StacExtensions 8 | 9 | 10 | @pytest.mark.slow 11 | def test_upload_local(setup: str) -> None: 12 | target = setup 13 | source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs")) 14 | metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv")) 15 | footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv")) 16 | command = os.path.join(os.getcwd(), "upload") 17 | subprocess.run( 18 | [command, "-s", source, "-d", "imagery.historic", "-t", target, "-m", metadata_path, "-f", footprint_metadata], 19 | check=True, 20 | ) 21 | 22 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.json")) 23 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.tiff")) 24 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "collection.json")) 25 | 26 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.json")) 27 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.tif")) 28 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "collection.json")) 29 | 30 | assert os.path.isfile(os.path.join(target, "SURVEY_1", "72360.json")) 31 | assert os.path.isfile(os.path.join(target, "SURVEY_1", "72360.tiff")) 32 | assert os.path.isfile(os.path.join(target, "SURVEY_1", "collection.json")) 33 | 34 | with open(os.path.join(target, "SURVEY_1", "72359.json")) as item_json_file: 35 | item_metadata = json.load(item_json_file) 36 | assert item_metadata["properties"]["camera:sequence_number"] == 89555 37 | assert StacExtensions.camera.value in item_metadata["stac_extensions"] 38 | assert "camera:nominal_focal_length" not in item_metadata["properties"].keys() 39 | 40 | with open(os.path.join(target, "SURVEY_3", "72352.json")) as item_json_file: 41 | item_metadata = json.load(item_json_file) 42 | assert item_metadata["properties"]["mission"] == "SURVEY_3" 43 | assert item_metadata["id"] == "72352" 44 | assert ( 45 | item_metadata["assets"]["visual"]["file:checksum"] 46 | == "1220e3e67b095835c5ae8d7b311af25606d3dc0915219f34838e1f0c78b980697ca4" 47 | ) 48 | assert (item_metadata["assets"]["visual"]["href"]) == "./72352.tiff" 49 | assert len(item_metadata["links"]) == 3 50 | for link in item_metadata["links"]: 51 | assert link["rel"] != "self" 52 | assert link["href"] == "./collection.json" 53 | 54 | with open(os.path.join(target, "SURVEY_3", "collection.json")) as collection_json_file: 55 | collection_metadata = json.load(collection_json_file) 56 | 57 | assert len(collection_metadata["links"]) == 2 58 | for link in collection_metadata["links"]: 59 | assert link["rel"] != "self" 60 | if link["rel"] == "root": 61 | assert link["href"] == "./collection.json" 62 | if link["rel"] == "item": 63 | assert link["href"] == "./72352.json" 64 | 65 | assert item_metadata["properties"]["camera:sequence_number"] == 89556 66 | assert item_metadata["properties"]["camera:nominal_focal_length"] == 508 67 | assert StacExtensions.camera.value in item_metadata["stac_extensions"] 68 | 69 | 70 | @pytest.mark.slow 71 | def test_upload_local_fail(setup: str) -> None: 72 | target = setup 73 | source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs")) 74 | metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata_error.csv")) 75 | footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv")) 76 | command = os.path.join(os.getcwd(), "upload") 77 | 78 | with pytest.raises(Exception) as e: 79 | subprocess.run( 80 | [command, "-s", source, "-d", "imagery.historic", "-t", target, "-m", metadata_path, "-f", footprint_metadata], 81 | check=True, 82 | ) 83 | assert "process is stopped" in str(e.value).lower() 84 | 85 | 86 | @pytest.mark.slow 87 | def test_upload_local_forced(setup: str) -> None: 88 | target = setup 89 | source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs")) 90 | metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata_error.csv")) 91 | footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv")) 92 | command = os.path.join(os.getcwd(), "upload") 93 | 94 | subprocess.run( 95 | [ 96 | command, 97 | "-s", 98 | source, 99 | "-d", 100 | "imagery.historic", 101 | "-t", 102 | target, 103 | "-m", 104 | metadata_path, 105 | "-f", 106 | footprint_metadata, 107 | "--force", 108 | ], 109 | check=True, 110 | ) 111 | 112 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.json")) 113 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.tiff")) 114 | assert os.path.isfile(os.path.join(target, "SURVEY_3", "collection.json")) 115 | 116 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.json")) 117 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.tif")) 118 | assert os.path.isfile(os.path.join(target, "SURVEY_2", "collection.json")) 119 | -------------------------------------------------------------------------------- /topo_processor/cli/upload.py: -------------------------------------------------------------------------------- 1 | import click 2 | import pystac 3 | from linz_logger import LogLevel, get_log, set_level 4 | 5 | from topo_processor.metadata.data_type import DataType 6 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata 7 | from topo_processor.stac.item_factory import process_source 8 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator 9 | from topo_processor.stac.store import collection_store 10 | from topo_processor.util.s3 import is_s3_path 11 | from topo_processor.util.time import time_in_ms 12 | from topo_processor.util.transfer_collection import transfer_collection 13 | 14 | 15 | @click.command() 16 | @click.option( 17 | "-s", 18 | "--source", 19 | required=True, 20 | help="The source of the data to import", 21 | ) 22 | @click.option( 23 | "-d", 24 | "--datatype", 25 | required=True, 26 | type=click.Choice([data_type for data_type in DataType], case_sensitive=True), 27 | help="The datatype of the upload", 28 | ) 29 | @click.option( 30 | "-t", 31 | "--target", 32 | required=True, 33 | help="The target directory path or bucket name of the upload", 34 | ) 35 | @click.option( 36 | "-c", 37 | "--correlationid", 38 | required=False, 39 | help="The correlation ID of the batch job", 40 | ) 41 | @click.option( 42 | "-m", 43 | "--metadata", 44 | required=False, 45 | help="The metadata file path", 46 | ) 47 | @click.option( 48 | "-v", 49 | "--verbose", 50 | is_flag=True, 51 | help="Use verbose to display trace logs", 52 | ) 53 | @click.option( 54 | "-f", 55 | "--footprint", 56 | required=False, 57 | help="The survey footprint metadata path", 58 | ) 59 | @click.option( 60 | "--force", 61 | is_flag=True, 62 | help="Force the upload even if all the data is not valid", 63 | ) 64 | def main( 65 | source: str, datatype: str, correlationid: str, target: str, metadata: str, verbose: str, footprint: str, force: bool 66 | ) -> None: 67 | get_log().info("upload_start", correlationId=correlationid, source=source, target=target, dataType=datatype, force=force) 68 | try: 69 | pystac.validation.set_validator(IterErrorsValidator()) 70 | 71 | if verbose: 72 | set_level(LogLevel.trace) 73 | 74 | start_time = time_in_ms() 75 | data_type = DataType(datatype) 76 | 77 | # Caching the metadata required by the user. 78 | if metadata: 79 | get_metadata(data_type, None, metadata) 80 | if not is_s3_path(metadata): 81 | if not footprint: 82 | get_log().error( 83 | "survey_footprint_metadata_not_given", 84 | msg="You have to provide a local path for the survey footprint metadata", 85 | ) 86 | raise Exception("survey footprint metadata not given") 87 | else: 88 | if data_type == DataType.IMAGERY_HISTORIC: 89 | get_metadata(DataType.SURVEY_FOOTPRINT_HISTORIC, None, footprint) 90 | else: 91 | raise Exception("Not yet implemented") 92 | 93 | process_source(source, data_type, metadata, force) 94 | 95 | for collection in collection_store.values(): 96 | transfer_collection(collection, target, data_type, force) 97 | 98 | get_log().debug( 99 | "Job Completed", 100 | source=source, 101 | location=target, 102 | correlationid=correlationid, 103 | data_type=data_type, 104 | duration=time_in_ms() - start_time, 105 | ) 106 | except Exception as e: 107 | get_log().error("Job Failed", error=e, source=source, correlationid=correlationid, data_type=datatype) 108 | finally: 109 | for collection in collection_store.values(): 110 | collection.delete_temp_dir() 111 | -------------------------------------------------------------------------------- /topo_processor/cli/validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import click 5 | from linz_logger import LogLevel, get_log, set_level 6 | 7 | from topo_processor.stac.validation import validate_stac 8 | from topo_processor.util.configuration import temp_folder 9 | from topo_processor.util.s3 import is_s3_path 10 | from topo_processor.util.time import time_in_ms 11 | 12 | 13 | @click.command() 14 | @click.option( 15 | "-i", 16 | "--item", 17 | is_flag=True, 18 | help="Use item to validate items only.", 19 | ) 20 | @click.option( 21 | "-c", 22 | "--collection", 23 | is_flag=True, 24 | help="Use collection to validate collections only.", 25 | ) 26 | @click.option( 27 | "-m", 28 | "--metadata", 29 | required=False, 30 | help="(OPTIONAL) The path of the metadata csv file to validate.", 31 | ) 32 | @click.option( 33 | "-v", 34 | "--verbose", 35 | is_flag=True, 36 | help="Use verbose to display trace logs (it might be slower).", 37 | ) 38 | def main(item: bool, collection: bool, metadata: str, verbose: str) -> None: 39 | if verbose: 40 | set_level(LogLevel.trace) 41 | else: 42 | set_level(LogLevel.info) 43 | 44 | start_time = time_in_ms() 45 | 46 | if metadata: 47 | if not is_s3_path(metadata): 48 | metadata = os.path.abspath(metadata) 49 | 50 | if item == collection: 51 | validate_stac(metadata) 52 | else: 53 | validate_stac(metadata, item, collection) 54 | 55 | # Cleanup 56 | shutil.rmtree(temp_folder) 57 | 58 | get_log().info( 59 | "validate completed", 60 | duration=time_in_ms() - start_time, 61 | ) 62 | -------------------------------------------------------------------------------- /topo_processor/cog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cog/__init__.py -------------------------------------------------------------------------------- /topo_processor/cog/create_cog.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket 4 | from topo_processor.util.command import Command 5 | from topo_processor.util.s3 import bucket_name_from_path, is_s3_path 6 | 7 | 8 | def create_cog(input_path: str, output_path: str) -> Command: 9 | is_s3 = is_s3_path(input_path) 10 | if is_s3: 11 | credentials: Credentials = get_credentials_from_bucket(bucket_name_from_path(input_path)) 12 | input_path = f"/vsis3/{input_path.replace('s3://', '')}" 13 | if os.environ.get("IS_DOCKER") == "true": 14 | cmd = Command("gdal_translate") 15 | if is_s3: 16 | os.environ["AWS_ACCESS_KEY_ID"] = credentials.access_key 17 | os.environ["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key 18 | os.environ["AWS_SESSION_TOKEN"] = credentials.token 19 | else: 20 | cmd = Command("gdal_translate", {"container": "osgeo/gdal", "tag": "ubuntu-small-3.5.0"}) 21 | if is_s3: 22 | cmd.env(f"AWS_ACCESS_KEY_ID={credentials.access_key}") 23 | cmd.env(f"AWS_SECRET_ACCESS_KEY={credentials.secret_key}") 24 | cmd.env(f"AWS_SESSION_TOKEN={credentials.token}") 25 | 26 | cmd.env("GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR") 27 | cmd.mount(input_path) 28 | cmd.mount(os.path.dirname(output_path)) 29 | cmd.arg(input_path) 30 | cmd.arg("-of", "COG") 31 | cmd.arg("-co", "COMPRESS=LZW") 32 | cmd.arg("-co", "NUM_THREADS=ALL_CPUS") 33 | cmd.arg("-co", "PREDICTOR=2") 34 | cmd.arg("-co", "OVERVIEW_COMPRESS=JPEG") 35 | cmd.arg("-co", "BIGTIFF=YES") 36 | cmd.arg("-co", "OVERVIEW_RESAMPLING=LANCZOS") 37 | cmd.arg("-co", "BLOCKSIZE=512") 38 | cmd.arg("-co", "OVERVIEW_QUALITY=90") 39 | cmd.arg("-co", "SPARSE_OK=TRUE") 40 | cmd.arg(output_path) 41 | return cmd 42 | -------------------------------------------------------------------------------- /topo_processor/cog/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cog/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/cog/tests/create_cog_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.cog.create_cog import create_cog 4 | 5 | 6 | def test_cog_command() -> None: 7 | input_path = "fake_input_dir/fake_input.tiff" 8 | output_path = "fake_input_dir/fake_output.tiff" 9 | 10 | cmd = create_cog(input_path, output_path) 11 | assert cmd.to_full_command() == [ 12 | "gdal_translate", 13 | "fake_input_dir/fake_input.tiff", 14 | "-of", 15 | "COG", 16 | "-co", 17 | "COMPRESS=LZW", 18 | "-co", 19 | "NUM_THREADS=ALL_CPUS", 20 | "-co", 21 | "PREDICTOR=2", 22 | "-co", 23 | "OVERVIEW_COMPRESS=JPEG", 24 | "-co", 25 | "BIGTIFF=YES", 26 | "-co", 27 | "OVERVIEW_RESAMPLING=LANCZOS", 28 | "-co", 29 | "BLOCKSIZE=512", 30 | "-co", 31 | "OVERVIEW_QUALITY=90", 32 | "-co", 33 | "SPARSE_OK=TRUE", 34 | "fake_input_dir/fake_output.tiff", 35 | ] 36 | -------------------------------------------------------------------------------- /topo_processor/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/data/__init__.py -------------------------------------------------------------------------------- /topo_processor/data/data_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_transformer_imagery_historic import DataTransformerImageryHistoric 2 | from .data_transformer_repo import DataTransformerRepository 3 | 4 | data_transformer_repo = DataTransformerRepository() 5 | data_transformer_repo.append(DataTransformerImageryHistoric()) 6 | -------------------------------------------------------------------------------- /topo_processor/data/data_transformers/data_transformer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from topo_processor.stac.item import Item 8 | 9 | 10 | class DataTransformer(ABC): 11 | @property 12 | @abstractmethod 13 | def name(self) -> str: 14 | pass 15 | 16 | @abstractmethod 17 | def is_applicable(self, item: Item) -> bool: 18 | pass 19 | 20 | @abstractmethod 21 | def transform_data(self, item: Item) -> None: 22 | pass 23 | -------------------------------------------------------------------------------- /topo_processor/data/data_transformers/data_transformer_imagery_historic.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import TYPE_CHECKING 5 | 6 | import pystac 7 | import ulid 8 | from linz_logger import get_log 9 | 10 | from topo_processor.cog.create_cog import create_cog 11 | from topo_processor.stac.asset import Asset 12 | from topo_processor.util.file_extension import is_tiff 13 | from topo_processor.util.time import time_in_ms 14 | 15 | from .data_transformer import DataTransformer 16 | 17 | if TYPE_CHECKING: 18 | from topo_processor.stac.item import Item 19 | 20 | 21 | class DataTransformerImageryHistoric(DataTransformer): 22 | name = "data.transformer.imagery.historic" 23 | 24 | def is_applicable(self, item: Item) -> bool: 25 | for asset in item.assets: 26 | if is_tiff(asset.source_path): 27 | return True 28 | return False 29 | 30 | def transform_data(self, item: Item) -> None: 31 | cog_asset_list = [] 32 | for asset in item.assets: 33 | if not is_tiff(asset.source_path): 34 | continue 35 | start_time = time_in_ms() 36 | if not item.collection: 37 | get_log().warning("Item has no collection", item_id=item.id) 38 | return 39 | output_path = os.path.join(item.collection.get_temp_dir(), f"{ulid.ULID()}.tiff") 40 | 41 | try: 42 | create_cog(asset.source_path, output_path).run() 43 | except Exception as e: 44 | raise Exception( 45 | f"COG creation failed for item {item.id} with source path {asset.source_path} and output path {output_path}." 46 | ) from e 47 | 48 | get_log().debug("Created COG", output_path=output_path, duration=time_in_ms() - start_time) 49 | 50 | asset.needs_upload = False 51 | 52 | cog_asset = Asset(output_path) 53 | cog_asset.content_type = pystac.MediaType.COG 54 | cog_asset.key_name = asset.key_name 55 | cog_asset.target = asset.target 56 | cog_asset.properties = asset.properties 57 | cog_asset.set_output_asset_dates(output_path) 58 | cog_asset_list.append(cog_asset) 59 | 60 | for asset in cog_asset_list: 61 | item.add_asset(asset) 62 | -------------------------------------------------------------------------------- /topo_processor/data/data_transformers/data_transformer_repo.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, List 4 | 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.time import time_in_ms 8 | 9 | from .data_transformer import DataTransformer 10 | 11 | if TYPE_CHECKING: 12 | from topo_processor.stac.item import Item 13 | 14 | 15 | class DataTransformerRepository: 16 | transformers: List[DataTransformer] = [] 17 | 18 | def append(self, transformers: DataTransformer) -> None: 19 | self.transformers.append(transformers) 20 | 21 | def transform_data(self, item: Item) -> None: 22 | for transformer in self.transformers: 23 | if transformer.is_applicable(item): 24 | start_time = time_in_ms() 25 | try: 26 | transformer.transform_data(item) 27 | except Exception as e: 28 | item.add_error(str(e), transformer.name, e) 29 | get_log().error("Data Transform Failed. Process is stopped.", transformers=transformer.name, error=e) 30 | raise Exception(e) 31 | get_log().debug( 32 | "Data Transformed", 33 | duration=time_in_ms() - start_time, 34 | ) 35 | -------------------------------------------------------------------------------- /topo_processor/file_system/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/file_system/__init__.py -------------------------------------------------------------------------------- /topo_processor/file_system/assets.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | from topo_processor.file_system.file_searcher import get_file_path_from_survey 5 | from topo_processor.file_system.get_fs import get_fs 6 | from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol 7 | from topo_processor.metadata.data_type import DataType 8 | from topo_processor.stac.asset import Asset 9 | from topo_processor.stac.store import get_asset 10 | from topo_processor.util.aws_files import build_s3_path 11 | from topo_processor.util.file_extension import FILE_EXTENSIONS, is_extension 12 | from topo_processor.util.s3 import bucket_name_from_stack, is_s3_path 13 | 14 | 15 | def get_assets(source: str, data_type: str, metadata_path: str = "") -> List[Asset]: 16 | if os.path.isdir(os.path.dirname(source)) or is_s3_path(source): 17 | return _get_assets_from_directory(source, data_type) 18 | else: 19 | if data_type == DataType.IMAGERY_HISTORIC: 20 | return _get_historical_imagery_assets(source, data_type, metadata_path) 21 | raise Exception(f"Source is neither Directory or Imagery Historic datatype, source= {source}") 22 | 23 | 24 | def _get_assets_from_directory(source: str, data_type: str) -> List[Asset]: 25 | assets_list: List[Asset] = [] 26 | if not is_s3_path(source): 27 | source = os.path.abspath(source) 28 | fs = get_fs(source) 29 | for (path, _, files) in fs.walk(source): 30 | if not files: 31 | continue 32 | for file_ in files: 33 | if not is_extension(file_, FILE_EXTENSIONS[data_type]): 34 | continue 35 | asset_path = get_path_with_protocol(source, fs, path) 36 | asset = get_asset(f"{asset_path}/{file_}") 37 | assets_list.append(asset) 38 | return assets_list 39 | 40 | 41 | def _get_historical_imagery_assets(source: str, data_type: str, metadata_path: str = "") -> List[Asset]: 42 | assets_list: List[Asset] = [] 43 | manifest_bucket = bucket_name_from_stack("TopoProcessorBatch") 44 | manifest_path = build_s3_path(manifest_bucket, "manifest.json") 45 | asset_path_list: List[str] = get_file_path_from_survey(source, manifest_path, metadata_path) 46 | for path in asset_path_list: 47 | if not is_extension(path, FILE_EXTENSIONS[data_type]): 48 | continue 49 | asset = get_asset(path) 50 | assets_list.append(asset) 51 | return assets_list 52 | -------------------------------------------------------------------------------- /topo_processor/file_system/file_searcher.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from linz_logger import get_log 4 | 5 | from topo_processor.file_system.manifest import get_file_path_from_manifest, load_manifest 6 | from topo_processor.metadata.data_type import DataType 7 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata 8 | from topo_processor.util.aws_files import build_s3_path 9 | from topo_processor.util.configuration import historical_imagery_bucket 10 | 11 | 12 | def get_file_path_from_survey(survey_id: str, manifest_path: str, metadata_path: str = "") -> List[str]: 13 | list_file_path: List[str] = [] 14 | criteria = {"survey": survey_id} 15 | metadata = get_metadata(DataType.IMAGERY_HISTORIC, criteria, metadata_path, True) 16 | manifest = load_manifest(manifest_path) 17 | 18 | for metadata_row in metadata.values(): 19 | file_name_lower = str(metadata_row["raw_filename"]).lower() 20 | tmp_list = get_file_path_from_manifest(manifest, ("/" + file_name_lower + ".tif", "/" + file_name_lower + ".tiff")) 21 | if len(tmp_list) > 1: 22 | raise Exception( 23 | f"Duplicate files found for file name: {file_name_lower}. Duplicate path: {', '.join([duplicate for duplicate in tmp_list])}" 24 | ) 25 | elif len(tmp_list) == 1: 26 | path = build_s3_path(historical_imagery_bucket, tmp_list[0]) 27 | list_file_path.append(path) 28 | else: 29 | get_log().warn( 30 | "file_not_found", 31 | msg="No file found with this name.", 32 | file_name=file_name_lower, 33 | ) 34 | 35 | return list_file_path 36 | -------------------------------------------------------------------------------- /topo_processor/file_system/get_fs.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from fsspec.implementations.local import LocalFileSystem 4 | from s3fs import S3FileSystem 5 | 6 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket 7 | from topo_processor.util.s3 import bucket_name_from_path, is_s3_path 8 | 9 | 10 | def get_fs(path: str) -> Any: 11 | if is_s3_path(path): 12 | credentials: Credentials = get_credentials_from_bucket(bucket_name_from_path(path)) 13 | return S3FileSystem(secret=credentials.secret_key, token=credentials.token, key=credentials.access_key) 14 | return LocalFileSystem(auto_mkdir="True") 15 | -------------------------------------------------------------------------------- /topo_processor/file_system/get_path_with_protocol.py: -------------------------------------------------------------------------------- 1 | from s3fs import S3FileSystem 2 | 3 | 4 | def get_path_with_protocol(source_dir: str, source_fs: S3FileSystem, path: str) -> str: 5 | source_dir = source_dir.rstrip("/") 6 | trimmed_source_dir = source_fs._strip_protocol(source_dir) 7 | output_path = f"{source_dir}{path[(len(trimmed_source_dir)):]}" 8 | return output_path 9 | -------------------------------------------------------------------------------- /topo_processor/file_system/manifest.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, List, Tuple 3 | 4 | from topo_processor.util.aws_files import create_s3_manifest, s3_download 5 | from topo_processor.util.configuration import temp_folder 6 | from topo_processor.util.s3 import is_s3_path 7 | 8 | 9 | def load_manifest(manifest_path: str) -> Dict[str, Any]: 10 | if is_s3_path(manifest_path): 11 | create_s3_manifest(manifest_path) 12 | s3_download(manifest_path, f"{temp_folder}/manifest.json") 13 | manifest_path = f"{temp_folder}/manifest.json" 14 | 15 | with open(manifest_path) as manifest_json_file: 16 | manifest: Dict[str, Any] = json.load(manifest_json_file) 17 | 18 | return manifest 19 | 20 | 21 | def get_file_path_from_manifest(manifest: Dict[str, Any], file_names: Tuple[str, ...]) -> List[str]: 22 | list_str: List[str] = [] 23 | 24 | for manifest_file in manifest["files"]: 25 | if manifest_file["path"].lower().endswith(file_names): 26 | list_str.append(manifest_file["path"]) 27 | 28 | return list_str 29 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/file_system/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/file_system/tests/assets_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.file_system.assets import _get_assets_from_directory 6 | from topo_processor.metadata.data_type import DataType 7 | 8 | 9 | def test_get_assets_from_directory() -> None: 10 | source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs")) 11 | assets_list = _get_assets_from_directory(source, DataType.IMAGERY_HISTORIC) 12 | 13 | assert len(assets_list) == 5 14 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/file_searcher_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | import pytest 5 | 6 | from topo_processor.file_system.file_searcher import get_file_path_from_survey 7 | from topo_processor.file_system.manifest import get_file_path_from_manifest, load_manifest 8 | 9 | 10 | def test_get_file_path_from_manifest() -> None: 11 | assert_list: List[str] = [] 12 | assert_list.append("/tiffs/SURVEY_1/CONTROL.tif") 13 | 14 | result_list: List[str] = [] 15 | manifest = load_manifest(os.path.join(os.getcwd(), "test_data", "manifest.json")) 16 | result_list = get_file_path_from_manifest(manifest, ("control.tif", "control.tiff")) 17 | 18 | assert assert_list == result_list 19 | 20 | 21 | def test_get_file_path_from_survey() -> None: 22 | assert_list: List[str] = [] 23 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif") 24 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/MULTIPLE_ASSET.tif") 25 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/CONTROL.tif") 26 | 27 | result_list: List[str] = get_file_path_from_survey( 28 | "SURVEY_1", os.path.join(os.getcwd(), "test_data", "manifest.json"), "test_data/historical_aerial_photos_metadata.csv" 29 | ) 30 | 31 | assert result_list == assert_list 32 | 33 | 34 | def test_get_file_path_from_survey_duplicate() -> None: 35 | assert_list: List[str] = [] 36 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif") 37 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/MULTIPLE_ASSET.tif") 38 | assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/CONTROL.tif") 39 | 40 | with pytest.raises(Exception) as e: 41 | get_file_path_from_survey( 42 | "SURVEY_1", 43 | os.path.join(os.getcwd(), "test_data", "manifest_duplicate.json"), 44 | "test_data/historical_aerial_photos_metadata.csv", 45 | ) 46 | assert "Duplicate files found" in str(e.value) 47 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/get_fs_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fsspec.implementations.local import LocalFileSystem 3 | from s3fs.core import S3FileSystem 4 | 5 | from topo_processor.file_system.get_fs import get_fs 6 | 7 | 8 | @pytest.mark.skip(reason="Skip this test for now before refactoring get_credentials") 9 | def test_get_fs_s3() -> None: 10 | path = "s3://testbucket" 11 | assert isinstance(get_fs(path), S3FileSystem) 12 | 13 | 14 | def test_get_fs_local() -> None: 15 | path = "./test" 16 | assert isinstance(get_fs(path), LocalFileSystem) 17 | path = "/home/test/location" 18 | assert isinstance(get_fs(path), LocalFileSystem) 19 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/get_path_with_protocol_test.py: -------------------------------------------------------------------------------- 1 | from fsspec.implementations.local import LocalFileSystem 2 | from s3fs import S3FileSystem 3 | 4 | from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol 5 | 6 | 7 | def test_get_path_with_protocol_aws() -> None: 8 | source_dir_with_forwardslash = "s3://bucketname/folder/" 9 | path = "bucketname/folder/subfolder/subfolder2" 10 | fs = S3FileSystem() 11 | full_path = get_path_with_protocol(source_dir=source_dir_with_forwardslash, source_fs=fs, path=path) 12 | assert full_path == "s3://bucketname/folder/subfolder/subfolder2" 13 | source_dir_without_forwardslash = "s3://bucketname/folder" 14 | full_path = get_path_with_protocol(source_dir=source_dir_without_forwardslash, source_fs=fs, path=path) 15 | assert full_path == "s3://bucketname/folder/subfolder/subfolder2" 16 | 17 | 18 | def test_get_path_with_protocol_local() -> None: 19 | source_dir_with_forwardslash = "/home/username/dev/topo-processor/test_data/tiffs/" 20 | path = "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1" 21 | fs = LocalFileSystem(auto_mkdir="True") 22 | full_path = get_path_with_protocol(source_dir=source_dir_with_forwardslash, source_fs=fs, path=path) 23 | assert full_path == "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1" 24 | source_dir_without_forwardslash = "/home/username/dev/topo-processor/test_data/tiffs" 25 | full_path = get_path_with_protocol(source_dir=source_dir_without_forwardslash, source_fs=fs, path=path) 26 | assert full_path == "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1" 27 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/transfer_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.file_system.get_fs import get_fs 6 | from topo_processor.file_system.transfer import transfer_file 7 | 8 | 9 | def test_transfer_local(setup: str) -> None: 10 | dest_path = f"{setup}/test.tiff" 11 | input_path = os.path.join(os.getcwd(), "test_data/tiffs/SURVEY_1/CONTROL.tiff") 12 | transfer_file(input_path, "fakechecksum", "image/tiff", dest_path) 13 | assert get_fs(input_path).info(input_path)["size"] == get_fs(dest_path).info(dest_path)["size"] 14 | -------------------------------------------------------------------------------- /topo_processor/file_system/tests/write_json_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.file_system.write_json import write_json 6 | 7 | 8 | def test_write_json(setup: str) -> None: 9 | my_dict = {"foo": "foo", "bar": 1} 10 | target = setup + "/test.json" 11 | write_json(my_dict, target) 12 | assert os.path.isfile(target) 13 | -------------------------------------------------------------------------------- /topo_processor/file_system/transfer.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from linz_logger import get_log 4 | 5 | from topo_processor.util.time import time_in_ms 6 | 7 | from .get_fs import get_fs 8 | 9 | 10 | def transfer_file(source_file: str, checksum: str, content_type: Union[str, None], target_file: str) -> None: 11 | start_time = time_in_ms() 12 | with get_fs(source_file).open(source_file, "rb") as f1: 13 | data = f1.read() 14 | with get_fs(target_file).open(target_file, "wb", ContentType=content_type, Metadata={"hash": checksum}) as f2: 15 | f2.write(data) 16 | get_log().debug( 17 | "File transferred", 18 | source_file=source_file, 19 | target_file=target_file, 20 | checksum=checksum, 21 | duration=time_in_ms() - start_time, 22 | ) 23 | -------------------------------------------------------------------------------- /topo_processor/file_system/write_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | import pystac 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.time import time_in_ms 8 | 9 | from .get_fs import get_fs 10 | 11 | 12 | def write_json(dictionary: Dict[str, Any], target_json: str) -> None: 13 | start_time = time_in_ms() 14 | with get_fs(target_json).open(target_json, "w", encoding="utf8", ContentType=pystac.MediaType.JSON) as f1: 15 | f1.write(json.dumps(dictionary, indent=4, ensure_ascii=False)) 16 | get_log().debug("JSON Written", target_json=target_json, duration=time_in_ms() - start_time) 17 | -------------------------------------------------------------------------------- /topo_processor/geostore/invoke.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | import boto3 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.aws_credentials import Credentials 8 | 9 | ROLE_ARN = "arn:aws:iam::715898075157:role/api-users" 10 | logger = get_log() 11 | 12 | 13 | def invoke_lambda(name: str, http_method: str, parameters: Dict[str, str]) -> Dict[str, Any]: 14 | 15 | client_sts = boto3.client("sts") 16 | 17 | assumed_role = client_sts.assume_role(RoleArn=ROLE_ARN, RoleSessionName="invoke-geostore") 18 | credentials = Credentials( 19 | assumed_role["Credentials"]["AccessKeyId"], 20 | assumed_role["Credentials"]["SecretAccessKey"], 21 | assumed_role["Credentials"]["SessionToken"], 22 | ) 23 | client_lambda = boto3.client( 24 | "lambda", 25 | aws_access_key_id=credentials.access_key, 26 | aws_secret_access_key=credentials.secret_key, 27 | aws_session_token=credentials.token, 28 | ) 29 | payload = build_lambda_payload(http_method, parameters) 30 | logger.debug("invoke_lambda_function", name=name, payload=payload) 31 | 32 | raw_response = client_lambda.invoke( 33 | FunctionName=name, 34 | InvocationType="RequestResponse", 35 | LogType="Tail", 36 | Payload=json.dumps(payload).encode(), 37 | ) 38 | payload_response: Dict[str, Any] = json.loads(raw_response["Payload"].read()) 39 | 40 | if not is_response_ok(payload_response): 41 | raise Exception("invoke_lambda_function_error", payload_response) 42 | 43 | logger.debug("response_lambda_function", name=name, response=payload_response) 44 | return payload_response 45 | 46 | 47 | def build_lambda_payload(http_method: str, parameters: Dict[str, str]) -> Dict[str, Any]: 48 | payload: Dict[str, Any] = {} 49 | payload["http_method"] = http_method 50 | payload["body"] = {} 51 | if parameters: 52 | payload["body"] = parameters 53 | 54 | return payload 55 | 56 | 57 | def invoke_import_status(execution_arn: str) -> Dict[str, Any]: 58 | """Return the current status of the dataset version import process in the Geostore identified by 'execution_arn'""" 59 | import_status_parameters = {"execution_arn": execution_arn} 60 | import_status_response_payload = invoke_lambda("import-status", "GET", import_status_parameters) 61 | 62 | import_status: Dict[str, Any] = import_status_response_payload["body"] 63 | return import_status 64 | 65 | 66 | def is_response_ok(response: Dict[str, Any]) -> bool: 67 | try: 68 | if 200 <= response["status_code"] <= 299: 69 | return True 70 | return False 71 | except Exception as e: 72 | raise Exception("There is an issue with the response") from e 73 | -------------------------------------------------------------------------------- /topo_processor/geostore/tests/invoke_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.geostore.invoke import build_lambda_payload, is_response_ok 4 | 5 | 6 | def test_build_lambda_payload() -> None: 7 | payload = { 8 | "http_method": "GET", 9 | "body": {"id": "123", "metadata_url": "s3://my-bucket/my-survey/metadata.csv", "s3_role_arn": "arn:my-arn:1234567"}, 10 | } 11 | payload_param = { 12 | "id": "123", 13 | "metadata_url": "s3://my-bucket/my-survey/metadata.csv", 14 | "s3_role_arn": "arn:my-arn:1234567", 15 | } 16 | 17 | payload_built = build_lambda_payload("GET", payload_param) 18 | assert payload == payload_built 19 | 20 | 21 | def test_is_response_ok() -> None: 22 | response_ko = {"status_code": 404, "body": {"message": "Not Found: dataset '1234' does not exist"}} 23 | assert is_response_ok(response_ko) is False 24 | response_ok = { 25 | "status_code": 200, 26 | "body": { 27 | "created_at": "2022-03-23T02:41:53.940795+0000", 28 | "pk": "DATASET#01FYTADW8MSCNR8D68EX7APMD3", 29 | "title": "test_title", 30 | "updated_at": "2022-03-23T02:41:53.940911+0000", 31 | "id": "01FYTADW8MSCNR8D68EX7APMD3", 32 | }, 33 | } 34 | assert is_response_ok(response_ok) 35 | -------------------------------------------------------------------------------- /topo_processor/metadata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/__init__.py -------------------------------------------------------------------------------- /topo_processor/metadata/csv_loader/csv_loader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from curses import meta 4 | from typing import Any, Dict, List 5 | 6 | from linz_logger import get_log 7 | 8 | 9 | def read_csv(metadata_file_path: str, key: str, alternative_key: str = "", columns: List[str] = []) -> Dict[str, Any]: 10 | metadata: Dict[str, Any] = {} 11 | 12 | csv_path = os.path.join(os.getcwd(), metadata_file_path) 13 | if not os.path.isfile(csv_path): 14 | raise Exception(f'Cannot find "{csv_path}"') 15 | 16 | with open(csv_path, "r") as csv_text: 17 | reader = csv.DictReader(csv_text, delimiter=",") 18 | for row in reader: 19 | filtered_row: Dict[str, str] = {} 20 | if columns: 21 | for col in columns: 22 | filtered_row[col] = row[col] 23 | else: 24 | filtered_row = row 25 | 26 | if row[key]: 27 | key_value = row[key] 28 | if key_value in metadata: 29 | if filtered_row == metadata[key_value]: 30 | raise Exception(f'Duplicate "{key_value}" found in "{metadata_file_path}"') 31 | elif alternative_key and row[alternative_key]: 32 | metadata[row[alternative_key]] = filtered_row 33 | metadata[key_value] = filtered_row 34 | elif alternative_key and row[alternative_key]: 35 | metadata[row[alternative_key]] = filtered_row 36 | else: 37 | get_log().debug("read_csv_key_not_found", key=key, alternative_key=alternative_key) 38 | 39 | return metadata 40 | -------------------------------------------------------------------------------- /topo_processor/metadata/csv_loader/tests/csv_loader_test.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import tempfile 4 | 5 | import pytest 6 | 7 | from topo_processor.metadata.csv_loader.csv_loader import read_csv 8 | 9 | 10 | def test_read_csv() -> None: 11 | metadata_path = os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv") 12 | metadata = read_csv(metadata_path, "raw_filename", "sufi") 13 | 14 | assert len(metadata) == 5 15 | assert list(metadata.keys()) == ["WRONG_PHOTO_TYPE", "MULTIPLE_ASSET", "CONTROL", "WRONG_SURVEY", "CONTROL_2"] 16 | 17 | 18 | def test_error_on_wrong_file_name() -> None: 19 | metadata_path = "./data/historical_aerial_photos_metadata.csv" 20 | 21 | with pytest.raises(Exception, match=r"^Cannot find "): 22 | read_csv(metadata_path, "raw_filename", "sufi") 23 | 24 | 25 | def test_error_on_duplicate_file() -> None: 26 | temp_file = tempfile.NamedTemporaryFile() 27 | header = [ 28 | "WKT", 29 | "sufi", 30 | "survey", 31 | "run", 32 | "photo_no", 33 | "alternate_survey_name", 34 | "camera", 35 | "camera_sequence_no", 36 | "nominal_focal_length", 37 | "altitude", 38 | "scale", 39 | "photocentre_lat", 40 | "photocentre_lon", 41 | "date", 42 | "film", 43 | "film_sequence_no", 44 | "photo_type", 45 | "format", 46 | "source", 47 | "physical_film_condition", 48 | "image_anomalies", 49 | "scanned", 50 | "raw_filename", 51 | "released_filename", 52 | "when_scanned", 53 | "photo_version", 54 | ] 55 | row = [ 56 | "", 57 | "", 58 | "", 59 | "", 60 | "", 61 | "", 62 | "", 63 | "", 64 | "", 65 | "", 66 | "", 67 | "", 68 | "", 69 | "", 70 | "", 71 | "", 72 | "", 73 | "", 74 | "", 75 | "", 76 | "", 77 | "", 78 | "WRONG_PHOTO_TYPE", 79 | "", 80 | "", 81 | "", 82 | ] 83 | with open(temp_file.name, "a", encoding="utf-8") as csv_file: 84 | writer = csv.writer(csv_file) 85 | writer.writerow(header) 86 | writer.writerow(row) 87 | writer.writerow(row) 88 | 89 | with pytest.raises(Exception, match=r'Duplicate "WRONG_PHOTO_TYPE" found in "' + temp_file.name + '"'): 90 | read_csv(temp_file.name, "raw_filename", "sufi") 91 | 92 | 93 | def test_read_csv_column_filter() -> None: 94 | metadata_path = os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv") 95 | metadata = read_csv(metadata_path, "SURVEY", columns=["NAME"]) 96 | 97 | assert len(metadata) == 4 98 | assert list(metadata.keys()) == ["SURVEY_1", "SURVEY_3", "SURVEY_2", "SURVEY_NO_NAME"] 99 | assert list(metadata.values()) == [{"NAME": "TE KUITI 1"}, {"NAME": "AUCKLAND 1"}, {"NAME": "WELLINGTON 2"}, {"NAME": ""}] 100 | -------------------------------------------------------------------------------- /topo_processor/metadata/data_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Dict 3 | 4 | 5 | class DataType(str, Enum): 6 | IMAGERY_HISTORIC = "imagery.historic" 7 | IMAGERY_AERIAL = "imagery.aerial" 8 | LIDAR_DSM = "lidar.dsm" 9 | LIDAR_DEM = "lidar.dem" 10 | LIDAR_POINT_CLOUD = "lidar.pointcloud" 11 | SURVEY_FOOTPRINT_HISTORIC = "survey.footprint.historic" 12 | 13 | 14 | data_type_layer: Dict[str, str] = {DataType.IMAGERY_HISTORIC: "51002", DataType.SURVEY_FOOTPRINT_HISTORIC: "51000"} 15 | 16 | 17 | def get_layer_id(data_type: str) -> str: 18 | return data_type_layer[data_type] 19 | -------------------------------------------------------------------------------- /topo_processor/metadata/lds_cache/lds_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional 4 | 5 | import pystac 6 | from linz_logger import get_log 7 | 8 | from topo_processor.metadata.csv_loader.csv_loader import read_csv 9 | from topo_processor.metadata.data_type import DataType, get_layer_id 10 | from topo_processor.util.aws_files import build_s3_path, load_file_content, s3_download 11 | from topo_processor.util.configuration import lds_cache_bucket, temp_folder 12 | from topo_processor.util.file_converter import geopackage_to_csv 13 | from topo_processor.util.file_extension import is_csv, is_geopackage 14 | from topo_processor.util.gzip import decompress_file 15 | 16 | metadata_store: Dict[str, Dict[str, Any]] = {} 17 | """Stores the metadata by layer id""" 18 | 19 | 20 | def get_latest_item_path(collection: pystac.Collection) -> pystac.Link: 21 | for link in reversed(collection.get_links()): 22 | if not link.rel == "item": 23 | continue 24 | return link 25 | 26 | raise Exception(f"No version found for Collection {collection.title}") 27 | 28 | 29 | def get_latest_item(layer: str) -> pystac.Item: 30 | collection = pystac.Collection.from_dict(load_file_content(lds_cache_bucket, layer + "/collection.json")) 31 | latest_item = get_latest_item_path(collection) 32 | latest_item_path = f"{layer}/{latest_item.href.lstrip('./')}" 33 | 34 | return pystac.Item.from_dict(load_file_content(lds_cache_bucket, latest_item_path)) 35 | 36 | 37 | def get_metadata( 38 | data_type: str, criteria: Optional[Dict[str, str]] = None, metadata_path: str = "", save_filtered: bool = False 39 | ) -> Dict[str, Any]: 40 | """Return a dictionary containing the metadata""" 41 | layer_id = get_layer_id(data_type) 42 | 43 | if not metadata_path: 44 | if not metadata_store.get(layer_id): 45 | latest_item = get_latest_item(layer_id) 46 | exported_asset = latest_item.assets.get("export", None) 47 | 48 | if exported_asset is None: 49 | raise Exception(f"No exported asset found for lds layer: {layer_id}") 50 | 51 | asset_path = exported_asset.href.lstrip("./") 52 | metadata_path = f"{temp_folder}/{asset_path}" 53 | s3_download(build_s3_path(lds_cache_bucket, f"{layer_id}/{asset_path}"), metadata_path) 54 | 55 | if os.path.isfile(metadata_path): 56 | if exported_asset.extra_fields.get("encoding", None) == "gzip": 57 | decompress_file(metadata_path) 58 | else: 59 | raise Exception(f"{metadata_path} not found") 60 | 61 | if os.path.isfile(metadata_path): 62 | if is_geopackage(metadata_path): 63 | new_metadata_path = os.path.splitext(metadata_path)[0] + "_" + time.strftime("%s") + ".csv" 64 | geopackage_to_csv(metadata_path, new_metadata_path).run() 65 | metadata_path = new_metadata_path 66 | elif not is_csv(metadata_path): 67 | raise Exception(f"Unsupported file format. {metadata_path} must be .csv or .gpkg") 68 | 69 | if data_type == DataType.IMAGERY_HISTORIC: 70 | metadata_store[layer_id] = read_csv(metadata_path, "raw_filename", "sufi") 71 | elif data_type == DataType.SURVEY_FOOTPRINT_HISTORIC: 72 | metadata_store[layer_id] = read_csv(metadata_path, "SURVEY", columns=["NAME"]) 73 | 74 | if criteria: 75 | filtered_metadata = filter_metadata(metadata_store[layer_id], criteria) 76 | if save_filtered: 77 | metadata_store[layer_id] = filtered_metadata 78 | else: 79 | return filtered_metadata 80 | 81 | return metadata_store[layer_id] 82 | 83 | 84 | def filter_metadata(metadata_to_filter: Dict[str, Any], criteria: Dict[str, Any]) -> Dict[str, Any]: 85 | get_log().debug("filter_metadata", criteria=criteria) 86 | filtered_dict: Dict[str, Any] = {} 87 | is_found = False 88 | 89 | for metadata_key, metadata_value in metadata_to_filter.items(): 90 | for criteria_key, criteria_value in criteria.items(): 91 | if metadata_value[criteria_key]: 92 | if metadata_value[criteria_key] == criteria_value: 93 | is_found = True 94 | else: 95 | is_found = False 96 | break 97 | if is_found: 98 | filtered_dict[metadata_key] = metadata_value 99 | return filtered_dict 100 | -------------------------------------------------------------------------------- /topo_processor/metadata/lds_cache/tests/lds_cache_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from tempfile import mkdtemp 4 | 5 | import pytest 6 | 7 | from topo_processor.metadata.data_type import DataType 8 | from topo_processor.metadata.lds_cache.lds_cache import filter_metadata, get_metadata 9 | 10 | 11 | def test_filter_metadata() -> None: 12 | metadata = { 13 | "file_a": {"survey": "survey_1", "camera": "camera_a", "raw_filename": "file_a"}, 14 | "file_b": {"survey": "survey_3", "camera": "camera_b", "raw_filename": "file_b"}, 15 | "file_c": {"survey": "survey_1", "camera": "camera_b", "raw_filename": "file_c"}, 16 | } 17 | 18 | criteria = {"survey": "survey_1"} 19 | 20 | metadata_filtered = { 21 | "file_a": {"survey": "survey_1", "camera": "camera_a", "raw_filename": "file_a"}, 22 | "file_c": {"survey": "survey_1", "camera": "camera_b", "raw_filename": "file_c"}, 23 | } 24 | 25 | result = filter_metadata(metadata, criteria) 26 | 27 | assert metadata_filtered == result 28 | 29 | 30 | def test_get_metadata_csv() -> None: 31 | metadata = { 32 | "WRONG_SURVEY": { 33 | "WKT": "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))", 34 | "sufi": "72352", 35 | "survey": "SURVEY_3", 36 | "run": "E", 37 | "photo_no": "48", 38 | "alternate_survey_name": "", 39 | "camera": "EAGLE IV", 40 | "camera_sequence_no": "89556", 41 | "nominal_focal_length": "508", 42 | "altitude": "11000", 43 | "scale": "6600", 44 | "photocentre_lat": "-45.8079", 45 | "photocentre_lon": "170.5496", 46 | "date": "1952-04-23T00:00:00.000", 47 | "film": "731", 48 | "film_sequence_no": "114", 49 | "photo_type": "B&W", 50 | "format": "18cm x 23cm", 51 | "source": "ORIGINAL ", 52 | "physical_film_condition": "", 53 | "image_anomalies": "", 54 | "scanned": "Y", 55 | "raw_filename": "WRONG_SURVEY", 56 | "released_filename": "CROWN_731_114", 57 | "when_scanned": "2020/Q4", 58 | "photo_version": "1", 59 | } 60 | } 61 | metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv")) 62 | criteria = {"survey": "SURVEY_3"} 63 | result = get_metadata(DataType.IMAGERY_HISTORIC, criteria, metadata_path) 64 | 65 | assert metadata == result 66 | 67 | 68 | def test_get_metadata_gpkg() -> None: 69 | metadata = { 70 | "WRONG_SURVEY": { 71 | "WKT": "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))", 72 | "sufi": "72352", 73 | "survey": "SURVEY_3", 74 | "run": "E", 75 | "photo_no": "48", 76 | "alternate_survey_name": "", 77 | "camera": "EAGLE IV", 78 | "camera_sequence_no": "89556", 79 | "nominal_focal_length": "508", 80 | "altitude": "11000", 81 | "scale": "6600", 82 | "photocentre_lat": "-45.8079", 83 | "photocentre_lon": "170.5496", 84 | "date": "1952-04-23T00:00:00.000", 85 | "film": "731", 86 | "film_sequence_no": "114", 87 | "photo_type": "B&W", 88 | "format": "18cm x 23cm", 89 | "source": "ORIGINAL ", 90 | "physical_film_condition": "", 91 | "image_anomalies": "", 92 | "scanned": "Y", 93 | "raw_filename": "WRONG_SURVEY", 94 | "released_filename": "CROWN_731_114", 95 | "when_scanned": "2020/Q4", 96 | "photo_version": "1", 97 | } 98 | } 99 | temp_folder: str = mkdtemp() 100 | source_metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.gpkg")) 101 | dest_metadata_path = os.path.abspath(os.path.join(temp_folder, "historical_aerial_photos_metadata.gpkg")) 102 | shutil.copyfile(source_metadata_path, dest_metadata_path) 103 | criteria = {"survey": "SURVEY_3"} 104 | result = get_metadata(DataType.IMAGERY_HISTORIC, criteria, dest_metadata_path) 105 | shutil.rmtree(temp_folder) 106 | 107 | assert metadata == result 108 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .metadata_loader_imagery_historic import MetadataLoaderImageryHistoric 2 | from .metadata_loader_repo import MetadataLoaderRepository 3 | from .metadata_loader_tiff import MetadataLoaderTiff 4 | 5 | metadata_loader_rep = MetadataLoaderRepository() 6 | metadata_loader_rep.append(MetadataLoaderImageryHistoric()) 7 | metadata_loader_rep.append(MetadataLoaderTiff()) 8 | 9 | metadata_loader_imagery_hist = MetadataLoaderImageryHistoric() 10 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/metadata_loader.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING, Optional 5 | 6 | if TYPE_CHECKING: 7 | from topo_processor.stac.asset import Asset 8 | 9 | 10 | class MetadataLoader(ABC): 11 | @property 12 | @abstractmethod 13 | def name(self) -> str: 14 | pass 15 | 16 | @abstractmethod 17 | def is_applicable(self, asset: Optional[Asset] = None) -> bool: 18 | pass 19 | 20 | @abstractmethod 21 | def load_metadata(self, asset: Optional[Asset] = None) -> None: 22 | pass 23 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/metadata_loader_imagery_historic.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import TYPE_CHECKING, Any, Dict, Optional 5 | 6 | import shapely.wkt 7 | from rasterio.enums import ColorInterp 8 | 9 | from topo_processor.metadata.data_type import DataType 10 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata 11 | from topo_processor.stac.asset_key import AssetKey 12 | from topo_processor.stac.collection import Collection 13 | from topo_processor.stac.linz_provider import LinzProviders 14 | from topo_processor.stac.providers import Providers 15 | from topo_processor.stac.stac_extensions import StacExtensions 16 | from topo_processor.stac.store import get_collection, get_item 17 | from topo_processor.util.conversions import ( 18 | historical_imagery_photo_type_to_linz_geospatial_type, 19 | nzt_datetime_to_utc_datetime, 20 | quarterdate_to_date_string, 21 | remove_empty_strings, 22 | string_to_boolean, 23 | string_to_number, 24 | ) 25 | from topo_processor.util.file_extension import is_tiff 26 | 27 | from .metadata_loader import MetadataLoader 28 | 29 | if TYPE_CHECKING: 30 | from topo_processor.stac.asset import Asset 31 | from topo_processor.stac.item import Item 32 | 33 | 34 | class MetadataLoaderImageryHistoric(MetadataLoader): 35 | name = "metadata.loader.imagery.historic" 36 | is_init = False 37 | raw_metadata: Dict[str, Dict[str, str]] = {} 38 | 39 | def is_applicable(self, asset: Optional[Asset] = None) -> bool: 40 | if asset: 41 | return is_tiff(asset.source_path) 42 | else: 43 | return False 44 | 45 | def load_metadata(self, asset: Optional[Asset] = None, metadata_file: str = "", is_load_all: bool = False) -> None: 46 | criteria = {} 47 | filename = "" 48 | 49 | if asset: 50 | filename = os.path.splitext(os.path.basename(asset.source_path))[0] 51 | criteria = {"raw_filename": filename} 52 | 53 | self.raw_metadata = get_metadata(DataType.IMAGERY_HISTORIC.value, criteria, metadata_file) 54 | 55 | if is_load_all: 56 | for metadata in self.raw_metadata.values(): 57 | self.populate_item(metadata) 58 | elif asset: 59 | 60 | if filename not in self.raw_metadata: 61 | asset.add_error("Asset not found in CSV file", self.name) 62 | return 63 | asset_metadata = self.raw_metadata[filename] 64 | 65 | asset.target = f"{asset_metadata['survey']}/{asset_metadata['sufi']}{asset.file_ext()}" 66 | asset.key_name = AssetKey.Visual 67 | self.populate_item(asset_metadata, asset) 68 | 69 | def populate_item(self, metadata_row: Dict[str, str], asset: Optional[Asset] = None) -> None: 70 | survey = metadata_row["survey"] 71 | if not survey: 72 | survey = metadata_row["alternate_survey_name"] 73 | title = self.get_title(survey) 74 | 75 | collection = get_collection(title) 76 | collection.survey = survey 77 | 78 | item = get_item(metadata_row["sufi"]) 79 | collection.add_item(item) 80 | 81 | if asset: 82 | item.add_asset(asset) 83 | 84 | item.collection = collection 85 | self.populate_collection(collection) 86 | 87 | item.properties.update( 88 | { 89 | "mission": collection.survey, 90 | "platform": "Fixed-wing Aircraft", 91 | "instruments": [metadata_row["camera"]], 92 | } 93 | ) 94 | 95 | self.add_linz_geospatial_type(item, metadata_row["photo_type"]) 96 | self.add_aerial_photo_metadata(item, metadata_row) 97 | self.add_camera_metadata(item, metadata_row) 98 | self.add_film_metadata(item, metadata_row) 99 | self.add_centroid(item, metadata_row) 100 | self.add_projection_extent(item) 101 | self.add_scanning_metadata(item, metadata_row) 102 | self.add_datetime_property(item, metadata_row) 103 | self.add_spatial_extent(item, metadata_row) 104 | self.add_bands_extent(item, asset) 105 | 106 | item.add_extension(StacExtensions.historical_imagery.value) 107 | item.add_extension(StacExtensions.linz.value) 108 | item.add_extension(StacExtensions.version.value) 109 | 110 | def populate_collection(self, collection: Collection) -> None: 111 | collection.license = "CC-BY-4.0" 112 | collection.extra_fields.update( 113 | { 114 | "linz:history": "LINZ and its predecessors, Lands & Survey and Department of Survey and Land Information (DOSLI), commissioned aerial photography for the Crown between 1936 and 2008.\nOne of the predominant uses of the aerial photography at the time was the photogrammetric mapping of New Zealand, initially at 1inch to 1mile followed by the NZMS 260 and Topo50 map series at 1:50,000.\nThese photographs were scanned through the Crown Aerial Film Archive scanning project.", 115 | "linz:lifecycle": "completed", 116 | "quality:description": "Geographic coordinates provided with this aerial photographic survey were estimated from the associated survey chart and have low positional accuracy. They should be used for general referencing only.", 117 | } 118 | ) 119 | 120 | collection.add_extension(StacExtensions.quality.value) 121 | collection.add_linz_provider(LinzProviders.LTTW.value) 122 | collection.add_linz_provider(LinzProviders.LMPP.value) 123 | collection.add_provider(Providers.NZAM.value) 124 | 125 | def get_title(self, survey: str) -> str: 126 | survey_names = get_metadata(DataType.SURVEY_FOOTPRINT_HISTORIC) 127 | title: str = "" 128 | 129 | if len(survey_names) == 0: 130 | raise Exception(f"Empty footprint metadata file when processing survey {survey}") 131 | 132 | try: 133 | title = survey_names[survey]["NAME"] 134 | except Exception as e: 135 | raise Exception(f"No name found for survey {survey} in footprint metadata file") from e 136 | 137 | return title 138 | 139 | def add_spatial_extent(self, item: Item, asset_metadata: Dict[str, str]) -> None: 140 | wkt = asset_metadata.get("WKT", None) 141 | if wkt is None or wkt.lower() == "polygon empty": 142 | item.add_warning("Geometry is missing", "") 143 | return 144 | 145 | try: 146 | # EPSG:4167 -> EPSG:4326 is mostly a null conversion, in the future if we support additional projections we should reproject this 147 | poly = shapely.wkt.loads(wkt) 148 | # Reduce the precision of all the coordinates to approx 1M resolution 149 | poly = shapely.wkt.loads(shapely.wkt.dumps(poly, rounding_precision=5)) 150 | item.geometry_poly = poly 151 | except shapely.errors.WKTReadingError as e: 152 | item.add_error("Geometry is invalid", "", e) 153 | 154 | def add_camera_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None: 155 | camera_properties: Dict[str, Any] = {} 156 | 157 | camera_properties["camera:sequence_number"] = string_to_number(asset_metadata["camera_sequence_no"]) 158 | camera_properties["camera:nominal_focal_length"] = string_to_number(asset_metadata["nominal_focal_length"]) 159 | 160 | item.properties.update(remove_empty_strings(camera_properties)) 161 | item.add_extension(StacExtensions.camera.value) 162 | 163 | def add_film_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None: 164 | film_properties: Dict[str, Any] = {} 165 | 166 | film_properties["film:id"] = asset_metadata["film"] 167 | film_properties["film:negative_sequence"] = string_to_number(asset_metadata["film_sequence_no"]) 168 | film_properties["film:physical_condition"] = asset_metadata["physical_film_condition"] 169 | film_properties["film:physical_size"] = asset_metadata["format"] 170 | 171 | item.properties.update(remove_empty_strings(film_properties)) 172 | item.add_extension(StacExtensions.film.value) 173 | 174 | def add_aerial_photo_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None: 175 | aerial_photo_properties: Dict[str, Any] = {} 176 | aerial_photo_properties["aerial-photo:run"] = asset_metadata["run"] 177 | aerial_photo_properties["aerial-photo:sequence_number"] = string_to_number(asset_metadata["photo_no"]) 178 | aerial_photo_properties["aerial-photo:anomalies"] = asset_metadata["image_anomalies"] 179 | altitude = string_to_number(asset_metadata["altitude"]) 180 | if isinstance(altitude, int) and altitude <= 0: 181 | item.add_warning( 182 | msg="Skipped Record", 183 | cause=self.name, 184 | e=Exception(f"stac field 'aerial-photo:altitude' has value: {altitude}"), 185 | ) 186 | else: 187 | aerial_photo_properties["aerial-photo:altitude"] = altitude 188 | scale = string_to_number(asset_metadata["scale"]) 189 | if isinstance(scale, int) and scale <= 0: 190 | item.add_warning( 191 | msg="Skipped Record", 192 | cause=self.name, 193 | e=Exception(f"stac field 'aerial-photo:scale' has value: {scale}"), 194 | ) 195 | else: 196 | aerial_photo_properties["aerial-photo:scale"] = scale 197 | 198 | item.properties.update(remove_empty_strings(aerial_photo_properties)) 199 | item.add_extension(StacExtensions.aerial_photo.value) 200 | 201 | def add_scanning_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None: 202 | scanning_properties: Dict[str, Any] = {} 203 | 204 | if asset_metadata["source"]: 205 | scanning_properties["scan:is_original"] = string_to_boolean(asset_metadata["source"], ["original"], ["copy"]) 206 | if asset_metadata["when_scanned"]: 207 | scanning_properties["scan:scanned"] = quarterdate_to_date_string(asset_metadata["when_scanned"]) 208 | 209 | item.properties.update(remove_empty_strings(scanning_properties)) 210 | item.add_extension(StacExtensions.scanning.value) 211 | 212 | def add_datetime_property(self, item: Item, asset_metadata: Dict[str, str]) -> None: 213 | item_date = asset_metadata.get("date", None) 214 | 215 | if item_date: 216 | try: 217 | item.datetime = nzt_datetime_to_utc_datetime(item_date) 218 | except Exception as e: 219 | item.add_error(msg="Invalid date", cause=self.name, e=e) 220 | else: 221 | item.add_error(msg="No date found", cause=self.name, e=Exception(f"item date has no value")) 222 | 223 | def add_centroid(self, item: Item, asset_metadata: Dict[str, str]) -> None: 224 | 225 | centroid = { 226 | "lat": string_to_number(asset_metadata.get("photocentre_lat", "")), 227 | "lon": string_to_number(asset_metadata.get("photocentre_lon", "")), 228 | } 229 | if self.is_valid_centroid(item, centroid): 230 | item.properties["proj:centroid"] = centroid 231 | item.add_extension(StacExtensions.projection.value) 232 | 233 | def add_projection_extent(self, item: Item) -> None: 234 | item.properties["proj:epsg"] = None 235 | item.add_extension(StacExtensions.projection.value) 236 | 237 | def add_bands_extent(self, item: Item, asset: Optional[Asset] = None) -> None: 238 | item.add_extension(StacExtensions.eo.value) 239 | 240 | if asset: 241 | # default value 242 | asset.properties["eo:bands"] = [{"name": ColorInterp.gray.name, "common_name": "pan"}] 243 | 244 | def is_valid_centroid(self, item: Item, centroid: Dict[str, Any]) -> bool: 245 | if not isinstance(centroid["lat"], (int, float)) or centroid["lat"] > 90 or centroid["lat"] < -90: 246 | item.add_warning( 247 | msg="Skipped Record", 248 | cause=self.name, 249 | e=Exception( 250 | f"stac field 'proj:centroid' has invalid lat value: {centroid['lat']}, instance: {type(centroid['lat'])}" 251 | ), 252 | ) 253 | return False 254 | if not isinstance(centroid["lon"], (int, float)) or centroid["lon"] > 180 or centroid["lon"] < -180: 255 | item.add_warning( 256 | msg="Skipped Record", 257 | cause=self.name, 258 | e=Exception( 259 | f"stac field 'proj:centroid' has invalid lon value: {centroid['lon']}, instance: {type(centroid['lon'])}" 260 | ), 261 | ) 262 | return False 263 | return True 264 | 265 | def add_linz_geospatial_type(self, item: Item, photo_type: str) -> None: 266 | 267 | item.linz_geospatial_type = historical_imagery_photo_type_to_linz_geospatial_type(photo_type) 268 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/metadata_loader_repo.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, List, Optional 4 | 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.time import time_in_ms 8 | 9 | from .metadata_loader import MetadataLoader 10 | 11 | if TYPE_CHECKING: 12 | from topo_processor.stac.asset import Asset 13 | 14 | 15 | class MetadataLoaderRepository: 16 | loaders: List[MetadataLoader] = [] 17 | 18 | def append(self, loader: MetadataLoader) -> None: 19 | self.loaders.append(loader) 20 | 21 | def load_metadata(self, asset: Optional[Asset] = None) -> None: 22 | for loader in self.loaders: 23 | if loader.is_applicable(asset): 24 | start_time = time_in_ms() 25 | try: 26 | loader.load_metadata(asset) 27 | if not asset or not asset.is_valid: 28 | break 29 | except Exception as e: 30 | # TODO refactor to report errors in a better way 31 | if asset: 32 | asset.add_error(str(e), loader.name, e) 33 | get_log().error("Metadata Load Failed", error=e, loader=loader.name) 34 | raise Exception(f"Metadata Load Failed: {e}") 35 | get_log().debug( 36 | "Metadata Loaded", 37 | loader=loader.name, 38 | asset=asset.source_path, 39 | duration=time_in_ms() - start_time, 40 | ) 41 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/metadata_loader_tiff.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from typing import TYPE_CHECKING, Any, Optional 5 | 6 | import rasterio 7 | from linz_logger import get_log 8 | from rasterio.enums import ColorInterp 9 | 10 | from topo_processor.file_system.get_fs import get_fs 11 | from topo_processor.stac.stac_extensions import StacExtensions 12 | from topo_processor.util.file_extension import is_tiff 13 | 14 | from .metadata_loader import MetadataLoader 15 | 16 | if TYPE_CHECKING: 17 | from topo_processor.stac.asset import Asset 18 | 19 | 20 | class MetadataLoaderTiff(MetadataLoader): 21 | name = "metadata.loader.imagery.tiff" 22 | 23 | def is_applicable(self, asset: Optional[Asset] = None) -> bool: 24 | if asset is None or asset.item is None: 25 | return False 26 | return is_tiff(asset.source_path) 27 | 28 | def load_metadata(self, asset: Optional[Asset] = None) -> None: 29 | if asset: 30 | fs = get_fs(asset.source_path) 31 | # FIXME: Should we download the file first as we could need it to do the coggification later? 32 | # This process takes quiet a long time locally. 33 | 34 | with fs.open(asset.source_path) as f: 35 | with warnings.catch_warnings(record=True) as w: 36 | with rasterio.open(f) as tiff: 37 | self.add_epsg(tiff, asset) 38 | self.add_bands(tiff, asset) 39 | for warn in w: 40 | get_log().warning(f"Rasterio Warning: {warn.message}", file=asset.source_path, loader=self.name) 41 | 42 | def add_epsg(self, tiff: Any, asset: Asset) -> None: 43 | if tiff.crs: 44 | if not tiff.crs.is_epsg_code: 45 | raise Exception("The code is not a valid EPSG code.") 46 | crs = tiff.crs.to_epsg() 47 | else: 48 | crs = None 49 | if asset.item: 50 | asset.item.properties["proj:epsg"] = crs 51 | asset.item.add_extension(StacExtensions.projection.value) 52 | 53 | def add_bands(self, tiff: Any, asset: Asset) -> None: 54 | if asset.item: 55 | asset.item.add_extension(StacExtensions.eo.value) 56 | 57 | if ColorInterp.gray in tiff.colorinterp and len(tiff.colorinterp) == 1: 58 | asset.properties["eo:bands"] = [{"name": ColorInterp.gray.name, "common_name": "pan"}] 59 | elif all(band in [ColorInterp.red, ColorInterp.blue, ColorInterp.green] for band in tiff.colorinterp): 60 | asset.properties["eo:bands"] = [ 61 | {"name": ColorInterp.red.name, "common_name": "red"}, 62 | {"name": ColorInterp.green.name, "common_name": "green"}, 63 | {"name": ColorInterp.blue.name, "common_name": "blue"}, 64 | ] 65 | elif asset.item: 66 | asset.item.add_warning( 67 | msg="Skipped Asset Record", 68 | cause=self.name, 69 | e=Exception("stac field 'eo:bands' skipped. Tiff ColorInterp does not match specified values"), 70 | ) 71 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/metadata_loaders/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_loaders/tests/metadata_loader_tiff_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from topo_processor.metadata.metadata_loaders.metadata_loader_tiff import MetadataLoaderTiff 4 | from topo_processor.stac.asset import Asset 5 | from topo_processor.stac.collection import Collection 6 | from topo_processor.stac.item import Item 7 | from topo_processor.stac.stac_extensions import StacExtensions 8 | 9 | 10 | def test_load_metadata() -> None: 11 | source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 12 | asset = Asset(source_path) 13 | item = Item("item_id") 14 | item.add_asset(asset) 15 | item.collection = Collection("Collection") 16 | loader = MetadataLoaderTiff() 17 | assert loader.is_applicable(asset) 18 | 19 | loader.load_metadata(asset) 20 | assert item.properties["proj:epsg"] is None 21 | assert StacExtensions.projection.value in item.stac_extensions 22 | assert len(item.assets) == 1 23 | assert item.assets[0].properties["eo:bands"] == [{"name": "gray", "common_name": "pan"}] 24 | assert StacExtensions.eo.value in item.stac_extensions 25 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/__init__.py: -------------------------------------------------------------------------------- 1 | from topo_processor.metadata.metadata_validators.metadata_validator import MetadataValidator 2 | 3 | from .metadata_validator_repo import MetadataValidatorRepository 4 | from .metadata_validator_stac import MetadataValidatorStac 5 | from .metadata_validator_tiff import MetadataValidatorTiff 6 | 7 | metadata_validator_repo = MetadataValidatorRepository() 8 | metadata_validator_repo.append(MetadataValidatorTiff()) 9 | metadata_validator_repo.append(MetadataValidatorStac()) 10 | 11 | metadata_validator_stac = MetadataValidatorStac() 12 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/metadata_validator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from topo_processor.stac.item import Item 8 | 9 | 10 | class MetadataValidator(ABC): 11 | @property 12 | @abstractmethod 13 | def name(self) -> str: 14 | pass 15 | 16 | @abstractmethod 17 | def is_applicable(self, item: Item) -> bool: 18 | pass 19 | 20 | @abstractmethod 21 | def validate_metadata(self, item: Item) -> None: 22 | pass 23 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/metadata_validator_repo.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, List 4 | 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.time import time_in_ms 8 | 9 | from .metadata_validator import MetadataValidator 10 | 11 | if TYPE_CHECKING: 12 | from topo_processor.stac.item import Item 13 | 14 | 15 | class MetadataValidatorRepository: 16 | validators: List[MetadataValidator] = [] 17 | 18 | def append(self, validator: MetadataValidator) -> None: 19 | self.validators.append(validator) 20 | 21 | def validate_metadata(self, item: Item) -> bool: 22 | is_valid = True 23 | 24 | for validator in self.validators: 25 | if validator.is_applicable(item): 26 | start_time = time_in_ms() 27 | try: 28 | validator.validate_metadata(item) 29 | except Exception as e: 30 | is_valid = False 31 | item.add_error(str(e), validator.name, e) 32 | get_log().warning(f"Validation Failed: {e}", validator=validator.name) 33 | get_log().debug( 34 | "Validity Checked", 35 | validator=validator.name, 36 | duration=time_in_ms() - start_time, 37 | ) 38 | 39 | return is_valid 40 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/metadata_validator_stac.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from typing import Any, Dict, Union 5 | 6 | import fsspec 7 | import jsonschema_rs 8 | import pystac.validation 9 | from linz_logger import get_log 10 | from pystac.errors import STACValidationError 11 | 12 | from topo_processor.stac.collection import Collection 13 | from topo_processor.stac.item import Item 14 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator 15 | 16 | from .metadata_validator import MetadataValidator 17 | 18 | 19 | class MetadataValidatorStac(MetadataValidator): 20 | name = "validator.stac" 21 | validator_cache: Dict[str, Any] = {} 22 | 23 | def get_validator_from_uri(self, schema_uri: str) -> Any: 24 | if schema_uri not in self.validator_cache: 25 | file = fsspec.open(schema_uri, "rt") 26 | with file as f: 27 | self.validator_cache[schema_uri] = jsonschema_rs.JSONSchema.from_str(f.read()) 28 | 29 | validator = self.validator_cache[schema_uri] 30 | 31 | return validator 32 | 33 | def is_applicable(self, stac_object: Union[Item, Collection]) -> bool: 34 | return True 35 | 36 | def validate_metadata(self, item: Item) -> None: 37 | 38 | if isinstance(pystac.validation.RegisteredValidator.get_validator(), IterErrorsValidator): 39 | with warnings.catch_warnings(record=True) as w: 40 | item.create_stac().validate() 41 | msg = "" 42 | for warn in w: 43 | msg = msg + ", " + str(warn.message) 44 | if w: 45 | raise STACValidationError(message=f"Not valid STAC: {msg}") 46 | else: 47 | try: 48 | item.create_stac().validate() 49 | except STACValidationError as e: 50 | raise STACValidationError(message=f"Not valid STAC: {e}") 51 | 52 | def validate_metadata_with_report(self, stac_object: Union[Item, Collection]) -> Dict[str, list[str]]: 53 | """Validate the STAC object (Item or Collection) against the core json schema and its extensions. 54 | Return an error report [{schemaURI, [errors]}] 55 | """ 56 | errors_report: Dict[str, list[str]] = {} 57 | if isinstance(stac_object, Collection): 58 | stac_collection = stac_object.create_stac() 59 | for item in stac_object.items: 60 | stac_item = stac_object.items[item].create_stac() 61 | stac_collection.add_item(stac_item) 62 | stac_object.generate_summaries(stac_collection) 63 | stac_dict = stac_collection.to_dict(include_self_link=False) 64 | else: 65 | stac_dict = stac_object.create_stac().to_dict(include_self_link=False) 66 | 67 | schema_uris: list[str] = [stac_object.schema] + stac_dict["stac_extensions"] 68 | 69 | for schema_uri in schema_uris: 70 | get_log().trace(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], schema=schema_uri) 71 | current_errors = [] 72 | v = self.get_validator_from_uri(schema_uri) 73 | errors = v.iter_errors(stac_dict) 74 | 75 | for error in errors: 76 | current_errors.append(error.message) 77 | get_log().warn(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], error=error.message) 78 | 79 | if current_errors: 80 | errors_report[schema_uri] = current_errors 81 | 82 | return errors_report 83 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/metadata_validator_tiff.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from typing import TYPE_CHECKING 5 | 6 | from linz_logger import get_log 7 | 8 | from topo_processor.file_system.get_fs import get_fs 9 | from topo_processor.util.file_extension import is_tiff 10 | 11 | from .metadata_validator import MetadataValidator 12 | 13 | if TYPE_CHECKING: 14 | from topo_processor.stac.item import Item 15 | 16 | 17 | class MetadataValidatorTiff(MetadataValidator): 18 | name = "validator.imagery.tiff" 19 | 20 | def is_applicable(self, item: Item) -> bool: 21 | for asset in item.assets: 22 | if is_tiff(asset.source_path): 23 | return True 24 | return False 25 | 26 | def validate_metadata(self, item: Item) -> None: 27 | 28 | for asset in item.assets: 29 | if not is_tiff(asset.source_path): 30 | continue 31 | 32 | geospatial_type = item.linz_geospatial_type 33 | eo_bands = asset.properties["eo:bands"] 34 | common_names = [common_names["common_name"] for common_names in eo_bands] 35 | 36 | with warnings.catch_warnings(record=True) as w: 37 | 38 | # black and white 39 | if geospatial_type in ["black and white image", "black and white infrared image"]: 40 | # check eo:bands matches geospatial_type 41 | if len(eo_bands) != 1 or eo_bands[0]["common_name"] != "pan": 42 | raise Exception(f"Wrong linz_geospatial_type of '{geospatial_type}' when bands = '{eo_bands}'") 43 | # color 44 | # check linz_geospatial_type matches colorinterp 45 | elif geospatial_type in ["color image", "color infrared image"]: 46 | # check eo:bands matches colorinterp 47 | if ( 48 | len(eo_bands) != 3 49 | or "red" not in common_names 50 | or "green" not in common_names 51 | or "blue" not in common_names 52 | ): 53 | raise Exception(f"Wrong linz_geospatial_type of '{geospatial_type}' when bands = '{eo_bands}'") 54 | else: 55 | raise Exception(f"Unknown linz_geospatial_type of '{geospatial_type}'") 56 | for warn in w: 57 | get_log().warning(f"Warning: {warn.message}", file=asset.source_path, loader=self.name) 58 | -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/metadata_validators/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/metadata/metadata_validators/tests/metadata_validator_tiff_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.metadata.metadata_validators.metadata_validator_tiff import MetadataValidatorTiff 6 | from topo_processor.stac.asset import Asset 7 | from topo_processor.stac.item import Item 8 | 9 | 10 | def test_check_validity() -> None: 11 | source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 12 | asset = Asset(source_path) 13 | item = Item("item_id") 14 | item.add_asset(asset) 15 | item.linz_geospatial_type = "color image" 16 | asset.properties.update({"eo:bands": [{"name": "gray", "common_name": "pan"}]}) 17 | 18 | validator = MetadataValidatorTiff() 19 | assert validator.is_applicable(item) 20 | with pytest.raises( 21 | Exception, match=r"Wrong linz_geospatial_type of 'color image' when bands = [{'name': 'gray', 'common_name': 'pan'}]" 22 | ): 23 | validator.validate_metadata(item) 24 | 25 | 26 | def test_unknown_geospatial_type() -> None: 27 | source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 28 | asset = Asset(source_path) 29 | item = Item("item_id") 30 | item.add_asset(asset) 31 | item.linz_geospatial_type = "grayscale" 32 | asset.properties.update({"eo:bands": [{"name": "gray", "common_name": "pan"}]}) 33 | 34 | validator = MetadataValidatorTiff() 35 | assert validator.is_applicable(item) 36 | with pytest.raises(Exception, match=r"Unknown linz_geospatial_type of 'grayscale'"): 37 | validator.validate_metadata(item) 38 | -------------------------------------------------------------------------------- /topo_processor/stac/__init__.py: -------------------------------------------------------------------------------- 1 | from ..metadata.data_type import DataType 2 | from .asset import Asset 3 | from .collection import Collection 4 | from .item import Item 5 | from .item_factory import process_source 6 | from .linz_provider import LinzProvider, LinzProviderRole 7 | from .stac_extensions import StacExtensions 8 | from .store import collection_store 9 | from .validate_report import ValidateReport 10 | from .validation import validate_stac 11 | -------------------------------------------------------------------------------- /topo_processor/stac/asset.py: -------------------------------------------------------------------------------- 1 | from mimetypes import MimeTypes 2 | from os import path 3 | from typing import TYPE_CHECKING, Any, Dict, Optional, Union 4 | 5 | import pystac 6 | 7 | from topo_processor.util.checksum import multihash_as_hex 8 | from topo_processor.util.configuration import get_topo_processor_version 9 | from topo_processor.util.files import get_file_update_time 10 | from topo_processor.util.valid import Validity 11 | 12 | from .asset_key import AssetKey 13 | 14 | if TYPE_CHECKING: 15 | from .item import Item 16 | 17 | 18 | class Asset(Validity): 19 | source_path: str # The raw file location on disk 20 | target: Optional[str] = None # New file name used for uploading 21 | content_type: Optional[str] = None 22 | needs_upload: bool = True 23 | href: str 24 | properties: Dict[str, Any] 25 | item: Optional["Item"] = None 26 | key_name: Optional[AssetKey] = None 27 | 28 | def __init__(self, source_path: str): 29 | super().__init__() 30 | self.source_path = source_path 31 | self.properties = { 32 | "processing:software": get_topo_processor_version(), 33 | } 34 | 35 | def file_ext(self) -> str: 36 | return path.splitext(self.target if self.target else self.source_path)[1] 37 | 38 | def get_content_type(self) -> Union[str, None]: 39 | if self.content_type: 40 | return self.content_type 41 | return MimeTypes().guess_type(self.target if self.target else self.source_path)[0] 42 | 43 | def get_checksum(self) -> str: 44 | if "file:checksum" not in self.properties: 45 | checksum: str = multihash_as_hex(self.source_path) 46 | self.properties["file:checksum"] = checksum 47 | 48 | return_value: str = self.properties["file:checksum"] 49 | return return_value 50 | 51 | def set_output_asset_dates(self, output_path: str) -> None: 52 | if "created" not in self.properties: 53 | self.properties["created"] = get_file_update_time(output_path) 54 | # TODO: process for COG updates not created yet 55 | self.properties["updated"] = self.properties["created"] 56 | else: 57 | self.properties["updated"] = get_file_update_time(output_path) 58 | 59 | def create_stac(self) -> pystac.Asset: 60 | stac = pystac.Asset(href=self.href, extra_fields=self.properties, media_type=self.get_content_type()) 61 | return stac 62 | -------------------------------------------------------------------------------- /topo_processor/stac/asset_key.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class AssetKey(str, Enum): 5 | Visual = "visual" 6 | Thumbnail = "thumbnail" 7 | -------------------------------------------------------------------------------- /topo_processor/stac/collection.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import warnings 5 | from datetime import datetime 6 | from shutil import rmtree 7 | from tempfile import mkdtemp 8 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set 9 | 10 | import pystac 11 | import ulid 12 | from linz_logger import get_log 13 | from pystac.collection import Collection as PystacCollection 14 | from pystac.errors import STACValidationError 15 | from pystac.summaries import Summaries, Summarizer 16 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap 17 | from shapely.ops import unary_union 18 | 19 | from topo_processor.metadata.data_type import DataType 20 | from topo_processor.stac.asset import Asset 21 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator 22 | from topo_processor.util.time import get_min_max_interval 23 | from topo_processor.util.valid import Validity 24 | 25 | from .linz_provider import LinzProvider 26 | from .providers import Providers 27 | from .stac_extensions import StacExtensions 28 | 29 | if TYPE_CHECKING: 30 | from .item import Item 31 | 32 | TEMP_DIR: Optional[str] = None 33 | FIELDS_JSON_URL = "https://raw.githubusercontent.com/linz/stac/master/fields/fields.json" 34 | 35 | 36 | class Collection(Validity): 37 | id: str 38 | title: str 39 | survey: str 40 | description: str 41 | license: str 42 | items: Dict[str, "Item"] 43 | linz_providers: List[Dict[str, Any]] 44 | providers: List[pystac.Provider] 45 | schema: Optional[str] 46 | extra_fields: Dict[str, Any] 47 | linz_geospatial_type: str 48 | 49 | stac_extensions: Set[str] 50 | summaries: Summaries = Summaries.empty() 51 | 52 | def __init__(self, title: str): 53 | super().__init__() 54 | # FIXME: Do we want to generate this id like this? 55 | self.id = str(ulid.ULID()) 56 | self.title = title 57 | self.description = "" 58 | self.items = {} 59 | self.schema = DefaultSchemaUriMap().get_object_schema_uri(pystac.STACObjectType.COLLECTION, pystac.get_stac_version()) 60 | self.extra_fields = dict( 61 | { 62 | # TODO: decision to be made on version ref comments [TDE-230] hardcode to '1' for now 63 | "version": "1", 64 | "linz:security_classification": "unclassified", 65 | } 66 | ) 67 | self.linz_providers = [] 68 | self.stac_extensions = set([StacExtensions.file.value]) 69 | self.providers = [Providers.TTW.value] 70 | 71 | def add_item(self, item: Item) -> None: 72 | if item.collection is not None and item.collection != self: 73 | raise Exception(f"Remapping of collection? existing='{item.collection.title}' new='{self.title}' item='{item.id}'") 74 | if item.id in self.items: 75 | existing = self.items[item.id] 76 | if existing != item: 77 | raise Exception(f"Remapping of item id in collection='{self.title}' item='{item.id}'") 78 | return 79 | self.items[item.id] = item 80 | 81 | def add_extension(self, ext: str) -> None: 82 | self.stac_extensions.add(ext) 83 | 84 | def add_provider(self, provider: pystac.Provider) -> None: 85 | if provider not in self.providers: 86 | self.providers.append(provider) 87 | 88 | def add_linz_provider(self, linz_provider: LinzProvider) -> None: 89 | if linz_provider.to_dict() not in self.linz_providers: 90 | self.linz_providers.append(linz_provider.to_dict()) 91 | 92 | def update_description(self, stac_collection: pystac.Collection, data_type: DataType) -> None: 93 | if data_type == DataType.IMAGERY_HISTORIC: 94 | size = self.summaries.to_dict()["film:physical_size"] 95 | if len(size) == 1: 96 | size = size[0] 97 | colour = self.extra_fields["linz:geospatial_type"] 98 | stac_collection.description = ( 99 | self.description 100 | ) = f"This aerial photographic survey was digitised from {colour} {size} negatives in the Crown collection of the Crown Aerial Film Archive." 101 | 102 | def get_temp_dir(self) -> str: 103 | global TEMP_DIR 104 | if not TEMP_DIR: 105 | TEMP_DIR = mkdtemp() 106 | get_log().debug("Temp directory created", path=TEMP_DIR) 107 | temp_dir = os.path.join(TEMP_DIR, self.survey) 108 | if not os.path.exists(temp_dir): 109 | os.mkdir(temp_dir) 110 | return temp_dir 111 | 112 | def get_temporal_extent(self) -> List[Optional[datetime]]: 113 | dates: List[datetime] = [] 114 | 115 | for item in self.items.values(): 116 | if item.datetime: 117 | dates.append(item.datetime) 118 | 119 | return get_min_max_interval(dates) 120 | 121 | def get_bounding_boxes(self) -> List[List[float]]: 122 | """ 123 | create a union of all item bounding boxes inside the collection 124 | """ 125 | polys = [x.geometry_poly for x in self.items.values() if x.geometry_poly is not None] 126 | 127 | if len(polys) == 0: 128 | return [[0.0, 0.0, 0.0, 0.0]] 129 | union_poly = unary_union(polys) 130 | return [list(union_poly.bounds)] 131 | 132 | def get_linz_geospatial_type(self) -> str: 133 | geospatial_type_set = set(x.linz_geospatial_type for x in self.items.values() if x.linz_geospatial_type) 134 | if len(geospatial_type_set) != 1: 135 | get_log().warning(f"Invalid 'linz:geospatial_type' collection='{self.title}'") 136 | return "invalid geospatial type" 137 | geospatial_type_str = geospatial_type_set.pop() 138 | return geospatial_type_str 139 | 140 | def get_linz_asset_summaries(self) -> Dict[str, Any]: 141 | assets_checked: List[Asset] = [] 142 | dates_created: List[datetime] = [] 143 | dates_updated: List[datetime] = [] 144 | processing_software_versions: List[Dict[str, str]] = [] 145 | 146 | for item in self.items.values(): 147 | for asset in item.assets: 148 | if not asset.needs_upload: 149 | continue 150 | if not asset in assets_checked: 151 | if "created" in asset.properties: 152 | dates_created.append(asset.properties["created"]) 153 | dates_updated.append(asset.properties["updated"]) 154 | if "processing:software" in asset.properties: 155 | if asset.properties["processing:software"] not in processing_software_versions: 156 | processing_software_versions.append(asset.properties["processing:software"]) 157 | assets_checked.append(asset) 158 | 159 | interval_created = get_min_max_interval(dates_created) 160 | interval_updated = get_min_max_interval(dates_updated) 161 | 162 | # to pass metadata-only validation as there are no assets to populate mandatory linz:asset_summaries 163 | # TODO: review this workaround once validation command has been combined into upload command 164 | if not assets_checked: 165 | return { 166 | "created": {"minimum": "0000-01-01T00:00:00Z", "maximum": "0000-01-01T00:00:00Z"}, 167 | "updated": {"minimum": "0000-01-01T00:00:00Z", "maximum": "0000-01-01T00:00:00Z"}, 168 | } 169 | 170 | return { 171 | "processing:software": processing_software_versions, 172 | "created": {"minimum": interval_created[0], "maximum": interval_created[1]}, 173 | "updated": {"minimum": interval_updated[0], "maximum": interval_updated[1]}, 174 | } 175 | 176 | def delete_temp_dir(self) -> None: 177 | global TEMP_DIR 178 | if TEMP_DIR: 179 | if os.path.exists(TEMP_DIR): 180 | rmtree(TEMP_DIR) 181 | TEMP_DIR = None 182 | 183 | def generate_summaries(self, collection: pystac.Collection) -> None: 184 | summarizer = Summarizer(fields=FIELDS_JSON_URL) 185 | collection.summaries = summarizer.summarize(collection) 186 | self.summaries = collection.summaries 187 | 188 | def create_stac(self) -> pystac.Collection: 189 | if self.linz_providers: 190 | self.extra_fields["linz:providers"] = self.linz_providers 191 | self.extra_fields["linz:geospatial_type"] = self.get_linz_geospatial_type() 192 | self.extra_fields["linz:asset_summaries"] = self.get_linz_asset_summaries() 193 | 194 | stac = pystac.Collection( 195 | id=self.id, 196 | description=self.description, 197 | extent=pystac.Extent( 198 | pystac.SpatialExtent(bboxes=self.get_bounding_boxes()), 199 | pystac.TemporalExtent(intervals=[self.get_temporal_extent()]), 200 | ), 201 | title=self.title, 202 | stac_extensions=list(sorted(self.stac_extensions)), 203 | href="./collection.json", 204 | extra_fields=self.extra_fields, 205 | license=self.license, 206 | providers=self.providers, 207 | ) 208 | get_log().info("Stac Collection Created", id=stac.id, title=stac.title) 209 | return stac 210 | 211 | def validate_pystac_collection(self, pystac_collection: PystacCollection) -> None: 212 | 213 | if isinstance(pystac.validation.RegisteredValidator.get_validator(), IterErrorsValidator): 214 | with warnings.catch_warnings(record=True) as w: 215 | pystac_collection.validate() 216 | msg = "" 217 | for warn in w: 218 | msg = msg + ", " + str(warn.message) 219 | if w: 220 | raise Exception(f"Not valid STAC: {msg}") 221 | 222 | else: 223 | try: 224 | pystac_collection.validate() 225 | except STACValidationError as e: 226 | raise Exception(f"Not valid STAC") from e 227 | -------------------------------------------------------------------------------- /topo_processor/stac/item.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from typing import Any, Dict, List, Optional, Set 3 | 4 | import shapely.geometry 5 | from linz_logger import get_log 6 | from pystac import get_stac_version 7 | from pystac.item import Item as PystacItem 8 | from pystac.stac_object import STACObjectType 9 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap 10 | 11 | from topo_processor.util.valid import Validity 12 | 13 | from .asset import Asset 14 | from .collection import Collection 15 | from .stac_extensions import StacExtensions 16 | 17 | 18 | class Item(Validity): 19 | 20 | id: str 21 | geometry_poly: Optional[shapely.geometry.Polygon] = None 22 | linz_geospatial_type: str = "" 23 | datetime: Optional[dt.datetime] = None 24 | properties: Dict[str, Any] 25 | stac_extensions: Set[str] 26 | assets: List[Asset] 27 | collection: Optional[Collection] = None 28 | schema: Optional[str] 29 | 30 | def __init__(self, item_id: str): 31 | super().__init__() 32 | self.id = item_id 33 | self.properties = { 34 | # TODO: decision to be made on version ref comments [TDE-230] hardcode to '1' for now 35 | "version": "1", 36 | } 37 | self.stac_extensions = set([StacExtensions.file.value]) 38 | self.assets = [] 39 | self.schema = DefaultSchemaUriMap().get_object_schema_uri(STACObjectType.ITEM, get_stac_version()) 40 | 41 | def is_valid(self) -> bool: 42 | if not super().is_valid(): 43 | return False 44 | for asset in self.assets: 45 | if not asset.is_valid(): 46 | return False 47 | return True 48 | 49 | def add_asset(self, asset: Asset) -> None: 50 | if asset.item: 51 | raise Exception(f"Asset is already associated with an item: existing item='{asset.item.id}' new item='{self.id}'") 52 | self.assets.append(asset) 53 | asset.item = self 54 | 55 | def add_extension(self, ext: str, add_to_collection: bool = True) -> None: 56 | self.stac_extensions.add(ext) 57 | if not self.collection: 58 | return 59 | if add_to_collection: 60 | self.collection.add_extension(ext) 61 | 62 | def create_stac(self) -> PystacItem: 63 | geometry = None 64 | bbox = None 65 | if self.geometry_poly is not None: 66 | geometry = shapely.geometry.mapping(self.geometry_poly) 67 | bbox = self.geometry_poly.bounds 68 | 69 | stac = PystacItem( 70 | id=self.id, 71 | geometry=geometry, 72 | bbox=bbox, 73 | datetime=self.datetime, 74 | properties=self.properties, 75 | stac_extensions=list(sorted(self.stac_extensions)), 76 | ) 77 | get_log().info("Stac Item Created", id=stac.id) 78 | return stac 79 | -------------------------------------------------------------------------------- /topo_processor/stac/item_factory.py: -------------------------------------------------------------------------------- 1 | from linz_logger import get_log 2 | 3 | from topo_processor.data.data_transformers import data_transformer_repo 4 | from topo_processor.file_system.assets import get_assets 5 | from topo_processor.metadata.data_type import DataType 6 | from topo_processor.metadata.metadata_loaders import metadata_loader_rep 7 | from topo_processor.metadata.metadata_validators import metadata_validator_repo 8 | from topo_processor.stac.store import asset_store, item_store 9 | from topo_processor.util.time import time_in_ms 10 | 11 | 12 | def process_source(source: str, data_type: DataType, metadata_path: str = "", force: bool = False) -> None: 13 | start_time = time_in_ms() 14 | _create_assets(source, data_type, metadata_path) 15 | total_asset = len(asset_store) 16 | if total_asset == 0: 17 | get_log().warn("No Assets Found", assets=total_asset, source=source, duration=time_in_ms() - start_time) 18 | return 19 | 20 | get_log().debug("Assets Created", assets=total_asset, source=source, duration=time_in_ms() - start_time) 21 | 22 | start_time = time_in_ms() 23 | _create_items(force) 24 | total_item = len(item_store) 25 | if len(item_store) == 0: 26 | get_log().warn("No Items Created", items=total_item, source=source, duration=time_in_ms() - start_time) 27 | return 28 | 29 | get_log().debug("Items Created", items=total_item, source=source, duration=time_in_ms() - start_time) 30 | 31 | 32 | def _create_assets(source: str, data_type: str, metadata_path: str) -> None: 33 | assets = get_assets(source, data_type, metadata_path) 34 | for asset in assets: 35 | metadata_loader_rep.load_metadata(asset) 36 | 37 | 38 | def _create_items(force: bool = False) -> None: 39 | all_items_valid = True 40 | # Validate metadata of valid items 41 | # Item can be already detected invalid from the metadata loader 42 | # For those, we don't want to validate their metadata 43 | for item in item_store.values(): 44 | if item.is_valid(): 45 | metadata_is_valid = metadata_validator_repo.validate_metadata(item) 46 | if all_items_valid and not metadata_is_valid: 47 | all_items_valid = metadata_is_valid 48 | 49 | if not all_items_valid and not force: 50 | raise Exception("At least one Item is not valid. Process is stopped") 51 | 52 | for item in item_store.values(): 53 | if item.is_valid(): 54 | data_transformer_repo.transform_data(item) 55 | -------------------------------------------------------------------------------- /topo_processor/stac/iter_errors_validator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | from typing import Any, Dict, List, Optional, Tuple 4 | 5 | import jsonschema 6 | import pystac 7 | from linz_logger import get_log 8 | from pystac import STACObjectType 9 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap, SchemaUriMap 10 | from pystac.validation.stac_validator import STACValidator 11 | 12 | 13 | class IterErrorsValidator(STACValidator): 14 | 15 | schema_uri_map: SchemaUriMap 16 | schema_cache: Dict[str, Dict[str, Any]] 17 | 18 | def __init__(self, schema_uri_map: Optional[SchemaUriMap] = None) -> None: 19 | 20 | if schema_uri_map is not None: 21 | self.schema_uri_map = schema_uri_map 22 | else: 23 | self.schema_uri_map = DefaultSchemaUriMap() 24 | 25 | self.schema_cache = {} 26 | 27 | def get_schema_from_uri(self, schema_uri: str) -> Tuple[Dict[str, Any], Any]: 28 | if schema_uri not in self.schema_cache: 29 | s = json.loads(pystac.StacIO.default().read_text(schema_uri)) 30 | self.schema_cache[schema_uri] = s 31 | 32 | schema = self.schema_cache[schema_uri] 33 | 34 | resolver = jsonschema.validators.RefResolver(base_uri=schema_uri, referrer=schema, store=self.schema_cache) 35 | 36 | return schema, resolver 37 | 38 | def _validate_from_uri(self, stac_dict: Dict[str, Any], schema_uri: str) -> List[str]: 39 | """Return a list of the error(s) found during the validation of stac_dict against schema_uri""" 40 | errors = [] 41 | schema, resolver = self.get_schema_from_uri(schema_uri) 42 | 43 | # Draft7 for pystac 44 | validator = jsonschema.Draft7Validator(schema) 45 | for error in sorted(validator.evolve(schema=schema).iter_errors(stac_dict), key=str): 46 | errors.append(error.message) 47 | 48 | for uri in resolver.store: 49 | if uri not in self.schema_cache: 50 | self.schema_cache[uri] = resolver.store[uri] 51 | 52 | return errors 53 | 54 | def _get_error_message( 55 | self, 56 | schema_uri: str, 57 | stac_object_type: STACObjectType, 58 | extension_id: Optional[str], 59 | href: Optional[str], 60 | stac_id: Optional[str], 61 | errors: Optional[List[str]], 62 | ) -> str: 63 | s = "Validation failed for {} ".format(stac_object_type) 64 | if href is not None: 65 | s += "at {} ".format(href) 66 | if stac_id is not None: 67 | s += "with ID {} ".format(stac_id) 68 | s += "against schema at {}".format(schema_uri) 69 | if extension_id is not None: 70 | s += " for STAC extension '{}'".format(extension_id) 71 | if errors: 72 | s += " with the following error(s): '{}'".format(", ".join(errors)) 73 | return s 74 | 75 | def validate_core( 76 | self, 77 | stac_dict: Dict[str, Any], 78 | stac_object_type: STACObjectType, 79 | stac_version: str, 80 | href: Optional[str] = None, 81 | ) -> Optional[str]: 82 | """Validate a core stac object. 83 | Return value can be None or specific to the implementation. 84 | Args: 85 | stac_dict : Dictionary that is the STAC json of the object. 86 | stac_object_type : The stac object type of the object encoded in 87 | stac_dict. One of :class:`~pystac.STACObjectType`. 88 | stac_version : The version of STAC to validate the object against. 89 | href : Optional HREF of the STAC object being validated. 90 | Returns: 91 | str: URI for the JSON schema that was validated against, or None if 92 | no validation occurred. 93 | """ 94 | schema_uri = self.schema_uri_map.get_object_schema_uri(stac_object_type, stac_version) 95 | 96 | if schema_uri is None: 97 | return None 98 | try: 99 | errors = self._validate_from_uri(stac_dict, schema_uri) 100 | except Exception as e: 101 | get_log().error(f"Exception while validating {stac_object_type} href: {href}") 102 | raise e 103 | if errors: 104 | msg = self._get_error_message(schema_uri, stac_object_type, None, href, stac_dict.get("id"), errors) 105 | warnings.warn(msg) 106 | 107 | return schema_uri 108 | 109 | def validate_extension( 110 | self, 111 | stac_dict: Dict[str, Any], 112 | stac_object_type: STACObjectType, 113 | stac_version: str, 114 | extension_id: str, 115 | href: Optional[str] = None, 116 | ) -> Optional[str]: 117 | """Validate an extension stac object. 118 | Return value can be None or specific to the implementation. 119 | Args: 120 | stac_dict : Dictionary that is the STAC json of the object. 121 | stac_object_type : The stac object type of the object encoded in 122 | stac_dict. One of :class:`~pystac.STACObjectType`. 123 | stac_version : The version of STAC to validate the object against. 124 | extension_id : The extension ID to validate against. 125 | href : Optional HREF of the STAC object being validated. 126 | Returns: 127 | str: URI for the JSON schema that was validated against, or None if 128 | no validation occurred. 129 | """ 130 | schema_uri = extension_id 131 | 132 | if schema_uri is None: 133 | return None 134 | 135 | try: 136 | errors = self._validate_from_uri(stac_dict, schema_uri) 137 | except Exception as e: 138 | get_log().error(f"Exception while validating {stac_object_type} href: {href}") 139 | raise e 140 | if errors: 141 | msg = self._get_error_message(schema_uri, stac_object_type, extension_id, href, stac_dict.get("id"), errors) 142 | warnings.warn(msg) 143 | 144 | return schema_uri 145 | -------------------------------------------------------------------------------- /topo_processor/stac/linz_provider.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Any, Dict, List, Optional 3 | 4 | from pystac import Provider 5 | from pystac.utils import StringEnum 6 | 7 | 8 | class LinzProviderRole(StringEnum): 9 | """Enumerates the allows values of the LinzProvider "role" field.""" 10 | 11 | MANAGER = "manager" 12 | CUSTODIAN = "custodian" 13 | 14 | 15 | class LinzProvider(Provider): 16 | 17 | roles: Optional[List[LinzProviderRole]] # type:ignore 18 | """Optional roles of the provider. Any of manager or custodian. 19 | LINZ override of pystac.ProviderRole Enum. 20 | Type ignored due to: https://github.com/radiantearth/stac-spec/issues/1147 21 | """ 22 | 23 | def __init__( 24 | self, 25 | name: str, 26 | description: Optional[str] = None, 27 | roles: Optional[List[LinzProviderRole]] = None, 28 | url: Optional[str] = None, 29 | extra_fields: Optional[Dict[str, Any]] = None, 30 | ): 31 | self.name = name 32 | self.description = description 33 | self.roles = roles 34 | self.url = url 35 | self.extra_fields = extra_fields or {} 36 | 37 | 38 | class LinzProviders(Enum): 39 | LTTW = LinzProvider( 40 | name="Toitū Te Whenua LINZ", 41 | description="The New Zealand Government's lead agency for location and property information, Crown land and managing overseas investment.", 42 | roles=[LinzProviderRole.CUSTODIAN], 43 | url="https://www.linz.govt.nz/about-linz/what-were-doing/projects/crown-aerial-film-archive-historical-imagery-scanning-project", 44 | ) 45 | LMPP = LinzProvider(name="Manager Partnership Programmes", roles=[LinzProviderRole.MANAGER]) 46 | -------------------------------------------------------------------------------- /topo_processor/stac/providers.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import pystac 4 | 5 | 6 | class Providers(Enum): 7 | TTW = pystac.Provider( 8 | name="Toitū Te Whenua LINZ", 9 | description="The New Zealand Government's lead agency for location and property information, Crown land and managing overseas investment.", 10 | roles=[pystac.ProviderRole.HOST, pystac.ProviderRole.LICENSOR, pystac.ProviderRole.PROCESSOR], 11 | url="https://www.linz.govt.nz/about-linz/what-were-doing/projects/crown-aerial-film-archive-historical-imagery-scanning-project", 12 | ) 13 | NZAM = pystac.Provider( 14 | name="NZ Aerial Mapping", 15 | description="Aerial survey and geospatial services firm. Went into liquidation in 2014.", 16 | roles=[pystac.ProviderRole.PRODUCER], 17 | ) 18 | -------------------------------------------------------------------------------- /topo_processor/stac/stac_extensions.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class StacExtensions(str, Enum): 5 | linz = "https://stac.linz.govt.nz/v0.0.15/linz/schema.json" 6 | quality = "https://stac.linz.govt.nz/v0.0.15/quality/schema.json" 7 | historical_imagery = "https://stac.linz.govt.nz/v0.0.15/historical-imagery/schema.json" 8 | aerial_photo = "https://stac.linz.govt.nz/v0.0.15/aerial-photo/schema.json" 9 | camera = "https://stac.linz.govt.nz/v0.0.15/camera/schema.json" 10 | film = "https://stac.linz.govt.nz/v0.0.15/film/schema.json" 11 | scanning = "https://stac.linz.govt.nz/v0.0.15/scanning/schema.json" 12 | eo = "https://stac-extensions.github.io/eo/v1.0.0/schema.json" 13 | file = "https://stac-extensions.github.io/file/v2.0.0/schema.json" 14 | projection = "https://stac-extensions.github.io/projection/v1.0.0/schema.json" 15 | version = "https://stac-extensions.github.io/version/v1.0.0/schema.json" 16 | -------------------------------------------------------------------------------- /topo_processor/stac/store.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from .asset import Asset 4 | from .collection import Collection 5 | from .item import Item 6 | 7 | collection_store: Dict[str, Collection] = {} 8 | item_store: Dict[str, Item] = {} 9 | asset_store: Dict[str, Asset] = {} 10 | 11 | 12 | def get_collection(title: str) -> Collection: 13 | if title not in collection_store: 14 | collection = Collection(title) 15 | collection_store[title] = collection 16 | return collection_store[title] 17 | 18 | 19 | def get_asset(source_path: str) -> Asset: 20 | if source_path not in asset_store: 21 | asset = Asset(source_path) 22 | asset_store[source_path] = asset 23 | return asset_store[source_path] 24 | 25 | 26 | def get_item(item_id: str) -> Item: 27 | if item_id not in item_store: 28 | item = Item(item_id) 29 | item_store[item_id] = item 30 | return item_store[item_id] 31 | -------------------------------------------------------------------------------- /topo_processor/stac/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/stac/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/stac/tests/asset_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.stac.asset import Asset 6 | 7 | 8 | def test_asset() -> None: 9 | """validate adding of extra field: file:checksum""" 10 | source_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")) 11 | asset = Asset(source_path) 12 | asset.href = "test_asset" 13 | checksum = asset.get_checksum() 14 | json_asset = asset.create_stac().to_dict() 15 | assert json_asset["file:checksum"] == checksum 16 | -------------------------------------------------------------------------------- /topo_processor/stac/tests/file_extension_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.metadata.data_type import DataType 4 | from topo_processor.util.file_extension import FILE_EXTENSIONS, is_extension, is_tiff 5 | 6 | 7 | def test_is_tiff() -> None: 8 | file_a = "file.tiff" 9 | file_b = "file.tif" 10 | file_c = "file.TIFF" 11 | file_d = "file.jpg" 12 | 13 | assert is_tiff(file_a) is True 14 | assert is_tiff(file_b) is True 15 | assert is_tiff(file_c) is True 16 | assert is_tiff(file_d) is False 17 | 18 | 19 | def test_is_extension_imagery_historic() -> None: 20 | file_a = "file.tiff" 21 | file_b = "file.tif" 22 | file_c = "file.TIFF" 23 | file_d = "file.jpg" 24 | 25 | assert is_extension(file_a, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True 26 | assert is_extension(file_b, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True 27 | assert is_extension(file_c, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True 28 | assert is_extension(file_d, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is False 29 | -------------------------------------------------------------------------------- /topo_processor/stac/tests/iter_errors_validator_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | import pytest 5 | from pystac import STACValidationError, validation 6 | 7 | from topo_processor.metadata.metadata_validators.metadata_validator_stac import MetadataValidatorStac 8 | from topo_processor.stac.asset import Asset 9 | from topo_processor.stac.item import Item 10 | from topo_processor.stac.stac_extensions import StacExtensions 11 | 12 | 13 | def test_iter_errors_validator() -> None: 14 | """check error details is in exception message""" 15 | source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 16 | asset = Asset(source_path) 17 | item = Item("item_id") 18 | item.datetime = datetime.now() 19 | item.add_asset(asset) 20 | item.properties.update({"camera:nominal_focal_length": "string"}) 21 | item.properties.update({"camera:sequence_number": 1234}) 22 | item.add_extension(StacExtensions.camera.value, add_to_collection=False) 23 | validator = MetadataValidatorStac() 24 | assert validator.is_applicable(item) 25 | with pytest.raises(STACValidationError) as e: 26 | validator.validate_metadata(item) 27 | assert "'string' is not of type 'integer'" in str(e.value) 28 | 29 | 30 | def test_iter_errors_validator_multiple_extensions() -> None: 31 | """check error details is in exception message""" 32 | source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 33 | asset = Asset(source_path) 34 | item = Item("item_id") 35 | item.datetime = datetime.now() 36 | item.add_asset(asset) 37 | item.properties.update({"camera:nominal_focal_length": "string"}) 38 | item.properties.update({"camera:sequence_number": 1234}) 39 | item.add_extension(StacExtensions.camera.value, add_to_collection=False) 40 | item.add_extension(StacExtensions.aerial_photo.value, add_to_collection=False) 41 | validator = MetadataValidatorStac() 42 | assert validator.is_applicable(item) 43 | 44 | with pytest.raises(STACValidationError) as e: 45 | validator.validate_metadata(item) 46 | assert "'string' is not of type 'integer'" in str(e.value) 47 | assert "'aerial-photo:run' is a required property" in str(e.value) 48 | -------------------------------------------------------------------------------- /topo_processor/stac/tests/validate_report_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.stac.validate_report import ValidateReport 4 | 5 | 6 | def test_increment_error() -> None: 7 | """""" 8 | error_report: ValidateReport = ValidateReport() 9 | error_report.increment_error("schema_a", "error_1") 10 | assert error_report.report_per_error_type["schema_a"]["error_1"] == 1 11 | error_report.increment_error("schema_a", "error_1") 12 | assert error_report.report_per_error_type["schema_a"]["error_1"] == 2 13 | error_report.increment_error("schema_b", "error_1") 14 | assert error_report.report_per_error_type["schema_b"]["error_1"] == 1 15 | error_report.increment_error("schema_a", "error_2") 16 | assert error_report.report_per_error_type["schema_a"]["error_2"] == 1 17 | -------------------------------------------------------------------------------- /topo_processor/stac/validate_report.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | 4 | class ValidateReport: 5 | total: int 6 | report_per_error_type: Dict[str, Dict[str, int]] 7 | 8 | def __init__(self) -> None: 9 | self.total = 0 10 | self.report_per_error_type = {} 11 | 12 | def add_errors(self, errors_per_schema: Dict[str, List[str]]) -> None: 13 | for schema_uri in errors_per_schema: 14 | for error in errors_per_schema[schema_uri]: 15 | self.increment_error(schema_uri, error) 16 | self.total = self.total + 1 17 | 18 | def increment_error(self, schema: str, error: str) -> None: 19 | existing = self.report_per_error_type.get(schema) 20 | if existing is None: 21 | self.report_per_error_type[schema] = existing = {} 22 | existing[error] = existing.get(error, 0) + 1 23 | -------------------------------------------------------------------------------- /topo_processor/stac/validation.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union 2 | 3 | from linz_logger import get_log 4 | 5 | from topo_processor.metadata.data_type import DataType, get_layer_id 6 | from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_hist 7 | from topo_processor.metadata.metadata_validators import metadata_validator_stac 8 | from topo_processor.stac.validate_report import ValidateReport 9 | from topo_processor.util.time import time_in_ms 10 | 11 | from .collection import Collection 12 | from .item import Item 13 | from .store import collection_store, item_store 14 | 15 | 16 | def validate_stac(metadata_file: str = "", validate_item: bool = True, validate_collection: bool = True) -> None: 17 | """This function only validate the Historical Imagery layer at the moment.""" 18 | # FIXME: Make this function generic by validating other layers (vs only Historical Imagery atm) 19 | start_time = time_in_ms() 20 | item_report: ValidateReport = ValidateReport() 21 | collection_report: ValidateReport = ValidateReport() 22 | 23 | get_log().debug("validate_stac", layer=get_layer_id(DataType.IMAGERY_HISTORIC)) 24 | 25 | # Load metadata from metadata csv file 26 | metadata_loader_imagery_hist.load_metadata(None, metadata_file, True) 27 | get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time) 28 | 29 | # Validate metadata from stored STAC objects 30 | if validate_item: 31 | item_report = validate_store(item_store) 32 | if validate_collection: 33 | collection_report = validate_store(collection_store) 34 | 35 | # Print report 36 | get_log().info( 37 | "Metadata Validated", 38 | metadata_file=metadata_file, 39 | nbItemsValidated=item_report.total, 40 | nbCollectionsValidated=collection_report.total, 41 | duration=time_in_ms() - start_time, 42 | itemErrors=item_report.report_per_error_type, 43 | collectionErrors=collection_report.report_per_error_type, 44 | ) 45 | 46 | 47 | def validate_store(store: Union[Dict[str, Item], Dict[str, Collection]]) -> ValidateReport: 48 | validate_report: ValidateReport = ValidateReport() 49 | 50 | for stac_object in store.values(): 51 | if stac_object.is_valid(): 52 | validate_report.add_errors(metadata_validator_stac.validate_metadata_with_report(stac_object)) 53 | 54 | return validate_report 55 | -------------------------------------------------------------------------------- /topo_processor/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .command import Command 2 | from .execution import ExecutionDocker, ExecutionLocal 3 | -------------------------------------------------------------------------------- /topo_processor/util/aws_credentials.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import TYPE_CHECKING, Dict 3 | 4 | from boto3 import Session 5 | from linz_logger import get_log 6 | 7 | from topo_processor.util.configuration import aws_profile, linz_ssm_bucket_config_name 8 | 9 | if TYPE_CHECKING: 10 | from mypy_boto3_sts import STSClient 11 | else: 12 | STSClient = object 13 | 14 | 15 | class Credentials: 16 | access_key: str 17 | secret_key: str 18 | token: str 19 | 20 | def __init__(self, access_key: str, secret_key: str, token: str): 21 | self.access_key = access_key 22 | self.secret_key = secret_key 23 | self.token = token 24 | 25 | 26 | session = Session(profile_name=aws_profile) 27 | client_sts: STSClient = session.client("sts") 28 | bucket_roles: Dict[str, Dict[str, str]] = {} 29 | bucket_credentials: Dict[str, Credentials] = {} 30 | 31 | # Load bucket to roleArn mapping for LINZ internal buckets from SSM 32 | def init_roles() -> None: 33 | get_log().debug("init_roles", linz_ssm_bucket_name=linz_ssm_bucket_config_name, aws_profile=aws_profile) 34 | if linz_ssm_bucket_config_name is None: 35 | return 36 | 37 | get_log().debug("load_bucket_config", ssm=linz_ssm_bucket_config_name) 38 | role_config_param = session.client("ssm").get_parameter(Name=linz_ssm_bucket_config_name) 39 | role_config = json.loads(role_config_param["Parameter"]["Value"]) 40 | 41 | for cfg in role_config: 42 | bucket_roles[cfg["bucket"]] = cfg 43 | get_log().info("load_bucket_config_done", ssm=linz_ssm_bucket_config_name, buckets=len(role_config)) 44 | 45 | 46 | def get_credentials_from_bucket(bucket_name: str) -> Credentials: 47 | get_log().debug("get_credentials_from_bucket", bucket_name=bucket_name) 48 | # FIXME: check if the token is expired - add a parameter 49 | if bucket_name not in bucket_credentials: 50 | role_arn = get_role_arn(bucket_name) 51 | if role_arn: 52 | bucket_credentials[bucket_name] = get_credentials_from_role(role_arn) 53 | else: 54 | session_credentials = session.get_credentials() 55 | default_credentials = Credentials( 56 | session_credentials.access_key, session_credentials.secret_key, session_credentials.token 57 | ) 58 | 59 | return default_credentials 60 | return bucket_credentials[bucket_name] 61 | 62 | 63 | def get_credentials_from_role(role_arn: str) -> Credentials: 64 | get_log().debug("get_credentials_from_role", role_arn=role_arn) 65 | assumed_role = client_sts.assume_role(RoleArn=role_arn, RoleSessionName="TopoProcessor") 66 | credentials = Credentials( 67 | assumed_role["Credentials"]["AccessKeyId"], 68 | assumed_role["Credentials"]["SecretAccessKey"], 69 | assumed_role["Credentials"]["SessionToken"], 70 | ) 71 | return credentials 72 | 73 | 74 | def get_role_arn(bucket_name: str) -> str: 75 | role_arn = "" 76 | if not bucket_roles: 77 | init_roles() 78 | if bucket_name in bucket_roles: 79 | role_arn = bucket_roles[bucket_name]["roleArn"] 80 | else: 81 | get_log().warn("role_arn_not_found", bucketName=bucket_name) 82 | 83 | return role_arn 84 | -------------------------------------------------------------------------------- /topo_processor/util/aws_files.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta, timezone 3 | from typing import Any, Dict, List, Union 4 | from urllib.parse import urlparse 5 | 6 | import boto3 7 | from botocore import exceptions as botocore_exceptions 8 | from linz_logger import get_log 9 | 10 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket 11 | from topo_processor.util.configuration import historical_imagery_bucket 12 | from topo_processor.util.file_extension import is_tiff 13 | from topo_processor.util.time import time_in_ms 14 | 15 | 16 | def s3_download(source_path: str, dest_path: str, credentials: Union[Credentials, None] = None) -> None: 17 | start_time = time_in_ms() 18 | get_log().debug("s3_download started", objectPath=source_path, destinationPath=dest_path) 19 | 20 | url_o = urlparse(source_path) 21 | bucket_name = url_o.netloc 22 | object_name = url_o.path[1:] 23 | 24 | if not credentials: 25 | credentials = get_credentials_from_bucket(bucket_name) 26 | 27 | s3 = boto3.resource( 28 | "s3", 29 | aws_access_key_id=credentials.access_key, 30 | aws_secret_access_key=credentials.secret_key, 31 | aws_session_token=credentials.token, 32 | ) 33 | 34 | try: 35 | s3.Bucket(bucket_name).download_file(object_name, dest_path) 36 | except Exception as e: 37 | get_log().error("s3_download failed", objectPath=source_path, error=e) 38 | raise e 39 | 40 | get_log().debug( 41 | "s3_download ended", 42 | objectPath=source_path, 43 | destinationPath=dest_path, 44 | duration=time_in_ms() - start_time, 45 | ) 46 | 47 | 48 | def load_file_content(bucket_name: str, object_path: str) -> Dict[str, Any]: 49 | get_log().debug("bucket_name", bucket_name=bucket_name) 50 | credentials: Credentials = get_credentials_from_bucket(bucket_name) 51 | 52 | s3 = boto3.resource( 53 | "s3", 54 | aws_access_key_id=credentials.access_key, 55 | aws_secret_access_key=credentials.secret_key, 56 | aws_session_token=credentials.token, 57 | ) 58 | 59 | object_content = s3.Object(bucket_name=bucket_name, key=object_path) 60 | 61 | if object_path.endswith(".json"): 62 | json_result: Dict[str, Any] = json.loads(object_content.get()["Body"].read()) 63 | return json_result 64 | 65 | result: Dict[str, Any] = json.loads(object_content.get()["Body"].read().decode("utf-8")) 66 | return result 67 | 68 | 69 | def build_s3_path(bucket_name: str, object_path: str) -> str: 70 | return f"s3://{bucket_name}/" + (object_path[1:] if object_path.startswith("/") else object_path) 71 | 72 | 73 | def create_s3_manifest(manifest_source_path: str) -> None: 74 | # TODO:lock file 75 | start_time = time_in_ms() 76 | get_log().debug("check_manifest", manifestPath=manifest_source_path) 77 | 78 | url_o = urlparse(manifest_source_path) 79 | bucket_name = url_o.netloc 80 | manifest_path = url_o.path[1:] 81 | credentials: Credentials = get_credentials_from_bucket(bucket_name) 82 | 83 | s3_client = boto3.client( 84 | "s3", 85 | aws_access_key_id=credentials.access_key, 86 | aws_secret_access_key=credentials.secret_key, 87 | aws_session_token=credentials.token, 88 | ) 89 | 90 | try: 91 | manifest_modified_datetime = s3_client.head_object(Bucket=bucket_name, Key=manifest_path)["LastModified"] 92 | cutoff_datetime = datetime.now(timezone.utc) - timedelta(days=28) 93 | if cutoff_datetime < manifest_modified_datetime: 94 | return 95 | 96 | except botocore_exceptions.ClientError as e: 97 | if e.response["Error"]["Code"] == "404": 98 | get_log().debug("no_manifest_file_found", bucketName=bucket_name, manifestPath=manifest_path, error=e) 99 | else: 100 | raise e 101 | 102 | try: 103 | get_log().debug("create_manifest", bucketName=bucket_name, manifestPath=manifest_path) 104 | manifest_new: Dict[str, Any] = {} 105 | manifest_file_list = _list_objects(historical_imagery_bucket) 106 | manifest_new["path"] = manifest_path 107 | manifest_new["time"] = time_in_ms() 108 | manifest_new["files"] = manifest_file_list 109 | 110 | s3_client.put_object( 111 | Body=json.dumps(manifest_new).encode("UTF-8"), 112 | ContentType="application/json", 113 | Bucket=bucket_name, 114 | Key=manifest_path, 115 | ) 116 | 117 | except Exception as e: 118 | get_log().error("create_manifest_failed", bucketPath=bucket_name, manifestPath=manifest_path, error=e) 119 | raise e 120 | 121 | get_log().debug( 122 | "log_manifest_create_time", 123 | manifestSourcePath=manifest_source_path, 124 | duration=time_in_ms() - start_time, 125 | ) 126 | 127 | 128 | def _list_objects(bucket_name: str) -> List[Dict[str, str]]: 129 | 130 | credentials: Credentials = get_credentials_from_bucket(bucket_name) 131 | 132 | s3_client = boto3.client( 133 | "s3", 134 | aws_access_key_id=credentials.access_key, 135 | aws_secret_access_key=credentials.secret_key, 136 | aws_session_token=credentials.token, 137 | ) 138 | 139 | file_list: List[Dict[str, str]] = [] 140 | paginator = s3_client.get_paginator("list_objects_v2") 141 | response_iterator = paginator.paginate(Bucket=bucket_name) 142 | for response in response_iterator: 143 | for contents_data in response["Contents"]: 144 | key = contents_data["Key"] 145 | if is_tiff(key): 146 | file_list.append({"path": key}) 147 | 148 | return file_list 149 | -------------------------------------------------------------------------------- /topo_processor/util/checksum.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | import multihash 4 | 5 | from topo_processor.file_system.get_fs import get_fs 6 | 7 | CHUNK_SIZE = 1024 * 1024 # 1MB 8 | 9 | 10 | def multihash_as_hex(path: str) -> str: 11 | file_hash = hashlib.sha256() 12 | with get_fs(path).open(path, "rb") as file: 13 | while chunk := file.read(CHUNK_SIZE): 14 | file_hash.update(chunk) 15 | result: str = multihash.to_hex_string(multihash.encode(file_hash.digest(), "sha2-256")) 16 | return result 17 | -------------------------------------------------------------------------------- /topo_processor/util/command.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional, Tuple, TypedDict 3 | 4 | from topo_processor.util.execution import ExecutionDocker, ExecutionLocal 5 | 6 | 7 | class CommandDocker(TypedDict): 8 | container: str 9 | tag: Optional[str] 10 | 11 | 12 | class Command: 13 | use_docker: bool 14 | 15 | def __init__(self, command: str, docker_ref: Optional[CommandDocker] = None) -> None: 16 | self.command = command 17 | self.arguments: List[str] = [] 18 | self.volumes: List[str] = [] 19 | self.envs: List[str] = [] 20 | if docker_ref is None: 21 | self.use_docker = False 22 | else: 23 | self.use_docker = True 24 | self.container = docker_ref.get("container", None) 25 | self.container_tag = docker_ref.get("tag", None) 26 | 27 | def arg(self, *args: str) -> "Command": 28 | for argument in args: 29 | self.arguments.append(argument) 30 | return self 31 | 32 | def mount(self, *args: str) -> "Command": 33 | """Mount a folder, useful only if the command is run inside of docker""" 34 | for volume in args: 35 | self.volumes.append(volume) 36 | return self 37 | 38 | def env(self, *args: str) -> "Command": 39 | """Only useful when using docker""" 40 | for env in args: 41 | self.envs.append(env) 42 | return self 43 | 44 | def to_full_command(self) -> List[str]: 45 | return [self.command] + self.arguments 46 | 47 | def redacted_command(self) -> List[str]: 48 | """Provide redacted argument string for logging which removes sensitive information""" 49 | redacted = [] 50 | for arg in self.arguments: 51 | if arg.startswith("AWS"): 52 | split_arg = arg.split("=") 53 | arg = f"{split_arg[0]}=******" 54 | redacted.append(arg) 55 | return [self.command] + redacted 56 | 57 | def to_docker(self) -> "Command": 58 | if not self.container: 59 | raise Exception(f"No container found for command {self.command}") 60 | docker = Command("docker") 61 | docker.arg("run") 62 | for env in self.envs: 63 | docker.arg("--env", env) 64 | docker.arg("--user", f"{os.geteuid()}:{os.getegid()}") 65 | for volume in self.volumes: 66 | docker.arg("-v", f"{volume}:{volume}") 67 | docker.arg("--rm") 68 | 69 | if not self.container_tag: 70 | docker.arg(self.container) 71 | else: 72 | docker.arg(f"{self.container}:{self.container_tag}") 73 | 74 | docker.arg(self.command) 75 | for argument in self.arguments: 76 | docker.arg(argument) 77 | return docker 78 | 79 | def run(self) -> Tuple[int, str, str]: 80 | if self.use_docker: 81 | return ExecutionDocker.run(self) 82 | return ExecutionLocal.run(self) 83 | -------------------------------------------------------------------------------- /topo_processor/util/configuration.py: -------------------------------------------------------------------------------- 1 | from os import environ, path 2 | from tempfile import mkdtemp 3 | from typing import Dict, Optional 4 | 5 | from dotenv import load_dotenv 6 | from linz_logger import get_log 7 | 8 | load_dotenv() 9 | 10 | 11 | def get_env(env_name: str) -> str: 12 | env_var = environ.get(env_name) 13 | if env_var is None: 14 | raise Exception(f"Missing environment variable ${env_name}") 15 | return env_var 16 | 17 | 18 | lds_cache_bucket: str = get_env("LINZ_CACHE_BUCKET") 19 | historical_imagery_bucket = get_env("LINZ_HISTORICAL_IMAGERY_BUCKET") 20 | aws_profile: Optional[str] = environ.get("AWS_PROFILE") 21 | linz_ssm_bucket_config_name: Optional[str] = environ.get("LINZ_SSM_BUCKET_CONFIG_NAME") 22 | temp_folder: str = mkdtemp() 23 | get_log().debug( 24 | "from_environment_variables", lds_cache_bucket=lds_cache_bucket, aws_profile=aws_profile, ssm=linz_ssm_bucket_config_name 25 | ) 26 | 27 | 28 | def get_topo_processor_version() -> Dict[str, str]: 29 | with open(path.join("VERSION")) as version_file: 30 | version: str = version_file.read().strip() 31 | return {"Topo Processor": version} 32 | -------------------------------------------------------------------------------- /topo_processor/util/conversions.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | from typing import Any, Dict, List, Union 4 | 5 | from dateutil import parser, tz 6 | 7 | 8 | def string_to_number(value: str) -> Union[float, int, str]: 9 | """If possible this function returns the int/float of the input value, 10 | if not it returns the string. 11 | """ 12 | try: 13 | int_number = int(value) 14 | return int_number 15 | except ValueError: 16 | try: 17 | float_number = float(value) 18 | return float_number 19 | except ValueError: 20 | return value 21 | 22 | 23 | def remove_empty_strings(properties: Dict[str, Any]) -> Dict[str, Any]: 24 | return {key: value for key, value in properties.items() if value != ""} 25 | 26 | 27 | def string_to_boolean(value: str, true_values: List[str], false_values: List[str]) -> Union[bool, str]: 28 | """Find value in lists and return boolean, 29 | else returns the original value string. 30 | """ 31 | clean_value = value.strip().lower() 32 | if clean_value in true_values: 33 | return True 34 | if clean_value in false_values: 35 | return False 36 | return value 37 | 38 | 39 | def nzt_datetime_to_utc_datetime(date: str) -> datetime: 40 | utc_tz = tz.gettz("UTC") 41 | nz_tz = tz.gettz("Pacific/Auckland") 42 | 43 | try: 44 | nz_time = parser.parse(date).replace(tzinfo=nz_tz) 45 | except parser.ParserError as err: 46 | raise Exception(f"Not a valid date: {err}") from err 47 | 48 | utc_time: datetime = nz_time.astimezone(utc_tz) 49 | 50 | return utc_time 51 | 52 | 53 | def quarterdate_to_date_string(value: str) -> str: 54 | """If possible this function converts quarter e.g. 'Q3' to RFC3339 format, 55 | e.g. '2021-03-01T00:00:00.000Z', then to UTC, else returns original value string. 56 | """ 57 | re_result = re.search(r"(\d{4})[/][qQ]([1-4])", value) 58 | 59 | if re_result is not None: 60 | 61 | year = re_result.group(1) 62 | month = (3 * (int(re_result.group(2)))) - 2 63 | 64 | date_string_nz = f"{year}-{month}-01T00:00:00.000" 65 | datetime_utc = nzt_datetime_to_utc_datetime(date_string_nz) 66 | date_string_utc = datetime_utc.strftime("%Y-%m-%dT%H:%M:%S") + "Z" 67 | return date_string_utc 68 | 69 | return value 70 | 71 | 72 | def historical_imagery_photo_type_to_linz_geospatial_type(photo_type: str) -> str: 73 | """Find value in dict and return linz_geospatial_type, 74 | else return the original value string. 75 | """ 76 | geospatial_type_conversion_table = { 77 | "B&W": "black and white image", 78 | "B&W IR": "black and white infrared image", 79 | "COLOUR": "color image", 80 | "COLOUR IR": "color infrared image", 81 | } 82 | 83 | lgs_value = geospatial_type_conversion_table.get(photo_type.strip().upper()) 84 | if lgs_value: 85 | return lgs_value 86 | else: 87 | return photo_type 88 | -------------------------------------------------------------------------------- /topo_processor/util/execution.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from typing import TYPE_CHECKING, Tuple 3 | 4 | from linz_logger import get_log 5 | 6 | from topo_processor.util.time import time_in_ms 7 | 8 | if TYPE_CHECKING: 9 | from .command import Command 10 | 11 | 12 | class ExecutionLocal: 13 | cmd: "Command" 14 | 15 | @staticmethod 16 | def run(cmd: "Command") -> Tuple[int, str, str]: 17 | start_time = time_in_ms() 18 | 19 | proc = subprocess.run(cmd.to_full_command(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) 20 | if proc.returncode != 0: 21 | get_log().error("Run command failed", command=cmd.redacted_command(), duration=time_in_ms() - start_time) 22 | raise Exception(proc.stderr.decode()) 23 | get_log().trace("Run command succeeded", command=cmd.redacted_command(), duration=time_in_ms() - start_time) 24 | return proc.returncode, proc.stdout.decode(), proc.stderr.decode() 25 | 26 | 27 | class ExecutionDocker: 28 | cmd: "Command" 29 | 30 | @staticmethod 31 | def run(cmd: "Command") -> Tuple[int, str, str]: 32 | return ExecutionLocal.run(cmd.to_docker()) 33 | -------------------------------------------------------------------------------- /topo_processor/util/file_converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from topo_processor.util.command import Command 4 | 5 | 6 | def geopackage_to_csv(input_path: str, output_path: str) -> Command: 7 | if os.environ.get("IS_DOCKER") == "true": 8 | cmd = Command("ogr2ogr") 9 | else: 10 | cmd = Command("ogr2ogr", {"container": "osgeo/gdal", "tag": "ubuntu-small-3.5.0"}) 11 | 12 | cmd.mount(input_path) 13 | cmd.mount(os.path.dirname(output_path)) 14 | cmd.arg("-f", "CSV") 15 | cmd.arg("-lco", "GEOMETRY=AS_WKT") 16 | cmd.arg("-nlt", "POLYGON") 17 | cmd.arg(output_path) 18 | cmd.arg(input_path) 19 | return cmd 20 | -------------------------------------------------------------------------------- /topo_processor/util/file_extension.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple 2 | 3 | FILE_EXTENSIONS: Dict[str, Tuple[str, ...]] = {"imagery.historic": (".tif", ".tiff")} 4 | 5 | 6 | def is_extension(file_name: str, extensions: Tuple[str, ...]) -> bool: 7 | return file_name.lower().endswith(extensions) 8 | 9 | 10 | def is_tiff(path: str) -> bool: 11 | return is_extension(path, (".tiff", ".tif")) 12 | 13 | 14 | def is_csv(path: str) -> bool: 15 | return is_extension(path, (".csv",)) 16 | 17 | 18 | def is_geopackage(path: str) -> bool: 19 | return is_extension(path, (".gpkg",)) 20 | -------------------------------------------------------------------------------- /topo_processor/util/files.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | 5 | def get_file_update_time(path: str) -> str: 6 | """Return the time (in ) of the last update of the path metadata 7 | https://docs.python.org/3.9/library/os.path.html#os.path.getctime 8 | Here ctime refers to the last metadata change for specified path in UNIX while in Windows, it refers to path creation time.""" 9 | update_ctime = os.path.getctime(path) 10 | update_time = datetime.utcfromtimestamp(update_ctime).strftime("%Y-%m-%dT%H:%M:%S") + "Z" 11 | return update_time 12 | -------------------------------------------------------------------------------- /topo_processor/util/gzip.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | from typing import Optional 3 | 4 | from linz_logger.logger import get_log 5 | 6 | 7 | def is_gzip_file(file_path: str) -> bool: 8 | with open(file_path, "rb") as file: 9 | # gzip magic number == "1f 8b" 10 | return file.read(2) == b"\x1f\x8b" 11 | 12 | 13 | def decompress_file(file_path: str) -> None: 14 | input: Optional[gzip.GzipFile] = None 15 | 16 | try: 17 | input = gzip.GzipFile(file_path, "rb") 18 | s = input.read() 19 | except gzip.BadGzipFile as e: 20 | get_log().error("File decompression failed", file=file_path, error=e) 21 | raise e 22 | finally: 23 | if input: 24 | input.close() 25 | 26 | output = open(file_path, "wb") 27 | output.write(s) 28 | output.close() 29 | -------------------------------------------------------------------------------- /topo_processor/util/s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from linz_logger import get_log 3 | 4 | 5 | def bucket_name_from_path(path: str) -> str: 6 | path_parts = path.replace("s3://", "").split("/") 7 | return path_parts.pop(0) 8 | 9 | 10 | def bucket_name_from_stack(stack_name: str) -> str: 11 | get_log().debug("stack_name", stack_name=stack_name) 12 | session = boto3.Session() 13 | cloudformation = session.resource("cloudformation") 14 | stack = cloudformation.Stack(stack_name) 15 | 16 | temp_bucket: str = "" 17 | 18 | for output in stack.outputs: 19 | if output["OutputKey"] == "TempBucketName": 20 | get_log().debug("bucket_name", bucket_name=output["OutputValue"]) 21 | temp_bucket = output["OutputValue"] 22 | 23 | if not temp_bucket: 24 | get_log().error("bucket_name_not_found", stackName=stack_name) 25 | raise Exception("No temp_bucket found in stack") 26 | 27 | return temp_bucket 28 | 29 | 30 | def is_s3_path(path: str) -> bool: 31 | if path.startswith("s3://"): 32 | return True 33 | return False 34 | -------------------------------------------------------------------------------- /topo_processor/util/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/util/tests/__init__.py -------------------------------------------------------------------------------- /topo_processor/util/tests/aws_credentials_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.util.aws_credentials import bucket_roles, get_role_arn 4 | 5 | 6 | # Add test with AWS mock 7 | def test_get_role_arn() -> None: 8 | bucket_roles["bucket-test"] = {"roleArn": "arn:aws:iam::123456789012:role/S3Access"} 9 | assert get_role_arn("bucket-test") == "arn:aws:iam::123456789012:role/S3Access" 10 | -------------------------------------------------------------------------------- /topo_processor/util/tests/aws_files_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.util.aws_files import build_s3_path 4 | 5 | 6 | # Add test with AWS mock 7 | def test_build_s3_path() -> None: 8 | assert build_s3_path("test-bucket", "/test-folder/object.ext") == "s3://test-bucket/test-folder/object.ext" 9 | -------------------------------------------------------------------------------- /topo_processor/util/tests/checksum_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from topo_processor.util.checksum import multihash_as_hex 6 | 7 | 8 | def test_multihash_as_hex() -> None: 9 | path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "WRONG_SURVEY.tiff") 10 | assert multihash_as_hex(path) == "1220d1bed69013d3dbcf4b1ef90016d77be83ad9b1759865ef5f9969ed540f902f53" 11 | 12 | path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff") 13 | assert multihash_as_hex(path) == "1220d3e42a62bb123eeeb96358f1e4ed46d20b1a329a4738dd643d27623ba8452957" 14 | 15 | path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "MULTIPLE_ASSET.tiff") 16 | assert multihash_as_hex(path) == "1220d3e42a62bb123eeeb96358f1e4ed46d20b1a329a4738dd643d27623ba8452957" 17 | -------------------------------------------------------------------------------- /topo_processor/util/tests/command_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.util.command import Command 4 | 5 | 6 | def test_hello_world_local() -> None: 7 | cmd = Command("echo") 8 | cmd.arg("Hello World Local!!!") 9 | return_code, stdout, stderr = cmd.run() 10 | assert stdout == "Hello World Local!!!\n" 11 | assert stderr == "" 12 | assert return_code == 0 13 | 14 | 15 | def test_hello_world_docker(mocker) -> None: # type: ignore 16 | cmd = Command("/bin/echo", {"container": "busybox", "tag": "latest"}) 17 | cmd.arg("Hello World Docker!!!") 18 | mocker.patch("topo_processor.util.execution.ExecutionLocal.run", return_value=[0, "Hello World Docker!!!\n", ""]) 19 | return_code, stdout, _ = cmd.run() 20 | assert stdout == "Hello World Docker!!!\n" 21 | assert return_code == 0 22 | -------------------------------------------------------------------------------- /topo_processor/util/tests/conversions_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from topo_processor.util.conversions import ( 4 | historical_imagery_photo_type_to_linz_geospatial_type, 5 | nzt_datetime_to_utc_datetime, 6 | quarterdate_to_date_string, 7 | ) 8 | 9 | 10 | def test_nzt_datetime_to_utc_datetime_daylight_saving_on() -> None: 11 | utc_date = nzt_datetime_to_utc_datetime("1988-01-11T00:00:00.000") 12 | assert utc_date.isoformat() == "1988-01-10T11:00:00+00:00" 13 | 14 | 15 | def test_nzt_datetime_to_utc_datetime_daylight_saving_off() -> None: 16 | utc_date = nzt_datetime_to_utc_datetime("1988-07-11T00:00:00.000") 17 | assert utc_date.isoformat() == "1988-07-10T12:00:00+00:00" 18 | 19 | 20 | def test_quarter_date_to_utc_correct_format() -> None: 21 | utc_date_string = quarterdate_to_date_string("2020/Q1") 22 | assert utc_date_string == "2019-12-31T11:00:00Z" 23 | 24 | 25 | def test_quarter_date_to_utc_incorrect_format() -> None: 26 | returned_string = quarterdate_to_date_string("nzam_pilot") 27 | assert returned_string == "nzam_pilot" 28 | 29 | 30 | def test_historical_imagery_photo_type_to_linz_geospatial_type_empty_string() -> None: 31 | returned_string = historical_imagery_photo_type_to_linz_geospatial_type("") 32 | assert returned_string == "" 33 | 34 | 35 | def test_historical_imagery_photo_type_to_linz_geospatial_type_whitespace_case() -> None: 36 | returned_string = historical_imagery_photo_type_to_linz_geospatial_type(" B&w IR ") 37 | assert returned_string == "black and white infrared image" 38 | -------------------------------------------------------------------------------- /topo_processor/util/tests/files_test.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | 4 | import pytest 5 | 6 | from topo_processor.util.gzip import is_gzip_file 7 | 8 | 9 | def test_is_gzip_file_true(setup: str) -> None: 10 | compressed_file = os.path.abspath(os.path.join(setup, "file.gz")) 11 | cf = gzip.open(compressed_file, "wb") 12 | cf.write("test".encode("utf-8")) 13 | cf.close() 14 | 15 | assert is_gzip_file(compressed_file) == True 16 | 17 | 18 | def test_is_gzip_file_false(setup: str) -> None: 19 | file = os.path.abspath(os.path.join(setup, "file.txt")) 20 | cf = open(file, "wb") 21 | cf.write("test".encode("utf-8")) 22 | cf.close() 23 | 24 | assert is_gzip_file(file) == False 25 | -------------------------------------------------------------------------------- /topo_processor/util/tests/time_test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List 3 | 4 | from topo_processor.util.time import get_min_max_interval 5 | 6 | 7 | def test_get_min_max_interval() -> None: 8 | dates: List[datetime] = [] 9 | datetime_earliest = datetime.strptime("1918-11-11", "%Y-%m-%d") 10 | datetime_mid = datetime.strptime("1945-05-08", "%Y-%m-%d") 11 | datetime_latest = datetime.strptime("1989-11-09", "%Y-%m-%d") 12 | dates.append(datetime_earliest) 13 | dates.append(datetime_latest) 14 | dates.append(datetime_mid) 15 | 16 | assert get_min_max_interval(dates) == [datetime_earliest, datetime_latest] 17 | -------------------------------------------------------------------------------- /topo_processor/util/tests/transfer_collection_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from datetime import datetime 4 | 5 | import pytest 6 | 7 | from topo_processor.metadata.data_type import DataType 8 | from topo_processor.metadata.metadata_loaders.metadata_loader_imagery_historic import MetadataLoaderImageryHistoric 9 | from topo_processor.stac.asset import Asset 10 | from topo_processor.stac.asset_key import AssetKey 11 | from topo_processor.stac.collection import Collection 12 | from topo_processor.stac.item import Item 13 | from topo_processor.util.transfer_collection import transfer_collection 14 | 15 | 16 | def test_fail_on_duplicate_assets(setup: str) -> None: 17 | target = setup 18 | collection = Collection("fake_title") 19 | collection.survey = "survey_id" 20 | collection.description = "fake_description" 21 | collection.license = "fake_license" 22 | item = Item("item_id") 23 | item.datetime = datetime.now() 24 | item.linz_geospatial_type = "black and white image" 25 | collection.add_item(item) 26 | item.collection = collection 27 | 28 | cog_1 = Asset("./test_data/tiffs/SURVEY_1/CONTROL.tiff") 29 | cog_1.target = "fake_title/fake_target.tiff" 30 | cog_1.key_name = AssetKey.Visual 31 | item.add_asset(cog_1) 32 | 33 | cog_2 = Asset("test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff") 34 | cog_2.target = "fake_title/fake_target.tiff" 35 | cog_2.key_name = AssetKey.Visual 36 | item.add_asset(cog_2) 37 | 38 | with pytest.raises(Exception, match=r"./item_id.tiff already exists."): 39 | transfer_collection(item.collection, target, DataType("imagery.historic")) 40 | 41 | 42 | def test_asset_key_not_in_list(setup: str) -> None: 43 | target = setup 44 | collection = Collection("fake_title") 45 | collection.survey = "survey_id" 46 | collection.description = "fake_description" 47 | collection.license = "fake_license" 48 | item = Item("item_id") 49 | item.datetime = datetime.now() 50 | item.linz_geospatial_type = "black and white image" 51 | collection.add_item(item) 52 | item.collection = collection 53 | 54 | test_asset = Asset("./test_data/tiffs/SURVEY_1/CONTROL.tiff") 55 | test_asset.target = "fake_title/fake_target.tiff" 56 | test_asset.key_name = None 57 | item.add_asset(test_asset) 58 | 59 | with pytest.raises(Exception, match=r"No asset key set for asset ./item_id.tiff"): 60 | transfer_collection(item.collection, target, DataType("imagery.historic")) 61 | 62 | 63 | def test_generate_summaries(setup: str) -> None: 64 | target = setup 65 | collection = Collection("AUCKLAND 1") 66 | collection.description = "fake_description" 67 | collection.license = "face_license" 68 | collection.survey = "SURVEY_1" 69 | test_geom = { 70 | "WKT": "POLYGON ((177.168157744315 -38.7538525409217," 71 | "177.23423558687 -38.7514276946524," 72 | "177.237358655351 -38.8031681573174," 73 | "177.17123348276 -38.8055953066942," 74 | "177.168157744315 -38.7538525409217))" 75 | } 76 | test_datetime = datetime.strptime("1918-11-11", "%Y-%m-%d") 77 | 78 | item_1 = Item("item_1_id") 79 | metadata_loader_imagery_historic = MetadataLoaderImageryHistoric() 80 | metadata_loader_imagery_historic.add_spatial_extent(item_1, asset_metadata=test_geom) 81 | item_1.datetime = test_datetime 82 | item_1.properties = { 83 | "mission": "SURVEY_1", 84 | "proj:centroid": {"lat": -45.8079, "lon": 170.5548}, 85 | "camera:sequence_number": 89555, 86 | "film:id": "731", 87 | "aerial-photo:scale": 6600, 88 | "scan:scanned": "2014-06-30T12:00:00Z", 89 | "proj:epsg": "null", 90 | } 91 | collection.add_item(item_1) 92 | item_1.collection = collection 93 | 94 | item_2 = Item("item_2_id") 95 | metadata_loader_imagery_historic = MetadataLoaderImageryHistoric() 96 | metadata_loader_imagery_historic.add_spatial_extent(item_2, asset_metadata=test_geom) 97 | item_2.datetime = test_datetime 98 | item_2.properties = { 99 | "mission": "SURVEY_1", 100 | "proj:centroid": {"lat": -45.8079, "lon": 170.5599}, 101 | "camera:sequence_number": 89554, 102 | "film:id": "731", 103 | "aerial-photo:scale": 5600, 104 | "scan:scanned": "2019-12-31T11:00:00Z", 105 | "proj:epsg": "null", 106 | } 107 | collection.add_item(item_2) 108 | item_2.collection = collection 109 | 110 | transfer_collection(item_1.collection, target, DataType("imagery.aerial")) 111 | 112 | with open(os.path.join(target, "SURVEY_1", "collection.json")) as collection_json_file: 113 | collection_metadata = json.load(collection_json_file) 114 | assert collection_metadata["summaries"]["mission"] == ["SURVEY_1"] 115 | assert collection_metadata["summaries"]["film:id"] == ["731"] 116 | assert collection_metadata["summaries"]["proj:epsg"] == ["null"] 117 | assert collection_metadata["summaries"]["aerial-photo:scale"] == {"minimum": 5600, "maximum": 6600} 118 | assert collection_metadata["summaries"]["scan:scanned"] == { 119 | "minimum": "2014-06-30T12:00:00Z", 120 | "maximum": "2019-12-31T11:00:00Z", 121 | } 122 | assert collection_metadata["summaries"]["camera:sequence_number"] == {"minimum": 89554, "maximum": 89555} 123 | assert "proj:centroid" not in collection_metadata["summaries"].keys() 124 | -------------------------------------------------------------------------------- /topo_processor/util/time.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | from typing import List, Union 4 | 5 | 6 | def time_in_ms() -> float: 7 | return time.time() * 1000 8 | 9 | 10 | def get_min_max_interval(times: List[datetime]) -> List[Union[datetime, None]]: 11 | min_date = None 12 | max_date = None 13 | 14 | for date in times: 15 | if not min_date: 16 | min_date = date 17 | elif date < min_date: 18 | min_date = date 19 | if not max_date: 20 | max_date = date 21 | elif date > max_date: 22 | max_date = date 23 | 24 | return [min_date, max_date] 25 | -------------------------------------------------------------------------------- /topo_processor/util/transfer_collection.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import TYPE_CHECKING, Any 5 | 6 | from linz_logger import get_log 7 | from pystac.catalog import CatalogType 8 | 9 | from topo_processor.file_system.transfer import transfer_file 10 | from topo_processor.file_system.write_json import write_json 11 | from topo_processor.metadata.data_type import DataType 12 | 13 | if TYPE_CHECKING: 14 | from topo_processor.stac.collection import Collection 15 | 16 | 17 | def transfer_collection(collection: Collection, target: str, data_type: DataType, force: bool = False) -> None: 18 | stac_collection = collection.create_stac() 19 | files_to_transfer: dict[str, Any] = {} 20 | # pystac v1.1.0 21 | # Required to remove cwd from collection self_href, 22 | # Must be come after collection.create_stac and be before stac_collection.add_item(..) 23 | stac_collection.catalog_type = CatalogType.SELF_CONTAINED 24 | 25 | for item in collection.items.values(): 26 | if not item.is_valid(): 27 | get_log().warning("Invalid item won't be uploaded:", error=item.log) 28 | continue 29 | if item.log: 30 | get_log().warning(f"Item {item.id} contains warnings:", error=item.log) 31 | 32 | stac_item = item.create_stac() 33 | stac_collection.add_item(stac_item) 34 | # pystac v1.1.0 35 | # Required to change the pystac default of ./{id}/{id}.json 36 | # Must come after stac_collection.add_item(stac_item) 37 | stac_item.set_self_href(f"./{item.id}.json") 38 | 39 | existing_asset_hrefs = {} 40 | 41 | for asset in item.assets: 42 | 43 | if not asset.needs_upload: 44 | continue 45 | asset.href = f"./{item.id}{asset.file_ext()}" 46 | if asset.href in existing_asset_hrefs: 47 | raise Exception(f"{asset.href} already exists.") 48 | if not asset.target: 49 | raise Exception(f"No asset target set for asset {asset.href}") 50 | asset_transfer = { 51 | "source": asset.source_path, 52 | "checksum": asset.get_checksum(), 53 | "contentType": asset.get_content_type(), 54 | "target": os.path.join(target, asset.target), 55 | } 56 | 57 | if not asset.key_name: 58 | raise Exception(f"No asset key set for asset {asset.href}") 59 | else: 60 | stac_item.add_asset(key=asset.key_name, asset=asset.create_stac()) 61 | 62 | existing_asset_hrefs[asset.href] = asset_transfer 63 | 64 | files_to_transfer[item.id] = {"images": existing_asset_hrefs} 65 | 66 | # pystac v1.1.0 67 | # Required to not add a self link with an 'absolute' link from the cwd 68 | json_item = stac_item.to_dict(include_self_link=False) 69 | if not item.collection: 70 | raise Exception(f"No collection set for item {item.id}") 71 | files_to_transfer[item.id]["stac"] = { 72 | "item": json_item, 73 | "target": os.path.join(target, item.collection.survey, f"{item.id}.json"), 74 | } 75 | 76 | # after all items have been processed generate summaries 77 | collection.generate_summaries(stac_collection) 78 | collection.update_description(stac_collection, data_type) 79 | 80 | try: 81 | collection.validate_pystac_collection(stac_collection) 82 | except Exception as e: 83 | get_log().error(f"Collection Validation Warning: {e}", collection_id=collection.id) 84 | if not force: 85 | raise Exception("Collection failed the validation. Process is stopped.") from e 86 | 87 | # Transfer the files 88 | for item_transfer in files_to_transfer.values(): 89 | for asset_transfer in item_transfer["images"].values(): 90 | transfer_file( 91 | str(asset_transfer["source"]), 92 | str(asset_transfer["checksum"]), 93 | str(asset_transfer["contentType"]), 94 | str(asset_transfer["target"]), 95 | ) 96 | write_json(item_transfer["stac"]["item"], item_transfer["stac"]["target"]) 97 | 98 | # pystac v1.1.0 99 | # Required to not add a self link with an 'absolute' link from the cwd 100 | json_collection = stac_collection.to_dict(include_self_link=False) 101 | 102 | write_json(json_collection, os.path.join(target, collection.survey, "collection.json")) 103 | -------------------------------------------------------------------------------- /topo_processor/util/valid.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | 4 | class Validity: 5 | def __init__(self) -> None: 6 | self.log: List[Dict[str, Any]] = [] 7 | self._valid = True 8 | 9 | def add_error(self, msg: str, cause: str, e: Optional[Exception] = None) -> None: 10 | self.log.append({"msg": msg, "level": "error", "cause": cause, "error": e}) 11 | self._valid = False 12 | 13 | def add_warning(self, msg: str, cause: str, e: Optional[Exception] = None) -> None: 14 | self.log.append({"msg": msg, "level": "warning", "cause": cause, "error": e}) 15 | 16 | def is_valid(self) -> bool: 17 | return self._valid 18 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@linzjs/style/tsconfig.base.json", 3 | "compilerOptions": { 4 | "lib": ["ES2020"], 5 | "outDir": "build" 6 | }, 7 | "include": ["infra/src"] 8 | } 9 | -------------------------------------------------------------------------------- /upload: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from topo_processor.cli import upload 4 | 5 | upload.main() 6 | --------------------------------------------------------------------------------