├── .dockerignore
├── .env
├── .eslintrc.cjs
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── build.python.yml
    │   ├── build.ts.yml
    │   └── codeql-analysis.yml
├── .gitignore
├── .kodiak.toml
├── .prettierrc.cjs
├── .pylintrc
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── VERSION
├── cdk.json
├── conftest.py
├── infra
    └── src
    │   ├── batch-monitor.ts
    │   ├── batch.ts
    │   ├── index.ts
    │   ├── lambda-code
    │       └── index.js
    │   └── submit.ts
├── package.json
├── poetry.lock
├── pyproject.toml
├── scripts
    └── version.bump.sh
├── test_data
    ├── historical_aerial_photos_metadata.csv
    ├── historical_aerial_photos_metadata.gpkg
    ├── historical_aerial_photos_metadata_error.csv
    ├── historical_survey_footprint_metadata.csv
    ├── manifest.json
    ├── manifest_duplicate.json
    ├── schemas
    │   └── README.md
    └── tiffs
    │   ├── SURVEY_1
    │       ├── CONTROL.tiff
    │       ├── MULTIPLE_ASSET.his
    │       ├── MULTIPLE_ASSET.tiff
    │       ├── WRONG_PHOTO_TYPE.tiff
    │       └── WRONG_SURVEY.tiff
    │   └── SURVEY_2
    │       └── CONTROL_2.tif
├── topo_processor
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   ├── geostore
    │   │   ├── __init__.py
    │   │   ├── add.py
    │   │   ├── delete.py
    │   │   ├── list.py
    │   │   └── status.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── upload_test.py
    │   ├── upload.py
    │   └── validate.py
    ├── cog
    │   ├── __init__.py
    │   ├── create_cog.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── create_cog_test.py
    ├── data
    │   ├── __init__.py
    │   └── data_transformers
    │   │   ├── __init__.py
    │   │   ├── data_transformer.py
    │   │   ├── data_transformer_imagery_historic.py
    │   │   └── data_transformer_repo.py
    ├── file_system
    │   ├── __init__.py
    │   ├── assets.py
    │   ├── file_searcher.py
    │   ├── get_fs.py
    │   ├── get_path_with_protocol.py
    │   ├── manifest.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── assets_test.py
    │   │   ├── file_searcher_test.py
    │   │   ├── get_fs_test.py
    │   │   ├── get_path_with_protocol_test.py
    │   │   ├── transfer_test.py
    │   │   └── write_json_test.py
    │   ├── transfer.py
    │   └── write_json.py
    ├── geostore
    │   ├── invoke.py
    │   └── tests
    │   │   └── invoke_test.py
    ├── metadata
    │   ├── __init__.py
    │   ├── csv_loader
    │   │   ├── csv_loader.py
    │   │   └── tests
    │   │   │   └── csv_loader_test.py
    │   ├── data_type.py
    │   ├── lds_cache
    │   │   ├── lds_cache.py
    │   │   └── tests
    │   │   │   └── lds_cache_test.py
    │   ├── metadata_loaders
    │   │   ├── __init__.py
    │   │   ├── metadata_loader.py
    │   │   ├── metadata_loader_imagery_historic.py
    │   │   ├── metadata_loader_repo.py
    │   │   ├── metadata_loader_tiff.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── metadata_loader_imagery_historic_test.py
    │   │   │   └── metadata_loader_tiff_test.py
    │   └── metadata_validators
    │   │   ├── __init__.py
    │   │   ├── metadata_validator.py
    │   │   ├── metadata_validator_repo.py
    │   │   ├── metadata_validator_stac.py
    │   │   ├── metadata_validator_tiff.py
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── metadata_validator_stac_test.py
    │   │       └── metadata_validator_tiff_test.py
    ├── stac
    │   ├── __init__.py
    │   ├── asset.py
    │   ├── asset_key.py
    │   ├── collection.py
    │   ├── item.py
    │   ├── item_factory.py
    │   ├── iter_errors_validator.py
    │   ├── linz_provider.py
    │   ├── providers.py
    │   ├── stac_extensions.py
    │   ├── store.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── asset_test.py
    │   │   ├── collection_test.py
    │   │   ├── file_extension_test.py
    │   │   ├── iter_errors_validator_test.py
    │   │   └── validate_report_test.py
    │   ├── validate_report.py
    │   └── validation.py
    └── util
    │   ├── __init__.py
    │   ├── aws_credentials.py
    │   ├── aws_files.py
    │   ├── checksum.py
    │   ├── command.py
    │   ├── configuration.py
    │   ├── conversions.py
    │   ├── execution.py
    │   ├── file_converter.py
    │   ├── file_extension.py
    │   ├── files.py
    │   ├── gzip.py
    │   ├── s3.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── aws_credentials_test.py
    │       ├── aws_files_test.py
    │       ├── checksum_test.py
    │       ├── command_test.py
    │       ├── conversions_test.py
    │       ├── files_test.py
    │       ├── time_test.py
    │       └── transfer_collection_test.py
    │   ├── time.py
    │   ├── transfer_collection.py
    │   └── valid.py
├── tsconfig.json
├── upload
└── yarn.lock


/.dockerignore:
--------------------------------------------------------------------------------
 1 | infra/
 2 | README.md
 3 | .gitignore
 4 | .pylintrc
 5 | __pycache__
 6 | .github
 7 | .mypy_cache
 8 | .pytest_cache
 9 | .vscode
10 | .kodiak.toml
11 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | LINZ_CACHE_BUCKET=linz-lds-cache
2 | LINZ_HISTORICAL_IMAGERY_BUCKET=linz-historical-imagery-staging
3 | LINZ_SSM_BUCKET_CONFIG_NAME=BucketConfig
4 | 


--------------------------------------------------------------------------------
/.eslintrc.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   ...require("@linzjs/style/.eslintrc.js"),
3 | };
4 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @linz/li-topo-data-engineering
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | ### Bug Description
 8 | A clear and concise description of what the bug is and what you expected to happen.
 9 | 
10 | #### Steps to Reproduce
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Scroll down to '....'
15 | 4. See error
16 | 
17 | #### Desktop
18 |  - Environment: [e.g. Windows / DaaS / Ubuntu]
19 |  - Relevant Software Versions [e.g. QGIS 2.18.21]
20 | 
21 | #### Screenshots
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | **Add an _Assignee_, _Milestone_, _Release_ and any relevant _Labels_.**
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | ### User Story
 8 | 
 9 | In order to [accomplish goal] as a [role] I want [capability] 
10 | (optional: instead of [existing behaviour]).
11 | 
12 | #### Acceptance Criteria
13 | - [ ] ...
14 | - [ ] ...
15 | - [ ] ...
16 | 
17 | **Add an _Assignee_, _Milestone_, _Release_ and any relevant _Labels_.**
18 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Fixes: #  
 2 | _Add new line for each issue that was fixed_
 3 | 
 4 | ### Change Description:
 5 | 
 6 | ...
 7 | 
 8 | ### Notes for Testing:
 9 | 
10 | ...
11 | 
12 | #### Source Code Documentation Tasks:
13 | - [ ] README updated (where applicable)
14 | - [ ] CHANGELOG (Unreleased section) updated
15 | - [ ] Docstrings / comments included to help explain code
16 | 
17 | #### User Documentation Tasks:
18 | - [ ] Confluence updated (where applicable)
19 | 
20 | #### Testing Tasks:
21 | - [ ] Added tests that fail without this change
22 | - [ ] All tests are passing in development environment
23 | - [ ] Reviewers assigned
24 | - [ ] Linked to main issue for ZenHub board
25 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: "github-actions"
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 | - package-ecosystem: npm
 8 |   directory: "/"
 9 |   schedule:
10 |     interval: daily
11 |   open-pull-requests-limit: 10
12 | - package-ecosystem: pip
13 |   directory: "/"
14 |   schedule:
15 |     interval: daily
16 |   open-pull-requests-limit: 10
17 | 


--------------------------------------------------------------------------------
/.github/workflows/build.python.yml:
--------------------------------------------------------------------------------
 1 | name: Build Python
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 | 
14 |       - name: Use Python "3.8"
15 |         uses: actions/setup-python@v3
16 |         with:
17 |           python-version: "3.8"
18 |       - name: Install
19 |         run: |
20 |           pip install poetry
21 |           poetry install
22 |       - name: Format
23 |         run: |
24 |           poetry run black . --check --diff
25 |       - name: Lint
26 |         run: |
27 |           poetry run pylint topo_processor/ --exit-zero
28 |       - name: Import Sorting
29 |         run: |
30 |           poetry run isort -rc . --check --diff
31 |       - name: Test
32 |         run: |
33 |           poetry run pytest --slow --cov topo_processor
34 |       - name: Mypy
35 |         run: |
36 |           poetry run mypy .
37 | 


--------------------------------------------------------------------------------
/.github/workflows/build.ts.yml:
--------------------------------------------------------------------------------
 1 | name: Build Typescript
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       id-token: write
13 |       contents: write
14 |     steps:
15 |       - name: Build and test
16 |         uses: linz/action-typescript@v1
17 | 
18 |       - name: (Prod) Setup git config
19 |         if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:')
20 |         run: |
21 |           git config user.name "github-actions[bot]"
22 |           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
23 | 
24 |       - name: (Prod) Create tag
25 |         if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:')
26 |         run: |
27 |           CURRENT_VERSION=$(node -p "require('./package.json').version")
28 |           git tag v${CURRENT_VERSION} -m v${CURRENT_VERSION} || true
29 |           git push --tags
30 | 
31 |       - name: (Prod) Create github release
32 |         if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:')
33 |         run: npx conventional-github-releaser -p angular
34 |         env:
35 |           CONVENTIONAL_GITHUB_RELEASER_TOKEN: ${{secrets.GITHUB_TOKEN}}
36 | 
37 |       - name: (NonProd) Configure AWS Credentials
38 |         if: github.ref == 'refs/heads/master'
39 |         uses: aws-actions/configure-aws-credentials@v1
40 |         with:
41 |           aws-region: 'ap-southeast-2'
42 |           role-to-assume: ${{ secrets.AWS_ROLE_NON_PROD }}
43 | 
44 |       - name: (NonProd) Deploy to NonProd
45 |         if: github.ref == 'refs/heads/master'
46 |         run: |
47 |           npx cdk deploy --all -y --require-approval never
48 |         env:
49 |           AWS_ORG_ID: ${{secrets.AWS_ORG_ID}}
50 | 
51 |       - name: (Prod) Configure AWS Credentials
52 |         if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:')
53 |         uses: aws-actions/configure-aws-credentials@v1
54 |         with:
55 |           aws-region: 'ap-southeast-2'
56 |           role-to-assume: ${{ secrets.AWS_ROLE_PROD }}
57 | 
58 |       - name: (Prod) Deploy to Prod
59 |         if: github.ref == 'refs/heads/master' && startsWith(github.event.head_commit.message, 'release:')
60 |         run: |
61 |           npx cdk deploy --all -y --require-approval never
62 |         env:
63 |           AWS_ORG_ID: ${{secrets.AWS_ORG_ID}}
64 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master, ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [master]
 9 |   schedule:
10 |     - cron: '0 0 * * 1'
11 | 
12 | jobs:
13 |   analyse:
14 |     name: Analyse
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout repository
19 |       uses: actions/checkout@v3
20 |       with:
21 |         # We must fetch at least the immediate parents so that if this is
22 |         # a pull request then we can checkout the head.
23 |         fetch-depth: 2
24 | 
25 |     # If this run was triggered by a pull request event, then checkout
26 |     # the head of the pull request instead of the merge commit.
27 |     - run: git checkout HEAD^2
28 |       if: ${{ github.event_name == 'pull_request' }}
29 | 
30 |     # Initializes the CodeQL tools for scanning.
31 |     - name: Initialize CodeQL
32 |       uses: github/codeql-action/init@v2
33 |       # Override language selection by uncommenting this and choosing your languages
34 |       # with:
35 |       #   languages: go, javascript, csharp, python, cpp, java
36 | 
37 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
38 |     # If this step fails, then you should remove it and run the build manually (see below)
39 |     - name: Autobuild
40 |       uses: github/codeql-action/autobuild@v2
41 | 
42 |     # ℹ️ Command-line programs to run using the OS shell.
43 |     # 📚 https://git.io/JvXDl
44 | 
45 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 |     #    and modify them (or add more) to build your code if your project
47 |     #    uses a compiled language
48 | 
49 |     #- run: |
50 |     #   make bootstrap
51 |     #   make release
52 | 
53 |     - name: Perform CodeQL Analysis
54 |       uses: github/codeql-action/analyze@v2
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | .pytest_cache
 3 | *.pyc
 4 | .venv
 5 | .vscode
 6 | Thumbs.db
 7 | build/
 8 | .coverage
 9 | batch/roles.json
10 | node_modules
11 | cdk.out
12 | cdk.context.json
13 | 


--------------------------------------------------------------------------------
/.kodiak.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 | 
3 | 
4 | [merge]
5 | method = "squash"
6 | automerge_label = "automerge 🚀"
7 | 


--------------------------------------------------------------------------------
/.prettierrc.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   ...require('@linzjs/style/.prettierrc.js'),
3 | };
4 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | disable =
 3 |         bad-continuation,
 4 |         invalid-name,
 5 |         missing-class-docstring,
 6 |         missing-module-docstring,
 7 |         missing-function-docstring
 8 | [FORMAT]
 9 | max-line-length=127
10 | [MISCELLANEOUS]
11 | notes=FIXME,XXX
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM osgeo/gdal:ubuntu-small-3.5.0
 2 | 
 3 | # Install Poetry
 4 | RUN apt-get update
 5 | RUN apt-get install python3-pip -y
 6 | RUN pip install poetry
 7 | 
 8 | # Set environment variable to prevent GDAL running in Docker
 9 | ENV IS_DOCKER=true
10 | 
11 | WORKDIR /app
12 | # Add Poetry config and scripts
13 | COPY poetry.lock pyproject.toml VERSION /app/
14 | 
15 | RUN poetry config virtualenvs.create false \
16 |   && poetry install --no-dev --no-interaction --no-ansi
17 | 
18 | COPY ./topo_processor /app/topo_processor
19 | COPY ./upload /app/
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Land Information New Zealand
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Topo Processor
  2 | 
  3 | [![GitHub Actions Status](https://github.com/linz/topo-processor/workflows/Build/badge.svg)](https://github.com/linz/topo-processor/actions)
  4 | [![Alerts](https://badgen.net/lgtm/alerts/g/linz/topo-processor?icon=lgtm&labelColor=2e3a44&label=Alerts&color=3dc64b)](https://lgtm.com/projects/g/linz/topo-processor/context:python)
  5 | [![Dependabot Status](https://badgen.net/dependabot/linz/topo-processor?icon=dependabot&labelColor=2e3a44&color=blue)](https://dependabot.com)
  6 | [![License](https://badgen.net/github/license/linz/processor-aerial-imagery?labelColor=2e3a44&label=License)](https://github.com/linz/topo-processor/blob/master/LICENSE)
  7 | [![Conventional Commits](https://badgen.net/badge/Commits/conventional?labelColor=2e3a44&color=EC5772)](https://conventionalcommits.org)
  8 | [![Code Style](https://badgen.net/badge/Code%20Style/black?labelColor=2e3a44&color=000000)](https://github.com/psf/black)
  9 | 
 10 | ## Description
 11 | 
 12 | The Topo Processor is a collection of small components that can be combined together to create a pipeline. It can be run on a local workstation or using AWS Batch.
 13 | 
 14 | These components include transforming data into cloud optimised formats like [COG](https://www.cogeo.org/) and the creation of [STAC](http://stacspec.org/) metadata.
 15 | 
 16 | ## Installation
 17 | 
 18 | ### Requirements to run Topo Processor locally:
 19 | 
 20 | #### Poetry
 21 | 
 22 | Follow the [Poetry installation guide](https://python-poetry.org/docs/).
 23 | 
 24 | #### Docker
 25 | 
 26 | Follow the [Docker Engine installation guide (Ubuntu)](https://docs.docker.com/engine/install/ubuntu/).
 27 | 
 28 | ### Recommended
 29 | 
 30 | - [node](https://nodejs.org/en/about/)
 31 | - [pretty-json-log](https://npmjs.com/package/pretty-json-log)
 32 | 
 33 | ### Use poetry to install
 34 | 
 35 | ```shell
 36 | poetry shell
 37 | 
 38 | poetry install
 39 | ```
 40 | 
 41 | ## Configuration
 42 | 
 43 | The global user configuration is defined by environment variables, example environment variables are found in the `.env` file.
 44 | 
 45 | ### Requirements to run Topo Processor using AWS Batch:
 46 | 
 47 | #### Software
 48 | 
 49 | ```shell
 50 | yarn
 51 | 
 52 | yarn build
 53 | ```
 54 | 
 55 | #### AWS Batch Stack deployment
 56 | 
 57 | **_NOTE:_** [AWS deployment is done automatically through GitHub Actions.](#aws-deployment--ci--cd)
 58 | 
 59 | To deploy the Batch via CDK locally:
 60 | 
 61 | On the AWS account you are logged into
 62 | 
 63 | ```shell
 64 | yarn build
 65 | 
 66 | npx cdk deploy
 67 | ```
 68 | 
 69 | ### AWS Roles
 70 | 
 71 | To allow the system to perform cross account AWS requests, you'll need to config AWS roles inside of an AWS SSM parameter.
 72 | 
 73 | This configuration parameter can be referenced via `$LINZ_SSM_BUCKET_CONFIG_NAME`
 74 | 
 75 | ## Usage
 76 | 
 77 | ### AWS Batch Job Submission
 78 | 
 79 | **_NOTE:_** Only the `upload` command is implemented to run on AWS Batch. Currently the job submission is restricted to only one job per survey.
 80 | 
 81 | **_NOTE:_** You may need to set the `AWS_REGION` environment variable to your region.
 82 | 
 83 | ```shell
 84 | # Passing survey IDs as argument
 85 | node ./build/infra/src/submit.js surveyId1 surveyId3 [...]
 86 | 
 87 | # Passing S3 folder as argument
 88 | node ./build/infra/src/submit.js s3://my-bucket/backup2/surveyId1/ s3://my-bucket/backup4/surveyId3/ [...]
 89 | ```
 90 | 
 91 | ### `upload`
 92 | 
 93 | **_NOTE:_** The `upload` command is restricted to a run per `survey` and only for the `Historical Imagery` layer. To run multiple surveys, please refere to `AWS Batch` described above.
 94 | 
 95 | | Argument                    |                                           Description                                           |
 96 | | --------------------------- | :---------------------------------------------------------------------------------------------: |
 97 | | `-s` or `--source`          | The source of the data to import. Can be a `survey ID` or a path (local or `s3`) to the survey. |
 98 | | `-d` or `--datatype`        |        The datatype of the upload. _Only `imagery.historic` is available at the moment._        |
 99 | | `-t` or `--target`          |                   The target local directory path or `s3` path of the upload.                   |
100 | | `-cid` or `--correlationid` |              OPTIONAL. The `correlation ID` of the batch job. _`AWS Batch` only._               |
101 | | `-m` or `--metadata`        |                        OPTIONAL. The metadata file (local or `s3`) path.                        |
102 | | `-f` or `--footprint`       |               TESTING PURPOSE. The footprint metadata file (local or `s3`) path.                |
103 | | `--force`                   |   Flag to force the upload even if some data are invalid (some items might not be uploaded).    |
104 | | `-v` or `--verbose`         |                                   Flag to display trace logs.                                   |
105 | 
106 | The user has to specify the survey id or path (where the data is) as a `--source` and it will be validated against the latest version of metadata. A metadata file path can also be specified by using `--metadata` if the LDS cache version one is not wanted. The `--datatype` has to be `imagery.historic`. The user also has to specify a target folder for the output.
107 | 
108 | ```shell
109 | # Run in a virtual environment (poetry shell):
110 | ./upload --source source_path --datatype data.type --target target_folder
111 | ```
112 | 
113 | ```shell
114 | # For help:
115 | ./upload --help
116 | ```
117 | 
118 | ```shell
119 | # To see all logs in a tidy format, use pretty-json-log:
120 | ./upload --source source_path --datatype data.type --target target_folder --verbose | pjl
121 | ```
122 | 
123 | The following source and target combinations can be used:
124 | 
125 | | Source | Target |
126 | | ------ | :----: |
127 | | s3     |   s3   |
128 | | s3     | local  |
129 | | local  | local  |
130 | | local  |   s3   |
131 | 
132 | ### `add` (Geostore)
133 | 
134 | This command allows to add a survey to the [Geostore](https://github.com/linz/geostore) by using the [Geostore API](https://github.com/linz/geostore/blob/master/USAGE.md).
135 | 
136 | **_Prerequisites:_** The survey has to be processed by the `upload` command first. The output files of the `upload` is what will be exported to the `Geostore`.
137 | 
138 | | Argument              |                      Description                       |
139 | | --------------------- | :----------------------------------------------------: |
140 | | `-s`, `--source` TEXT |     The s3 path to the survey to export [required]     |
141 | | `-r`, `--role` TEXT   | The ARN role to access to the source bucket [required] |
142 | | `-c`, `--commit`      |  Use this flag to commit the creation of the dataset   |
143 | | `-v`, `--verbose`     |           Use verbose to display debug logs            |
144 | 
145 | ```bash
146 | poetry run add -s "s3://bucket/survey-path/" -r "arn:aws:iam::123456789:role/read-role"
147 | ```
148 | 
149 | ### `status` (Geostore)
150 | 
151 | This is to follow the current upload status to the `Geostore` for a particular `dataset` version. You may have to run it several times as the status gets updated.
152 | 
153 | | Argument                     |                                   Description                                    |
154 | | ---------------------------- | :------------------------------------------------------------------------------: |
155 | | `-a`, `--execution-arn` TEXT | The execution ARN received from the Geostore after invoking an upload [required] |
156 | | `-v`, `--verbose`            |                        Use verbose to display debug logs                         |
157 | 
158 | **_NOTE:_** The command to run is given in the logs after calling successfully the `add` command:
159 | 
160 | ```json
161 | "info": "To check the export status, run the following command 'poetry run status -arn arn:aws:states:ap-southeast-2:632223577832:execution:ABCD'"
162 | ```
163 | 
164 | ### `list` (Geostore)
165 | 
166 | It gives you the information for one or all the datasets created on the `Geostore`.
167 | 
168 | | Argument             |                                      Description                                      |
169 | | -------------------- | :-----------------------------------------------------------------------------------: |
170 | | `-t`, `--title` TEXT | The Geostore title of the survey to filter e.g. historical-aerial-imagery-survey-2660 |
171 | | `-v`, `--verbose`    |                           Use verbose to display debug logs                           |
172 | 
173 | ```bash
174 | poetry run list [-s ID123ABC]
175 | ```
176 | 
177 | ### `delete` (Geostore)
178 | 
179 | Delete a dataset from the `Geostore`. Only if the dataset does not contain any version. To delete a dataset which contains a version, contact the **Geostore** support.
180 | 
181 | | Argument                  |                       Description                       |
182 | | ------------------------- | :-----------------------------------------------------: |
183 | | `-d`, `--dataset-id` TEXT |           The dataset id to delete [required]           |
184 | | `-c`, `--commit`          | Use this flag to commit the suppression of the dataset. |
185 | | `-v`, `--verbose`         |            Use verbose to display debug logs            |
186 | 
187 | ```bash
188 | poetry run delete -d ID123ABC [--commit]
189 | ```
190 | 
191 | ### `validate`
192 | 
193 | **_NOTE:_** This command is currently only implemented for `Historical Imagery`. Other layers will come later.
194 | 
195 | This command runs a validation against a layer. It gets the layer last version metadata and generates the corresponding STAC objects on the fly. Then, it runs a JSON schema validation (using [jsonschema-rs](https://github.com/Stranger6667/jsonschema-rs)) for the `Items` and `Collections`. It outputs the errors and their recurrences grouped by JSON schemas as:
196 | 
197 | ```json
198 | "errors": {"https://stac.linz.govt.nz/v0.0.11/aerial-photo/schema.json": {"'aerial-photo:run' is a required property": 4, "'aerial-photo:sequence_number' is a required property": 10}
199 | ```
200 | 
201 | To validate another version than the latest one, specify the metadata csv file wanted to be validated by using the `--metadata` argument.
202 | 
203 | The following command have to be run in a virtual environment (poetry shell):
204 | 
205 | ```shell
206 | # Run default:
207 | poetry run validate
208 | ```
209 | 
210 | ```shell
211 | # Run against a specific version (can be a s3 or local file):
212 | poetry run validate --metadata s3://bucket/layer_id/metadata_file.csv
213 | ```
214 | 
215 | ```shell
216 | # Run against the `Items` only:
217 | poetry run validate --item
218 | ```
219 | 
220 | ```shell
221 | # Run against the `Collections` only:
222 | poetry run validate --collection
223 | ```
224 | 
225 | ```shell
226 | # For help:
227 | poetry run validate --help
228 | ```
229 | 
230 | ```shell
231 | # To see all logs in a tidy format, use pretty-json-log:
232 | poetry run validate --verbose | pjl
233 | ```
234 | 
235 | ```shell
236 | # To record the output in an external file:
237 | poetry run validate | tee output.file
238 | ```
239 | 
240 | ## AWS Deployment / CI / CD
241 | 
242 | CI/CD is used to deploy into AWS, to trigger a deployment create a new "release:" commit and merge it to master
243 | 
244 | A helpful utility script is in `./scripts/version.bump.sh` to automate this process
245 | 
246 | ```bash
247 | ./scripts/version.bump.sh
248 | # Push branch release/v:versionNumber
249 | git push
250 | # Create the pull request
251 | gh pr create
252 | # Merge to master
253 | ```
254 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | v0.15.0
2 | 


--------------------------------------------------------------------------------
/cdk.json:
--------------------------------------------------------------------------------
1 | {
2 |   "app": "node build/infra/src/index.js"
3 | }
4 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from tempfile import mkdtemp
 3 | from typing import Generator
 4 | 
 5 | import pystac
 6 | import pytest
 7 | 
 8 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator
 9 | 
10 | 
11 | @pytest.fixture(autouse=True)
12 | def set_iter_errors_validator() -> None:
13 |     pystac.validation.set_validator(IterErrorsValidator())
14 | 
15 | 
16 | def pytest_addoption(parser) -> None:  # type: ignore
17 |     parser.addoption("--slow", action="store_true", default=False, help="run slow tests")
18 | 
19 | 
20 | def pytest_runtest_setup(item) -> None:  # type: ignore
21 |     if "slow" in item.keywords and not item.config.getoption("--slow"):
22 |         pytest.skip("need --slow option to run this test")
23 | 
24 | 
25 | @pytest.fixture(autouse=True)
26 | def setup() -> Generator[str, None, None]:
27 |     """
28 |     This function creates a temporary directory and deletes it after each test.
29 |     See following link for details:
30 |     https://docs.pytest.org/en/stable/fixture.html#yield-fixtures-recommended
31 |     """
32 |     target = mkdtemp()
33 |     yield target
34 |     shutil.rmtree(target)
35 | 


--------------------------------------------------------------------------------
/infra/src/batch-monitor.ts:
--------------------------------------------------------------------------------
 1 | import { Stack, StackProps } from 'aws-cdk-lib';
 2 | import * as events from 'aws-cdk-lib/aws-events';
 3 | import * as evtTargets from 'aws-cdk-lib/aws-events-targets';
 4 | import * as lf from 'aws-cdk-lib/aws-lambda';
 5 | import { Code } from 'aws-cdk-lib/aws-lambda';
 6 | import { Construct } from 'constructs';
 7 | import * as path from 'path';
 8 | 
 9 | export class AwsBatchMonitor extends Stack {
10 |   public constructor(scope: Construct, id: string, props: StackProps) {
11 |     super(scope, id, props);
12 | 
13 |     const rule = new events.Rule(this, 'BatchEventRule', {
14 |       eventPattern: {
15 |         source: ['aws.batch'],
16 |         detailType: ['Batch Job State Change'],
17 |       },
18 |     });
19 | 
20 |     const lambda = new lf.Function(this, 'BatchLog', {
21 |       runtime: lf.Runtime.NODEJS_14_X,
22 |       handler: 'index.handler',
23 |       code: Code.fromAsset(path.join(process.cwd(), 'infra', 'src', 'lambda-code')),
24 |     });
25 | 
26 |     rule.addTarget(new evtTargets.LambdaFunction(lambda));
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/infra/src/batch.ts:
--------------------------------------------------------------------------------
 1 | import { CfnOutput, Duration, RemovalPolicy, Stack, StackProps } from 'aws-cdk-lib';
 2 | import { DockerImageAsset } from 'aws-cdk-lib/aws-ecr-assets';
 3 | import { ContainerImage } from 'aws-cdk-lib/aws-ecs';
 4 | import {
 5 |   Role,
 6 |   CompositePrincipal,
 7 |   ServicePrincipal,
 8 |   CfnInstanceProfile,
 9 |   ManagedPolicy,
10 |   PolicyStatement,
11 | } from 'aws-cdk-lib/aws-iam';
12 | import { Vpc, InstanceClass, InstanceType, InstanceSize } from 'aws-cdk-lib/aws-ec2';
13 | import { ComputeResourceType, ComputeEnvironment, JobDefinition, JobQueue } from '@aws-cdk/aws-batch-alpha';
14 | import { BlockPublicAccess, Bucket } from 'aws-cdk-lib/aws-s3';
15 | import { StringParameter } from 'aws-cdk-lib/aws-ssm';
16 | import { Construct } from 'constructs';
17 | 
18 | interface BatchStackProps extends StackProps {
19 |   container: string;
20 | }
21 | 
22 | export class AwsBatchStack extends Stack {
23 |   public constructor(scope: Construct, id: string, props: BatchStackProps) {
24 |     super(scope, id, props);
25 | 
26 |     const container = new DockerImageAsset(this, 'BatchContainer', { directory: props.container });
27 |     const image = ContainerImage.fromDockerImageAsset(container);
28 | 
29 |     const vpc = Vpc.fromLookup(this, 'Vpc', { tags: { BaseVPC: 'true' } });
30 |     const instanceRole = new Role(this, 'BatchInstanceRole', {
31 |       assumedBy: new CompositePrincipal(
32 |         new ServicePrincipal('ec2.amazonaws.com'),
33 |         new ServicePrincipal('ecs.amazonaws.com'),
34 |       ),
35 |     });
36 |     instanceRole.addManagedPolicy(
37 |       ManagedPolicy.fromAwsManagedPolicyName('service-role/AmazonEC2ContainerServiceforEC2Role'),
38 |     );
39 |     instanceRole.addManagedPolicy(ManagedPolicy.fromAwsManagedPolicyName('AmazonSSMManagedInstanceCore'));
40 | 
41 |     instanceRole.addToPrincipalPolicy(
42 |       new PolicyStatement({ resources: ['*'], actions: ['sts:AssumeRole', 'cloudformation:DescribeStacks'] }),
43 |     );
44 | 
45 |     const tempBucket = new Bucket(this, 'TempBucket', {
46 |       removalPolicy: RemovalPolicy.RETAIN,
47 |       blockPublicAccess: BlockPublicAccess.BLOCK_ALL,
48 |       lifecycleRules: [{ expiration: Duration.days(90) }],
49 |     });
50 | 
51 |     const roRole = Role.fromRoleName(this, 'LINZReadRole', 'internal-user-read');
52 |     tempBucket.grantRead(roRole);
53 |     tempBucket.grantReadWrite(instanceRole);
54 |     StringParameter.fromStringParameterName(this, 'BucketConfig', 'BucketConfig').grantRead(instanceRole);
55 | 
56 |     new CfnInstanceProfile(this, 'BatchInstanceProfile', {
57 |       instanceProfileName: instanceRole.roleName,
58 |       roles: [instanceRole.roleName],
59 |     });
60 | 
61 |     const computeEnvironment = new ComputeEnvironment(this, 'BatchCompute', {
62 |       managed: true,
63 |       computeResources: {
64 |         instanceRole: instanceRole.roleName,
65 |         vpc,
66 |         type: ComputeResourceType.SPOT,
67 |         maxvCpus: 100,
68 |         minvCpus: 0,
69 |         instanceTypes: [
70 |           InstanceType.of(InstanceClass.C5, InstanceSize.LARGE),
71 |           InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE),
72 |           InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE2),
73 |           InstanceType.of(InstanceClass.C5, InstanceSize.XLARGE4),
74 |         ],
75 |       },
76 |     });
77 | 
78 |     const job = new JobDefinition(this, 'BatchJob', { container: { image } });
79 |     const queue = new JobQueue(this, 'BatchQueue', { computeEnvironments: [{ computeEnvironment, order: 1 }] });
80 | 
81 |     new CfnOutput(this, 'BatchJobArn', { value: job.jobDefinitionArn });
82 |     new CfnOutput(this, 'BatchQueueArn', { value: queue.jobQueueArn });
83 |     new CfnOutput(this, 'BatchEc2InstanceRole', { value: instanceRole.roleArn });
84 |     new CfnOutput(this, 'TempBucketName', { value: tempBucket.bucketName });
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/infra/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { App } from 'aws-cdk-lib';
 2 | import { AwsBatchStack } from './batch';
 3 | import { AwsBatchMonitor } from './batch-monitor';
 4 | 
 5 | const app = new App();
 6 | new AwsBatchStack(app, 'TopoProcessorBatch', {
 7 |   env: {
 8 |     region: 'ap-southeast-2',
 9 |     account: process.env['CDK_DEFAULT_ACCOUNT'],
10 |   },
11 |   container: './',
12 | });
13 | new AwsBatchMonitor(app, 'TopoProcessorBatchMon', {
14 |   env: {
15 |     region: 'ap-southeast-2',
16 |     account: process.env['CDK_DEFAULT_ACCOUNT'],
17 |   },
18 | });
19 | 


--------------------------------------------------------------------------------
/infra/src/lambda-code/index.js:
--------------------------------------------------------------------------------
1 | function handler(event, ctx, cb) {
2 |   console.log(JSON.stringify({ event }));
3 |   cb(null, 'done');
4 | }
5 | module.exports = { handler };
6 | 


--------------------------------------------------------------------------------
/infra/src/submit.ts:
--------------------------------------------------------------------------------
 1 | import * as sdk from 'aws-sdk';
 2 | import * as ulid from 'ulid';
 3 | import CloudFormation from 'aws-sdk/clients/cloudformation.js';
 4 | 
 5 | const batch = new sdk.Batch();
 6 | 
 7 | const cloudFormation = new CloudFormation({ region: 'ap-southeast-2' });
 8 | 
 9 | async function main(): Promise<void> {
10 |   const correlationId = ulid.ulid();
11 |   console.log({ correlationId });
12 | 
13 |   const environment = [
14 |     { name: 'AWS_DEFAULT_REGION', value: 'ap-southeast-2' },
15 |     { name: 'LINZ_CACHE_BUCKET', value: 'linz-lds-cache' },
16 |     { name: 'LINZ_CORRELATION_ID', value: correlationId },
17 |     { name: 'LINZ_HISTORICAL_IMAGERY_BUCKET', value: 'linz-historical-imagery-staging' },
18 |     { name: 'LINZ_SSM_BUCKET_CONFIG_NAME', value: 'BucketConfig' },
19 |   ];
20 | 
21 |   const stackInfo = await cloudFormation.describeStacks({ StackName: 'TopoProcessorBatch' }).promise();
22 |   const stackOutputs = stackInfo.Stacks?.[0].Outputs;
23 | 
24 |   const JobDefinitionArn = stackOutputs?.find((f) => f.OutputKey === 'BatchJobArn')?.OutputValue;
25 |   if (JobDefinitionArn == null) throw new Error('Unable to find CfnOutput "BatchJobArn"');
26 |   const JobQueueArn = stackOutputs?.find((f) => f.OutputKey === 'BatchQueueArn')?.OutputValue;
27 |   if (JobQueueArn == null) throw new Error('Unable to find CfnOutput "BatchQueueArn"');
28 |   const TempBucketName = stackOutputs?.find((f) => f.OutputKey === 'TempBucketName')?.OutputValue;
29 |   if (TempBucketName == null) throw new Error('Unable to find CfnOutput "TempBucketName"');
30 | 
31 |   if (process.argv.length > 2) {
32 |     for (let i = 2; i < process.argv.length; i++) {
33 |       const res = await batch
34 |         .submitJob({
35 |           jobName: ['Job', correlationId].join('-'),
36 |           jobQueue: JobQueueArn,
37 |           jobDefinition: JobDefinitionArn,
38 |           containerOverrides: {
39 |             resourceRequirements: [{ type: 'MEMORY', value: '3600' }],
40 |             command: buildCommandArguments(correlationId, TempBucketName, process.argv[i]),
41 |             environment,
42 |           },
43 |         })
44 |         .promise();
45 |       console.log({ source: process.argv[i] }, '\n', res);
46 |     }
47 |   } else {
48 |     console.log(
49 |       'You need to provide a source (a list of S3 bucket folders or a list of survey ID to process. Check the README for more information.',
50 |     );
51 |   }
52 | }
53 | 
54 | function buildCommandArguments(correlationId: string, tempBucket: string, source: string): string[] {
55 |   const command: string[] = [];
56 |   command.push('./upload');
57 |   command.push('--correlationid');
58 |   command.push(correlationId);
59 |   command.push('--source');
60 |   command.push(source);
61 |   command.push('--target');
62 |   command.push('s3://' + tempBucket + '/' + correlationId + '/');
63 |   command.push('--datatype');
64 |   command.push('imagery.historic');
65 |   command.push('-v');
66 | 
67 |   return command;
68 | }
69 | 
70 | main().catch(console.error);
71 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "@aws-cdk/aws-batch-alpha": "2.9.0-alpha.0",
 4 |     "@linzjs/style": "^3.9.0",
 5 |     "aws-cdk-lib": "^2.25.0",
 6 |     "aws-sdk": "^2.1140.0",
 7 |     "constructs": "^10.1.15",
 8 |     "conventional-changelog-cli": "^2.1.1",
 9 |     "ulid": "^2.3.0"
10 |   },
11 |   "devDependencies": {
12 |     "@types/node": "^17.0.21",
13 |     "aws-cdk": "2.25.0",
14 |     "conventional-github-releaser": "^3.1.5"
15 |   },
16 |   "version": "0.15.0",
17 |   "scripts": {
18 |     "build": "tsc",
19 |     "version": "conventional-changelog -p angular -i CHANGELOG.md -s && git add CHANGELOG.md",
20 |     "lint": "npx eslint . --quiet --fix --report-unused-disable-directives --ignore-path .gitignore",
21 |     "test": "echo No tests yet",
22 |     "submit": "node build/src/infra/submit.js"
23 |   },
24 |   "publishConfig": {
25 |     "access": "public"
26 |   },
27 |   "files": [
28 |     "build/src/**"
29 |   ]
30 | }
31 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "topo-processor"
 3 | version = "0.15.0"
 4 | description = ""
 5 | authors = ["Blayne Chard <bchard@linz.govt.nz>", "Paul Fouquet <pfouquet@linz.govt.nz>", "Megan Davidson <mdavidson@linz.govt.nz>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.8"
 9 | linz-logger= "^0.6.0"
10 | py-multihash = "^2.0.1"
11 | rasterio = "^1.2.10"
12 | click = "^8.1.3"
13 | boto3 = "^1.23.5"
14 | python-ulid = "^1.1.0"
15 | fsspec = {extras = ["s3"], version = "^2022.5.0"}
16 | pystac = "^1.4.0"
17 | jsonschema = "^4.5.1"
18 | jsonschema-rs = "^0.13.1"
19 | Shapely = "^1.8.2"
20 | requests = "^2.26.0"
21 | aiohttp = "^3.8.1"
22 | python-dotenv = "^0.20.0"
23 | 
24 | [tool.poetry.dev-dependencies]
25 | black = "^22.3"
26 | boto3-stubs = {version = "*", extras = ["lambda","sts"]}
27 | isort = "^5.10.1"
28 | pylint = "^2.13.9"
29 | pytest = "^7.1.2"
30 | rope = "^1.0.0"
31 | pytest-cov = "^3.0.0"
32 | pytest-mock = "^3.6.1"
33 | mypy = "*"
34 | moto = "^3.1.4"
35 | mypy-boto3-lambda = "^1.24.0"
36 | 
37 | [tool.poetry.scripts]
38 | add = 'topo_processor.cli.geostore.add:main'
39 | status = 'topo_processor.cli.geostore.status:main'
40 | list = 'topo_processor.cli.geostore.list:main'
41 | delete = 'topo_processor.cli.geostore.delete:main'
42 | validate = 'topo_processor.cli.validate:main'
43 | 
44 | [build-system]
45 | requires = ["poetry-core>=1.0.0"]
46 | build-backend = "poetry.core.masonry.api"
47 | 
48 | [tool.black]
49 | line-length = 127
50 | 
51 | [tool.isort]
52 | line_length = 127
53 | case_sensitive = true
54 | profile = "black"
55 | 
56 | [tool.mypy]
57 | show_error_codes = true
58 | strict = true
59 | disable_error_code = [
60 |     "import",
61 | ]
62 | 
63 | [[tool.mypy.overrides]]
64 | module = [
65 |     "linz_logger",
66 | ]
67 | ignore_missing_imports = true
68 | 
69 | [tool.pytest.ini_options]
70 | markers = [
71 |     "slow: marks tests as slow",
72 | ]
73 | testpaths = ["topo_processor"]
74 | 


--------------------------------------------------------------------------------
/scripts/version.bump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Version bump the repo and create a branch ready for pull request
 4 | #
 5 | set -e
 6 | 
 7 | git checkout master
 8 | git pull --rebase
 9 | 
10 | # Validate that there are actually changes to be made, this will fail if nothing needs publishing
11 | npm version -m 'release: %s' minor
12 | 
13 | # Set the version environment variable
14 | CURRENT_VERSION=$(node -p "require('./package.json').version")
15 | 
16 | # Bump the version in Poetry pyproject.toml file
17 | poetry version ${CURRENT_VERSION}
18 | 
19 | # Write version to a file for Topo Processor to use
20 | echo v${CURRENT_VERSION} | tee VERSION
21 | 
22 | # Commit the changed files
23 | git commit -a --amend --no-edit
24 | 
25 | # Checkout a new release branch
26 | git checkout -b release/v${CURRENT_VERSION}
27 | 
28 | # This tag will be created once the pull request is merged
29 | git tag -d v${CURRENT_VERSION}
30 | 


--------------------------------------------------------------------------------
/test_data/historical_aerial_photos_metadata.csv:
--------------------------------------------------------------------------------
1 | WKT,sufi,survey,run,photo_no,alternate_survey_name,camera,camera_sequence_no,nominal_focal_length,altitude,scale,photocentre_lat,photocentre_lon,date,film,film_sequence_no,photo_type,format,source,physical_film_condition,image_anomalies,scanned,raw_filename,released_filename,when_scanned,photo_version
2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72358,SURVEY_1,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,Film scratched,,Y,WRONG_PHOTO_TYPE,CROWN_731_114,2018/Q2,1
3 | "POLYGON ((170.550411567673 -45.8023873533434,170.569928799273 -45.802784811616,170.569485879683 -45.8134678833323,170.549964961905 -45.8130703891654,170.550411567673 -45.8023873533434))",72360,SURVEY_1,E,50,,EAGLE IV,89554,508,11000,6600,-45.8079,170.5599,1952-04-23T00:00:00.000,731,112,B&W,18cm x 23cm,ORIGINAL ,Metadata manually populated,,Y,MULTIPLE_ASSET,CROWN_731_112,2020/Q1,1
4 | "POLYGON ((170.545239253866 -45.8023714719313,170.564756441893 -45.8027698029763,170.564312545088 -45.813452864901,170.544791670795 -45.8130544977075,170.545239253866 -45.8023714719313))",72359,SURVEY_1,E,49,,EAGLE IV,89555,,11000,6600,-45.8079,170.5548,1952-04-23T00:00:00.000,731,113,B&W,18cm x 23cm,ORIGINAL ,Not Film 222,,Y,CONTROL,CROWN_731_113,2014/Q3,1
5 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72352,SURVEY_3,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,,,Y,WRONG_SURVEY,CROWN_731_114,2020/Q4,1
6 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",29659,SURVEY_2,A,2,,ZEISS RMK,279,210,12750,15000,-41.7779,172.646,1982-02-16T00:00:00.000,C2559,100,COLOUR,23cm x 23cm,ORIGINAL,Film scratched,,Y,CONTROL_2,CROWN_2559c_100,,1
7 | 


--------------------------------------------------------------------------------
/test_data/historical_aerial_photos_metadata.gpkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/historical_aerial_photos_metadata.gpkg


--------------------------------------------------------------------------------
/test_data/historical_aerial_photos_metadata_error.csv:
--------------------------------------------------------------------------------
1 | WKT,sufi,survey,run,photo_no,alternate_survey_name,camera,camera_sequence_no,nominal_focal_length,altitude,scale,photocentre_lat,photocentre_lon,date,film,film_sequence_no,photo_type,format,source,physical_film_condition,image_anomalies,scanned,raw_filename,released_filename,when_scanned,photo_version
2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72358,SURVEY_1,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,COLOUR,18cm x 23cm,ORIGINAL ,Film scratched,,Y,WRONG_PHOTO_TYPE,CROWN_731_114,2018/Q2,1
3 | "POLYGON ((170.550411567673 -45.8023873533434,170.569928799273 -45.802784811616,170.569485879683 -45.8134678833323,170.549964961905 -45.8130703891654,170.550411567673 -45.8023873533434))",72360,SURVEY_1,E,50,,EAGLE IV,89554,508,11000,6600,-45.8079,170.5599,"ERROR",731,112,B&W,18cm x 23cm,ORIGINAL ,Metadata manually populated,,Y,MULTIPLE_ASSET,CROWN_731_112,2020/Q1,1
4 | "POLYGON ((170.545239253866 -45.8023714719313,170.564756441893 -45.8027698029763,170.564312545088 -45.813452864901,170.544791670795 -45.8130544977075,170.545239253866 -45.8023714719313))",72359,SURVEY_1,E,49,,EAGLE IV,89555,,11000,6600,-45.8079,170.5548,1952-04-23T00:00:00.000,731,113,B&W,18cm x 23cm,ORIGINAL ,Not Film 222,,Y,CONTROL,CROWN_731_113,2014/Q3,1
5 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",72352,SURVEY_3,E,48,,EAGLE IV,89556,508,11000,6600,-45.8079,170.5496,1952-04-23T00:00:00.000,731,114,B&W,18cm x 23cm,ORIGINAL ,,,Y,WRONG_SURVEY,CROWN_731_114,2020/Q4,1
6 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",29659,SURVEY_2,A,2,,ZEISS RMK,279,210,12750,15000,-41.7779,172.646,1982-02-16T00:00:00.000,C2559,100,COLOUR,23cm x 23cm,ORIGINAL,Film scratched,,Y,CONTROL_2,CROWN_2559c_100,,1
7 | 


--------------------------------------------------------------------------------
/test_data/historical_survey_footprint_metadata.csv:
--------------------------------------------------------------------------------
1 | WKT,SURVEY,COUNTRY,FILM_TYPE,COPYRIGHT,CONTRACTOR,NAME,COMMENTS,AREV,COORD_SYS,CHECKED,DONE
2 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",SURVEY_1,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),TE KUITI 1,,f,NEW ZEALAND MAP GRID,t,1
3 | "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",SURVEY_3,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),AUCKLAND 1,,f,NEW ZEALAND MAP GRID,t,1
4 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",SURVEY_2,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),WELLINGTON 2,,f,NEW ZEALAND MAP GRID,t,1
5 | "POLYGON ((172.625388669748 -41.762347842565,172.666876525333 -41.7624796188097,172.666710116737 -41.793542284359,172.625202147726 -41.7934103066203,172.625388669748 -41.762347842565))",SURVEY_NO_NAME,NEW ZEALAND (MAINLAND),BLACK AND WHITE,CROWN,AERIAL SURVEYS (NELSON),,,f,NEW ZEALAND MAP GRID,t,1
6 | 


--------------------------------------------------------------------------------
/test_data/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "path": "test_data",
 3 |   "time": 1644961117023,
 4 |   "files": [
 5 |     { "path": "/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif" },
 6 |     { "path": "/tiffs/SURVEY_1/MULTIPLE_ASSET.tif" },
 7 |     { "path": "/tiffs/SURVEY_1/CONTROL.tif" }
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/test_data/manifest_duplicate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "path": "test_data",
 3 |   "time": 1644961117023,
 4 |   "files": [
 5 |     { "path": "/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif" },
 6 |     { "path": "/tiffs/SURVEY_1/MULTIPLE_ASSET.tif" },
 7 |     { "path": "/tiffs/SURVEY_3/MULTIPLE_ASSET.tif" },
 8 |     { "path": "/tiffs/SURVEY_1/CONTROL.tif" }
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------
/test_data/schemas/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Notes for Developers
 3 | 
 4 | You can put local schemas here for testing against. See the example tests in:
 5 | `topo_processor/metadata/metadata_validators/tests/metadata_validator_stac_test.py`
 6 | 
 7 | For LINZ STAC extensions you will need to alter the stac_extensions stanza, e.g. change
 8 | 
 9 | 
10 | ```
11 |   "definitions": {
12 |     "stac_extensions": {
13 |       "type": "object",
14 |       "required": ["stac_extensions"],
15 |       "properties": {
16 |         "stac_extensions": {
17 |           "type": "array",
18 |           "contains": {
19 |             "const": "https://stac.linz.govt.nz/_STAC_VERSION_/film/schema.json"
20 |           }
21 |         }
22 |       }
23 |     },
24 | ```
25 | 
26 | to reference the local path of the schema:
27 | 
28 | ```
29 |   "definitions": {
30 |     "stac_extensions": {
31 |       "type": "object",
32 |       "required": ["stac_extensions"],
33 |       "properties": {
34 |         "stac_extensions": {
35 |           "type": "array",
36 |           "contains": {
37 |             "const": "file:///home/your_username/dev/topo-processor/test_data/schemas/film.json"
38 |           }
39 |         }
40 |       }
41 |     },
42 | ```
43 | 


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_1/CONTROL.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/CONTROL.tiff


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.his:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.his


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tiff


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_1/WRONG_SURVEY.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_1/WRONG_SURVEY.tiff


--------------------------------------------------------------------------------
/test_data/tiffs/SURVEY_2/CONTROL_2.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/test_data/tiffs/SURVEY_2/CONTROL_2.tif


--------------------------------------------------------------------------------
/topo_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cli/geostore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/geostore/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cli/geostore/add.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | from typing import Any, Dict, List
  5 | from urllib.parse import urlparse
  6 | 
  7 | import boto3
  8 | import click
  9 | from linz_logger import LogLevel, get_log, set_level
 10 | 
 11 | from topo_processor.geostore.invoke import invoke_import_status, invoke_lambda
 12 | from topo_processor.stac.stac_extensions import StacExtensions
 13 | from topo_processor.util.aws_credentials import Credentials
 14 | from topo_processor.util.aws_files import s3_download
 15 | from topo_processor.util.configuration import temp_folder
 16 | from topo_processor.util.file_extension import is_tiff
 17 | from topo_processor.util.s3 import is_s3_path
 18 | from topo_processor.util.time import time_in_ms
 19 | 
 20 | 
 21 | @click.command()
 22 | @click.option(
 23 |     "-s",
 24 |     "--source",
 25 |     required=True,
 26 |     help="The s3 path to the survey to export",
 27 | )
 28 | @click.option(
 29 |     "-r",
 30 |     "--role",
 31 |     required=True,
 32 |     help="The ARN role to access to the source bucket",
 33 | )
 34 | @click.option(
 35 |     "-c",
 36 |     "--commit",
 37 |     is_flag=True,
 38 |     help="Use this flag to commit the creation of the dataset",
 39 | )
 40 | @click.option(
 41 |     "-v",
 42 |     "--verbose",
 43 |     is_flag=True,
 44 |     help="Use verbose to display debug logs",
 45 | )
 46 | def main(source: str, role: str, commit: bool, verbose: bool) -> None:
 47 |     """Create or add a new version of an existing dataset to the Geostore for the source (survey) passed as argument."""
 48 |     start_time = time_in_ms()
 49 |     logger = get_log()
 50 |     logger.info("geostore_add_started", source=source)
 51 | 
 52 |     if not verbose:
 53 |         set_level(LogLevel.info)
 54 | 
 55 |     try:
 56 |         source_role_arn = role
 57 |         client_sts = boto3.client("sts")
 58 |         assumed_role = client_sts.assume_role(RoleArn=source_role_arn, RoleSessionName="read-session")
 59 |         # Get Collection information
 60 |         collection_local_path = os.path.join(temp_folder, "collection.json")
 61 | 
 62 |         if is_s3_path(source):
 63 |             try:
 64 |                 credentials = Credentials(
 65 |                     assumed_role["Credentials"]["AccessKeyId"],
 66 |                     assumed_role["Credentials"]["SecretAccessKey"],
 67 |                     assumed_role["Credentials"]["SessionToken"],
 68 |                 )
 69 |                 s3_download(os.path.join(source, "collection.json"), collection_local_path, credentials)
 70 |             except Exception as e:
 71 |                 logger.error("geostore_export_failed", source=source, error=e)
 72 |                 return
 73 |         else:
 74 |             raise Exception("The source has to be a survey in a S3 bucket.")
 75 | 
 76 |         with open(collection_local_path) as collection_file:
 77 |             collection_json: Dict[str, Any] = json.load(collection_file)
 78 | 
 79 |         # Get survey id for dataset id, collection.title for Description, and datatype prefix
 80 |         survey_id = collection_json["summaries"]["mission"][0]
 81 |         if not survey_id:
 82 |             raise Exception("No survey ID found in collection.json")
 83 |         if StacExtensions.historical_imagery.value in collection_json["stac_extensions"]:
 84 |             title_prefix = "historical-aerial-imagery-survey-"
 85 |         else:
 86 |             raise Exception("No match for data type in collection.json stac_extensions.")
 87 |         title = collection_json["title"]
 88 | 
 89 |         prefixed_survey_id = title_prefix + survey_id
 90 | 
 91 |         if commit:
 92 |             # Check if a dataset for this survey already exists
 93 |             list_parameters = {"title": prefixed_survey_id}
 94 |             dataset_list = invoke_lambda("datasets", "GET", list_parameters)
 95 |             if len(dataset_list["body"]) == 1 and dataset_list["body"][0]["title"] == prefixed_survey_id:
 96 |                 # A dataset already exists
 97 |                 if click.confirm(
 98 |                     f"A dataset for the survey {prefixed_survey_id} already exists. A new version will be created. Do you want to continue?",
 99 |                     abort=True,
100 |                 ):
101 |                     # Create a new version
102 |                     dataset_id = dataset_list["body"][0]["id"]
103 |                     click.echo("A new version will be created.")
104 |             else:
105 |                 # Create a dataset
106 |                 logger.info("create_new_dataset", surveyId=prefixed_survey_id, surveyTitle=title)
107 |                 create_dataset_parameters = {"title": prefixed_survey_id, "description": title}
108 |                 dataset_response_payload = invoke_lambda("datasets", "POST", create_dataset_parameters)
109 |                 dataset_id = dataset_response_payload["body"]["id"]
110 |                 if not dataset_id:
111 |                     raise Exception(f"No dataset ID found in datasets Lambda function response: {dataset_response_payload}")
112 | 
113 |             # Upload data
114 |             upload_data_parameters = {
115 |                 "id": dataset_id,
116 |                 "metadata_url": os.path.join(source, "collection.json"),
117 |                 "s3_role_arn": source_role_arn,
118 |             }
119 |             version_response_payload = invoke_lambda("dataset-versions", "POST", upload_data_parameters)
120 |             execution_arn = version_response_payload["body"]["execution_arn"]
121 | 
122 |             # Check import status
123 |             import_status = invoke_import_status(execution_arn)
124 | 
125 |             logger.info(
126 |                 "geostore_add_invoked",
127 |                 info=f"To check the import status, run the following command 'poetry run status -a {execution_arn}'",
128 |             )
129 | 
130 |             logger.debug(
131 |                 "geostore_add_details",
132 |                 source=source,
133 |                 datasetId=dataset_id,
134 |                 executionArn=execution_arn,
135 |                 currentImportStatus=import_status,
136 |                 duration=time_in_ms() - start_time,
137 |             )
138 |         else:
139 |             source_parse = urlparse(source, allow_fragments=False)
140 |             bucket_name = source_parse.netloc
141 |             prefix = source_parse.path[1:].replace("collection.json", "")
142 |             logger.debug("no_commit", action="list_objects", bucket=bucket_name, prefix=prefix)
143 |             file_list: List[str] = []
144 |             s3 = boto3.client(
145 |                 "s3",
146 |                 aws_access_key_id=credentials.access_key,
147 |                 aws_secret_access_key=credentials.secret_key,
148 |                 aws_session_token=credentials.token,
149 |             )
150 |             paginator = s3.get_paginator("list_objects_v2")
151 |             response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
152 |             for response in response_iterator:
153 |                 for contents_data in response["Contents"]:
154 |                     key = contents_data["Key"]
155 |                     if is_tiff(key):
156 |                         file_list.append(key)
157 |             logger.info(
158 |                 "The change won't be commit since the --commit flag has not been specified.",
159 |                 sourceFiles=file_list,
160 |                 surveyId=prefixed_survey_id,
161 |                 surveyTitle=title,
162 |             )
163 | 
164 |     except Exception as e:
165 |         logger.error("geostore_add_failed", err=e)
166 |     finally:
167 |         shutil.rmtree(temp_folder)
168 | 


--------------------------------------------------------------------------------
/topo_processor/cli/geostore/delete.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from linz_logger import LogLevel, get_log, set_level
 3 | 
 4 | from topo_processor.geostore.invoke import invoke_lambda
 5 | from topo_processor.util.time import time_in_ms
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option(
10 |     "-d",
11 |     "--dataset-id",
12 |     required=True,
13 |     help="The dataset id to delete",
14 | )
15 | @click.option(
16 |     "-c",
17 |     "--commit",
18 |     is_flag=True,
19 |     help="Use this flag to commit the suppression of the dataset.",
20 | )
21 | @click.option(
22 |     "-v",
23 |     "--verbose",
24 |     is_flag=True,
25 |     help="Use verbose to display debug logs",
26 | )
27 | def main(dataset_id: str, commit: bool, verbose: str) -> None:
28 |     start_time = time_in_ms()
29 |     logger = get_log()
30 |     logger.info("delete_datasets_start", dataset_id=dataset_id)
31 | 
32 |     if not verbose:
33 |         set_level(LogLevel.info)
34 | 
35 |     try:
36 |         delete_parameters = {"title": dataset_id}
37 |         operation = "GET"
38 |         if commit:
39 |             operation = "DELETE"
40 | 
41 |         response = invoke_lambda("datasets", operation, delete_parameters)
42 |         if not commit:
43 |             logger.info(
44 |                 f"You are about to delete the following dataset: {response['body']}. Run the command again with the --commit flag to confirm."
45 |             )
46 |         else:
47 |             logger.info("delete_dataset_success", deleted_id=dataset_id, duration=time_in_ms() - start_time)
48 |     except Exception as e:
49 |         logger.error("delete_dataset_failed", err=e)
50 | 


--------------------------------------------------------------------------------
/topo_processor/cli/geostore/list.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from linz_logger import LogLevel, get_log, set_level
 3 | 
 4 | from topo_processor.geostore.invoke import invoke_lambda
 5 | from topo_processor.util.time import time_in_ms
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option(
10 |     "-t",
11 |     "--title",
12 |     required=False,
13 |     help="The Geostore title of the survey to filter",
14 | )
15 | @click.option(
16 |     "-v",
17 |     "--verbose",
18 |     is_flag=True,
19 |     help="Use verbose to display debug logs",
20 | )
21 | def main(title: str, verbose: bool) -> None:
22 |     start_time = time_in_ms()
23 |     logger = get_log()
24 |     logger.info("list_datasets_start", title=title)
25 | 
26 |     if not verbose:
27 |         set_level(LogLevel.info)
28 | 
29 |     try:
30 |         list_parameters = {}
31 |         if title:
32 |             list_parameters = {"title": title}
33 |         dataset_list = invoke_lambda("datasets", "GET", list_parameters)
34 | 
35 |         logger.info("list_datasets_end", dataset_list=dataset_list, duration=time_in_ms() - start_time)
36 |     except Exception as e:
37 |         logger.error("list_datasets_failed", err=e)
38 | 


--------------------------------------------------------------------------------
/topo_processor/cli/geostore/status.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from linz_logger import LogLevel, get_log, set_level
 3 | 
 4 | from topo_processor.geostore.invoke import invoke_import_status
 5 | from topo_processor.util.time import time_in_ms
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option(
10 |     "-a",
11 |     "--execution-arn",
12 |     required=True,
13 |     help="The execution arn received from the Geostore after invoking an upload",
14 | )
15 | @click.option(
16 |     "-v",
17 |     "--verbose",
18 |     is_flag=True,
19 |     help="Use verbose to display debug logs",
20 | )
21 | def main(execution_arn: str, verbose: bool) -> None:
22 |     start_time = time_in_ms()
23 |     logger = get_log()
24 |     logger.info("check_import_status_start", arn=execution_arn)
25 | 
26 |     if not verbose:
27 |         set_level(LogLevel.info)
28 | 
29 |     try:
30 |         import_status = invoke_import_status(execution_arn)
31 | 
32 |         logger.info(
33 |             "check_import_status",
34 |             current_import_status=import_status,
35 |         )
36 | 
37 |         logger.debug(
38 |             "check_export_status_end",
39 |             duration=time_in_ms() - start_time,
40 |         )
41 | 
42 |     except Exception as e:
43 |         logger.error("check_import_status_failed", err=e)
44 | 


--------------------------------------------------------------------------------
/topo_processor/cli/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cli/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cli/tests/upload_test.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import subprocess
  4 | 
  5 | import pytest
  6 | 
  7 | from topo_processor.stac.stac_extensions import StacExtensions
  8 | 
  9 | 
 10 | @pytest.mark.slow
 11 | def test_upload_local(setup: str) -> None:
 12 |     target = setup
 13 |     source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs"))
 14 |     metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv"))
 15 |     footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv"))
 16 |     command = os.path.join(os.getcwd(), "upload")
 17 |     subprocess.run(
 18 |         [command, "-s", source, "-d", "imagery.historic", "-t", target, "-m", metadata_path, "-f", footprint_metadata],
 19 |         check=True,
 20 |     )
 21 | 
 22 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.json"))
 23 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.tiff"))
 24 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "collection.json"))
 25 | 
 26 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.json"))
 27 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.tif"))
 28 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "collection.json"))
 29 | 
 30 |     assert os.path.isfile(os.path.join(target, "SURVEY_1", "72360.json"))
 31 |     assert os.path.isfile(os.path.join(target, "SURVEY_1", "72360.tiff"))
 32 |     assert os.path.isfile(os.path.join(target, "SURVEY_1", "collection.json"))
 33 | 
 34 |     with open(os.path.join(target, "SURVEY_1", "72359.json")) as item_json_file:
 35 |         item_metadata = json.load(item_json_file)
 36 |     assert item_metadata["properties"]["camera:sequence_number"] == 89555
 37 |     assert StacExtensions.camera.value in item_metadata["stac_extensions"]
 38 |     assert "camera:nominal_focal_length" not in item_metadata["properties"].keys()
 39 | 
 40 |     with open(os.path.join(target, "SURVEY_3", "72352.json")) as item_json_file:
 41 |         item_metadata = json.load(item_json_file)
 42 |     assert item_metadata["properties"]["mission"] == "SURVEY_3"
 43 |     assert item_metadata["id"] == "72352"
 44 |     assert (
 45 |         item_metadata["assets"]["visual"]["file:checksum"]
 46 |         == "1220e3e67b095835c5ae8d7b311af25606d3dc0915219f34838e1f0c78b980697ca4"
 47 |     )
 48 |     assert (item_metadata["assets"]["visual"]["href"]) == "./72352.tiff"
 49 |     assert len(item_metadata["links"]) == 3
 50 |     for link in item_metadata["links"]:
 51 |         assert link["rel"] != "self"
 52 |         assert link["href"] == "./collection.json"
 53 | 
 54 |     with open(os.path.join(target, "SURVEY_3", "collection.json")) as collection_json_file:
 55 |         collection_metadata = json.load(collection_json_file)
 56 | 
 57 |     assert len(collection_metadata["links"]) == 2
 58 |     for link in collection_metadata["links"]:
 59 |         assert link["rel"] != "self"
 60 |         if link["rel"] == "root":
 61 |             assert link["href"] == "./collection.json"
 62 |         if link["rel"] == "item":
 63 |             assert link["href"] == "./72352.json"
 64 | 
 65 |     assert item_metadata["properties"]["camera:sequence_number"] == 89556
 66 |     assert item_metadata["properties"]["camera:nominal_focal_length"] == 508
 67 |     assert StacExtensions.camera.value in item_metadata["stac_extensions"]
 68 | 
 69 | 
 70 | @pytest.mark.slow
 71 | def test_upload_local_fail(setup: str) -> None:
 72 |     target = setup
 73 |     source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs"))
 74 |     metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata_error.csv"))
 75 |     footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv"))
 76 |     command = os.path.join(os.getcwd(), "upload")
 77 | 
 78 |     with pytest.raises(Exception) as e:
 79 |         subprocess.run(
 80 |             [command, "-s", source, "-d", "imagery.historic", "-t", target, "-m", metadata_path, "-f", footprint_metadata],
 81 |             check=True,
 82 |         )
 83 |         assert "process is stopped" in str(e.value).lower()
 84 | 
 85 | 
 86 | @pytest.mark.slow
 87 | def test_upload_local_forced(setup: str) -> None:
 88 |     target = setup
 89 |     source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs"))
 90 |     metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata_error.csv"))
 91 |     footprint_metadata = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv"))
 92 |     command = os.path.join(os.getcwd(), "upload")
 93 | 
 94 |     subprocess.run(
 95 |         [
 96 |             command,
 97 |             "-s",
 98 |             source,
 99 |             "-d",
100 |             "imagery.historic",
101 |             "-t",
102 |             target,
103 |             "-m",
104 |             metadata_path,
105 |             "-f",
106 |             footprint_metadata,
107 |             "--force",
108 |         ],
109 |         check=True,
110 |     )
111 | 
112 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.json"))
113 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "72352.tiff"))
114 |     assert os.path.isfile(os.path.join(target, "SURVEY_3", "collection.json"))
115 | 
116 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.json"))
117 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "29659.tif"))
118 |     assert os.path.isfile(os.path.join(target, "SURVEY_2", "collection.json"))
119 | 


--------------------------------------------------------------------------------
/topo_processor/cli/upload.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import pystac
  3 | from linz_logger import LogLevel, get_log, set_level
  4 | 
  5 | from topo_processor.metadata.data_type import DataType
  6 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata
  7 | from topo_processor.stac.item_factory import process_source
  8 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator
  9 | from topo_processor.stac.store import collection_store
 10 | from topo_processor.util.s3 import is_s3_path
 11 | from topo_processor.util.time import time_in_ms
 12 | from topo_processor.util.transfer_collection import transfer_collection
 13 | 
 14 | 
 15 | @click.command()
 16 | @click.option(
 17 |     "-s",
 18 |     "--source",
 19 |     required=True,
 20 |     help="The source of the data to import",
 21 | )
 22 | @click.option(
 23 |     "-d",
 24 |     "--datatype",
 25 |     required=True,
 26 |     type=click.Choice([data_type for data_type in DataType], case_sensitive=True),
 27 |     help="The datatype of the upload",
 28 | )
 29 | @click.option(
 30 |     "-t",
 31 |     "--target",
 32 |     required=True,
 33 |     help="The target directory path or bucket name of the upload",
 34 | )
 35 | @click.option(
 36 |     "-c",
 37 |     "--correlationid",
 38 |     required=False,
 39 |     help="The correlation ID of the batch job",
 40 | )
 41 | @click.option(
 42 |     "-m",
 43 |     "--metadata",
 44 |     required=False,
 45 |     help="The metadata file path",
 46 | )
 47 | @click.option(
 48 |     "-v",
 49 |     "--verbose",
 50 |     is_flag=True,
 51 |     help="Use verbose to display trace logs",
 52 | )
 53 | @click.option(
 54 |     "-f",
 55 |     "--footprint",
 56 |     required=False,
 57 |     help="The survey footprint metadata path",
 58 | )
 59 | @click.option(
 60 |     "--force",
 61 |     is_flag=True,
 62 |     help="Force the upload even if all the data is not valid",
 63 | )
 64 | def main(
 65 |     source: str, datatype: str, correlationid: str, target: str, metadata: str, verbose: str, footprint: str, force: bool
 66 | ) -> None:
 67 |     get_log().info("upload_start", correlationId=correlationid, source=source, target=target, dataType=datatype, force=force)
 68 |     try:
 69 |         pystac.validation.set_validator(IterErrorsValidator())
 70 | 
 71 |         if verbose:
 72 |             set_level(LogLevel.trace)
 73 | 
 74 |         start_time = time_in_ms()
 75 |         data_type = DataType(datatype)
 76 | 
 77 |         # Caching the metadata required by the user.
 78 |         if metadata:
 79 |             get_metadata(data_type, None, metadata)
 80 |             if not is_s3_path(metadata):
 81 |                 if not footprint:
 82 |                     get_log().error(
 83 |                         "survey_footprint_metadata_not_given",
 84 |                         msg="You have to provide a local path for the survey footprint metadata",
 85 |                     )
 86 |                     raise Exception("survey footprint metadata not given")
 87 |                 else:
 88 |                     if data_type == DataType.IMAGERY_HISTORIC:
 89 |                         get_metadata(DataType.SURVEY_FOOTPRINT_HISTORIC, None, footprint)
 90 |                     else:
 91 |                         raise Exception("Not yet implemented")
 92 | 
 93 |         process_source(source, data_type, metadata, force)
 94 | 
 95 |         for collection in collection_store.values():
 96 |             transfer_collection(collection, target, data_type, force)
 97 | 
 98 |         get_log().debug(
 99 |             "Job Completed",
100 |             source=source,
101 |             location=target,
102 |             correlationid=correlationid,
103 |             data_type=data_type,
104 |             duration=time_in_ms() - start_time,
105 |         )
106 |     except Exception as e:
107 |         get_log().error("Job Failed", error=e, source=source, correlationid=correlationid, data_type=datatype)
108 |     finally:
109 |         for collection in collection_store.values():
110 |             collection.delete_temp_dir()
111 | 


--------------------------------------------------------------------------------
/topo_processor/cli/validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | import click
 5 | from linz_logger import LogLevel, get_log, set_level
 6 | 
 7 | from topo_processor.stac.validation import validate_stac
 8 | from topo_processor.util.configuration import temp_folder
 9 | from topo_processor.util.s3 import is_s3_path
10 | from topo_processor.util.time import time_in_ms
11 | 
12 | 
13 | @click.command()
14 | @click.option(
15 |     "-i",
16 |     "--item",
17 |     is_flag=True,
18 |     help="Use item to validate items only.",
19 | )
20 | @click.option(
21 |     "-c",
22 |     "--collection",
23 |     is_flag=True,
24 |     help="Use collection to validate collections only.",
25 | )
26 | @click.option(
27 |     "-m",
28 |     "--metadata",
29 |     required=False,
30 |     help="(OPTIONAL) The path of the metadata csv file to validate.",
31 | )
32 | @click.option(
33 |     "-v",
34 |     "--verbose",
35 |     is_flag=True,
36 |     help="Use verbose to display trace logs (it might be slower).",
37 | )
38 | def main(item: bool, collection: bool, metadata: str, verbose: str) -> None:
39 |     if verbose:
40 |         set_level(LogLevel.trace)
41 |     else:
42 |         set_level(LogLevel.info)
43 | 
44 |     start_time = time_in_ms()
45 | 
46 |     if metadata:
47 |         if not is_s3_path(metadata):
48 |             metadata = os.path.abspath(metadata)
49 | 
50 |     if item == collection:
51 |         validate_stac(metadata)
52 |     else:
53 |         validate_stac(metadata, item, collection)
54 | 
55 |     # Cleanup
56 |     shutil.rmtree(temp_folder)
57 | 
58 |     get_log().info(
59 |         "validate completed",
60 |         duration=time_in_ms() - start_time,
61 |     )
62 | 


--------------------------------------------------------------------------------
/topo_processor/cog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cog/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cog/create_cog.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket
 4 | from topo_processor.util.command import Command
 5 | from topo_processor.util.s3 import bucket_name_from_path, is_s3_path
 6 | 
 7 | 
 8 | def create_cog(input_path: str, output_path: str) -> Command:
 9 |     is_s3 = is_s3_path(input_path)
10 |     if is_s3:
11 |         credentials: Credentials = get_credentials_from_bucket(bucket_name_from_path(input_path))
12 |         input_path = f"/vsis3/{input_path.replace('s3://', '')}"
13 |     if os.environ.get("IS_DOCKER") == "true":
14 |         cmd = Command("gdal_translate")
15 |         if is_s3:
16 |             os.environ["AWS_ACCESS_KEY_ID"] = credentials.access_key
17 |             os.environ["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key
18 |             os.environ["AWS_SESSION_TOKEN"] = credentials.token
19 |     else:
20 |         cmd = Command("gdal_translate", {"container": "osgeo/gdal", "tag": "ubuntu-small-3.5.0"})
21 |         if is_s3:
22 |             cmd.env(f"AWS_ACCESS_KEY_ID={credentials.access_key}")
23 |             cmd.env(f"AWS_SECRET_ACCESS_KEY={credentials.secret_key}")
24 |             cmd.env(f"AWS_SESSION_TOKEN={credentials.token}")
25 | 
26 |     cmd.env("GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR")
27 |     cmd.mount(input_path)
28 |     cmd.mount(os.path.dirname(output_path))
29 |     cmd.arg(input_path)
30 |     cmd.arg("-of", "COG")
31 |     cmd.arg("-co", "COMPRESS=LZW")
32 |     cmd.arg("-co", "NUM_THREADS=ALL_CPUS")
33 |     cmd.arg("-co", "PREDICTOR=2")
34 |     cmd.arg("-co", "OVERVIEW_COMPRESS=JPEG")
35 |     cmd.arg("-co", "BIGTIFF=YES")
36 |     cmd.arg("-co", "OVERVIEW_RESAMPLING=LANCZOS")
37 |     cmd.arg("-co", "BLOCKSIZE=512")
38 |     cmd.arg("-co", "OVERVIEW_QUALITY=90")
39 |     cmd.arg("-co", "SPARSE_OK=TRUE")
40 |     cmd.arg(output_path)
41 |     return cmd
42 | 


--------------------------------------------------------------------------------
/topo_processor/cog/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/cog/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/cog/tests/create_cog_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.cog.create_cog import create_cog
 4 | 
 5 | 
 6 | def test_cog_command() -> None:
 7 |     input_path = "fake_input_dir/fake_input.tiff"
 8 |     output_path = "fake_input_dir/fake_output.tiff"
 9 | 
10 |     cmd = create_cog(input_path, output_path)
11 |     assert cmd.to_full_command() == [
12 |         "gdal_translate",
13 |         "fake_input_dir/fake_input.tiff",
14 |         "-of",
15 |         "COG",
16 |         "-co",
17 |         "COMPRESS=LZW",
18 |         "-co",
19 |         "NUM_THREADS=ALL_CPUS",
20 |         "-co",
21 |         "PREDICTOR=2",
22 |         "-co",
23 |         "OVERVIEW_COMPRESS=JPEG",
24 |         "-co",
25 |         "BIGTIFF=YES",
26 |         "-co",
27 |         "OVERVIEW_RESAMPLING=LANCZOS",
28 |         "-co",
29 |         "BLOCKSIZE=512",
30 |         "-co",
31 |         "OVERVIEW_QUALITY=90",
32 |         "-co",
33 |         "SPARSE_OK=TRUE",
34 |         "fake_input_dir/fake_output.tiff",
35 |     ]
36 | 


--------------------------------------------------------------------------------
/topo_processor/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/data/__init__.py


--------------------------------------------------------------------------------
/topo_processor/data/data_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_transformer_imagery_historic import DataTransformerImageryHistoric
2 | from .data_transformer_repo import DataTransformerRepository
3 | 
4 | data_transformer_repo = DataTransformerRepository()
5 | data_transformer_repo.append(DataTransformerImageryHistoric())
6 | 


--------------------------------------------------------------------------------
/topo_processor/data/data_transformers/data_transformer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from topo_processor.stac.item import Item
 8 | 
 9 | 
10 | class DataTransformer(ABC):
11 |     @property
12 |     @abstractmethod
13 |     def name(self) -> str:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def is_applicable(self, item: Item) -> bool:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def transform_data(self, item: Item) -> None:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/topo_processor/data/data_transformers/data_transformer_imagery_historic.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | import pystac
 7 | import ulid
 8 | from linz_logger import get_log
 9 | 
10 | from topo_processor.cog.create_cog import create_cog
11 | from topo_processor.stac.asset import Asset
12 | from topo_processor.util.file_extension import is_tiff
13 | from topo_processor.util.time import time_in_ms
14 | 
15 | from .data_transformer import DataTransformer
16 | 
17 | if TYPE_CHECKING:
18 |     from topo_processor.stac.item import Item
19 | 
20 | 
21 | class DataTransformerImageryHistoric(DataTransformer):
22 |     name = "data.transformer.imagery.historic"
23 | 
24 |     def is_applicable(self, item: Item) -> bool:
25 |         for asset in item.assets:
26 |             if is_tiff(asset.source_path):
27 |                 return True
28 |         return False
29 | 
30 |     def transform_data(self, item: Item) -> None:
31 |         cog_asset_list = []
32 |         for asset in item.assets:
33 |             if not is_tiff(asset.source_path):
34 |                 continue
35 |             start_time = time_in_ms()
36 |             if not item.collection:
37 |                 get_log().warning("Item has no collection", item_id=item.id)
38 |                 return
39 |             output_path = os.path.join(item.collection.get_temp_dir(), f"{ulid.ULID()}.tiff")
40 | 
41 |             try:
42 |                 create_cog(asset.source_path, output_path).run()
43 |             except Exception as e:
44 |                 raise Exception(
45 |                     f"COG creation failed for item {item.id} with source path {asset.source_path} and output path {output_path}."
46 |                 ) from e
47 | 
48 |             get_log().debug("Created COG", output_path=output_path, duration=time_in_ms() - start_time)
49 | 
50 |             asset.needs_upload = False
51 | 
52 |             cog_asset = Asset(output_path)
53 |             cog_asset.content_type = pystac.MediaType.COG
54 |             cog_asset.key_name = asset.key_name
55 |             cog_asset.target = asset.target
56 |             cog_asset.properties = asset.properties
57 |             cog_asset.set_output_asset_dates(output_path)
58 |             cog_asset_list.append(cog_asset)
59 | 
60 |         for asset in cog_asset_list:
61 |             item.add_asset(asset)
62 | 


--------------------------------------------------------------------------------
/topo_processor/data/data_transformers/data_transformer_repo.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, List
 4 | 
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.time import time_in_ms
 8 | 
 9 | from .data_transformer import DataTransformer
10 | 
11 | if TYPE_CHECKING:
12 |     from topo_processor.stac.item import Item
13 | 
14 | 
15 | class DataTransformerRepository:
16 |     transformers: List[DataTransformer] = []
17 | 
18 |     def append(self, transformers: DataTransformer) -> None:
19 |         self.transformers.append(transformers)
20 | 
21 |     def transform_data(self, item: Item) -> None:
22 |         for transformer in self.transformers:
23 |             if transformer.is_applicable(item):
24 |                 start_time = time_in_ms()
25 |                 try:
26 |                     transformer.transform_data(item)
27 |                 except Exception as e:
28 |                     item.add_error(str(e), transformer.name, e)
29 |                     get_log().error("Data Transform Failed. Process is stopped.", transformers=transformer.name, error=e)
30 |                     raise Exception(e)
31 |                 get_log().debug(
32 |                     "Data Transformed",
33 |                     duration=time_in_ms() - start_time,
34 |                 )
35 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/file_system/__init__.py


--------------------------------------------------------------------------------
/topo_processor/file_system/assets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | 
 4 | from topo_processor.file_system.file_searcher import get_file_path_from_survey
 5 | from topo_processor.file_system.get_fs import get_fs
 6 | from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol
 7 | from topo_processor.metadata.data_type import DataType
 8 | from topo_processor.stac.asset import Asset
 9 | from topo_processor.stac.store import get_asset
10 | from topo_processor.util.aws_files import build_s3_path
11 | from topo_processor.util.file_extension import FILE_EXTENSIONS, is_extension
12 | from topo_processor.util.s3 import bucket_name_from_stack, is_s3_path
13 | 
14 | 
15 | def get_assets(source: str, data_type: str, metadata_path: str = "") -> List[Asset]:
16 |     if os.path.isdir(os.path.dirname(source)) or is_s3_path(source):
17 |         return _get_assets_from_directory(source, data_type)
18 |     else:
19 |         if data_type == DataType.IMAGERY_HISTORIC:
20 |             return _get_historical_imagery_assets(source, data_type, metadata_path)
21 |     raise Exception(f"Source is neither Directory or Imagery Historic datatype, source= {source}")
22 | 
23 | 
24 | def _get_assets_from_directory(source: str, data_type: str) -> List[Asset]:
25 |     assets_list: List[Asset] = []
26 |     if not is_s3_path(source):
27 |         source = os.path.abspath(source)
28 |     fs = get_fs(source)
29 |     for (path, _, files) in fs.walk(source):
30 |         if not files:
31 |             continue
32 |         for file_ in files:
33 |             if not is_extension(file_, FILE_EXTENSIONS[data_type]):
34 |                 continue
35 |             asset_path = get_path_with_protocol(source, fs, path)
36 |             asset = get_asset(f"{asset_path}/{file_}")
37 |             assets_list.append(asset)
38 |     return assets_list
39 | 
40 | 
41 | def _get_historical_imagery_assets(source: str, data_type: str, metadata_path: str = "") -> List[Asset]:
42 |     assets_list: List[Asset] = []
43 |     manifest_bucket = bucket_name_from_stack("TopoProcessorBatch")
44 |     manifest_path = build_s3_path(manifest_bucket, "manifest.json")
45 |     asset_path_list: List[str] = get_file_path_from_survey(source, manifest_path, metadata_path)
46 |     for path in asset_path_list:
47 |         if not is_extension(path, FILE_EXTENSIONS[data_type]):
48 |             continue
49 |         asset = get_asset(path)
50 |         assets_list.append(asset)
51 |     return assets_list
52 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/file_searcher.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from linz_logger import get_log
 4 | 
 5 | from topo_processor.file_system.manifest import get_file_path_from_manifest, load_manifest
 6 | from topo_processor.metadata.data_type import DataType
 7 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata
 8 | from topo_processor.util.aws_files import build_s3_path
 9 | from topo_processor.util.configuration import historical_imagery_bucket
10 | 
11 | 
12 | def get_file_path_from_survey(survey_id: str, manifest_path: str, metadata_path: str = "") -> List[str]:
13 |     list_file_path: List[str] = []
14 |     criteria = {"survey": survey_id}
15 |     metadata = get_metadata(DataType.IMAGERY_HISTORIC, criteria, metadata_path, True)
16 |     manifest = load_manifest(manifest_path)
17 | 
18 |     for metadata_row in metadata.values():
19 |         file_name_lower = str(metadata_row["raw_filename"]).lower()
20 |         tmp_list = get_file_path_from_manifest(manifest, ("/" + file_name_lower + ".tif", "/" + file_name_lower + ".tiff"))
21 |         if len(tmp_list) > 1:
22 |             raise Exception(
23 |                 f"Duplicate files found for file name: {file_name_lower}. Duplicate path: {', '.join([duplicate for duplicate in tmp_list])}"
24 |             )
25 |         elif len(tmp_list) == 1:
26 |             path = build_s3_path(historical_imagery_bucket, tmp_list[0])
27 |             list_file_path.append(path)
28 |         else:
29 |             get_log().warn(
30 |                 "file_not_found",
31 |                 msg="No file found with this name.",
32 |                 file_name=file_name_lower,
33 |             )
34 | 
35 |     return list_file_path
36 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/get_fs.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from fsspec.implementations.local import LocalFileSystem
 4 | from s3fs import S3FileSystem
 5 | 
 6 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket
 7 | from topo_processor.util.s3 import bucket_name_from_path, is_s3_path
 8 | 
 9 | 
10 | def get_fs(path: str) -> Any:
11 |     if is_s3_path(path):
12 |         credentials: Credentials = get_credentials_from_bucket(bucket_name_from_path(path))
13 |         return S3FileSystem(secret=credentials.secret_key, token=credentials.token, key=credentials.access_key)
14 |     return LocalFileSystem(auto_mkdir="True")
15 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/get_path_with_protocol.py:
--------------------------------------------------------------------------------
1 | from s3fs import S3FileSystem
2 | 
3 | 
4 | def get_path_with_protocol(source_dir: str, source_fs: S3FileSystem, path: str) -> str:
5 |     source_dir = source_dir.rstrip("/")
6 |     trimmed_source_dir = source_fs._strip_protocol(source_dir)
7 |     output_path = f"{source_dir}{path[(len(trimmed_source_dir)):]}"
8 |     return output_path
9 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/manifest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict, List, Tuple
 3 | 
 4 | from topo_processor.util.aws_files import create_s3_manifest, s3_download
 5 | from topo_processor.util.configuration import temp_folder
 6 | from topo_processor.util.s3 import is_s3_path
 7 | 
 8 | 
 9 | def load_manifest(manifest_path: str) -> Dict[str, Any]:
10 |     if is_s3_path(manifest_path):
11 |         create_s3_manifest(manifest_path)
12 |         s3_download(manifest_path, f"{temp_folder}/manifest.json")
13 |         manifest_path = f"{temp_folder}/manifest.json"
14 | 
15 |     with open(manifest_path) as manifest_json_file:
16 |         manifest: Dict[str, Any] = json.load(manifest_json_file)
17 | 
18 |     return manifest
19 | 
20 | 
21 | def get_file_path_from_manifest(manifest: Dict[str, Any], file_names: Tuple[str, ...]) -> List[str]:
22 |     list_str: List[str] = []
23 | 
24 |     for manifest_file in manifest["files"]:
25 |         if manifest_file["path"].lower().endswith(file_names):
26 |             list_str.append(manifest_file["path"])
27 | 
28 |     return list_str
29 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/file_system/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/assets_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.file_system.assets import _get_assets_from_directory
 6 | from topo_processor.metadata.data_type import DataType
 7 | 
 8 | 
 9 | def test_get_assets_from_directory() -> None:
10 |     source = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs"))
11 |     assets_list = _get_assets_from_directory(source, DataType.IMAGERY_HISTORIC)
12 | 
13 |     assert len(assets_list) == 5
14 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/file_searcher_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | 
 4 | import pytest
 5 | 
 6 | from topo_processor.file_system.file_searcher import get_file_path_from_survey
 7 | from topo_processor.file_system.manifest import get_file_path_from_manifest, load_manifest
 8 | 
 9 | 
10 | def test_get_file_path_from_manifest() -> None:
11 |     assert_list: List[str] = []
12 |     assert_list.append("/tiffs/SURVEY_1/CONTROL.tif")
13 | 
14 |     result_list: List[str] = []
15 |     manifest = load_manifest(os.path.join(os.getcwd(), "test_data", "manifest.json"))
16 |     result_list = get_file_path_from_manifest(manifest, ("control.tif", "control.tiff"))
17 | 
18 |     assert assert_list == result_list
19 | 
20 | 
21 | def test_get_file_path_from_survey() -> None:
22 |     assert_list: List[str] = []
23 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif")
24 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/MULTIPLE_ASSET.tif")
25 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/CONTROL.tif")
26 | 
27 |     result_list: List[str] = get_file_path_from_survey(
28 |         "SURVEY_1", os.path.join(os.getcwd(), "test_data", "manifest.json"), "test_data/historical_aerial_photos_metadata.csv"
29 |     )
30 | 
31 |     assert result_list == assert_list
32 | 
33 | 
34 | def test_get_file_path_from_survey_duplicate() -> None:
35 |     assert_list: List[str] = []
36 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/WRONG_PHOTO_TYPE.tif")
37 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/MULTIPLE_ASSET.tif")
38 |     assert_list.append("s3://linz-historical-imagery-staging/tiffs/SURVEY_1/CONTROL.tif")
39 | 
40 |     with pytest.raises(Exception) as e:
41 |         get_file_path_from_survey(
42 |             "SURVEY_1",
43 |             os.path.join(os.getcwd(), "test_data", "manifest_duplicate.json"),
44 |             "test_data/historical_aerial_photos_metadata.csv",
45 |         )
46 |         assert "Duplicate files found" in str(e.value)
47 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/get_fs_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fsspec.implementations.local import LocalFileSystem
 3 | from s3fs.core import S3FileSystem
 4 | 
 5 | from topo_processor.file_system.get_fs import get_fs
 6 | 
 7 | 
 8 | @pytest.mark.skip(reason="Skip this test for now before refactoring get_credentials")
 9 | def test_get_fs_s3() -> None:
10 |     path = "s3://testbucket"
11 |     assert isinstance(get_fs(path), S3FileSystem)
12 | 
13 | 
14 | def test_get_fs_local() -> None:
15 |     path = "./test"
16 |     assert isinstance(get_fs(path), LocalFileSystem)
17 |     path = "/home/test/location"
18 |     assert isinstance(get_fs(path), LocalFileSystem)
19 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/get_path_with_protocol_test.py:
--------------------------------------------------------------------------------
 1 | from fsspec.implementations.local import LocalFileSystem
 2 | from s3fs import S3FileSystem
 3 | 
 4 | from topo_processor.file_system.get_path_with_protocol import get_path_with_protocol
 5 | 
 6 | 
 7 | def test_get_path_with_protocol_aws() -> None:
 8 |     source_dir_with_forwardslash = "s3://bucketname/folder/"
 9 |     path = "bucketname/folder/subfolder/subfolder2"
10 |     fs = S3FileSystem()
11 |     full_path = get_path_with_protocol(source_dir=source_dir_with_forwardslash, source_fs=fs, path=path)
12 |     assert full_path == "s3://bucketname/folder/subfolder/subfolder2"
13 |     source_dir_without_forwardslash = "s3://bucketname/folder"
14 |     full_path = get_path_with_protocol(source_dir=source_dir_without_forwardslash, source_fs=fs, path=path)
15 |     assert full_path == "s3://bucketname/folder/subfolder/subfolder2"
16 | 
17 | 
18 | def test_get_path_with_protocol_local() -> None:
19 |     source_dir_with_forwardslash = "/home/username/dev/topo-processor/test_data/tiffs/"
20 |     path = "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1"
21 |     fs = LocalFileSystem(auto_mkdir="True")
22 |     full_path = get_path_with_protocol(source_dir=source_dir_with_forwardslash, source_fs=fs, path=path)
23 |     assert full_path == "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1"
24 |     source_dir_without_forwardslash = "/home/username/dev/topo-processor/test_data/tiffs"
25 |     full_path = get_path_with_protocol(source_dir=source_dir_without_forwardslash, source_fs=fs, path=path)
26 |     assert full_path == "/home/username/dev/topo-processor/test_data/tiffs/SURVEY_1"
27 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/transfer_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.file_system.get_fs import get_fs
 6 | from topo_processor.file_system.transfer import transfer_file
 7 | 
 8 | 
 9 | def test_transfer_local(setup: str) -> None:
10 |     dest_path = f"{setup}/test.tiff"
11 |     input_path = os.path.join(os.getcwd(), "test_data/tiffs/SURVEY_1/CONTROL.tiff")
12 |     transfer_file(input_path, "fakechecksum", "image/tiff", dest_path)
13 |     assert get_fs(input_path).info(input_path)["size"] == get_fs(dest_path).info(dest_path)["size"]
14 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/tests/write_json_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.file_system.write_json import write_json
 6 | 
 7 | 
 8 | def test_write_json(setup: str) -> None:
 9 |     my_dict = {"foo": "foo", "bar": 1}
10 |     target = setup + "/test.json"
11 |     write_json(my_dict, target)
12 |     assert os.path.isfile(target)
13 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/transfer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from linz_logger import get_log
 4 | 
 5 | from topo_processor.util.time import time_in_ms
 6 | 
 7 | from .get_fs import get_fs
 8 | 
 9 | 
10 | def transfer_file(source_file: str, checksum: str, content_type: Union[str, None], target_file: str) -> None:
11 |     start_time = time_in_ms()
12 |     with get_fs(source_file).open(source_file, "rb") as f1:
13 |         data = f1.read()
14 |         with get_fs(target_file).open(target_file, "wb", ContentType=content_type, Metadata={"hash": checksum}) as f2:
15 |             f2.write(data)
16 |             get_log().debug(
17 |                 "File transferred",
18 |                 source_file=source_file,
19 |                 target_file=target_file,
20 |                 checksum=checksum,
21 |                 duration=time_in_ms() - start_time,
22 |             )
23 | 


--------------------------------------------------------------------------------
/topo_processor/file_system/write_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | import pystac
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.time import time_in_ms
 8 | 
 9 | from .get_fs import get_fs
10 | 
11 | 
12 | def write_json(dictionary: Dict[str, Any], target_json: str) -> None:
13 |     start_time = time_in_ms()
14 |     with get_fs(target_json).open(target_json, "w", encoding="utf8", ContentType=pystac.MediaType.JSON) as f1:
15 |         f1.write(json.dumps(dictionary, indent=4, ensure_ascii=False))
16 |         get_log().debug("JSON Written", target_json=target_json, duration=time_in_ms() - start_time)
17 | 


--------------------------------------------------------------------------------
/topo_processor/geostore/invoke.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | import boto3
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.aws_credentials import Credentials
 8 | 
 9 | ROLE_ARN = "arn:aws:iam::715898075157:role/api-users"
10 | logger = get_log()
11 | 
12 | 
13 | def invoke_lambda(name: str, http_method: str, parameters: Dict[str, str]) -> Dict[str, Any]:
14 | 
15 |     client_sts = boto3.client("sts")
16 | 
17 |     assumed_role = client_sts.assume_role(RoleArn=ROLE_ARN, RoleSessionName="invoke-geostore")
18 |     credentials = Credentials(
19 |         assumed_role["Credentials"]["AccessKeyId"],
20 |         assumed_role["Credentials"]["SecretAccessKey"],
21 |         assumed_role["Credentials"]["SessionToken"],
22 |     )
23 |     client_lambda = boto3.client(
24 |         "lambda",
25 |         aws_access_key_id=credentials.access_key,
26 |         aws_secret_access_key=credentials.secret_key,
27 |         aws_session_token=credentials.token,
28 |     )
29 |     payload = build_lambda_payload(http_method, parameters)
30 |     logger.debug("invoke_lambda_function", name=name, payload=payload)
31 | 
32 |     raw_response = client_lambda.invoke(
33 |         FunctionName=name,
34 |         InvocationType="RequestResponse",
35 |         LogType="Tail",
36 |         Payload=json.dumps(payload).encode(),
37 |     )
38 |     payload_response: Dict[str, Any] = json.loads(raw_response["Payload"].read())
39 | 
40 |     if not is_response_ok(payload_response):
41 |         raise Exception("invoke_lambda_function_error", payload_response)
42 | 
43 |     logger.debug("response_lambda_function", name=name, response=payload_response)
44 |     return payload_response
45 | 
46 | 
47 | def build_lambda_payload(http_method: str, parameters: Dict[str, str]) -> Dict[str, Any]:
48 |     payload: Dict[str, Any] = {}
49 |     payload["http_method"] = http_method
50 |     payload["body"] = {}
51 |     if parameters:
52 |         payload["body"] = parameters
53 | 
54 |     return payload
55 | 
56 | 
57 | def invoke_import_status(execution_arn: str) -> Dict[str, Any]:
58 |     """Return the current status of the dataset version import process in the Geostore identified by 'execution_arn'"""
59 |     import_status_parameters = {"execution_arn": execution_arn}
60 |     import_status_response_payload = invoke_lambda("import-status", "GET", import_status_parameters)
61 | 
62 |     import_status: Dict[str, Any] = import_status_response_payload["body"]
63 |     return import_status
64 | 
65 | 
66 | def is_response_ok(response: Dict[str, Any]) -> bool:
67 |     try:
68 |         if 200 <= response["status_code"] <= 299:
69 |             return True
70 |         return False
71 |     except Exception as e:
72 |         raise Exception("There is an issue with the response") from e
73 | 


--------------------------------------------------------------------------------
/topo_processor/geostore/tests/invoke_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.geostore.invoke import build_lambda_payload, is_response_ok
 4 | 
 5 | 
 6 | def test_build_lambda_payload() -> None:
 7 |     payload = {
 8 |         "http_method": "GET",
 9 |         "body": {"id": "123", "metadata_url": "s3://my-bucket/my-survey/metadata.csv", "s3_role_arn": "arn:my-arn:1234567"},
10 |     }
11 |     payload_param = {
12 |         "id": "123",
13 |         "metadata_url": "s3://my-bucket/my-survey/metadata.csv",
14 |         "s3_role_arn": "arn:my-arn:1234567",
15 |     }
16 | 
17 |     payload_built = build_lambda_payload("GET", payload_param)
18 |     assert payload == payload_built
19 | 
20 | 
21 | def test_is_response_ok() -> None:
22 |     response_ko = {"status_code": 404, "body": {"message": "Not Found: dataset '1234' does not exist"}}
23 |     assert is_response_ok(response_ko) is False
24 |     response_ok = {
25 |         "status_code": 200,
26 |         "body": {
27 |             "created_at": "2022-03-23T02:41:53.940795+0000",
28 |             "pk": "DATASET#01FYTADW8MSCNR8D68EX7APMD3",
29 |             "title": "test_title",
30 |             "updated_at": "2022-03-23T02:41:53.940911+0000",
31 |             "id": "01FYTADW8MSCNR8D68EX7APMD3",
32 |         },
33 |     }
34 |     assert is_response_ok(response_ok)
35 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/__init__.py


--------------------------------------------------------------------------------
/topo_processor/metadata/csv_loader/csv_loader.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | from curses import meta
 4 | from typing import Any, Dict, List
 5 | 
 6 | from linz_logger import get_log
 7 | 
 8 | 
 9 | def read_csv(metadata_file_path: str, key: str, alternative_key: str = "", columns: List[str] = []) -> Dict[str, Any]:
10 |     metadata: Dict[str, Any] = {}
11 | 
12 |     csv_path = os.path.join(os.getcwd(), metadata_file_path)
13 |     if not os.path.isfile(csv_path):
14 |         raise Exception(f'Cannot find "{csv_path}"')
15 | 
16 |     with open(csv_path, "r") as csv_text:
17 |         reader = csv.DictReader(csv_text, delimiter=",")
18 |         for row in reader:
19 |             filtered_row: Dict[str, str] = {}
20 |             if columns:
21 |                 for col in columns:
22 |                     filtered_row[col] = row[col]
23 |             else:
24 |                 filtered_row = row
25 | 
26 |             if row[key]:
27 |                 key_value = row[key]
28 |                 if key_value in metadata:
29 |                     if filtered_row == metadata[key_value]:
30 |                         raise Exception(f'Duplicate "{key_value}" found in "{metadata_file_path}"')
31 |                     elif alternative_key and row[alternative_key]:
32 |                         metadata[row[alternative_key]] = filtered_row
33 |                 metadata[key_value] = filtered_row
34 |             elif alternative_key and row[alternative_key]:
35 |                 metadata[row[alternative_key]] = filtered_row
36 |             else:
37 |                 get_log().debug("read_csv_key_not_found", key=key, alternative_key=alternative_key)
38 | 
39 |     return metadata
40 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/csv_loader/tests/csv_loader_test.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import tempfile
  4 | 
  5 | import pytest
  6 | 
  7 | from topo_processor.metadata.csv_loader.csv_loader import read_csv
  8 | 
  9 | 
 10 | def test_read_csv() -> None:
 11 |     metadata_path = os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv")
 12 |     metadata = read_csv(metadata_path, "raw_filename", "sufi")
 13 | 
 14 |     assert len(metadata) == 5
 15 |     assert list(metadata.keys()) == ["WRONG_PHOTO_TYPE", "MULTIPLE_ASSET", "CONTROL", "WRONG_SURVEY", "CONTROL_2"]
 16 | 
 17 | 
 18 | def test_error_on_wrong_file_name() -> None:
 19 |     metadata_path = "./data/historical_aerial_photos_metadata.csv"
 20 | 
 21 |     with pytest.raises(Exception, match=r"^Cannot find "):
 22 |         read_csv(metadata_path, "raw_filename", "sufi")
 23 | 
 24 | 
 25 | def test_error_on_duplicate_file() -> None:
 26 |     temp_file = tempfile.NamedTemporaryFile()
 27 |     header = [
 28 |         "WKT",
 29 |         "sufi",
 30 |         "survey",
 31 |         "run",
 32 |         "photo_no",
 33 |         "alternate_survey_name",
 34 |         "camera",
 35 |         "camera_sequence_no",
 36 |         "nominal_focal_length",
 37 |         "altitude",
 38 |         "scale",
 39 |         "photocentre_lat",
 40 |         "photocentre_lon",
 41 |         "date",
 42 |         "film",
 43 |         "film_sequence_no",
 44 |         "photo_type",
 45 |         "format",
 46 |         "source",
 47 |         "physical_film_condition",
 48 |         "image_anomalies",
 49 |         "scanned",
 50 |         "raw_filename",
 51 |         "released_filename",
 52 |         "when_scanned",
 53 |         "photo_version",
 54 |     ]
 55 |     row = [
 56 |         "",
 57 |         "",
 58 |         "",
 59 |         "",
 60 |         "",
 61 |         "",
 62 |         "",
 63 |         "",
 64 |         "",
 65 |         "",
 66 |         "",
 67 |         "",
 68 |         "",
 69 |         "",
 70 |         "",
 71 |         "",
 72 |         "",
 73 |         "",
 74 |         "",
 75 |         "",
 76 |         "",
 77 |         "",
 78 |         "WRONG_PHOTO_TYPE",
 79 |         "",
 80 |         "",
 81 |         "",
 82 |     ]
 83 |     with open(temp_file.name, "a", encoding="utf-8") as csv_file:
 84 |         writer = csv.writer(csv_file)
 85 |         writer.writerow(header)
 86 |         writer.writerow(row)
 87 |         writer.writerow(row)
 88 | 
 89 |     with pytest.raises(Exception, match=r'Duplicate "WRONG_PHOTO_TYPE" found in "' + temp_file.name + '"'):
 90 |         read_csv(temp_file.name, "raw_filename", "sufi")
 91 | 
 92 | 
 93 | def test_read_csv_column_filter() -> None:
 94 |     metadata_path = os.path.join(os.getcwd(), "test_data", "historical_survey_footprint_metadata.csv")
 95 |     metadata = read_csv(metadata_path, "SURVEY", columns=["NAME"])
 96 | 
 97 |     assert len(metadata) == 4
 98 |     assert list(metadata.keys()) == ["SURVEY_1", "SURVEY_3", "SURVEY_2", "SURVEY_NO_NAME"]
 99 |     assert list(metadata.values()) == [{"NAME": "TE KUITI 1"}, {"NAME": "AUCKLAND 1"}, {"NAME": "WELLINGTON 2"}, {"NAME": ""}]
100 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/data_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Dict
 3 | 
 4 | 
 5 | class DataType(str, Enum):
 6 |     IMAGERY_HISTORIC = "imagery.historic"
 7 |     IMAGERY_AERIAL = "imagery.aerial"
 8 |     LIDAR_DSM = "lidar.dsm"
 9 |     LIDAR_DEM = "lidar.dem"
10 |     LIDAR_POINT_CLOUD = "lidar.pointcloud"
11 |     SURVEY_FOOTPRINT_HISTORIC = "survey.footprint.historic"
12 | 
13 | 
14 | data_type_layer: Dict[str, str] = {DataType.IMAGERY_HISTORIC: "51002", DataType.SURVEY_FOOTPRINT_HISTORIC: "51000"}
15 | 
16 | 
17 | def get_layer_id(data_type: str) -> str:
18 |     return data_type_layer[data_type]
19 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/lds_cache/lds_cache.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | import pystac
  6 | from linz_logger import get_log
  7 | 
  8 | from topo_processor.metadata.csv_loader.csv_loader import read_csv
  9 | from topo_processor.metadata.data_type import DataType, get_layer_id
 10 | from topo_processor.util.aws_files import build_s3_path, load_file_content, s3_download
 11 | from topo_processor.util.configuration import lds_cache_bucket, temp_folder
 12 | from topo_processor.util.file_converter import geopackage_to_csv
 13 | from topo_processor.util.file_extension import is_csv, is_geopackage
 14 | from topo_processor.util.gzip import decompress_file
 15 | 
 16 | metadata_store: Dict[str, Dict[str, Any]] = {}
 17 | """Stores the metadata by layer id"""
 18 | 
 19 | 
 20 | def get_latest_item_path(collection: pystac.Collection) -> pystac.Link:
 21 |     for link in reversed(collection.get_links()):
 22 |         if not link.rel == "item":
 23 |             continue
 24 |         return link
 25 | 
 26 |     raise Exception(f"No version found for Collection {collection.title}")
 27 | 
 28 | 
 29 | def get_latest_item(layer: str) -> pystac.Item:
 30 |     collection = pystac.Collection.from_dict(load_file_content(lds_cache_bucket, layer + "/collection.json"))
 31 |     latest_item = get_latest_item_path(collection)
 32 |     latest_item_path = f"{layer}/{latest_item.href.lstrip('./')}"
 33 | 
 34 |     return pystac.Item.from_dict(load_file_content(lds_cache_bucket, latest_item_path))
 35 | 
 36 | 
 37 | def get_metadata(
 38 |     data_type: str, criteria: Optional[Dict[str, str]] = None, metadata_path: str = "", save_filtered: bool = False
 39 | ) -> Dict[str, Any]:
 40 |     """Return a dictionary containing the metadata"""
 41 |     layer_id = get_layer_id(data_type)
 42 | 
 43 |     if not metadata_path:
 44 |         if not metadata_store.get(layer_id):
 45 |             latest_item = get_latest_item(layer_id)
 46 |             exported_asset = latest_item.assets.get("export", None)
 47 | 
 48 |             if exported_asset is None:
 49 |                 raise Exception(f"No exported asset found for lds layer: {layer_id}")
 50 | 
 51 |             asset_path = exported_asset.href.lstrip("./")
 52 |             metadata_path = f"{temp_folder}/{asset_path}"
 53 |             s3_download(build_s3_path(lds_cache_bucket, f"{layer_id}/{asset_path}"), metadata_path)
 54 | 
 55 |             if os.path.isfile(metadata_path):
 56 |                 if exported_asset.extra_fields.get("encoding", None) == "gzip":
 57 |                     decompress_file(metadata_path)
 58 |             else:
 59 |                 raise Exception(f"{metadata_path} not found")
 60 | 
 61 |     if os.path.isfile(metadata_path):
 62 |         if is_geopackage(metadata_path):
 63 |             new_metadata_path = os.path.splitext(metadata_path)[0] + "_" + time.strftime("%s") + ".csv"
 64 |             geopackage_to_csv(metadata_path, new_metadata_path).run()
 65 |             metadata_path = new_metadata_path
 66 |         elif not is_csv(metadata_path):
 67 |             raise Exception(f"Unsupported file format. {metadata_path} must be .csv or .gpkg")
 68 | 
 69 |         if data_type == DataType.IMAGERY_HISTORIC:
 70 |             metadata_store[layer_id] = read_csv(metadata_path, "raw_filename", "sufi")
 71 |         elif data_type == DataType.SURVEY_FOOTPRINT_HISTORIC:
 72 |             metadata_store[layer_id] = read_csv(metadata_path, "SURVEY", columns=["NAME"])
 73 | 
 74 |     if criteria:
 75 |         filtered_metadata = filter_metadata(metadata_store[layer_id], criteria)
 76 |         if save_filtered:
 77 |             metadata_store[layer_id] = filtered_metadata
 78 |         else:
 79 |             return filtered_metadata
 80 | 
 81 |     return metadata_store[layer_id]
 82 | 
 83 | 
 84 | def filter_metadata(metadata_to_filter: Dict[str, Any], criteria: Dict[str, Any]) -> Dict[str, Any]:
 85 |     get_log().debug("filter_metadata", criteria=criteria)
 86 |     filtered_dict: Dict[str, Any] = {}
 87 |     is_found = False
 88 | 
 89 |     for metadata_key, metadata_value in metadata_to_filter.items():
 90 |         for criteria_key, criteria_value in criteria.items():
 91 |             if metadata_value[criteria_key]:
 92 |                 if metadata_value[criteria_key] == criteria_value:
 93 |                     is_found = True
 94 |                 else:
 95 |                     is_found = False
 96 |                     break
 97 |         if is_found:
 98 |             filtered_dict[metadata_key] = metadata_value
 99 |     return filtered_dict
100 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/lds_cache/tests/lds_cache_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from tempfile import mkdtemp
  4 | 
  5 | import pytest
  6 | 
  7 | from topo_processor.metadata.data_type import DataType
  8 | from topo_processor.metadata.lds_cache.lds_cache import filter_metadata, get_metadata
  9 | 
 10 | 
 11 | def test_filter_metadata() -> None:
 12 |     metadata = {
 13 |         "file_a": {"survey": "survey_1", "camera": "camera_a", "raw_filename": "file_a"},
 14 |         "file_b": {"survey": "survey_3", "camera": "camera_b", "raw_filename": "file_b"},
 15 |         "file_c": {"survey": "survey_1", "camera": "camera_b", "raw_filename": "file_c"},
 16 |     }
 17 | 
 18 |     criteria = {"survey": "survey_1"}
 19 | 
 20 |     metadata_filtered = {
 21 |         "file_a": {"survey": "survey_1", "camera": "camera_a", "raw_filename": "file_a"},
 22 |         "file_c": {"survey": "survey_1", "camera": "camera_b", "raw_filename": "file_c"},
 23 |     }
 24 | 
 25 |     result = filter_metadata(metadata, criteria)
 26 | 
 27 |     assert metadata_filtered == result
 28 | 
 29 | 
 30 | def test_get_metadata_csv() -> None:
 31 |     metadata = {
 32 |         "WRONG_SURVEY": {
 33 |             "WKT": "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",
 34 |             "sufi": "72352",
 35 |             "survey": "SURVEY_3",
 36 |             "run": "E",
 37 |             "photo_no": "48",
 38 |             "alternate_survey_name": "",
 39 |             "camera": "EAGLE IV",
 40 |             "camera_sequence_no": "89556",
 41 |             "nominal_focal_length": "508",
 42 |             "altitude": "11000",
 43 |             "scale": "6600",
 44 |             "photocentre_lat": "-45.8079",
 45 |             "photocentre_lon": "170.5496",
 46 |             "date": "1952-04-23T00:00:00.000",
 47 |             "film": "731",
 48 |             "film_sequence_no": "114",
 49 |             "photo_type": "B&W",
 50 |             "format": "18cm x 23cm",
 51 |             "source": "ORIGINAL ",
 52 |             "physical_film_condition": "",
 53 |             "image_anomalies": "",
 54 |             "scanned": "Y",
 55 |             "raw_filename": "WRONG_SURVEY",
 56 |             "released_filename": "CROWN_731_114",
 57 |             "when_scanned": "2020/Q4",
 58 |             "photo_version": "1",
 59 |         }
 60 |     }
 61 |     metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.csv"))
 62 |     criteria = {"survey": "SURVEY_3"}
 63 |     result = get_metadata(DataType.IMAGERY_HISTORIC, criteria, metadata_path)
 64 | 
 65 |     assert metadata == result
 66 | 
 67 | 
 68 | def test_get_metadata_gpkg() -> None:
 69 |     metadata = {
 70 |         "WRONG_SURVEY": {
 71 |             "WKT": "POLYGON ((170.540066918452 -45.8023553587759,170.559584102313 -45.8027545634288,170.559139228268 -45.8134376154951,170.539618358047 -45.8130383744392,170.540066918452 -45.8023553587759))",
 72 |             "sufi": "72352",
 73 |             "survey": "SURVEY_3",
 74 |             "run": "E",
 75 |             "photo_no": "48",
 76 |             "alternate_survey_name": "",
 77 |             "camera": "EAGLE IV",
 78 |             "camera_sequence_no": "89556",
 79 |             "nominal_focal_length": "508",
 80 |             "altitude": "11000",
 81 |             "scale": "6600",
 82 |             "photocentre_lat": "-45.8079",
 83 |             "photocentre_lon": "170.5496",
 84 |             "date": "1952-04-23T00:00:00.000",
 85 |             "film": "731",
 86 |             "film_sequence_no": "114",
 87 |             "photo_type": "B&W",
 88 |             "format": "18cm x 23cm",
 89 |             "source": "ORIGINAL ",
 90 |             "physical_film_condition": "",
 91 |             "image_anomalies": "",
 92 |             "scanned": "Y",
 93 |             "raw_filename": "WRONG_SURVEY",
 94 |             "released_filename": "CROWN_731_114",
 95 |             "when_scanned": "2020/Q4",
 96 |             "photo_version": "1",
 97 |         }
 98 |     }
 99 |     temp_folder: str = mkdtemp()
100 |     source_metadata_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "historical_aerial_photos_metadata.gpkg"))
101 |     dest_metadata_path = os.path.abspath(os.path.join(temp_folder, "historical_aerial_photos_metadata.gpkg"))
102 |     shutil.copyfile(source_metadata_path, dest_metadata_path)
103 |     criteria = {"survey": "SURVEY_3"}
104 |     result = get_metadata(DataType.IMAGERY_HISTORIC, criteria, dest_metadata_path)
105 |     shutil.rmtree(temp_folder)
106 | 
107 |     assert metadata == result
108 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/__init__.py:
--------------------------------------------------------------------------------
 1 | from .metadata_loader_imagery_historic import MetadataLoaderImageryHistoric
 2 | from .metadata_loader_repo import MetadataLoaderRepository
 3 | from .metadata_loader_tiff import MetadataLoaderTiff
 4 | 
 5 | metadata_loader_rep = MetadataLoaderRepository()
 6 | metadata_loader_rep.append(MetadataLoaderImageryHistoric())
 7 | metadata_loader_rep.append(MetadataLoaderTiff())
 8 | 
 9 | metadata_loader_imagery_hist = MetadataLoaderImageryHistoric()
10 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/metadata_loader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING, Optional
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from topo_processor.stac.asset import Asset
 8 | 
 9 | 
10 | class MetadataLoader(ABC):
11 |     @property
12 |     @abstractmethod
13 |     def name(self) -> str:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def is_applicable(self, asset: Optional[Asset] = None) -> bool:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def load_metadata(self, asset: Optional[Asset] = None) -> None:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/metadata_loader_imagery_historic.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from typing import TYPE_CHECKING, Any, Dict, Optional
  5 | 
  6 | import shapely.wkt
  7 | from rasterio.enums import ColorInterp
  8 | 
  9 | from topo_processor.metadata.data_type import DataType
 10 | from topo_processor.metadata.lds_cache.lds_cache import get_metadata
 11 | from topo_processor.stac.asset_key import AssetKey
 12 | from topo_processor.stac.collection import Collection
 13 | from topo_processor.stac.linz_provider import LinzProviders
 14 | from topo_processor.stac.providers import Providers
 15 | from topo_processor.stac.stac_extensions import StacExtensions
 16 | from topo_processor.stac.store import get_collection, get_item
 17 | from topo_processor.util.conversions import (
 18 |     historical_imagery_photo_type_to_linz_geospatial_type,
 19 |     nzt_datetime_to_utc_datetime,
 20 |     quarterdate_to_date_string,
 21 |     remove_empty_strings,
 22 |     string_to_boolean,
 23 |     string_to_number,
 24 | )
 25 | from topo_processor.util.file_extension import is_tiff
 26 | 
 27 | from .metadata_loader import MetadataLoader
 28 | 
 29 | if TYPE_CHECKING:
 30 |     from topo_processor.stac.asset import Asset
 31 |     from topo_processor.stac.item import Item
 32 | 
 33 | 
 34 | class MetadataLoaderImageryHistoric(MetadataLoader):
 35 |     name = "metadata.loader.imagery.historic"
 36 |     is_init = False
 37 |     raw_metadata: Dict[str, Dict[str, str]] = {}
 38 | 
 39 |     def is_applicable(self, asset: Optional[Asset] = None) -> bool:
 40 |         if asset:
 41 |             return is_tiff(asset.source_path)
 42 |         else:
 43 |             return False
 44 | 
 45 |     def load_metadata(self, asset: Optional[Asset] = None, metadata_file: str = "", is_load_all: bool = False) -> None:
 46 |         criteria = {}
 47 |         filename = ""
 48 | 
 49 |         if asset:
 50 |             filename = os.path.splitext(os.path.basename(asset.source_path))[0]
 51 |             criteria = {"raw_filename": filename}
 52 | 
 53 |         self.raw_metadata = get_metadata(DataType.IMAGERY_HISTORIC.value, criteria, metadata_file)
 54 | 
 55 |         if is_load_all:
 56 |             for metadata in self.raw_metadata.values():
 57 |                 self.populate_item(metadata)
 58 |         elif asset:
 59 | 
 60 |             if filename not in self.raw_metadata:
 61 |                 asset.add_error("Asset not found in CSV file", self.name)
 62 |                 return
 63 |             asset_metadata = self.raw_metadata[filename]
 64 | 
 65 |             asset.target = f"{asset_metadata['survey']}/{asset_metadata['sufi']}{asset.file_ext()}"
 66 |             asset.key_name = AssetKey.Visual
 67 |             self.populate_item(asset_metadata, asset)
 68 | 
 69 |     def populate_item(self, metadata_row: Dict[str, str], asset: Optional[Asset] = None) -> None:
 70 |         survey = metadata_row["survey"]
 71 |         if not survey:
 72 |             survey = metadata_row["alternate_survey_name"]
 73 |         title = self.get_title(survey)
 74 | 
 75 |         collection = get_collection(title)
 76 |         collection.survey = survey
 77 | 
 78 |         item = get_item(metadata_row["sufi"])
 79 |         collection.add_item(item)
 80 | 
 81 |         if asset:
 82 |             item.add_asset(asset)
 83 | 
 84 |         item.collection = collection
 85 |         self.populate_collection(collection)
 86 | 
 87 |         item.properties.update(
 88 |             {
 89 |                 "mission": collection.survey,
 90 |                 "platform": "Fixed-wing Aircraft",
 91 |                 "instruments": [metadata_row["camera"]],
 92 |             }
 93 |         )
 94 | 
 95 |         self.add_linz_geospatial_type(item, metadata_row["photo_type"])
 96 |         self.add_aerial_photo_metadata(item, metadata_row)
 97 |         self.add_camera_metadata(item, metadata_row)
 98 |         self.add_film_metadata(item, metadata_row)
 99 |         self.add_centroid(item, metadata_row)
100 |         self.add_projection_extent(item)
101 |         self.add_scanning_metadata(item, metadata_row)
102 |         self.add_datetime_property(item, metadata_row)
103 |         self.add_spatial_extent(item, metadata_row)
104 |         self.add_bands_extent(item, asset)
105 | 
106 |         item.add_extension(StacExtensions.historical_imagery.value)
107 |         item.add_extension(StacExtensions.linz.value)
108 |         item.add_extension(StacExtensions.version.value)
109 | 
110 |     def populate_collection(self, collection: Collection) -> None:
111 |         collection.license = "CC-BY-4.0"
112 |         collection.extra_fields.update(
113 |             {
114 |                 "linz:history": "LINZ and its predecessors, Lands & Survey and Department of Survey and Land Information (DOSLI), commissioned aerial photography for the Crown between 1936 and 2008.\nOne of the predominant uses of the aerial photography at the time was the photogrammetric mapping of New Zealand, initially at 1inch to 1mile followed by the NZMS 260 and Topo50 map series at 1:50,000.\nThese photographs were scanned through the Crown Aerial Film Archive scanning project.",
115 |                 "linz:lifecycle": "completed",
116 |                 "quality:description": "Geographic coordinates provided with this aerial photographic survey were estimated from the associated survey chart and have low positional accuracy. They should be used for general referencing only.",
117 |             }
118 |         )
119 | 
120 |         collection.add_extension(StacExtensions.quality.value)
121 |         collection.add_linz_provider(LinzProviders.LTTW.value)
122 |         collection.add_linz_provider(LinzProviders.LMPP.value)
123 |         collection.add_provider(Providers.NZAM.value)
124 | 
125 |     def get_title(self, survey: str) -> str:
126 |         survey_names = get_metadata(DataType.SURVEY_FOOTPRINT_HISTORIC)
127 |         title: str = ""
128 | 
129 |         if len(survey_names) == 0:
130 |             raise Exception(f"Empty footprint metadata file when processing survey {survey}")
131 | 
132 |         try:
133 |             title = survey_names[survey]["NAME"]
134 |         except Exception as e:
135 |             raise Exception(f"No name found for survey {survey} in footprint metadata file") from e
136 | 
137 |         return title
138 | 
139 |     def add_spatial_extent(self, item: Item, asset_metadata: Dict[str, str]) -> None:
140 |         wkt = asset_metadata.get("WKT", None)
141 |         if wkt is None or wkt.lower() == "polygon empty":
142 |             item.add_warning("Geometry is missing", "")
143 |             return
144 | 
145 |         try:
146 |             # EPSG:4167 -> EPSG:4326 is mostly a null conversion, in the future if we support additional projections we should reproject this
147 |             poly = shapely.wkt.loads(wkt)
148 |             # Reduce the precision of all the coordinates to approx 1M resolution
149 |             poly = shapely.wkt.loads(shapely.wkt.dumps(poly, rounding_precision=5))
150 |             item.geometry_poly = poly
151 |         except shapely.errors.WKTReadingError as e:
152 |             item.add_error("Geometry is invalid", "", e)
153 | 
154 |     def add_camera_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None:
155 |         camera_properties: Dict[str, Any] = {}
156 | 
157 |         camera_properties["camera:sequence_number"] = string_to_number(asset_metadata["camera_sequence_no"])
158 |         camera_properties["camera:nominal_focal_length"] = string_to_number(asset_metadata["nominal_focal_length"])
159 | 
160 |         item.properties.update(remove_empty_strings(camera_properties))
161 |         item.add_extension(StacExtensions.camera.value)
162 | 
163 |     def add_film_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None:
164 |         film_properties: Dict[str, Any] = {}
165 | 
166 |         film_properties["film:id"] = asset_metadata["film"]
167 |         film_properties["film:negative_sequence"] = string_to_number(asset_metadata["film_sequence_no"])
168 |         film_properties["film:physical_condition"] = asset_metadata["physical_film_condition"]
169 |         film_properties["film:physical_size"] = asset_metadata["format"]
170 | 
171 |         item.properties.update(remove_empty_strings(film_properties))
172 |         item.add_extension(StacExtensions.film.value)
173 | 
174 |     def add_aerial_photo_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None:
175 |         aerial_photo_properties: Dict[str, Any] = {}
176 |         aerial_photo_properties["aerial-photo:run"] = asset_metadata["run"]
177 |         aerial_photo_properties["aerial-photo:sequence_number"] = string_to_number(asset_metadata["photo_no"])
178 |         aerial_photo_properties["aerial-photo:anomalies"] = asset_metadata["image_anomalies"]
179 |         altitude = string_to_number(asset_metadata["altitude"])
180 |         if isinstance(altitude, int) and altitude <= 0:
181 |             item.add_warning(
182 |                 msg="Skipped Record",
183 |                 cause=self.name,
184 |                 e=Exception(f"stac field 'aerial-photo:altitude' has value: {altitude}"),
185 |             )
186 |         else:
187 |             aerial_photo_properties["aerial-photo:altitude"] = altitude
188 |         scale = string_to_number(asset_metadata["scale"])
189 |         if isinstance(scale, int) and scale <= 0:
190 |             item.add_warning(
191 |                 msg="Skipped Record",
192 |                 cause=self.name,
193 |                 e=Exception(f"stac field 'aerial-photo:scale' has value: {scale}"),
194 |             )
195 |         else:
196 |             aerial_photo_properties["aerial-photo:scale"] = scale
197 | 
198 |         item.properties.update(remove_empty_strings(aerial_photo_properties))
199 |         item.add_extension(StacExtensions.aerial_photo.value)
200 | 
201 |     def add_scanning_metadata(self, item: Item, asset_metadata: Dict[str, str]) -> None:
202 |         scanning_properties: Dict[str, Any] = {}
203 | 
204 |         if asset_metadata["source"]:
205 |             scanning_properties["scan:is_original"] = string_to_boolean(asset_metadata["source"], ["original"], ["copy"])
206 |         if asset_metadata["when_scanned"]:
207 |             scanning_properties["scan:scanned"] = quarterdate_to_date_string(asset_metadata["when_scanned"])
208 | 
209 |         item.properties.update(remove_empty_strings(scanning_properties))
210 |         item.add_extension(StacExtensions.scanning.value)
211 | 
212 |     def add_datetime_property(self, item: Item, asset_metadata: Dict[str, str]) -> None:
213 |         item_date = asset_metadata.get("date", None)
214 | 
215 |         if item_date:
216 |             try:
217 |                 item.datetime = nzt_datetime_to_utc_datetime(item_date)
218 |             except Exception as e:
219 |                 item.add_error(msg="Invalid date", cause=self.name, e=e)
220 |         else:
221 |             item.add_error(msg="No date found", cause=self.name, e=Exception(f"item date has no value"))
222 | 
223 |     def add_centroid(self, item: Item, asset_metadata: Dict[str, str]) -> None:
224 | 
225 |         centroid = {
226 |             "lat": string_to_number(asset_metadata.get("photocentre_lat", "")),
227 |             "lon": string_to_number(asset_metadata.get("photocentre_lon", "")),
228 |         }
229 |         if self.is_valid_centroid(item, centroid):
230 |             item.properties["proj:centroid"] = centroid
231 |             item.add_extension(StacExtensions.projection.value)
232 | 
233 |     def add_projection_extent(self, item: Item) -> None:
234 |         item.properties["proj:epsg"] = None
235 |         item.add_extension(StacExtensions.projection.value)
236 | 
237 |     def add_bands_extent(self, item: Item, asset: Optional[Asset] = None) -> None:
238 |         item.add_extension(StacExtensions.eo.value)
239 | 
240 |         if asset:
241 |             # default value
242 |             asset.properties["eo:bands"] = [{"name": ColorInterp.gray.name, "common_name": "pan"}]
243 | 
244 |     def is_valid_centroid(self, item: Item, centroid: Dict[str, Any]) -> bool:
245 |         if not isinstance(centroid["lat"], (int, float)) or centroid["lat"] > 90 or centroid["lat"] < -90:
246 |             item.add_warning(
247 |                 msg="Skipped Record",
248 |                 cause=self.name,
249 |                 e=Exception(
250 |                     f"stac field 'proj:centroid' has invalid lat value: {centroid['lat']}, instance: {type(centroid['lat'])}"
251 |                 ),
252 |             )
253 |             return False
254 |         if not isinstance(centroid["lon"], (int, float)) or centroid["lon"] > 180 or centroid["lon"] < -180:
255 |             item.add_warning(
256 |                 msg="Skipped Record",
257 |                 cause=self.name,
258 |                 e=Exception(
259 |                     f"stac field 'proj:centroid' has invalid lon value: {centroid['lon']}, instance: {type(centroid['lon'])}"
260 |                 ),
261 |             )
262 |             return False
263 |         return True
264 | 
265 |     def add_linz_geospatial_type(self, item: Item, photo_type: str) -> None:
266 | 
267 |         item.linz_geospatial_type = historical_imagery_photo_type_to_linz_geospatial_type(photo_type)
268 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/metadata_loader_repo.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, List, Optional
 4 | 
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.time import time_in_ms
 8 | 
 9 | from .metadata_loader import MetadataLoader
10 | 
11 | if TYPE_CHECKING:
12 |     from topo_processor.stac.asset import Asset
13 | 
14 | 
15 | class MetadataLoaderRepository:
16 |     loaders: List[MetadataLoader] = []
17 | 
18 |     def append(self, loader: MetadataLoader) -> None:
19 |         self.loaders.append(loader)
20 | 
21 |     def load_metadata(self, asset: Optional[Asset] = None) -> None:
22 |         for loader in self.loaders:
23 |             if loader.is_applicable(asset):
24 |                 start_time = time_in_ms()
25 |                 try:
26 |                     loader.load_metadata(asset)
27 |                     if not asset or not asset.is_valid:
28 |                         break
29 |                 except Exception as e:
30 |                     # TODO refactor to report errors in a better way
31 |                     if asset:
32 |                         asset.add_error(str(e), loader.name, e)
33 |                     get_log().error("Metadata Load Failed", error=e, loader=loader.name)
34 |                     raise Exception(f"Metadata Load Failed: {e}")
35 |                 get_log().debug(
36 |                     "Metadata Loaded",
37 |                     loader=loader.name,
38 |                     asset=asset.source_path,
39 |                     duration=time_in_ms() - start_time,
40 |                 )
41 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/metadata_loader_tiff.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | from typing import TYPE_CHECKING, Any, Optional
 5 | 
 6 | import rasterio
 7 | from linz_logger import get_log
 8 | from rasterio.enums import ColorInterp
 9 | 
10 | from topo_processor.file_system.get_fs import get_fs
11 | from topo_processor.stac.stac_extensions import StacExtensions
12 | from topo_processor.util.file_extension import is_tiff
13 | 
14 | from .metadata_loader import MetadataLoader
15 | 
16 | if TYPE_CHECKING:
17 |     from topo_processor.stac.asset import Asset
18 | 
19 | 
20 | class MetadataLoaderTiff(MetadataLoader):
21 |     name = "metadata.loader.imagery.tiff"
22 | 
23 |     def is_applicable(self, asset: Optional[Asset] = None) -> bool:
24 |         if asset is None or asset.item is None:
25 |             return False
26 |         return is_tiff(asset.source_path)
27 | 
28 |     def load_metadata(self, asset: Optional[Asset] = None) -> None:
29 |         if asset:
30 |             fs = get_fs(asset.source_path)
31 |             # FIXME: Should we download the file first as we could need it to do the coggification later?
32 |             # This process takes quiet a long time locally.
33 | 
34 |             with fs.open(asset.source_path) as f:
35 |                 with warnings.catch_warnings(record=True) as w:
36 |                     with rasterio.open(f) as tiff:
37 |                         self.add_epsg(tiff, asset)
38 |                         self.add_bands(tiff, asset)
39 |                 for warn in w:
40 |                     get_log().warning(f"Rasterio Warning: {warn.message}", file=asset.source_path, loader=self.name)
41 | 
42 |     def add_epsg(self, tiff: Any, asset: Asset) -> None:
43 |         if tiff.crs:
44 |             if not tiff.crs.is_epsg_code:
45 |                 raise Exception("The code is not a valid EPSG code.")
46 |             crs = tiff.crs.to_epsg()
47 |         else:
48 |             crs = None
49 |         if asset.item:
50 |             asset.item.properties["proj:epsg"] = crs
51 |             asset.item.add_extension(StacExtensions.projection.value)
52 | 
53 |     def add_bands(self, tiff: Any, asset: Asset) -> None:
54 |         if asset.item:
55 |             asset.item.add_extension(StacExtensions.eo.value)
56 | 
57 |         if ColorInterp.gray in tiff.colorinterp and len(tiff.colorinterp) == 1:
58 |             asset.properties["eo:bands"] = [{"name": ColorInterp.gray.name, "common_name": "pan"}]
59 |         elif all(band in [ColorInterp.red, ColorInterp.blue, ColorInterp.green] for band in tiff.colorinterp):
60 |             asset.properties["eo:bands"] = [
61 |                 {"name": ColorInterp.red.name, "common_name": "red"},
62 |                 {"name": ColorInterp.green.name, "common_name": "green"},
63 |                 {"name": ColorInterp.blue.name, "common_name": "blue"},
64 |             ]
65 |         elif asset.item:
66 |             asset.item.add_warning(
67 |                 msg="Skipped Asset Record",
68 |                 cause=self.name,
69 |                 e=Exception("stac field 'eo:bands' skipped. Tiff ColorInterp does not match specified values"),
70 |             )
71 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/metadata_loaders/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_loaders/tests/metadata_loader_tiff_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from topo_processor.metadata.metadata_loaders.metadata_loader_tiff import MetadataLoaderTiff
 4 | from topo_processor.stac.asset import Asset
 5 | from topo_processor.stac.collection import Collection
 6 | from topo_processor.stac.item import Item
 7 | from topo_processor.stac.stac_extensions import StacExtensions
 8 | 
 9 | 
10 | def test_load_metadata() -> None:
11 |     source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
12 |     asset = Asset(source_path)
13 |     item = Item("item_id")
14 |     item.add_asset(asset)
15 |     item.collection = Collection("Collection")
16 |     loader = MetadataLoaderTiff()
17 |     assert loader.is_applicable(asset)
18 | 
19 |     loader.load_metadata(asset)
20 |     assert item.properties["proj:epsg"] is None
21 |     assert StacExtensions.projection.value in item.stac_extensions
22 |     assert len(item.assets) == 1
23 |     assert item.assets[0].properties["eo:bands"] == [{"name": "gray", "common_name": "pan"}]
24 |     assert StacExtensions.eo.value in item.stac_extensions
25 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/__init__.py:
--------------------------------------------------------------------------------
 1 | from topo_processor.metadata.metadata_validators.metadata_validator import MetadataValidator
 2 | 
 3 | from .metadata_validator_repo import MetadataValidatorRepository
 4 | from .metadata_validator_stac import MetadataValidatorStac
 5 | from .metadata_validator_tiff import MetadataValidatorTiff
 6 | 
 7 | metadata_validator_repo = MetadataValidatorRepository()
 8 | metadata_validator_repo.append(MetadataValidatorTiff())
 9 | metadata_validator_repo.append(MetadataValidatorStac())
10 | 
11 | metadata_validator_stac = MetadataValidatorStac()
12 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/metadata_validator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from topo_processor.stac.item import Item
 8 | 
 9 | 
10 | class MetadataValidator(ABC):
11 |     @property
12 |     @abstractmethod
13 |     def name(self) -> str:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def is_applicable(self, item: Item) -> bool:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def validate_metadata(self, item: Item) -> None:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/metadata_validator_repo.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, List
 4 | 
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.time import time_in_ms
 8 | 
 9 | from .metadata_validator import MetadataValidator
10 | 
11 | if TYPE_CHECKING:
12 |     from topo_processor.stac.item import Item
13 | 
14 | 
15 | class MetadataValidatorRepository:
16 |     validators: List[MetadataValidator] = []
17 | 
18 |     def append(self, validator: MetadataValidator) -> None:
19 |         self.validators.append(validator)
20 | 
21 |     def validate_metadata(self, item: Item) -> bool:
22 |         is_valid = True
23 | 
24 |         for validator in self.validators:
25 |             if validator.is_applicable(item):
26 |                 start_time = time_in_ms()
27 |                 try:
28 |                     validator.validate_metadata(item)
29 |                 except Exception as e:
30 |                     is_valid = False
31 |                     item.add_error(str(e), validator.name, e)
32 |                     get_log().warning(f"Validation Failed: {e}", validator=validator.name)
33 |                 get_log().debug(
34 |                     "Validity Checked",
35 |                     validator=validator.name,
36 |                     duration=time_in_ms() - start_time,
37 |                 )
38 | 
39 |         return is_valid
40 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/metadata_validator_stac.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | from typing import Any, Dict, Union
 5 | 
 6 | import fsspec
 7 | import jsonschema_rs
 8 | import pystac.validation
 9 | from linz_logger import get_log
10 | from pystac.errors import STACValidationError
11 | 
12 | from topo_processor.stac.collection import Collection
13 | from topo_processor.stac.item import Item
14 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator
15 | 
16 | from .metadata_validator import MetadataValidator
17 | 
18 | 
19 | class MetadataValidatorStac(MetadataValidator):
20 |     name = "validator.stac"
21 |     validator_cache: Dict[str, Any] = {}
22 | 
23 |     def get_validator_from_uri(self, schema_uri: str) -> Any:
24 |         if schema_uri not in self.validator_cache:
25 |             file = fsspec.open(schema_uri, "rt")
26 |             with file as f:
27 |                 self.validator_cache[schema_uri] = jsonschema_rs.JSONSchema.from_str(f.read())
28 | 
29 |         validator = self.validator_cache[schema_uri]
30 | 
31 |         return validator
32 | 
33 |     def is_applicable(self, stac_object: Union[Item, Collection]) -> bool:
34 |         return True
35 | 
36 |     def validate_metadata(self, item: Item) -> None:
37 | 
38 |         if isinstance(pystac.validation.RegisteredValidator.get_validator(), IterErrorsValidator):
39 |             with warnings.catch_warnings(record=True) as w:
40 |                 item.create_stac().validate()
41 |                 msg = ""
42 |                 for warn in w:
43 |                     msg = msg + ", " + str(warn.message)
44 |                 if w:
45 |                     raise STACValidationError(message=f"Not valid STAC: {msg}")
46 |         else:
47 |             try:
48 |                 item.create_stac().validate()
49 |             except STACValidationError as e:
50 |                 raise STACValidationError(message=f"Not valid STAC: {e}")
51 | 
52 |     def validate_metadata_with_report(self, stac_object: Union[Item, Collection]) -> Dict[str, list[str]]:
53 |         """Validate the STAC object (Item or Collection) against the core json schema and its extensions.
54 |         Return an error report [{schemaURI, [errors]}]
55 |         """
56 |         errors_report: Dict[str, list[str]] = {}
57 |         if isinstance(stac_object, Collection):
58 |             stac_collection = stac_object.create_stac()
59 |             for item in stac_object.items:
60 |                 stac_item = stac_object.items[item].create_stac()
61 |                 stac_collection.add_item(stac_item)
62 |             stac_object.generate_summaries(stac_collection)
63 |             stac_dict = stac_collection.to_dict(include_self_link=False)
64 |         else:
65 |             stac_dict = stac_object.create_stac().to_dict(include_self_link=False)
66 | 
67 |         schema_uris: list[str] = [stac_object.schema] + stac_dict["stac_extensions"]
68 | 
69 |         for schema_uri in schema_uris:
70 |             get_log().trace(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], schema=schema_uri)
71 |             current_errors = []
72 |             v = self.get_validator_from_uri(schema_uri)
73 |             errors = v.iter_errors(stac_dict)
74 | 
75 |             for error in errors:
76 |                 current_errors.append(error.message)
77 |                 get_log().warn(f"{self.name}:validate_metadata_with_report", stacId=stac_dict["id"], error=error.message)
78 | 
79 |             if current_errors:
80 |                 errors_report[schema_uri] = current_errors
81 | 
82 |         return errors_report
83 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/metadata_validator_tiff.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from linz_logger import get_log
 7 | 
 8 | from topo_processor.file_system.get_fs import get_fs
 9 | from topo_processor.util.file_extension import is_tiff
10 | 
11 | from .metadata_validator import MetadataValidator
12 | 
13 | if TYPE_CHECKING:
14 |     from topo_processor.stac.item import Item
15 | 
16 | 
17 | class MetadataValidatorTiff(MetadataValidator):
18 |     name = "validator.imagery.tiff"
19 | 
20 |     def is_applicable(self, item: Item) -> bool:
21 |         for asset in item.assets:
22 |             if is_tiff(asset.source_path):
23 |                 return True
24 |         return False
25 | 
26 |     def validate_metadata(self, item: Item) -> None:
27 | 
28 |         for asset in item.assets:
29 |             if not is_tiff(asset.source_path):
30 |                 continue
31 | 
32 |             geospatial_type = item.linz_geospatial_type
33 |             eo_bands = asset.properties["eo:bands"]
34 |             common_names = [common_names["common_name"] for common_names in eo_bands]
35 | 
36 |             with warnings.catch_warnings(record=True) as w:
37 | 
38 |                 # black and white
39 |                 if geospatial_type in ["black and white image", "black and white infrared image"]:
40 |                     # check eo:bands matches geospatial_type
41 |                     if len(eo_bands) != 1 or eo_bands[0]["common_name"] != "pan":
42 |                         raise Exception(f"Wrong linz_geospatial_type of '{geospatial_type}' when bands = '{eo_bands}'")
43 |                 # color
44 |                 # check linz_geospatial_type matches colorinterp
45 |                 elif geospatial_type in ["color image", "color infrared image"]:
46 |                     # check eo:bands matches colorinterp
47 |                     if (
48 |                         len(eo_bands) != 3
49 |                         or "red" not in common_names
50 |                         or "green" not in common_names
51 |                         or "blue" not in common_names
52 |                     ):
53 |                         raise Exception(f"Wrong linz_geospatial_type of '{geospatial_type}' when bands = '{eo_bands}'")
54 |                 else:
55 |                     raise Exception(f"Unknown linz_geospatial_type of '{geospatial_type}'")
56 |         for warn in w:
57 |             get_log().warning(f"Warning: {warn.message}", file=asset.source_path, loader=self.name)
58 | 


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/metadata/metadata_validators/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/metadata/metadata_validators/tests/metadata_validator_tiff_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.metadata.metadata_validators.metadata_validator_tiff import MetadataValidatorTiff
 6 | from topo_processor.stac.asset import Asset
 7 | from topo_processor.stac.item import Item
 8 | 
 9 | 
10 | def test_check_validity() -> None:
11 |     source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
12 |     asset = Asset(source_path)
13 |     item = Item("item_id")
14 |     item.add_asset(asset)
15 |     item.linz_geospatial_type = "color image"
16 |     asset.properties.update({"eo:bands": [{"name": "gray", "common_name": "pan"}]})
17 | 
18 |     validator = MetadataValidatorTiff()
19 |     assert validator.is_applicable(item)
20 |     with pytest.raises(
21 |         Exception, match=r"Wrong linz_geospatial_type of 'color image' when bands = [{'name': 'gray', 'common_name': 'pan'}]"
22 |     ):
23 |         validator.validate_metadata(item)
24 | 
25 | 
26 | def test_unknown_geospatial_type() -> None:
27 |     source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
28 |     asset = Asset(source_path)
29 |     item = Item("item_id")
30 |     item.add_asset(asset)
31 |     item.linz_geospatial_type = "grayscale"
32 |     asset.properties.update({"eo:bands": [{"name": "gray", "common_name": "pan"}]})
33 | 
34 |     validator = MetadataValidatorTiff()
35 |     assert validator.is_applicable(item)
36 |     with pytest.raises(Exception, match=r"Unknown linz_geospatial_type of 'grayscale'"):
37 |         validator.validate_metadata(item)
38 | 


--------------------------------------------------------------------------------
/topo_processor/stac/__init__.py:
--------------------------------------------------------------------------------
 1 | from ..metadata.data_type import DataType
 2 | from .asset import Asset
 3 | from .collection import Collection
 4 | from .item import Item
 5 | from .item_factory import process_source
 6 | from .linz_provider import LinzProvider, LinzProviderRole
 7 | from .stac_extensions import StacExtensions
 8 | from .store import collection_store
 9 | from .validate_report import ValidateReport
10 | from .validation import validate_stac
11 | 


--------------------------------------------------------------------------------
/topo_processor/stac/asset.py:
--------------------------------------------------------------------------------
 1 | from mimetypes import MimeTypes
 2 | from os import path
 3 | from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 4 | 
 5 | import pystac
 6 | 
 7 | from topo_processor.util.checksum import multihash_as_hex
 8 | from topo_processor.util.configuration import get_topo_processor_version
 9 | from topo_processor.util.files import get_file_update_time
10 | from topo_processor.util.valid import Validity
11 | 
12 | from .asset_key import AssetKey
13 | 
14 | if TYPE_CHECKING:
15 |     from .item import Item
16 | 
17 | 
18 | class Asset(Validity):
19 |     source_path: str  # The raw file location on disk
20 |     target: Optional[str] = None  # New file name used for uploading
21 |     content_type: Optional[str] = None
22 |     needs_upload: bool = True
23 |     href: str
24 |     properties: Dict[str, Any]
25 |     item: Optional["Item"] = None
26 |     key_name: Optional[AssetKey] = None
27 | 
28 |     def __init__(self, source_path: str):
29 |         super().__init__()
30 |         self.source_path = source_path
31 |         self.properties = {
32 |             "processing:software": get_topo_processor_version(),
33 |         }
34 | 
35 |     def file_ext(self) -> str:
36 |         return path.splitext(self.target if self.target else self.source_path)[1]
37 | 
38 |     def get_content_type(self) -> Union[str, None]:
39 |         if self.content_type:
40 |             return self.content_type
41 |         return MimeTypes().guess_type(self.target if self.target else self.source_path)[0]
42 | 
43 |     def get_checksum(self) -> str:
44 |         if "file:checksum" not in self.properties:
45 |             checksum: str = multihash_as_hex(self.source_path)
46 |             self.properties["file:checksum"] = checksum
47 | 
48 |         return_value: str = self.properties["file:checksum"]
49 |         return return_value
50 | 
51 |     def set_output_asset_dates(self, output_path: str) -> None:
52 |         if "created" not in self.properties:
53 |             self.properties["created"] = get_file_update_time(output_path)
54 |             # TODO: process for COG updates not created yet
55 |             self.properties["updated"] = self.properties["created"]
56 |         else:
57 |             self.properties["updated"] = get_file_update_time(output_path)
58 | 
59 |     def create_stac(self) -> pystac.Asset:
60 |         stac = pystac.Asset(href=self.href, extra_fields=self.properties, media_type=self.get_content_type())
61 |         return stac
62 | 


--------------------------------------------------------------------------------
/topo_processor/stac/asset_key.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class AssetKey(str, Enum):
5 |     Visual = "visual"
6 |     Thumbnail = "thumbnail"
7 | 


--------------------------------------------------------------------------------
/topo_processor/stac/collection.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import warnings
  5 | from datetime import datetime
  6 | from shutil import rmtree
  7 | from tempfile import mkdtemp
  8 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
  9 | 
 10 | import pystac
 11 | import ulid
 12 | from linz_logger import get_log
 13 | from pystac.collection import Collection as PystacCollection
 14 | from pystac.errors import STACValidationError
 15 | from pystac.summaries import Summaries, Summarizer
 16 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap
 17 | from shapely.ops import unary_union
 18 | 
 19 | from topo_processor.metadata.data_type import DataType
 20 | from topo_processor.stac.asset import Asset
 21 | from topo_processor.stac.iter_errors_validator import IterErrorsValidator
 22 | from topo_processor.util.time import get_min_max_interval
 23 | from topo_processor.util.valid import Validity
 24 | 
 25 | from .linz_provider import LinzProvider
 26 | from .providers import Providers
 27 | from .stac_extensions import StacExtensions
 28 | 
 29 | if TYPE_CHECKING:
 30 |     from .item import Item
 31 | 
 32 | TEMP_DIR: Optional[str] = None
 33 | FIELDS_JSON_URL = "https://raw.githubusercontent.com/linz/stac/master/fields/fields.json"
 34 | 
 35 | 
 36 | class Collection(Validity):
 37 |     id: str
 38 |     title: str
 39 |     survey: str
 40 |     description: str
 41 |     license: str
 42 |     items: Dict[str, "Item"]
 43 |     linz_providers: List[Dict[str, Any]]
 44 |     providers: List[pystac.Provider]
 45 |     schema: Optional[str]
 46 |     extra_fields: Dict[str, Any]
 47 |     linz_geospatial_type: str
 48 | 
 49 |     stac_extensions: Set[str]
 50 |     summaries: Summaries = Summaries.empty()
 51 | 
 52 |     def __init__(self, title: str):
 53 |         super().__init__()
 54 |         # FIXME: Do we want to generate this id like this?
 55 |         self.id = str(ulid.ULID())
 56 |         self.title = title
 57 |         self.description = ""
 58 |         self.items = {}
 59 |         self.schema = DefaultSchemaUriMap().get_object_schema_uri(pystac.STACObjectType.COLLECTION, pystac.get_stac_version())
 60 |         self.extra_fields = dict(
 61 |             {
 62 |                 # TODO: decision to be made on version ref comments [TDE-230] hardcode to '1' for now
 63 |                 "version": "1",
 64 |                 "linz:security_classification": "unclassified",
 65 |             }
 66 |         )
 67 |         self.linz_providers = []
 68 |         self.stac_extensions = set([StacExtensions.file.value])
 69 |         self.providers = [Providers.TTW.value]
 70 | 
 71 |     def add_item(self, item: Item) -> None:
 72 |         if item.collection is not None and item.collection != self:
 73 |             raise Exception(f"Remapping of collection? existing='{item.collection.title}' new='{self.title}' item='{item.id}'")
 74 |         if item.id in self.items:
 75 |             existing = self.items[item.id]
 76 |             if existing != item:
 77 |                 raise Exception(f"Remapping of item id in collection='{self.title}' item='{item.id}'")
 78 |             return
 79 |         self.items[item.id] = item
 80 | 
 81 |     def add_extension(self, ext: str) -> None:
 82 |         self.stac_extensions.add(ext)
 83 | 
 84 |     def add_provider(self, provider: pystac.Provider) -> None:
 85 |         if provider not in self.providers:
 86 |             self.providers.append(provider)
 87 | 
 88 |     def add_linz_provider(self, linz_provider: LinzProvider) -> None:
 89 |         if linz_provider.to_dict() not in self.linz_providers:
 90 |             self.linz_providers.append(linz_provider.to_dict())
 91 | 
 92 |     def update_description(self, stac_collection: pystac.Collection, data_type: DataType) -> None:
 93 |         if data_type == DataType.IMAGERY_HISTORIC:
 94 |             size = self.summaries.to_dict()["film:physical_size"]
 95 |             if len(size) == 1:
 96 |                 size = size[0]
 97 |             colour = self.extra_fields["linz:geospatial_type"]
 98 |             stac_collection.description = (
 99 |                 self.description
100 |             ) = f"This aerial photographic survey was digitised from {colour} {size} negatives in the Crown collection of the Crown Aerial Film Archive."
101 | 
102 |     def get_temp_dir(self) -> str:
103 |         global TEMP_DIR
104 |         if not TEMP_DIR:
105 |             TEMP_DIR = mkdtemp()
106 |             get_log().debug("Temp directory created", path=TEMP_DIR)
107 |         temp_dir = os.path.join(TEMP_DIR, self.survey)
108 |         if not os.path.exists(temp_dir):
109 |             os.mkdir(temp_dir)
110 |         return temp_dir
111 | 
112 |     def get_temporal_extent(self) -> List[Optional[datetime]]:
113 |         dates: List[datetime] = []
114 | 
115 |         for item in self.items.values():
116 |             if item.datetime:
117 |                 dates.append(item.datetime)
118 | 
119 |         return get_min_max_interval(dates)
120 | 
121 |     def get_bounding_boxes(self) -> List[List[float]]:
122 |         """
123 |         create a union of all item bounding boxes inside the collection
124 |         """
125 |         polys = [x.geometry_poly for x in self.items.values() if x.geometry_poly is not None]
126 | 
127 |         if len(polys) == 0:
128 |             return [[0.0, 0.0, 0.0, 0.0]]
129 |         union_poly = unary_union(polys)
130 |         return [list(union_poly.bounds)]
131 | 
132 |     def get_linz_geospatial_type(self) -> str:
133 |         geospatial_type_set = set(x.linz_geospatial_type for x in self.items.values() if x.linz_geospatial_type)
134 |         if len(geospatial_type_set) != 1:
135 |             get_log().warning(f"Invalid 'linz:geospatial_type' collection='{self.title}'")
136 |             return "invalid geospatial type"
137 |         geospatial_type_str = geospatial_type_set.pop()
138 |         return geospatial_type_str
139 | 
140 |     def get_linz_asset_summaries(self) -> Dict[str, Any]:
141 |         assets_checked: List[Asset] = []
142 |         dates_created: List[datetime] = []
143 |         dates_updated: List[datetime] = []
144 |         processing_software_versions: List[Dict[str, str]] = []
145 | 
146 |         for item in self.items.values():
147 |             for asset in item.assets:
148 |                 if not asset.needs_upload:
149 |                     continue
150 |                 if not asset in assets_checked:
151 |                     if "created" in asset.properties:
152 |                         dates_created.append(asset.properties["created"])
153 |                         dates_updated.append(asset.properties["updated"])
154 |                     if "processing:software" in asset.properties:
155 |                         if asset.properties["processing:software"] not in processing_software_versions:
156 |                             processing_software_versions.append(asset.properties["processing:software"])
157 |                     assets_checked.append(asset)
158 | 
159 |         interval_created = get_min_max_interval(dates_created)
160 |         interval_updated = get_min_max_interval(dates_updated)
161 | 
162 |         # to pass metadata-only validation as there are no assets to populate mandatory linz:asset_summaries
163 |         # TODO: review this workaround once validation command has been combined into upload command
164 |         if not assets_checked:
165 |             return {
166 |                 "created": {"minimum": "0000-01-01T00:00:00Z", "maximum": "0000-01-01T00:00:00Z"},
167 |                 "updated": {"minimum": "0000-01-01T00:00:00Z", "maximum": "0000-01-01T00:00:00Z"},
168 |             }
169 | 
170 |         return {
171 |             "processing:software": processing_software_versions,
172 |             "created": {"minimum": interval_created[0], "maximum": interval_created[1]},
173 |             "updated": {"minimum": interval_updated[0], "maximum": interval_updated[1]},
174 |         }
175 | 
176 |     def delete_temp_dir(self) -> None:
177 |         global TEMP_DIR
178 |         if TEMP_DIR:
179 |             if os.path.exists(TEMP_DIR):
180 |                 rmtree(TEMP_DIR)
181 |                 TEMP_DIR = None
182 | 
183 |     def generate_summaries(self, collection: pystac.Collection) -> None:
184 |         summarizer = Summarizer(fields=FIELDS_JSON_URL)
185 |         collection.summaries = summarizer.summarize(collection)
186 |         self.summaries = collection.summaries
187 | 
188 |     def create_stac(self) -> pystac.Collection:
189 |         if self.linz_providers:
190 |             self.extra_fields["linz:providers"] = self.linz_providers
191 |         self.extra_fields["linz:geospatial_type"] = self.get_linz_geospatial_type()
192 |         self.extra_fields["linz:asset_summaries"] = self.get_linz_asset_summaries()
193 | 
194 |         stac = pystac.Collection(
195 |             id=self.id,
196 |             description=self.description,
197 |             extent=pystac.Extent(
198 |                 pystac.SpatialExtent(bboxes=self.get_bounding_boxes()),
199 |                 pystac.TemporalExtent(intervals=[self.get_temporal_extent()]),
200 |             ),
201 |             title=self.title,
202 |             stac_extensions=list(sorted(self.stac_extensions)),
203 |             href="./collection.json",
204 |             extra_fields=self.extra_fields,
205 |             license=self.license,
206 |             providers=self.providers,
207 |         )
208 |         get_log().info("Stac Collection Created", id=stac.id, title=stac.title)
209 |         return stac
210 | 
211 |     def validate_pystac_collection(self, pystac_collection: PystacCollection) -> None:
212 | 
213 |         if isinstance(pystac.validation.RegisteredValidator.get_validator(), IterErrorsValidator):
214 |             with warnings.catch_warnings(record=True) as w:
215 |                 pystac_collection.validate()
216 |                 msg = ""
217 |                 for warn in w:
218 |                     msg = msg + ", " + str(warn.message)
219 |                 if w:
220 |                     raise Exception(f"Not valid STAC: {msg}")
221 | 
222 |         else:
223 |             try:
224 |                 pystac_collection.validate()
225 |             except STACValidationError as e:
226 |                 raise Exception(f"Not valid STAC") from e
227 | 


--------------------------------------------------------------------------------
/topo_processor/stac/item.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | from typing import Any, Dict, List, Optional, Set
 3 | 
 4 | import shapely.geometry
 5 | from linz_logger import get_log
 6 | from pystac import get_stac_version
 7 | from pystac.item import Item as PystacItem
 8 | from pystac.stac_object import STACObjectType
 9 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap
10 | 
11 | from topo_processor.util.valid import Validity
12 | 
13 | from .asset import Asset
14 | from .collection import Collection
15 | from .stac_extensions import StacExtensions
16 | 
17 | 
18 | class Item(Validity):
19 | 
20 |     id: str
21 |     geometry_poly: Optional[shapely.geometry.Polygon] = None
22 |     linz_geospatial_type: str = ""
23 |     datetime: Optional[dt.datetime] = None
24 |     properties: Dict[str, Any]
25 |     stac_extensions: Set[str]
26 |     assets: List[Asset]
27 |     collection: Optional[Collection] = None
28 |     schema: Optional[str]
29 | 
30 |     def __init__(self, item_id: str):
31 |         super().__init__()
32 |         self.id = item_id
33 |         self.properties = {
34 |             # TODO: decision to be made on version ref comments [TDE-230] hardcode to '1' for now
35 |             "version": "1",
36 |         }
37 |         self.stac_extensions = set([StacExtensions.file.value])
38 |         self.assets = []
39 |         self.schema = DefaultSchemaUriMap().get_object_schema_uri(STACObjectType.ITEM, get_stac_version())
40 | 
41 |     def is_valid(self) -> bool:
42 |         if not super().is_valid():
43 |             return False
44 |         for asset in self.assets:
45 |             if not asset.is_valid():
46 |                 return False
47 |         return True
48 | 
49 |     def add_asset(self, asset: Asset) -> None:
50 |         if asset.item:
51 |             raise Exception(f"Asset is already associated with an item: existing item='{asset.item.id}' new item='{self.id}'")
52 |         self.assets.append(asset)
53 |         asset.item = self
54 | 
55 |     def add_extension(self, ext: str, add_to_collection: bool = True) -> None:
56 |         self.stac_extensions.add(ext)
57 |         if not self.collection:
58 |             return
59 |         if add_to_collection:
60 |             self.collection.add_extension(ext)
61 | 
62 |     def create_stac(self) -> PystacItem:
63 |         geometry = None
64 |         bbox = None
65 |         if self.geometry_poly is not None:
66 |             geometry = shapely.geometry.mapping(self.geometry_poly)
67 |             bbox = self.geometry_poly.bounds
68 | 
69 |         stac = PystacItem(
70 |             id=self.id,
71 |             geometry=geometry,
72 |             bbox=bbox,
73 |             datetime=self.datetime,
74 |             properties=self.properties,
75 |             stac_extensions=list(sorted(self.stac_extensions)),
76 |         )
77 |         get_log().info("Stac Item Created", id=stac.id)
78 |         return stac
79 | 


--------------------------------------------------------------------------------
/topo_processor/stac/item_factory.py:
--------------------------------------------------------------------------------
 1 | from linz_logger import get_log
 2 | 
 3 | from topo_processor.data.data_transformers import data_transformer_repo
 4 | from topo_processor.file_system.assets import get_assets
 5 | from topo_processor.metadata.data_type import DataType
 6 | from topo_processor.metadata.metadata_loaders import metadata_loader_rep
 7 | from topo_processor.metadata.metadata_validators import metadata_validator_repo
 8 | from topo_processor.stac.store import asset_store, item_store
 9 | from topo_processor.util.time import time_in_ms
10 | 
11 | 
12 | def process_source(source: str, data_type: DataType, metadata_path: str = "", force: bool = False) -> None:
13 |     start_time = time_in_ms()
14 |     _create_assets(source, data_type, metadata_path)
15 |     total_asset = len(asset_store)
16 |     if total_asset == 0:
17 |         get_log().warn("No Assets Found", assets=total_asset, source=source, duration=time_in_ms() - start_time)
18 |         return
19 | 
20 |     get_log().debug("Assets Created", assets=total_asset, source=source, duration=time_in_ms() - start_time)
21 | 
22 |     start_time = time_in_ms()
23 |     _create_items(force)
24 |     total_item = len(item_store)
25 |     if len(item_store) == 0:
26 |         get_log().warn("No Items Created", items=total_item, source=source, duration=time_in_ms() - start_time)
27 |         return
28 | 
29 |     get_log().debug("Items Created", items=total_item, source=source, duration=time_in_ms() - start_time)
30 | 
31 | 
32 | def _create_assets(source: str, data_type: str, metadata_path: str) -> None:
33 |     assets = get_assets(source, data_type, metadata_path)
34 |     for asset in assets:
35 |         metadata_loader_rep.load_metadata(asset)
36 | 
37 | 
38 | def _create_items(force: bool = False) -> None:
39 |     all_items_valid = True
40 |     # Validate metadata of valid items
41 |     # Item can be already detected invalid from the metadata loader
42 |     # For those, we don't want to validate their metadata
43 |     for item in item_store.values():
44 |         if item.is_valid():
45 |             metadata_is_valid = metadata_validator_repo.validate_metadata(item)
46 |             if all_items_valid and not metadata_is_valid:
47 |                 all_items_valid = metadata_is_valid
48 | 
49 |     if not all_items_valid and not force:
50 |         raise Exception("At least one Item is not valid. Process is stopped")
51 | 
52 |     for item in item_store.values():
53 |         if item.is_valid():
54 |             data_transformer_repo.transform_data(item)
55 | 


--------------------------------------------------------------------------------
/topo_processor/stac/iter_errors_validator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import warnings
  3 | from typing import Any, Dict, List, Optional, Tuple
  4 | 
  5 | import jsonschema
  6 | import pystac
  7 | from linz_logger import get_log
  8 | from pystac import STACObjectType
  9 | from pystac.validation.schema_uri_map import DefaultSchemaUriMap, SchemaUriMap
 10 | from pystac.validation.stac_validator import STACValidator
 11 | 
 12 | 
 13 | class IterErrorsValidator(STACValidator):
 14 | 
 15 |     schema_uri_map: SchemaUriMap
 16 |     schema_cache: Dict[str, Dict[str, Any]]
 17 | 
 18 |     def __init__(self, schema_uri_map: Optional[SchemaUriMap] = None) -> None:
 19 | 
 20 |         if schema_uri_map is not None:
 21 |             self.schema_uri_map = schema_uri_map
 22 |         else:
 23 |             self.schema_uri_map = DefaultSchemaUriMap()
 24 | 
 25 |         self.schema_cache = {}
 26 | 
 27 |     def get_schema_from_uri(self, schema_uri: str) -> Tuple[Dict[str, Any], Any]:
 28 |         if schema_uri not in self.schema_cache:
 29 |             s = json.loads(pystac.StacIO.default().read_text(schema_uri))
 30 |             self.schema_cache[schema_uri] = s
 31 | 
 32 |         schema = self.schema_cache[schema_uri]
 33 | 
 34 |         resolver = jsonschema.validators.RefResolver(base_uri=schema_uri, referrer=schema, store=self.schema_cache)
 35 | 
 36 |         return schema, resolver
 37 | 
 38 |     def _validate_from_uri(self, stac_dict: Dict[str, Any], schema_uri: str) -> List[str]:
 39 |         """Return a list of the error(s) found during the validation of stac_dict against schema_uri"""
 40 |         errors = []
 41 |         schema, resolver = self.get_schema_from_uri(schema_uri)
 42 | 
 43 |         # Draft7 for pystac
 44 |         validator = jsonschema.Draft7Validator(schema)
 45 |         for error in sorted(validator.evolve(schema=schema).iter_errors(stac_dict), key=str):
 46 |             errors.append(error.message)
 47 | 
 48 |         for uri in resolver.store:
 49 |             if uri not in self.schema_cache:
 50 |                 self.schema_cache[uri] = resolver.store[uri]
 51 | 
 52 |         return errors
 53 | 
 54 |     def _get_error_message(
 55 |         self,
 56 |         schema_uri: str,
 57 |         stac_object_type: STACObjectType,
 58 |         extension_id: Optional[str],
 59 |         href: Optional[str],
 60 |         stac_id: Optional[str],
 61 |         errors: Optional[List[str]],
 62 |     ) -> str:
 63 |         s = "Validation failed for {} ".format(stac_object_type)
 64 |         if href is not None:
 65 |             s += "at {} ".format(href)
 66 |         if stac_id is not None:
 67 |             s += "with ID {} ".format(stac_id)
 68 |         s += "against schema at {}".format(schema_uri)
 69 |         if extension_id is not None:
 70 |             s += " for STAC extension '{}'".format(extension_id)
 71 |         if errors:
 72 |             s += " with the following error(s): '{}'".format(", ".join(errors))
 73 |         return s
 74 | 
 75 |     def validate_core(
 76 |         self,
 77 |         stac_dict: Dict[str, Any],
 78 |         stac_object_type: STACObjectType,
 79 |         stac_version: str,
 80 |         href: Optional[str] = None,
 81 |     ) -> Optional[str]:
 82 |         """Validate a core stac object.
 83 |         Return value can be None or specific to the implementation.
 84 |         Args:
 85 |             stac_dict : Dictionary that is the STAC json of the object.
 86 |             stac_object_type : The stac object type of the object encoded in
 87 |                 stac_dict. One of :class:`~pystac.STACObjectType`.
 88 |             stac_version : The version of STAC to validate the object against.
 89 |             href : Optional HREF of the STAC object being validated.
 90 |         Returns:
 91 |             str: URI for the JSON schema that was validated against, or None if
 92 |                 no validation occurred.
 93 |         """
 94 |         schema_uri = self.schema_uri_map.get_object_schema_uri(stac_object_type, stac_version)
 95 | 
 96 |         if schema_uri is None:
 97 |             return None
 98 |         try:
 99 |             errors = self._validate_from_uri(stac_dict, schema_uri)
100 |         except Exception as e:
101 |             get_log().error(f"Exception while validating {stac_object_type} href: {href}")
102 |             raise e
103 |         if errors:
104 |             msg = self._get_error_message(schema_uri, stac_object_type, None, href, stac_dict.get("id"), errors)
105 |             warnings.warn(msg)
106 | 
107 |         return schema_uri
108 | 
109 |     def validate_extension(
110 |         self,
111 |         stac_dict: Dict[str, Any],
112 |         stac_object_type: STACObjectType,
113 |         stac_version: str,
114 |         extension_id: str,
115 |         href: Optional[str] = None,
116 |     ) -> Optional[str]:
117 |         """Validate an extension stac object.
118 |         Return value can be None or specific to the implementation.
119 |         Args:
120 |             stac_dict : Dictionary that is the STAC json of the object.
121 |             stac_object_type : The stac object type of the object encoded in
122 |                 stac_dict. One of :class:`~pystac.STACObjectType`.
123 |             stac_version : The version of STAC to validate the object against.
124 |             extension_id : The extension ID to validate against.
125 |             href : Optional HREF of the STAC object being validated.
126 |         Returns:
127 |             str: URI for the JSON schema that was validated against, or None if
128 |                 no validation occurred.
129 |         """
130 |         schema_uri = extension_id
131 | 
132 |         if schema_uri is None:
133 |             return None
134 | 
135 |         try:
136 |             errors = self._validate_from_uri(stac_dict, schema_uri)
137 |         except Exception as e:
138 |             get_log().error(f"Exception while validating {stac_object_type} href: {href}")
139 |             raise e
140 |         if errors:
141 |             msg = self._get_error_message(schema_uri, stac_object_type, extension_id, href, stac_dict.get("id"), errors)
142 |             warnings.warn(msg)
143 | 
144 |         return schema_uri
145 | 


--------------------------------------------------------------------------------
/topo_processor/stac/linz_provider.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | from pystac import Provider
 5 | from pystac.utils import StringEnum
 6 | 
 7 | 
 8 | class LinzProviderRole(StringEnum):
 9 |     """Enumerates the allows values of the LinzProvider "role" field."""
10 | 
11 |     MANAGER = "manager"
12 |     CUSTODIAN = "custodian"
13 | 
14 | 
15 | class LinzProvider(Provider):
16 | 
17 |     roles: Optional[List[LinzProviderRole]]  # type:ignore
18 |     """Optional roles of the provider. Any of manager or custodian.
19 |     LINZ override of pystac.ProviderRole Enum.
20 |     Type ignored due to: https://github.com/radiantearth/stac-spec/issues/1147
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         name: str,
26 |         description: Optional[str] = None,
27 |         roles: Optional[List[LinzProviderRole]] = None,
28 |         url: Optional[str] = None,
29 |         extra_fields: Optional[Dict[str, Any]] = None,
30 |     ):
31 |         self.name = name
32 |         self.description = description
33 |         self.roles = roles
34 |         self.url = url
35 |         self.extra_fields = extra_fields or {}
36 | 
37 | 
38 | class LinzProviders(Enum):
39 |     LTTW = LinzProvider(
40 |         name="Toitū Te Whenua LINZ",
41 |         description="The New Zealand Government's lead agency for location and property information, Crown land and managing overseas investment.",
42 |         roles=[LinzProviderRole.CUSTODIAN],
43 |         url="https://www.linz.govt.nz/about-linz/what-were-doing/projects/crown-aerial-film-archive-historical-imagery-scanning-project",
44 |     )
45 |     LMPP = LinzProvider(name="Manager Partnership Programmes", roles=[LinzProviderRole.MANAGER])
46 | 


--------------------------------------------------------------------------------
/topo_processor/stac/providers.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import pystac
 4 | 
 5 | 
 6 | class Providers(Enum):
 7 |     TTW = pystac.Provider(
 8 |         name="Toitū Te Whenua LINZ",
 9 |         description="The New Zealand Government's lead agency for location and property information, Crown land and managing overseas investment.",
10 |         roles=[pystac.ProviderRole.HOST, pystac.ProviderRole.LICENSOR, pystac.ProviderRole.PROCESSOR],
11 |         url="https://www.linz.govt.nz/about-linz/what-were-doing/projects/crown-aerial-film-archive-historical-imagery-scanning-project",
12 |     )
13 |     NZAM = pystac.Provider(
14 |         name="NZ Aerial Mapping",
15 |         description="Aerial survey and geospatial services firm. Went into liquidation in 2014.",
16 |         roles=[pystac.ProviderRole.PRODUCER],
17 |     )
18 | 


--------------------------------------------------------------------------------
/topo_processor/stac/stac_extensions.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class StacExtensions(str, Enum):
 5 |     linz = "https://stac.linz.govt.nz/v0.0.15/linz/schema.json"
 6 |     quality = "https://stac.linz.govt.nz/v0.0.15/quality/schema.json"
 7 |     historical_imagery = "https://stac.linz.govt.nz/v0.0.15/historical-imagery/schema.json"
 8 |     aerial_photo = "https://stac.linz.govt.nz/v0.0.15/aerial-photo/schema.json"
 9 |     camera = "https://stac.linz.govt.nz/v0.0.15/camera/schema.json"
10 |     film = "https://stac.linz.govt.nz/v0.0.15/film/schema.json"
11 |     scanning = "https://stac.linz.govt.nz/v0.0.15/scanning/schema.json"
12 |     eo = "https://stac-extensions.github.io/eo/v1.0.0/schema.json"
13 |     file = "https://stac-extensions.github.io/file/v2.0.0/schema.json"
14 |     projection = "https://stac-extensions.github.io/projection/v1.0.0/schema.json"
15 |     version = "https://stac-extensions.github.io/version/v1.0.0/schema.json"
16 | 


--------------------------------------------------------------------------------
/topo_processor/stac/store.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from .asset import Asset
 4 | from .collection import Collection
 5 | from .item import Item
 6 | 
 7 | collection_store: Dict[str, Collection] = {}
 8 | item_store: Dict[str, Item] = {}
 9 | asset_store: Dict[str, Asset] = {}
10 | 
11 | 
12 | def get_collection(title: str) -> Collection:
13 |     if title not in collection_store:
14 |         collection = Collection(title)
15 |         collection_store[title] = collection
16 |     return collection_store[title]
17 | 
18 | 
19 | def get_asset(source_path: str) -> Asset:
20 |     if source_path not in asset_store:
21 |         asset = Asset(source_path)
22 |         asset_store[source_path] = asset
23 |     return asset_store[source_path]
24 | 
25 | 
26 | def get_item(item_id: str) -> Item:
27 |     if item_id not in item_store:
28 |         item = Item(item_id)
29 |         item_store[item_id] = item
30 |     return item_store[item_id]
31 | 


--------------------------------------------------------------------------------
/topo_processor/stac/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/stac/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/stac/tests/asset_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.stac.asset import Asset
 6 | 
 7 | 
 8 | def test_asset() -> None:
 9 |     """validate adding of extra field: file:checksum"""
10 |     source_path = os.path.abspath(os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff"))
11 |     asset = Asset(source_path)
12 |     asset.href = "test_asset"
13 |     checksum = asset.get_checksum()
14 |     json_asset = asset.create_stac().to_dict()
15 |     assert json_asset["file:checksum"] == checksum
16 | 


--------------------------------------------------------------------------------
/topo_processor/stac/tests/file_extension_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.metadata.data_type import DataType
 4 | from topo_processor.util.file_extension import FILE_EXTENSIONS, is_extension, is_tiff
 5 | 
 6 | 
 7 | def test_is_tiff() -> None:
 8 |     file_a = "file.tiff"
 9 |     file_b = "file.tif"
10 |     file_c = "file.TIFF"
11 |     file_d = "file.jpg"
12 | 
13 |     assert is_tiff(file_a) is True
14 |     assert is_tiff(file_b) is True
15 |     assert is_tiff(file_c) is True
16 |     assert is_tiff(file_d) is False
17 | 
18 | 
19 | def test_is_extension_imagery_historic() -> None:
20 |     file_a = "file.tiff"
21 |     file_b = "file.tif"
22 |     file_c = "file.TIFF"
23 |     file_d = "file.jpg"
24 | 
25 |     assert is_extension(file_a, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True
26 |     assert is_extension(file_b, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True
27 |     assert is_extension(file_c, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is True
28 |     assert is_extension(file_d, FILE_EXTENSIONS[DataType.IMAGERY_HISTORIC]) is False
29 | 


--------------------------------------------------------------------------------
/topo_processor/stac/tests/iter_errors_validator_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | import pytest
 5 | from pystac import STACValidationError, validation
 6 | 
 7 | from topo_processor.metadata.metadata_validators.metadata_validator_stac import MetadataValidatorStac
 8 | from topo_processor.stac.asset import Asset
 9 | from topo_processor.stac.item import Item
10 | from topo_processor.stac.stac_extensions import StacExtensions
11 | 
12 | 
13 | def test_iter_errors_validator() -> None:
14 |     """check error details is in exception message"""
15 |     source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
16 |     asset = Asset(source_path)
17 |     item = Item("item_id")
18 |     item.datetime = datetime.now()
19 |     item.add_asset(asset)
20 |     item.properties.update({"camera:nominal_focal_length": "string"})
21 |     item.properties.update({"camera:sequence_number": 1234})
22 |     item.add_extension(StacExtensions.camera.value, add_to_collection=False)
23 |     validator = MetadataValidatorStac()
24 |     assert validator.is_applicable(item)
25 |     with pytest.raises(STACValidationError) as e:
26 |         validator.validate_metadata(item)
27 |     assert "'string' is not of type 'integer'" in str(e.value)
28 | 
29 | 
30 | def test_iter_errors_validator_multiple_extensions() -> None:
31 |     """check error details is in exception message"""
32 |     source_path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
33 |     asset = Asset(source_path)
34 |     item = Item("item_id")
35 |     item.datetime = datetime.now()
36 |     item.add_asset(asset)
37 |     item.properties.update({"camera:nominal_focal_length": "string"})
38 |     item.properties.update({"camera:sequence_number": 1234})
39 |     item.add_extension(StacExtensions.camera.value, add_to_collection=False)
40 |     item.add_extension(StacExtensions.aerial_photo.value, add_to_collection=False)
41 |     validator = MetadataValidatorStac()
42 |     assert validator.is_applicable(item)
43 | 
44 |     with pytest.raises(STACValidationError) as e:
45 |         validator.validate_metadata(item)
46 |     assert "'string' is not of type 'integer'" in str(e.value)
47 |     assert "'aerial-photo:run' is a required property" in str(e.value)
48 | 


--------------------------------------------------------------------------------
/topo_processor/stac/tests/validate_report_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.stac.validate_report import ValidateReport
 4 | 
 5 | 
 6 | def test_increment_error() -> None:
 7 |     """"""
 8 |     error_report: ValidateReport = ValidateReport()
 9 |     error_report.increment_error("schema_a", "error_1")
10 |     assert error_report.report_per_error_type["schema_a"]["error_1"] == 1
11 |     error_report.increment_error("schema_a", "error_1")
12 |     assert error_report.report_per_error_type["schema_a"]["error_1"] == 2
13 |     error_report.increment_error("schema_b", "error_1")
14 |     assert error_report.report_per_error_type["schema_b"]["error_1"] == 1
15 |     error_report.increment_error("schema_a", "error_2")
16 |     assert error_report.report_per_error_type["schema_a"]["error_2"] == 1
17 | 


--------------------------------------------------------------------------------
/topo_processor/stac/validate_report.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | 
 4 | class ValidateReport:
 5 |     total: int
 6 |     report_per_error_type: Dict[str, Dict[str, int]]
 7 | 
 8 |     def __init__(self) -> None:
 9 |         self.total = 0
10 |         self.report_per_error_type = {}
11 | 
12 |     def add_errors(self, errors_per_schema: Dict[str, List[str]]) -> None:
13 |         for schema_uri in errors_per_schema:
14 |             for error in errors_per_schema[schema_uri]:
15 |                 self.increment_error(schema_uri, error)
16 |         self.total = self.total + 1
17 | 
18 |     def increment_error(self, schema: str, error: str) -> None:
19 |         existing = self.report_per_error_type.get(schema)
20 |         if existing is None:
21 |             self.report_per_error_type[schema] = existing = {}
22 |         existing[error] = existing.get(error, 0) + 1
23 | 


--------------------------------------------------------------------------------
/topo_processor/stac/validation.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Union
 2 | 
 3 | from linz_logger import get_log
 4 | 
 5 | from topo_processor.metadata.data_type import DataType, get_layer_id
 6 | from topo_processor.metadata.metadata_loaders import metadata_loader_imagery_hist
 7 | from topo_processor.metadata.metadata_validators import metadata_validator_stac
 8 | from topo_processor.stac.validate_report import ValidateReport
 9 | from topo_processor.util.time import time_in_ms
10 | 
11 | from .collection import Collection
12 | from .item import Item
13 | from .store import collection_store, item_store
14 | 
15 | 
16 | def validate_stac(metadata_file: str = "", validate_item: bool = True, validate_collection: bool = True) -> None:
17 |     """This function only validate the Historical Imagery layer at the moment."""
18 |     # FIXME: Make this function generic by validating other layers (vs only Historical Imagery atm)
19 |     start_time = time_in_ms()
20 |     item_report: ValidateReport = ValidateReport()
21 |     collection_report: ValidateReport = ValidateReport()
22 | 
23 |     get_log().debug("validate_stac", layer=get_layer_id(DataType.IMAGERY_HISTORIC))
24 | 
25 |     # Load metadata from metadata csv file
26 |     metadata_loader_imagery_hist.load_metadata(None, metadata_file, True)
27 |     get_log().debug("Metadata Loaded", metadata_file=metadata_file, duration=time_in_ms() - start_time)
28 | 
29 |     # Validate metadata from stored STAC objects
30 |     if validate_item:
31 |         item_report = validate_store(item_store)
32 |     if validate_collection:
33 |         collection_report = validate_store(collection_store)
34 | 
35 |     # Print report
36 |     get_log().info(
37 |         "Metadata Validated",
38 |         metadata_file=metadata_file,
39 |         nbItemsValidated=item_report.total,
40 |         nbCollectionsValidated=collection_report.total,
41 |         duration=time_in_ms() - start_time,
42 |         itemErrors=item_report.report_per_error_type,
43 |         collectionErrors=collection_report.report_per_error_type,
44 |     )
45 | 
46 | 
47 | def validate_store(store: Union[Dict[str, Item], Dict[str, Collection]]) -> ValidateReport:
48 |     validate_report: ValidateReport = ValidateReport()
49 | 
50 |     for stac_object in store.values():
51 |         if stac_object.is_valid():
52 |             validate_report.add_errors(metadata_validator_stac.validate_metadata_with_report(stac_object))
53 | 
54 |     return validate_report
55 | 


--------------------------------------------------------------------------------
/topo_processor/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .command import Command
2 | from .execution import ExecutionDocker, ExecutionLocal
3 | 


--------------------------------------------------------------------------------
/topo_processor/util/aws_credentials.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import TYPE_CHECKING, Dict
 3 | 
 4 | from boto3 import Session
 5 | from linz_logger import get_log
 6 | 
 7 | from topo_processor.util.configuration import aws_profile, linz_ssm_bucket_config_name
 8 | 
 9 | if TYPE_CHECKING:
10 |     from mypy_boto3_sts import STSClient
11 | else:
12 |     STSClient = object
13 | 
14 | 
15 | class Credentials:
16 |     access_key: str
17 |     secret_key: str
18 |     token: str
19 | 
20 |     def __init__(self, access_key: str, secret_key: str, token: str):
21 |         self.access_key = access_key
22 |         self.secret_key = secret_key
23 |         self.token = token
24 | 
25 | 
26 | session = Session(profile_name=aws_profile)
27 | client_sts: STSClient = session.client("sts")
28 | bucket_roles: Dict[str, Dict[str, str]] = {}
29 | bucket_credentials: Dict[str, Credentials] = {}
30 | 
31 | # Load bucket to roleArn mapping for LINZ internal buckets from SSM
32 | def init_roles() -> None:
33 |     get_log().debug("init_roles", linz_ssm_bucket_name=linz_ssm_bucket_config_name, aws_profile=aws_profile)
34 |     if linz_ssm_bucket_config_name is None:
35 |         return
36 | 
37 |     get_log().debug("load_bucket_config", ssm=linz_ssm_bucket_config_name)
38 |     role_config_param = session.client("ssm").get_parameter(Name=linz_ssm_bucket_config_name)
39 |     role_config = json.loads(role_config_param["Parameter"]["Value"])
40 | 
41 |     for cfg in role_config:
42 |         bucket_roles[cfg["bucket"]] = cfg
43 |     get_log().info("load_bucket_config_done", ssm=linz_ssm_bucket_config_name, buckets=len(role_config))
44 | 
45 | 
46 | def get_credentials_from_bucket(bucket_name: str) -> Credentials:
47 |     get_log().debug("get_credentials_from_bucket", bucket_name=bucket_name)
48 |     # FIXME: check if the token is expired - add a parameter
49 |     if bucket_name not in bucket_credentials:
50 |         role_arn = get_role_arn(bucket_name)
51 |         if role_arn:
52 |             bucket_credentials[bucket_name] = get_credentials_from_role(role_arn)
53 |         else:
54 |             session_credentials = session.get_credentials()
55 |             default_credentials = Credentials(
56 |                 session_credentials.access_key, session_credentials.secret_key, session_credentials.token
57 |             )
58 | 
59 |             return default_credentials
60 |     return bucket_credentials[bucket_name]
61 | 
62 | 
63 | def get_credentials_from_role(role_arn: str) -> Credentials:
64 |     get_log().debug("get_credentials_from_role", role_arn=role_arn)
65 |     assumed_role = client_sts.assume_role(RoleArn=role_arn, RoleSessionName="TopoProcessor")
66 |     credentials = Credentials(
67 |         assumed_role["Credentials"]["AccessKeyId"],
68 |         assumed_role["Credentials"]["SecretAccessKey"],
69 |         assumed_role["Credentials"]["SessionToken"],
70 |     )
71 |     return credentials
72 | 
73 | 
74 | def get_role_arn(bucket_name: str) -> str:
75 |     role_arn = ""
76 |     if not bucket_roles:
77 |         init_roles()
78 |     if bucket_name in bucket_roles:
79 |         role_arn = bucket_roles[bucket_name]["roleArn"]
80 |     else:
81 |         get_log().warn("role_arn_not_found", bucketName=bucket_name)
82 | 
83 |     return role_arn
84 | 


--------------------------------------------------------------------------------
/topo_processor/util/aws_files.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime, timedelta, timezone
  3 | from typing import Any, Dict, List, Union
  4 | from urllib.parse import urlparse
  5 | 
  6 | import boto3
  7 | from botocore import exceptions as botocore_exceptions
  8 | from linz_logger import get_log
  9 | 
 10 | from topo_processor.util.aws_credentials import Credentials, get_credentials_from_bucket
 11 | from topo_processor.util.configuration import historical_imagery_bucket
 12 | from topo_processor.util.file_extension import is_tiff
 13 | from topo_processor.util.time import time_in_ms
 14 | 
 15 | 
 16 | def s3_download(source_path: str, dest_path: str, credentials: Union[Credentials, None] = None) -> None:
 17 |     start_time = time_in_ms()
 18 |     get_log().debug("s3_download started", objectPath=source_path, destinationPath=dest_path)
 19 | 
 20 |     url_o = urlparse(source_path)
 21 |     bucket_name = url_o.netloc
 22 |     object_name = url_o.path[1:]
 23 | 
 24 |     if not credentials:
 25 |         credentials = get_credentials_from_bucket(bucket_name)
 26 | 
 27 |     s3 = boto3.resource(
 28 |         "s3",
 29 |         aws_access_key_id=credentials.access_key,
 30 |         aws_secret_access_key=credentials.secret_key,
 31 |         aws_session_token=credentials.token,
 32 |     )
 33 | 
 34 |     try:
 35 |         s3.Bucket(bucket_name).download_file(object_name, dest_path)
 36 |     except Exception as e:
 37 |         get_log().error("s3_download failed", objectPath=source_path, error=e)
 38 |         raise e
 39 | 
 40 |     get_log().debug(
 41 |         "s3_download ended",
 42 |         objectPath=source_path,
 43 |         destinationPath=dest_path,
 44 |         duration=time_in_ms() - start_time,
 45 |     )
 46 | 
 47 | 
 48 | def load_file_content(bucket_name: str, object_path: str) -> Dict[str, Any]:
 49 |     get_log().debug("bucket_name", bucket_name=bucket_name)
 50 |     credentials: Credentials = get_credentials_from_bucket(bucket_name)
 51 | 
 52 |     s3 = boto3.resource(
 53 |         "s3",
 54 |         aws_access_key_id=credentials.access_key,
 55 |         aws_secret_access_key=credentials.secret_key,
 56 |         aws_session_token=credentials.token,
 57 |     )
 58 | 
 59 |     object_content = s3.Object(bucket_name=bucket_name, key=object_path)
 60 | 
 61 |     if object_path.endswith(".json"):
 62 |         json_result: Dict[str, Any] = json.loads(object_content.get()["Body"].read())
 63 |         return json_result
 64 | 
 65 |     result: Dict[str, Any] = json.loads(object_content.get()["Body"].read().decode("utf-8"))
 66 |     return result
 67 | 
 68 | 
 69 | def build_s3_path(bucket_name: str, object_path: str) -> str:
 70 |     return f"s3://{bucket_name}/" + (object_path[1:] if object_path.startswith("/") else object_path)
 71 | 
 72 | 
 73 | def create_s3_manifest(manifest_source_path: str) -> None:
 74 |     # TODO:lock file
 75 |     start_time = time_in_ms()
 76 |     get_log().debug("check_manifest", manifestPath=manifest_source_path)
 77 | 
 78 |     url_o = urlparse(manifest_source_path)
 79 |     bucket_name = url_o.netloc
 80 |     manifest_path = url_o.path[1:]
 81 |     credentials: Credentials = get_credentials_from_bucket(bucket_name)
 82 | 
 83 |     s3_client = boto3.client(
 84 |         "s3",
 85 |         aws_access_key_id=credentials.access_key,
 86 |         aws_secret_access_key=credentials.secret_key,
 87 |         aws_session_token=credentials.token,
 88 |     )
 89 | 
 90 |     try:
 91 |         manifest_modified_datetime = s3_client.head_object(Bucket=bucket_name, Key=manifest_path)["LastModified"]
 92 |         cutoff_datetime = datetime.now(timezone.utc) - timedelta(days=28)
 93 |         if cutoff_datetime < manifest_modified_datetime:
 94 |             return
 95 | 
 96 |     except botocore_exceptions.ClientError as e:
 97 |         if e.response["Error"]["Code"] == "404":
 98 |             get_log().debug("no_manifest_file_found", bucketName=bucket_name, manifestPath=manifest_path, error=e)
 99 |         else:
100 |             raise e
101 | 
102 |     try:
103 |         get_log().debug("create_manifest", bucketName=bucket_name, manifestPath=manifest_path)
104 |         manifest_new: Dict[str, Any] = {}
105 |         manifest_file_list = _list_objects(historical_imagery_bucket)
106 |         manifest_new["path"] = manifest_path
107 |         manifest_new["time"] = time_in_ms()
108 |         manifest_new["files"] = manifest_file_list
109 | 
110 |         s3_client.put_object(
111 |             Body=json.dumps(manifest_new).encode("UTF-8"),
112 |             ContentType="application/json",
113 |             Bucket=bucket_name,
114 |             Key=manifest_path,
115 |         )
116 | 
117 |     except Exception as e:
118 |         get_log().error("create_manifest_failed", bucketPath=bucket_name, manifestPath=manifest_path, error=e)
119 |         raise e
120 | 
121 |     get_log().debug(
122 |         "log_manifest_create_time",
123 |         manifestSourcePath=manifest_source_path,
124 |         duration=time_in_ms() - start_time,
125 |     )
126 | 
127 | 
128 | def _list_objects(bucket_name: str) -> List[Dict[str, str]]:
129 | 
130 |     credentials: Credentials = get_credentials_from_bucket(bucket_name)
131 | 
132 |     s3_client = boto3.client(
133 |         "s3",
134 |         aws_access_key_id=credentials.access_key,
135 |         aws_secret_access_key=credentials.secret_key,
136 |         aws_session_token=credentials.token,
137 |     )
138 | 
139 |     file_list: List[Dict[str, str]] = []
140 |     paginator = s3_client.get_paginator("list_objects_v2")
141 |     response_iterator = paginator.paginate(Bucket=bucket_name)
142 |     for response in response_iterator:
143 |         for contents_data in response["Contents"]:
144 |             key = contents_data["Key"]
145 |             if is_tiff(key):
146 |                 file_list.append({"path": key})
147 | 
148 |     return file_list
149 | 


--------------------------------------------------------------------------------
/topo_processor/util/checksum.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | import multihash
 4 | 
 5 | from topo_processor.file_system.get_fs import get_fs
 6 | 
 7 | CHUNK_SIZE = 1024 * 1024  # 1MB
 8 | 
 9 | 
10 | def multihash_as_hex(path: str) -> str:
11 |     file_hash = hashlib.sha256()
12 |     with get_fs(path).open(path, "rb") as file:
13 |         while chunk := file.read(CHUNK_SIZE):
14 |             file_hash.update(chunk)
15 |     result: str = multihash.to_hex_string(multihash.encode(file_hash.digest(), "sha2-256"))
16 |     return result
17 | 


--------------------------------------------------------------------------------
/topo_processor/util/command.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Optional, Tuple, TypedDict
 3 | 
 4 | from topo_processor.util.execution import ExecutionDocker, ExecutionLocal
 5 | 
 6 | 
 7 | class CommandDocker(TypedDict):
 8 |     container: str
 9 |     tag: Optional[str]
10 | 
11 | 
12 | class Command:
13 |     use_docker: bool
14 | 
15 |     def __init__(self, command: str, docker_ref: Optional[CommandDocker] = None) -> None:
16 |         self.command = command
17 |         self.arguments: List[str] = []
18 |         self.volumes: List[str] = []
19 |         self.envs: List[str] = []
20 |         if docker_ref is None:
21 |             self.use_docker = False
22 |         else:
23 |             self.use_docker = True
24 |             self.container = docker_ref.get("container", None)
25 |             self.container_tag = docker_ref.get("tag", None)
26 | 
27 |     def arg(self, *args: str) -> "Command":
28 |         for argument in args:
29 |             self.arguments.append(argument)
30 |         return self
31 | 
32 |     def mount(self, *args: str) -> "Command":
33 |         """Mount a folder, useful only if the command is run inside of docker"""
34 |         for volume in args:
35 |             self.volumes.append(volume)
36 |         return self
37 | 
38 |     def env(self, *args: str) -> "Command":
39 |         """Only useful when using docker"""
40 |         for env in args:
41 |             self.envs.append(env)
42 |         return self
43 | 
44 |     def to_full_command(self) -> List[str]:
45 |         return [self.command] + self.arguments
46 | 
47 |     def redacted_command(self) -> List[str]:
48 |         """Provide redacted argument string for logging which removes sensitive information"""
49 |         redacted = []
50 |         for arg in self.arguments:
51 |             if arg.startswith("AWS"):
52 |                 split_arg = arg.split("=")
53 |                 arg = f"{split_arg[0]}=******"
54 |             redacted.append(arg)
55 |         return [self.command] + redacted
56 | 
57 |     def to_docker(self) -> "Command":
58 |         if not self.container:
59 |             raise Exception(f"No container found for command {self.command}")
60 |         docker = Command("docker")
61 |         docker.arg("run")
62 |         for env in self.envs:
63 |             docker.arg("--env", env)
64 |         docker.arg("--user", f"{os.geteuid()}:{os.getegid()}")
65 |         for volume in self.volumes:
66 |             docker.arg("-v", f"{volume}:{volume}")
67 |         docker.arg("--rm")
68 | 
69 |         if not self.container_tag:
70 |             docker.arg(self.container)
71 |         else:
72 |             docker.arg(f"{self.container}:{self.container_tag}")
73 | 
74 |         docker.arg(self.command)
75 |         for argument in self.arguments:
76 |             docker.arg(argument)
77 |         return docker
78 | 
79 |     def run(self) -> Tuple[int, str, str]:
80 |         if self.use_docker:
81 |             return ExecutionDocker.run(self)
82 |         return ExecutionLocal.run(self)
83 | 


--------------------------------------------------------------------------------
/topo_processor/util/configuration.py:
--------------------------------------------------------------------------------
 1 | from os import environ, path
 2 | from tempfile import mkdtemp
 3 | from typing import Dict, Optional
 4 | 
 5 | from dotenv import load_dotenv
 6 | from linz_logger import get_log
 7 | 
 8 | load_dotenv()
 9 | 
10 | 
11 | def get_env(env_name: str) -> str:
12 |     env_var = environ.get(env_name)
13 |     if env_var is None:
14 |         raise Exception(f"Missing environment variable ${env_name}")
15 |     return env_var
16 | 
17 | 
18 | lds_cache_bucket: str = get_env("LINZ_CACHE_BUCKET")
19 | historical_imagery_bucket = get_env("LINZ_HISTORICAL_IMAGERY_BUCKET")
20 | aws_profile: Optional[str] = environ.get("AWS_PROFILE")
21 | linz_ssm_bucket_config_name: Optional[str] = environ.get("LINZ_SSM_BUCKET_CONFIG_NAME")
22 | temp_folder: str = mkdtemp()
23 | get_log().debug(
24 |     "from_environment_variables", lds_cache_bucket=lds_cache_bucket, aws_profile=aws_profile, ssm=linz_ssm_bucket_config_name
25 | )
26 | 
27 | 
28 | def get_topo_processor_version() -> Dict[str, str]:
29 |     with open(path.join("VERSION")) as version_file:
30 |         version: str = version_file.read().strip()
31 |         return {"Topo Processor": version}
32 | 


--------------------------------------------------------------------------------
/topo_processor/util/conversions.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from datetime import datetime
 3 | from typing import Any, Dict, List, Union
 4 | 
 5 | from dateutil import parser, tz
 6 | 
 7 | 
 8 | def string_to_number(value: str) -> Union[float, int, str]:
 9 |     """If possible this function returns the int/float of the input value,
10 |     if not it returns the string.
11 |     """
12 |     try:
13 |         int_number = int(value)
14 |         return int_number
15 |     except ValueError:
16 |         try:
17 |             float_number = float(value)
18 |             return float_number
19 |         except ValueError:
20 |             return value
21 | 
22 | 
23 | def remove_empty_strings(properties: Dict[str, Any]) -> Dict[str, Any]:
24 |     return {key: value for key, value in properties.items() if value != ""}
25 | 
26 | 
27 | def string_to_boolean(value: str, true_values: List[str], false_values: List[str]) -> Union[bool, str]:
28 |     """Find value in lists and return boolean,
29 |     else returns the original value string.
30 |     """
31 |     clean_value = value.strip().lower()
32 |     if clean_value in true_values:
33 |         return True
34 |     if clean_value in false_values:
35 |         return False
36 |     return value
37 | 
38 | 
39 | def nzt_datetime_to_utc_datetime(date: str) -> datetime:
40 |     utc_tz = tz.gettz("UTC")
41 |     nz_tz = tz.gettz("Pacific/Auckland")
42 | 
43 |     try:
44 |         nz_time = parser.parse(date).replace(tzinfo=nz_tz)
45 |     except parser.ParserError as err:
46 |         raise Exception(f"Not a valid date: {err}") from err
47 | 
48 |     utc_time: datetime = nz_time.astimezone(utc_tz)
49 | 
50 |     return utc_time
51 | 
52 | 
53 | def quarterdate_to_date_string(value: str) -> str:
54 |     """If possible this function converts quarter e.g. 'Q3' to RFC3339 format,
55 |     e.g. '2021-03-01T00:00:00.000Z', then to UTC, else returns original value string.
56 |     """
57 |     re_result = re.search(r"(\d{4})[/][qQ]([1-4])", value)
58 | 
59 |     if re_result is not None:
60 | 
61 |         year = re_result.group(1)
62 |         month = (3 * (int(re_result.group(2)))) - 2
63 | 
64 |         date_string_nz = f"{year}-{month}-01T00:00:00.000"
65 |         datetime_utc = nzt_datetime_to_utc_datetime(date_string_nz)
66 |         date_string_utc = datetime_utc.strftime("%Y-%m-%dT%H:%M:%S") + "Z"
67 |         return date_string_utc
68 | 
69 |     return value
70 | 
71 | 
72 | def historical_imagery_photo_type_to_linz_geospatial_type(photo_type: str) -> str:
73 |     """Find value in dict and return linz_geospatial_type,
74 |     else return the original value string.
75 |     """
76 |     geospatial_type_conversion_table = {
77 |         "B&W": "black and white image",
78 |         "B&W IR": "black and white infrared image",
79 |         "COLOUR": "color image",
80 |         "COLOUR IR": "color infrared image",
81 |     }
82 | 
83 |     lgs_value = geospatial_type_conversion_table.get(photo_type.strip().upper())
84 |     if lgs_value:
85 |         return lgs_value
86 |     else:
87 |         return photo_type
88 | 


--------------------------------------------------------------------------------
/topo_processor/util/execution.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from typing import TYPE_CHECKING, Tuple
 3 | 
 4 | from linz_logger import get_log
 5 | 
 6 | from topo_processor.util.time import time_in_ms
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from .command import Command
10 | 
11 | 
12 | class ExecutionLocal:
13 |     cmd: "Command"
14 | 
15 |     @staticmethod
16 |     def run(cmd: "Command") -> Tuple[int, str, str]:
17 |         start_time = time_in_ms()
18 | 
19 |         proc = subprocess.run(cmd.to_full_command(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
20 |         if proc.returncode != 0:
21 |             get_log().error("Run command failed", command=cmd.redacted_command(), duration=time_in_ms() - start_time)
22 |             raise Exception(proc.stderr.decode())
23 |         get_log().trace("Run command succeeded", command=cmd.redacted_command(), duration=time_in_ms() - start_time)
24 |         return proc.returncode, proc.stdout.decode(), proc.stderr.decode()
25 | 
26 | 
27 | class ExecutionDocker:
28 |     cmd: "Command"
29 | 
30 |     @staticmethod
31 |     def run(cmd: "Command") -> Tuple[int, str, str]:
32 |         return ExecutionLocal.run(cmd.to_docker())
33 | 


--------------------------------------------------------------------------------
/topo_processor/util/file_converter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from topo_processor.util.command import Command
 4 | 
 5 | 
 6 | def geopackage_to_csv(input_path: str, output_path: str) -> Command:
 7 |     if os.environ.get("IS_DOCKER") == "true":
 8 |         cmd = Command("ogr2ogr")
 9 |     else:
10 |         cmd = Command("ogr2ogr", {"container": "osgeo/gdal", "tag": "ubuntu-small-3.5.0"})
11 | 
12 |     cmd.mount(input_path)
13 |     cmd.mount(os.path.dirname(output_path))
14 |     cmd.arg("-f", "CSV")
15 |     cmd.arg("-lco", "GEOMETRY=AS_WKT")
16 |     cmd.arg("-nlt", "POLYGON")
17 |     cmd.arg(output_path)
18 |     cmd.arg(input_path)
19 |     return cmd
20 | 


--------------------------------------------------------------------------------
/topo_processor/util/file_extension.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Tuple
 2 | 
 3 | FILE_EXTENSIONS: Dict[str, Tuple[str, ...]] = {"imagery.historic": (".tif", ".tiff")}
 4 | 
 5 | 
 6 | def is_extension(file_name: str, extensions: Tuple[str, ...]) -> bool:
 7 |     return file_name.lower().endswith(extensions)
 8 | 
 9 | 
10 | def is_tiff(path: str) -> bool:
11 |     return is_extension(path, (".tiff", ".tif"))
12 | 
13 | 
14 | def is_csv(path: str) -> bool:
15 |     return is_extension(path, (".csv",))
16 | 
17 | 
18 | def is_geopackage(path: str) -> bool:
19 |     return is_extension(path, (".gpkg",))
20 | 


--------------------------------------------------------------------------------
/topo_processor/util/files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def get_file_update_time(path: str) -> str:
 6 |     """Return the time (in ) of the last update of the path metadata
 7 |     https://docs.python.org/3.9/library/os.path.html#os.path.getctime
 8 |     Here ctime refers to the last metadata change for specified path in UNIX while in Windows, it refers to path creation time."""
 9 |     update_ctime = os.path.getctime(path)
10 |     update_time = datetime.utcfromtimestamp(update_ctime).strftime("%Y-%m-%dT%H:%M:%S") + "Z"
11 |     return update_time
12 | 


--------------------------------------------------------------------------------
/topo_processor/util/gzip.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | from typing import Optional
 3 | 
 4 | from linz_logger.logger import get_log
 5 | 
 6 | 
 7 | def is_gzip_file(file_path: str) -> bool:
 8 |     with open(file_path, "rb") as file:
 9 |         # gzip magic number == "1f 8b"
10 |         return file.read(2) == b"\x1f\x8b"
11 | 
12 | 
13 | def decompress_file(file_path: str) -> None:
14 |     input: Optional[gzip.GzipFile] = None
15 | 
16 |     try:
17 |         input = gzip.GzipFile(file_path, "rb")
18 |         s = input.read()
19 |     except gzip.BadGzipFile as e:
20 |         get_log().error("File decompression failed", file=file_path, error=e)
21 |         raise e
22 |     finally:
23 |         if input:
24 |             input.close()
25 | 
26 |     output = open(file_path, "wb")
27 |     output.write(s)
28 |     output.close()
29 | 


--------------------------------------------------------------------------------
/topo_processor/util/s3.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from linz_logger import get_log
 3 | 
 4 | 
 5 | def bucket_name_from_path(path: str) -> str:
 6 |     path_parts = path.replace("s3://", "").split("/")
 7 |     return path_parts.pop(0)
 8 | 
 9 | 
10 | def bucket_name_from_stack(stack_name: str) -> str:
11 |     get_log().debug("stack_name", stack_name=stack_name)
12 |     session = boto3.Session()
13 |     cloudformation = session.resource("cloudformation")
14 |     stack = cloudformation.Stack(stack_name)
15 | 
16 |     temp_bucket: str = ""
17 | 
18 |     for output in stack.outputs:
19 |         if output["OutputKey"] == "TempBucketName":
20 |             get_log().debug("bucket_name", bucket_name=output["OutputValue"])
21 |             temp_bucket = output["OutputValue"]
22 | 
23 |     if not temp_bucket:
24 |         get_log().error("bucket_name_not_found", stackName=stack_name)
25 |         raise Exception("No temp_bucket found in stack")
26 | 
27 |     return temp_bucket
28 | 
29 | 
30 | def is_s3_path(path: str) -> bool:
31 |     if path.startswith("s3://"):
32 |         return True
33 |     return False
34 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linz/topo-processor/337ea60c1ec196fe634433c5aebf5696cdea1a99/topo_processor/util/tests/__init__.py


--------------------------------------------------------------------------------
/topo_processor/util/tests/aws_credentials_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.util.aws_credentials import bucket_roles, get_role_arn
 4 | 
 5 | 
 6 | # Add test with AWS mock
 7 | def test_get_role_arn() -> None:
 8 |     bucket_roles["bucket-test"] = {"roleArn": "arn:aws:iam::123456789012:role/S3Access"}
 9 |     assert get_role_arn("bucket-test") == "arn:aws:iam::123456789012:role/S3Access"
10 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/aws_files_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from topo_processor.util.aws_files import build_s3_path
4 | 
5 | 
6 | # Add test with AWS mock
7 | def test_build_s3_path() -> None:
8 |     assert build_s3_path("test-bucket", "/test-folder/object.ext") == "s3://test-bucket/test-folder/object.ext"
9 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/checksum_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from topo_processor.util.checksum import multihash_as_hex
 6 | 
 7 | 
 8 | def test_multihash_as_hex() -> None:
 9 |     path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "WRONG_SURVEY.tiff")
10 |     assert multihash_as_hex(path) == "1220d1bed69013d3dbcf4b1ef90016d77be83ad9b1759865ef5f9969ed540f902f53"
11 | 
12 |     path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "CONTROL.tiff")
13 |     assert multihash_as_hex(path) == "1220d3e42a62bb123eeeb96358f1e4ed46d20b1a329a4738dd643d27623ba8452957"
14 | 
15 |     path = os.path.join(os.getcwd(), "test_data", "tiffs", "SURVEY_1", "MULTIPLE_ASSET.tiff")
16 |     assert multihash_as_hex(path) == "1220d3e42a62bb123eeeb96358f1e4ed46d20b1a329a4738dd643d27623ba8452957"
17 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/command_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.util.command import Command
 4 | 
 5 | 
 6 | def test_hello_world_local() -> None:
 7 |     cmd = Command("echo")
 8 |     cmd.arg("Hello World Local!!!")
 9 |     return_code, stdout, stderr = cmd.run()
10 |     assert stdout == "Hello World Local!!!\n"
11 |     assert stderr == ""
12 |     assert return_code == 0
13 | 
14 | 
15 | def test_hello_world_docker(mocker) -> None:  # type: ignore
16 |     cmd = Command("/bin/echo", {"container": "busybox", "tag": "latest"})
17 |     cmd.arg("Hello World Docker!!!")
18 |     mocker.patch("topo_processor.util.execution.ExecutionLocal.run", return_value=[0, "Hello World Docker!!!\n", ""])
19 |     return_code, stdout, _ = cmd.run()
20 |     assert stdout == "Hello World Docker!!!\n"
21 |     assert return_code == 0
22 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/conversions_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from topo_processor.util.conversions import (
 4 |     historical_imagery_photo_type_to_linz_geospatial_type,
 5 |     nzt_datetime_to_utc_datetime,
 6 |     quarterdate_to_date_string,
 7 | )
 8 | 
 9 | 
10 | def test_nzt_datetime_to_utc_datetime_daylight_saving_on() -> None:
11 |     utc_date = nzt_datetime_to_utc_datetime("1988-01-11T00:00:00.000")
12 |     assert utc_date.isoformat() == "1988-01-10T11:00:00+00:00"
13 | 
14 | 
15 | def test_nzt_datetime_to_utc_datetime_daylight_saving_off() -> None:
16 |     utc_date = nzt_datetime_to_utc_datetime("1988-07-11T00:00:00.000")
17 |     assert utc_date.isoformat() == "1988-07-10T12:00:00+00:00"
18 | 
19 | 
20 | def test_quarter_date_to_utc_correct_format() -> None:
21 |     utc_date_string = quarterdate_to_date_string("2020/Q1")
22 |     assert utc_date_string == "2019-12-31T11:00:00Z"
23 | 
24 | 
25 | def test_quarter_date_to_utc_incorrect_format() -> None:
26 |     returned_string = quarterdate_to_date_string("nzam_pilot")
27 |     assert returned_string == "nzam_pilot"
28 | 
29 | 
30 | def test_historical_imagery_photo_type_to_linz_geospatial_type_empty_string() -> None:
31 |     returned_string = historical_imagery_photo_type_to_linz_geospatial_type("")
32 |     assert returned_string == ""
33 | 
34 | 
35 | def test_historical_imagery_photo_type_to_linz_geospatial_type_whitespace_case() -> None:
36 |     returned_string = historical_imagery_photo_type_to_linz_geospatial_type(" B&w IR    ")
37 |     assert returned_string == "black and white infrared image"
38 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/files_test.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | 
 4 | import pytest
 5 | 
 6 | from topo_processor.util.gzip import is_gzip_file
 7 | 
 8 | 
 9 | def test_is_gzip_file_true(setup: str) -> None:
10 |     compressed_file = os.path.abspath(os.path.join(setup, "file.gz"))
11 |     cf = gzip.open(compressed_file, "wb")
12 |     cf.write("test".encode("utf-8"))
13 |     cf.close()
14 | 
15 |     assert is_gzip_file(compressed_file) == True
16 | 
17 | 
18 | def test_is_gzip_file_false(setup: str) -> None:
19 |     file = os.path.abspath(os.path.join(setup, "file.txt"))
20 |     cf = open(file, "wb")
21 |     cf.write("test".encode("utf-8"))
22 |     cf.close()
23 | 
24 |     assert is_gzip_file(file) == False
25 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/time_test.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import List
 3 | 
 4 | from topo_processor.util.time import get_min_max_interval
 5 | 
 6 | 
 7 | def test_get_min_max_interval() -> None:
 8 |     dates: List[datetime] = []
 9 |     datetime_earliest = datetime.strptime("1918-11-11", "%Y-%m-%d")
10 |     datetime_mid = datetime.strptime("1945-05-08", "%Y-%m-%d")
11 |     datetime_latest = datetime.strptime("1989-11-09", "%Y-%m-%d")
12 |     dates.append(datetime_earliest)
13 |     dates.append(datetime_latest)
14 |     dates.append(datetime_mid)
15 | 
16 |     assert get_min_max_interval(dates) == [datetime_earliest, datetime_latest]
17 | 


--------------------------------------------------------------------------------
/topo_processor/util/tests/transfer_collection_test.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from datetime import datetime
  4 | 
  5 | import pytest
  6 | 
  7 | from topo_processor.metadata.data_type import DataType
  8 | from topo_processor.metadata.metadata_loaders.metadata_loader_imagery_historic import MetadataLoaderImageryHistoric
  9 | from topo_processor.stac.asset import Asset
 10 | from topo_processor.stac.asset_key import AssetKey
 11 | from topo_processor.stac.collection import Collection
 12 | from topo_processor.stac.item import Item
 13 | from topo_processor.util.transfer_collection import transfer_collection
 14 | 
 15 | 
 16 | def test_fail_on_duplicate_assets(setup: str) -> None:
 17 |     target = setup
 18 |     collection = Collection("fake_title")
 19 |     collection.survey = "survey_id"
 20 |     collection.description = "fake_description"
 21 |     collection.license = "fake_license"
 22 |     item = Item("item_id")
 23 |     item.datetime = datetime.now()
 24 |     item.linz_geospatial_type = "black and white image"
 25 |     collection.add_item(item)
 26 |     item.collection = collection
 27 | 
 28 |     cog_1 = Asset("./test_data/tiffs/SURVEY_1/CONTROL.tiff")
 29 |     cog_1.target = "fake_title/fake_target.tiff"
 30 |     cog_1.key_name = AssetKey.Visual
 31 |     item.add_asset(cog_1)
 32 | 
 33 |     cog_2 = Asset("test_data/tiffs/SURVEY_1/MULTIPLE_ASSET.tiff")
 34 |     cog_2.target = "fake_title/fake_target.tiff"
 35 |     cog_2.key_name = AssetKey.Visual
 36 |     item.add_asset(cog_2)
 37 | 
 38 |     with pytest.raises(Exception, match=r"./item_id.tiff already exists."):
 39 |         transfer_collection(item.collection, target, DataType("imagery.historic"))
 40 | 
 41 | 
 42 | def test_asset_key_not_in_list(setup: str) -> None:
 43 |     target = setup
 44 |     collection = Collection("fake_title")
 45 |     collection.survey = "survey_id"
 46 |     collection.description = "fake_description"
 47 |     collection.license = "fake_license"
 48 |     item = Item("item_id")
 49 |     item.datetime = datetime.now()
 50 |     item.linz_geospatial_type = "black and white image"
 51 |     collection.add_item(item)
 52 |     item.collection = collection
 53 | 
 54 |     test_asset = Asset("./test_data/tiffs/SURVEY_1/CONTROL.tiff")
 55 |     test_asset.target = "fake_title/fake_target.tiff"
 56 |     test_asset.key_name = None
 57 |     item.add_asset(test_asset)
 58 | 
 59 |     with pytest.raises(Exception, match=r"No asset key set for asset ./item_id.tiff"):
 60 |         transfer_collection(item.collection, target, DataType("imagery.historic"))
 61 | 
 62 | 
 63 | def test_generate_summaries(setup: str) -> None:
 64 |     target = setup
 65 |     collection = Collection("AUCKLAND 1")
 66 |     collection.description = "fake_description"
 67 |     collection.license = "face_license"
 68 |     collection.survey = "SURVEY_1"
 69 |     test_geom = {
 70 |         "WKT": "POLYGON ((177.168157744315 -38.7538525409217,"
 71 |         "177.23423558687 -38.7514276946524,"
 72 |         "177.237358655351 -38.8031681573174,"
 73 |         "177.17123348276 -38.8055953066942,"
 74 |         "177.168157744315 -38.7538525409217))"
 75 |     }
 76 |     test_datetime = datetime.strptime("1918-11-11", "%Y-%m-%d")
 77 | 
 78 |     item_1 = Item("item_1_id")
 79 |     metadata_loader_imagery_historic = MetadataLoaderImageryHistoric()
 80 |     metadata_loader_imagery_historic.add_spatial_extent(item_1, asset_metadata=test_geom)
 81 |     item_1.datetime = test_datetime
 82 |     item_1.properties = {
 83 |         "mission": "SURVEY_1",
 84 |         "proj:centroid": {"lat": -45.8079, "lon": 170.5548},
 85 |         "camera:sequence_number": 89555,
 86 |         "film:id": "731",
 87 |         "aerial-photo:scale": 6600,
 88 |         "scan:scanned": "2014-06-30T12:00:00Z",
 89 |         "proj:epsg": "null",
 90 |     }
 91 |     collection.add_item(item_1)
 92 |     item_1.collection = collection
 93 | 
 94 |     item_2 = Item("item_2_id")
 95 |     metadata_loader_imagery_historic = MetadataLoaderImageryHistoric()
 96 |     metadata_loader_imagery_historic.add_spatial_extent(item_2, asset_metadata=test_geom)
 97 |     item_2.datetime = test_datetime
 98 |     item_2.properties = {
 99 |         "mission": "SURVEY_1",
100 |         "proj:centroid": {"lat": -45.8079, "lon": 170.5599},
101 |         "camera:sequence_number": 89554,
102 |         "film:id": "731",
103 |         "aerial-photo:scale": 5600,
104 |         "scan:scanned": "2019-12-31T11:00:00Z",
105 |         "proj:epsg": "null",
106 |     }
107 |     collection.add_item(item_2)
108 |     item_2.collection = collection
109 | 
110 |     transfer_collection(item_1.collection, target, DataType("imagery.aerial"))
111 | 
112 |     with open(os.path.join(target, "SURVEY_1", "collection.json")) as collection_json_file:
113 |         collection_metadata = json.load(collection_json_file)
114 |         assert collection_metadata["summaries"]["mission"] == ["SURVEY_1"]
115 |         assert collection_metadata["summaries"]["film:id"] == ["731"]
116 |         assert collection_metadata["summaries"]["proj:epsg"] == ["null"]
117 |         assert collection_metadata["summaries"]["aerial-photo:scale"] == {"minimum": 5600, "maximum": 6600}
118 |         assert collection_metadata["summaries"]["scan:scanned"] == {
119 |             "minimum": "2014-06-30T12:00:00Z",
120 |             "maximum": "2019-12-31T11:00:00Z",
121 |         }
122 |         assert collection_metadata["summaries"]["camera:sequence_number"] == {"minimum": 89554, "maximum": 89555}
123 |         assert "proj:centroid" not in collection_metadata["summaries"].keys()
124 | 


--------------------------------------------------------------------------------
/topo_processor/util/time.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from datetime import datetime
 3 | from typing import List, Union
 4 | 
 5 | 
 6 | def time_in_ms() -> float:
 7 |     return time.time() * 1000
 8 | 
 9 | 
10 | def get_min_max_interval(times: List[datetime]) -> List[Union[datetime, None]]:
11 |     min_date = None
12 |     max_date = None
13 | 
14 |     for date in times:
15 |         if not min_date:
16 |             min_date = date
17 |         elif date < min_date:
18 |             min_date = date
19 |         if not max_date:
20 |             max_date = date
21 |         elif date > max_date:
22 |             max_date = date
23 | 
24 |     return [min_date, max_date]
25 | 


--------------------------------------------------------------------------------
/topo_processor/util/transfer_collection.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from typing import TYPE_CHECKING, Any
  5 | 
  6 | from linz_logger import get_log
  7 | from pystac.catalog import CatalogType
  8 | 
  9 | from topo_processor.file_system.transfer import transfer_file
 10 | from topo_processor.file_system.write_json import write_json
 11 | from topo_processor.metadata.data_type import DataType
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from topo_processor.stac.collection import Collection
 15 | 
 16 | 
 17 | def transfer_collection(collection: Collection, target: str, data_type: DataType, force: bool = False) -> None:
 18 |     stac_collection = collection.create_stac()
 19 |     files_to_transfer: dict[str, Any] = {}
 20 |     # pystac v1.1.0
 21 |     # Required to remove cwd from collection self_href,
 22 |     # Must be come after collection.create_stac and be before stac_collection.add_item(..)
 23 |     stac_collection.catalog_type = CatalogType.SELF_CONTAINED
 24 | 
 25 |     for item in collection.items.values():
 26 |         if not item.is_valid():
 27 |             get_log().warning("Invalid item won't be uploaded:", error=item.log)
 28 |             continue
 29 |         if item.log:
 30 |             get_log().warning(f"Item {item.id} contains warnings:", error=item.log)
 31 | 
 32 |         stac_item = item.create_stac()
 33 |         stac_collection.add_item(stac_item)
 34 |         # pystac v1.1.0
 35 |         # Required to change the pystac default of ./{id}/{id}.json
 36 |         # Must come after stac_collection.add_item(stac_item)
 37 |         stac_item.set_self_href(f"./{item.id}.json")
 38 | 
 39 |         existing_asset_hrefs = {}
 40 | 
 41 |         for asset in item.assets:
 42 | 
 43 |             if not asset.needs_upload:
 44 |                 continue
 45 |             asset.href = f"./{item.id}{asset.file_ext()}"
 46 |             if asset.href in existing_asset_hrefs:
 47 |                 raise Exception(f"{asset.href} already exists.")
 48 |             if not asset.target:
 49 |                 raise Exception(f"No asset target set for asset {asset.href}")
 50 |             asset_transfer = {
 51 |                 "source": asset.source_path,
 52 |                 "checksum": asset.get_checksum(),
 53 |                 "contentType": asset.get_content_type(),
 54 |                 "target": os.path.join(target, asset.target),
 55 |             }
 56 | 
 57 |             if not asset.key_name:
 58 |                 raise Exception(f"No asset key set for asset {asset.href}")
 59 |             else:
 60 |                 stac_item.add_asset(key=asset.key_name, asset=asset.create_stac())
 61 | 
 62 |             existing_asset_hrefs[asset.href] = asset_transfer
 63 | 
 64 |         files_to_transfer[item.id] = {"images": existing_asset_hrefs}
 65 | 
 66 |         # pystac v1.1.0
 67 |         # Required to not add a self link with an 'absolute' link from the cwd
 68 |         json_item = stac_item.to_dict(include_self_link=False)
 69 |         if not item.collection:
 70 |             raise Exception(f"No collection set for item {item.id}")
 71 |         files_to_transfer[item.id]["stac"] = {
 72 |             "item": json_item,
 73 |             "target": os.path.join(target, item.collection.survey, f"{item.id}.json"),
 74 |         }
 75 | 
 76 |     # after all items have been processed generate summaries
 77 |     collection.generate_summaries(stac_collection)
 78 |     collection.update_description(stac_collection, data_type)
 79 | 
 80 |     try:
 81 |         collection.validate_pystac_collection(stac_collection)
 82 |     except Exception as e:
 83 |         get_log().error(f"Collection Validation Warning: {e}", collection_id=collection.id)
 84 |         if not force:
 85 |             raise Exception("Collection failed the validation. Process is stopped.") from e
 86 | 
 87 |     # Transfer the files
 88 |     for item_transfer in files_to_transfer.values():
 89 |         for asset_transfer in item_transfer["images"].values():
 90 |             transfer_file(
 91 |                 str(asset_transfer["source"]),
 92 |                 str(asset_transfer["checksum"]),
 93 |                 str(asset_transfer["contentType"]),
 94 |                 str(asset_transfer["target"]),
 95 |             )
 96 |         write_json(item_transfer["stac"]["item"], item_transfer["stac"]["target"])
 97 | 
 98 |     # pystac v1.1.0
 99 |     # Required to not add a self link with an 'absolute' link from the cwd
100 |     json_collection = stac_collection.to_dict(include_self_link=False)
101 | 
102 |     write_json(json_collection, os.path.join(target, collection.survey, "collection.json"))
103 | 


--------------------------------------------------------------------------------
/topo_processor/util/valid.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Optional
 2 | 
 3 | 
 4 | class Validity:
 5 |     def __init__(self) -> None:
 6 |         self.log: List[Dict[str, Any]] = []
 7 |         self._valid = True
 8 | 
 9 |     def add_error(self, msg: str, cause: str, e: Optional[Exception] = None) -> None:
10 |         self.log.append({"msg": msg, "level": "error", "cause": cause, "error": e})
11 |         self._valid = False
12 | 
13 |     def add_warning(self, msg: str, cause: str, e: Optional[Exception] = None) -> None:
14 |         self.log.append({"msg": msg, "level": "warning", "cause": cause, "error": e})
15 | 
16 |     def is_valid(self) -> bool:
17 |         return self._valid
18 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "@linzjs/style/tsconfig.base.json",
3 |   "compilerOptions": {
4 |     "lib": ["ES2020"],
5 |     "outDir": "build"
6 |   },
7 |   "include": ["infra/src"]
8 | }
9 | 


--------------------------------------------------------------------------------
/upload:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from topo_processor.cli import upload
4 | 
5 | upload.main()
6 | 


--------------------------------------------------------------------------------