├── .nvmrc
├── tests
    ├── __init__.py
    ├── test_networking_stack.py
    ├── dynamodb_generators.py
    ├── test_get_latest_extension_schema_version.py
    ├── file_utils.py
    ├── test_datasets_model_base.py
    ├── test_prefix_non_prod_name.py
    ├── test_parameter_store.py
    ├── test_step_function_logging.py
    ├── conftest.py
    ├── test_storage_bucket.py
    ├── test_validation_summary.py
    ├── test_upload_status_logging.py
    ├── test_api_endpoint_handler.py
    ├── test_validation_summary_logging.py
    ├── test_check_files_checksums_logging.py
    ├── general_generators.py
    ├── test_upload_status.py
    ├── test_import_status_logging.py
    ├── test_step_function.py
    ├── test_dataset_versions_endpoint_logging.py
    └── stac_generators.py
├── geostore
    ├── __init__.py
    ├── datasets
    │   ├── __init__.py
    │   ├── list.py
    │   ├── entrypoint.py
    │   ├── create.py
    │   ├── update.py
    │   ├── delete.py
    │   └── get.py
    ├── content_iterator
    │   ├── __init__.py
    │   └── task.py
    ├── dataset_versions
    │   ├── __init__.py
    │   └── entrypoint.py
    ├── import_asset_file
    │   ├── __init__.py
    │   └── task.py
    ├── import_dataset
    │   └── __init__.py
    ├── import_status
    │   ├── __init__.py
    │   ├── entrypoint.py
    │   └── get.py
    ├── populate_catalog
    │   ├── __init__.py
    │   └── task.py
    ├── upload_status
    │   ├── __init__.py
    │   └── task.py
    ├── check_files_checksums
    │   ├── __init__.py
    │   └── task.py
    ├── check_stac_metadata
    │   ├── __init__.py
    │   ├── task.py
    │   └── stac_validators.py
    ├── import_metadata_file
    │   ├── __init__.py
    │   └── task.py
    ├── notify_status_update
    │   └── __init__.py
    ├── update_root_catalog
    │   ├── __init__.py
    │   └── task.py
    ├── validation_summary
    │   ├── __init__.py
    │   └── task.py
    ├── aws_response.py
    ├── error_response_keys.py
    ├── import_file_batch_job_id_keys.py
    ├── api_keys.py
    ├── boto3_config.py
    ├── types.py
    ├── clock.py
    ├── import_dataset_keys.py
    ├── aws_keys.py
    ├── dataset_properties.py
    ├── wheel.txt
    ├── models.py
    ├── pip.txt
    ├── environment.py
    ├── logging_keys.py
    ├── aws_message_attributes.py
    ├── sts.py
    ├── Dockerfile
    ├── check.py
    ├── resources.py
    ├── s3.py
    ├── step_function_keys.py
    ├── processing_assets_model.py
    ├── api_responses.py
    ├── pystac_io_methods.py
    ├── stac_format.py
    ├── parameter_store.py
    ├── datasets_model.py
    ├── validation_results_model.py
    ├── import_dataset_file.py
    └── s3_utils.py
├── .python-version
├── infrastructure
    ├── __init__.py
    ├── constructs
    │   ├── __init__.py
    │   ├── backend.py
    │   ├── lambda_layers
    │   │   └── botocore
    │   │   │   ├── .gitignore
    │   │   │   ├── pyproject.toml
    │   │   │   └── poetry.lock
    │   ├── roles.py
    │   ├── sts_policy.py
    │   ├── s3_policy.py
    │   ├── removal_policy.py
    │   ├── lambda_config.py
    │   ├── common.py
    │   ├── version.py
    │   ├── lambda_layers.py
    │   ├── opentopo.py
    │   ├── lds.py
    │   ├── table.py
    │   ├── staging.py
    │   ├── lambda_task.py
    │   ├── import_file_function.py
    │   ├── lambda_endpoint.py
    │   ├── bundled_lambda_function.py
    │   ├── batch_submit_job_task.py
    │   ├── task_job_definition.py
    │   ├── bundled_code.py
    │   ├── notify.py
    │   ├── api.py
    │   └── storage.py
    ├── networking_stack.py
    └── application_stack.py
├── .hadolint.yaml
├── poetry.toml
├── .github
    ├── release.yml
    ├── codeql
    │   └── codeql-config.yml
    ├── workflows
    │   ├── .env
    │   ├── update-license-year.yml
    │   ├── codeql-analysis.yml
    │   ├── package-cli.yml
    │   └── mutation-test.yml
    ├── pull_request_template.md
    ├── dependabot.yml
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── enabler_story.md
    │   └── user_story.md
├── .dockerignore
├── package.json
├── .kodiak.toml
├── .editorconfig
├── cdk.json
├── .envrc
├── .gitignore
├── setup.cfg
├── nix
    └── sources.json
├── generate-requirements-files.bash
├── .gitlint
├── app.py
├── .gitmodules
├── clear-s3-buckets.bash
├── LICENSE
├── .run
    ├── pytest.run.xml
    ├── pytest-infrastructure.run.xml
    └── pytest-offline.run.xml
├── activate-dev-env.bash
├── shell.nix
├── reset-dev-env.bash
└── .pre-commit-config.yaml


/.nvmrc:
--------------------------------------------------------------------------------
1 | 18.14.1
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.15
2 | 


--------------------------------------------------------------------------------
/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_networking_stack.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/content_iterator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/dataset_versions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/import_asset_file/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/import_dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/import_status/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/populate_catalog/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/upload_status/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.hadolint.yaml:
--------------------------------------------------------------------------------
1 | ignored:
2 |   - DL3008
3 | 


--------------------------------------------------------------------------------
/geostore/check_files_checksums/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/check_stac_metadata/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/import_metadata_file/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/notify_status_update/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/update_root_catalog/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/geostore/validation_summary/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/geostore/aws_response.py:
--------------------------------------------------------------------------------
1 | AWS_CODE_REQUEST_TIMEOUT = "RequestTimeout"
2 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/backend.py:
--------------------------------------------------------------------------------
1 | BACKEND_DIRECTORY = "geostore"
2 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_layers/botocore/.gitignore:
--------------------------------------------------------------------------------
1 | /requirements.txt
2 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
1 | changelog:
2 |   exclude:
3 |     authors:
4 |       - dependabot
5 | 


--------------------------------------------------------------------------------
/geostore/error_response_keys.py:
--------------------------------------------------------------------------------
1 | ERROR_KEY = "error"
2 | ERROR_MESSAGE_KEY = "error_message"
3 | 


--------------------------------------------------------------------------------
/.github/codeql/codeql-config.yml:
--------------------------------------------------------------------------------
1 | paths-ignore:
2 |   - node_modules
3 |   - tests
4 |   - .venv
5 | 


--------------------------------------------------------------------------------
/geostore/import_file_batch_job_id_keys.py:
--------------------------------------------------------------------------------
1 | METADATA_JOB_ID_KEY = "metadata_job_id"
2 | ASSET_JOB_ID_KEY = "asset_job_id"
3 | 


--------------------------------------------------------------------------------
/geostore/api_keys.py:
--------------------------------------------------------------------------------
1 | MESSAGE_KEY = "message"
2 | STATUS_KEY = "status"
3 | SUCCESS_KEY = "success"
4 | 
5 | EVENT_KEY = "event"
6 | 


--------------------------------------------------------------------------------
/geostore/boto3_config.py:
--------------------------------------------------------------------------------
1 | from botocore.config import Config
2 | 
3 | CONFIG = Config(retries={"max_attempts": 5, "mode": "standard"})
4 | 


--------------------------------------------------------------------------------
/geostore/types.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, MutableMapping
2 | 
3 | JsonList = List[Any]
4 | JsonObject = MutableMapping[str, Any]
5 | 


--------------------------------------------------------------------------------
/geostore/clock.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | 
3 | 
4 | def now() -> datetime:
5 |     return datetime.now(timezone.utc)
6 | 


--------------------------------------------------------------------------------
/geostore/import_dataset_keys.py:
--------------------------------------------------------------------------------
1 | ORIGINAL_KEY_KEY = "original_key"
2 | NEW_KEY_KEY = "new_key"
3 | TARGET_BUCKET_NAME_KEY = "target_bucket_name"
4 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/roles.py:
--------------------------------------------------------------------------------
1 | from aws_cdk import Duration
2 | 
3 | MAX_SESSION_DURATION = Duration.hours(12)
4 | LINZ_ORGANIZATION_ID = "o-g9kpx6ff4u"
5 | 


--------------------------------------------------------------------------------
/geostore/aws_keys.py:
--------------------------------------------------------------------------------
1 | AWS_DEFAULT_REGION_KEY = "AWS_DEFAULT_REGION"
2 | BODY_KEY = "body"
3 | HTTP_METHOD_KEY = "http_method"
4 | STATUS_CODE_KEY = "status_code"
5 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/sts_policy.py:
--------------------------------------------------------------------------------
1 | from aws_cdk import aws_iam
2 | 
3 | ALLOW_ASSUME_ANY_ROLE = aws_iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"])
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | cdk.out
 2 | .coverage
 3 | .git
 4 | .github
 5 | .idea
 6 | .mypy_cache
 7 | node_modules
 8 | *.pyc
 9 | __pycache__
10 | .pytest_cache
11 | .venv
12 | .vscode
13 | 


--------------------------------------------------------------------------------
/geostore/dataset_properties.py:
--------------------------------------------------------------------------------
1 | from string import ascii_letters, digits
2 | 
3 | TITLE_CHARACTERS = f"āēīōūĀĒĪŌŪ{ascii_letters}{digits}_-"
4 | TITLE_PATTERN = f"^[{TITLE_CHARACTERS}]+$"
5 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/s3_policy.py:
--------------------------------------------------------------------------------
1 | from aws_cdk import aws_iam
2 | 
3 | ALLOW_DESCRIBE_ANY_S3_JOB = aws_iam.PolicyStatement(
4 |     resources=["*"],
5 |     actions=["s3:DescribeJob"],
6 | )
7 | 


--------------------------------------------------------------------------------
/geostore/wheel.txt:
--------------------------------------------------------------------------------
1 | wheel==0.40.0 \
2 |     --hash=sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873 \
3 |     --hash=sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247
4 | 


--------------------------------------------------------------------------------
/.github/workflows/.env:
--------------------------------------------------------------------------------
1 | AWS_DEFAULT_REGION=ap-southeast-2
2 | CiOidc=arn:aws:iam::586981104868:role/CiOidc
3 | NonProdOidc=arn:aws:iam::632223577832:role/NonProdOidc
4 | ProdOidc=arn:aws:iam::715898075157:role/ProdOidc
5 | 


--------------------------------------------------------------------------------
/geostore/models.py:
--------------------------------------------------------------------------------
1 | DB_KEY_SEPARATOR = "#"
2 | 
3 | CHECK_ID_PREFIX = f"CHECK{DB_KEY_SEPARATOR}"
4 | DATASET_ID_PREFIX = f"DATASET{DB_KEY_SEPARATOR}"
5 | URL_ID_PREFIX = f"URL{DB_KEY_SEPARATOR}"
6 | VERSION_ID_PREFIX = f"VERSION{DB_KEY_SEPARATOR}"
7 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "aws-cdk": "*"
 4 |   },
 5 |   "devDependencies": {},
 6 |   "prettier": {
 7 |     "printWidth": 100,
 8 |     "proseWrap": "always",
 9 |     "singleQuote": true,
10 |     "trailingComma": "all"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/tests/dynamodb_generators.py:
--------------------------------------------------------------------------------
1 | from geostore.step_function import get_hash_key
2 | 
3 | from .stac_generators import any_dataset_id, any_dataset_version_id
4 | 
5 | 
6 | def any_hash_key() -> str:
7 |     return get_hash_key(any_dataset_id(), any_dataset_version_id())
8 | 


--------------------------------------------------------------------------------
/tests/test_get_latest_extension_schema_version.py:
--------------------------------------------------------------------------------
1 | from geostore.check_stac_metadata.stac_validators import get_latest_extension_schema_version
2 | 
3 | 
4 | def should_get_latest_stac_spec_version() -> None:
5 |     assert get_latest_extension_schema_version("stac-spec") == "1.0.0"
6 | 


--------------------------------------------------------------------------------
/geostore/pip.txt:
--------------------------------------------------------------------------------
1 | 
2 | # The following packages are considered to be unsafe in a requirements file:
3 | pip==23.0.1 \
4 |     --hash=sha256:236bcb61156d76c4b8a05821b988c7b8c35bf0da28a4b614e8d6ab5212c25c6f \
5 |     --hash=sha256:cd015ea1bfb0fcef59d8a286c1f8bebcb983f6317719d415dc5351efb7cd7024
6 | 


--------------------------------------------------------------------------------
/tests/file_utils.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from json import dumps
 3 | from typing import BinaryIO
 4 | 
 5 | from geostore.types import JsonObject
 6 | 
 7 | 
 8 | def json_dict_to_file_object(value: JsonObject) -> BinaryIO:
 9 |     return BytesIO(initial_bytes=dumps(value).encode())
10 | 


--------------------------------------------------------------------------------
/.kodiak.toml:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | 
 3 | [approve]
 4 | auto_approve_usernames = ["dependabot"]
 5 | 
 6 | [merge]
 7 | method = "squash"
 8 | 
 9 | [merge.automerge_dependencies]
10 | usernames = ["dependabot"]
11 | versions = ["minor", "patch"]
12 | 
13 | [merge.message]
14 | title = "pull_request_title"
15 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://editorconfig.org
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | indent_size = 4
 7 | indent_style = space
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 
11 | [*.{json,nix,yaml,yml}]
12 | indent_size = 2
13 | 
14 | [*.md]
15 | indent_size = 3
16 | 


--------------------------------------------------------------------------------
/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "context": {
 4 |     "@aws-cdk/core:newStyleStackSynthesis": true,
 5 |     "@aws-cdk/core:stackRelativeExports": true,
 6 |     "@aws-cdk:enableDiffNoFail": true,
 7 |     "enableLDSAccess": true,
 8 |     "enableOpenTopographyAccess": true
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/removal_policy.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | 
 3 | from aws_cdk import RemovalPolicy
 4 | 
 5 | if environ.get("RESOURCE_REMOVAL_POLICY", "DESTROY").upper() == "RETAIN":
 6 |     REMOVAL_POLICY = RemovalPolicy.RETAIN
 7 | 
 8 | else:
 9 |     REMOVAL_POLICY = RemovalPolicy.DESTROY
10 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # the shebang is ignored, but nice for editors
 3 | 
 4 | if type -P lorri &>/dev/null; then
 5 |     eval "$(lorri direnv)"
 6 | else
 7 |     echo 'while direnv evaluated .envrc, could not find the command "lorri" [https://github.com/nix-community/lorri]'
 8 |     use nix
 9 | fi
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.bak
 2 | /cdk.out/
 3 | /.coverage
 4 | /dist/
 5 | html/
 6 | /htmlcov/
 7 | /.idea/
 8 | *.isorted
 9 | /junit.xml
10 | /geostore/.lambda_out_*/
11 | .mutmut-cache
12 | mutmut.xml
13 | /.mypy_cache/
14 | /node_modules/
15 | *.pyc
16 | __pycache__
17 | /.pytest_cache
18 | Thumbs.db
19 | /.venv
20 | /.vscode
21 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [mutmut]
2 | dict_synonyms = JobManifestLocationTypeDef,JobManifestSpecTypeDef,JobManifestTypeDef,JobOperationTypeDef,JobReportTypeDef,JsonObject,LambdaInvokeOperationTypeDef,MessageAttributeValueTypeDef
3 | paths_to_mutate = geostore,infrastructure
4 | runner = python -m pytest --assert=plain --exitfirst -m 'not infrastructure'
5 | 


--------------------------------------------------------------------------------
/tests/test_datasets_model_base.py:
--------------------------------------------------------------------------------
 1 | from pytest import mark
 2 | 
 3 | from geostore.datasets_model import datasets_model_with_meta
 4 | 
 5 | 
 6 | @mark.infrastructure
 7 | def should_create_unique_id_per_dataset() -> None:
 8 |     model = datasets_model_with_meta()
 9 |     first = model()
10 |     second = model()
11 | 
12 |     assert first.dataset_id != second.dataset_id
13 | 


--------------------------------------------------------------------------------
/geostore/environment.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | 
 3 | ENV_NAME_VARIABLE_NAME = "GEOSTORE_ENV_NAME"
 4 | PRODUCTION_ENVIRONMENT_NAME = "prod"
 5 | 
 6 | 
 7 | def environment_name() -> str:
 8 |     return environ.get(ENV_NAME_VARIABLE_NAME, PRODUCTION_ENVIRONMENT_NAME)
 9 | 
10 | 
11 | def is_production() -> bool:
12 |     return environment_name() == PRODUCTION_ENVIRONMENT_NAME
13 | 


--------------------------------------------------------------------------------
/geostore/logging_keys.py:
--------------------------------------------------------------------------------
1 | LOG_MESSAGE_LAMBDA_START = "Lambda Start"
2 | LOG_MESSAGE_LAMBDA_FAILURE = "Lambda Failure"
3 | LOG_MESSAGE_S3_BATCH_RESPONSE = "S3 Batch Response"
4 | LOG_MESSAGE_S3_DELETION_RESPONSE = "S3 Deletion Response"
5 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE = "Step Function Response"
6 | LOG_MESSAGE_VALIDATION_COMPLETE = "Validation Complete"
7 | GIT_COMMIT = "git_commit"
8 | 


--------------------------------------------------------------------------------
/.github/workflows/update-license-year.yml:
--------------------------------------------------------------------------------
 1 | name: Update copyright year in license file
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   run:
 8 |     runs-on: ubuntu-22.04
 9 |     steps:
10 |       - uses: actions/checkout@v3.4.0
11 |         with:
12 |           fetch-depth: 0
13 |       - uses: FantasticFiasco/action-update-license-year@v2.3.0
14 |         with:
15 |           token: ${{ secrets.GITHUB_TOKEN }}
16 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_layers/botocore/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | build-backend = "poetry.core.masonry.api"
 3 | requires = ["poetry-core>=1.0.0"]
 4 | 
 5 | [tool.poetry]
 6 | authors = ["Your Name <you@example.com>"]
 7 | description = ""
 8 | name = "botocore-layer"
 9 | version = "0.1.0"
10 | 
11 | [tool.poetry.dependencies]
12 | botocore = "*"
13 | python = "^3.9,<3.10"
14 | 
15 | [tool.poetry.dev-dependencies]
16 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_config.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Duration, aws_lambda, aws_logs
 2 | 
 3 | from geostore.environment import is_production
 4 | 
 5 | PYTHON_RUNTIME = aws_lambda.Runtime.PYTHON_3_9
 6 | 
 7 | DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES = 1024
 8 | DEFAULT_LAMBDA_TIMEOUT = Duration.seconds(60)
 9 | 
10 | if is_production():
11 |     RETENTION_DAYS = aws_logs.RetentionDays.ONE_YEAR
12 | else:
13 |     RETENTION_DAYS = aws_logs.RetentionDays.THREE_MONTHS
14 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/common.py:
--------------------------------------------------------------------------------
 1 | from logging import DEBUG, getLevelName
 2 | from typing import Iterable, Mapping
 3 | 
 4 | from aws_cdk import aws_iam, aws_ssm
 5 | 
 6 | LOG_LEVEL = getLevelName(DEBUG)
 7 | 
 8 | 
 9 | def grant_parameter_read_access(
10 |     parameter_readers: Mapping[aws_ssm.StringParameter, Iterable[aws_iam.IGrantable]]
11 | ) -> None:
12 |     for parameter, readers in parameter_readers.items():
13 |         for reader in readers:
14 |             parameter.grant_read(reader)
15 | 


--------------------------------------------------------------------------------
/geostore/import_status/entrypoint.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dataset-versions endpoint Lambda function.
 3 | """
 4 | from typing import Callable, Mapping
 5 | 
 6 | from ..api_responses import handle_request
 7 | from ..types import JsonObject
 8 | from .get import get_import_status
 9 | 
10 | REQUEST_HANDLERS: Mapping[str, Callable[[JsonObject], JsonObject]] = {
11 |     "GET": get_import_status,
12 | }
13 | 
14 | 
15 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
16 |     return handle_request(event, REQUEST_HANDLERS)
17 | 


--------------------------------------------------------------------------------
/geostore/aws_message_attributes.py:
--------------------------------------------------------------------------------
 1 | def decapitalize(key: str) -> str:
 2 |     """
 3 |     This method will be used to lower case the first character of SQS
 4 |     message attributes being received by Lambda to resolve inconsistencies.
 5 |     Issue outlined here: https://github.com/boto/boto3/issues/2582
 6 |     """
 7 |     return f"{key[:1].lower()}{key[1:]}"
 8 | 
 9 | 
10 | DATA_TYPE_KEY = "DataType"
11 | DATA_TYPE_STRING = "String"
12 | STRING_VALUE_KEY = "StringValue"
13 | STRING_VALUE_KEY_LOWER = decapitalize(STRING_VALUE_KEY)
14 | 


--------------------------------------------------------------------------------
/geostore/dataset_versions/entrypoint.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dataset-versions endpoint Lambda function.
 3 | """
 4 | from typing import Callable, MutableMapping
 5 | 
 6 | from ..api_responses import handle_request
 7 | from ..types import JsonObject
 8 | from .create import create_dataset_version
 9 | 
10 | REQUEST_HANDLERS: MutableMapping[str, Callable[[JsonObject], JsonObject]] = {
11 |     "POST": create_dataset_version,
12 | }
13 | 
14 | 
15 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
16 |     return handle_request(event, REQUEST_HANDLERS)
17 | 


--------------------------------------------------------------------------------
/nix/sources.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nixpkgs": {
 3 |     "branch": "release-22.11",
 4 |     "description": "Nix Packages collection",
 5 |     "homepage": "",
 6 |     "owner": "NixOS",
 7 |     "repo": "nixpkgs",
 8 |     "rev": "96e18717904dfedcd884541e5a92bf9ff632cf39",
 9 |     "sha256": "0zw1851mia86xqxdf8jgy1c6fm5lqw4rncv7v2lwxar3vhpn6c78",
10 |     "type": "tarball",
11 |     "url": "https://github.com/NixOS/nixpkgs/archive/96e18717904dfedcd884541e5a92bf9ff632cf39.tar.gz",
12 |     "url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!-- List of links to issues which will be closed by this PR. Uncomment this section if relevant.
 2 | ## Issues
 3 | 
 4 | Closes https://example.org/issues/1, https://example.org/issues/2.
 5 | -->
 6 | 
 7 | <!-- List of issues which had to be resolved or worked around to get through this work. Uncomment this section if relevant.
 8 | ## Challenges
 9 | 
10 | - [X doesn't support Y](https://example.org/issues/1)
11 | -->
12 | 
13 | ## Reference
14 | 
15 | [Code review checklist](https://github.com/linz/geostore/blob/master/CODING.md#Checklist)
16 | 


--------------------------------------------------------------------------------
/geostore/sts.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | import boto3
 5 | 
 6 | from .boto3_config import CONFIG
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from mypy_boto3_sts import STSClient
10 | else:
11 |     STSClient = object  # pragma: no mutate
12 | 
13 | STS_CLIENT: STSClient = boto3.client("sts", config=CONFIG)
14 | 
15 | 
16 | @lru_cache
17 | def get_account_number() -> str:
18 |     caller_identity = STS_CLIENT.get_caller_identity()
19 |     assert "Account" in caller_identity, caller_identity
20 |     return caller_identity["Account"]
21 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/version.py:
--------------------------------------------------------------------------------
 1 | from subprocess import PIPE, Popen
 2 | 
 3 | with Popen(["git", "rev-parse", "--abbrev-ref", "HEAD"], stdout=PIPE) as branch_command:
 4 |     GIT_BRANCH = branch_command.communicate()[0].decode().strip()
 5 | 
 6 | with Popen(["git", "rev-parse", "--short", "HEAD"], stdout=PIPE) as commit_command:
 7 |     GIT_COMMIT = commit_command.communicate()[0].decode().strip()
 8 | 
 9 | with Popen(["git", "describe", "--tags", "--exact-match"], stdout=PIPE) as tag_command:
10 |     GIT_TAG = tag_command.communicate()[0].decode().strip()
11 | if not GIT_TAG:
12 |     GIT_TAG = "UNRELEASED"
13 | 


--------------------------------------------------------------------------------
/geostore/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG python_version
 2 | FROM python:${python_version}-slim as build
 3 | 
 4 | ARG task
 5 | ARG packaging
 6 | 
 7 | RUN python -m venv /opt/.venv
 8 | 
 9 | COPY poetry.lock poetry.toml pyproject.toml /opt/
10 | COPY ${packaging}/${task}.txt /opt/
11 | 
12 | RUN /opt/.venv/bin/pip install --no-cache-dir --no-deps --requirement=/opt/${task}.txt
13 | 
14 | 
15 | ARG python_version
16 | FROM python:${python_version}-slim
17 | 
18 | ENTRYPOINT ["/opt/.venv/bin/python", "-bb", "-m", "src.task.task"]
19 | 
20 | USER 10000:10000
21 | 
22 | COPY --from=build /opt/.venv /opt/.venv
23 | 
24 | COPY geostore/*.py /src/
25 | ARG task
26 | COPY geostore/${task} /src/task/
27 | 


--------------------------------------------------------------------------------
/geostore/check.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class Check(Enum):
 5 |     ASSETS_IN_DATASET = "assets in dataset"
 6 |     CHECKSUM = "checksum"
 7 |     DUPLICATE_OBJECT_KEY = "duplicate asset name"
 8 |     FILE_NOT_FOUND = "file not found in staging or storage"
 9 |     INVALID_STAC_ROOT_TYPE = "root type must be catalog or collection"
10 |     JSON_PARSE = "JSON parse"
11 |     JSON_SCHEMA = "JSON schema"
12 |     NON_S3_URL = "not an s3 url"
13 |     NO_ASSETS_IN_DATASET = "no assets in the dataset"
14 |     SECURITY_CLASSIFICATION = "security classification"
15 |     STAGING_ACCESS = "staging bucket access"
16 |     UNKNOWN_CLIENT_ERROR = "unknown client error"
17 |     UNKNOWN_MULTIHASH_ERROR = "unknown multihash error"
18 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_layers.py:
--------------------------------------------------------------------------------
 1 | import constructs
 2 | from aws_cdk import aws_lambda_python_alpha
 3 | from constructs import Construct
 4 | 
 5 | from .lambda_config import PYTHON_RUNTIME
 6 | 
 7 | 
 8 | class LambdaLayers(Construct):
 9 |     def __init__(self, scope: constructs.Construct, stack_id: str, *, env_name: str) -> None:
10 |         super().__init__(scope, stack_id)
11 | 
12 |         self.botocore = aws_lambda_python_alpha.PythonLayerVersion(
13 |             self,
14 |             f"{env_name}-botocore-lambda-layer",
15 |             entry="infrastructure/constructs/lambda_layers/botocore",
16 |             compatible_runtimes=[PYTHON_RUNTIME],
17 |             description="botocore library",
18 |         )
19 | 


--------------------------------------------------------------------------------
/tests/test_prefix_non_prod_name.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | from unittest.mock import patch
 3 | 
 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME, PRODUCTION_ENVIRONMENT_NAME
 5 | from geostore.resources import prefix_non_prod_name
 6 | 
 7 | 
 8 | def should_return_original_name_when_production() -> None:
 9 |     name = "any name"
10 |     with patch.dict(environ, {ENV_NAME_VARIABLE_NAME: PRODUCTION_ENVIRONMENT_NAME}):
11 |         assert prefix_non_prod_name(name) == name
12 | 
13 | 
14 | def should_return_prefixed_name_when_not_production() -> None:
15 |     name = "any name"
16 |     environment_name = f"not {PRODUCTION_ENVIRONMENT_NAME}"
17 |     with patch.dict(environ, {ENV_NAME_VARIABLE_NAME: environment_name}):
18 |         assert prefix_non_prod_name(name) == f"{environment_name}-{name}"
19 | 


--------------------------------------------------------------------------------
/geostore/datasets/list.py:
--------------------------------------------------------------------------------
 1 | """List all datasets function."""
 2 | from http import HTTPStatus
 3 | 
 4 | from ..api_responses import success_response
 5 | from ..datasets_model import datasets_model_with_meta
 6 | from ..models import DATASET_ID_PREFIX
 7 | from ..types import JsonObject
 8 | 
 9 | 
10 | def list_datasets() -> JsonObject:
11 |     """GET: List all Datasets."""
12 | 
13 |     # list all datasets
14 |     datasets_model_class = datasets_model_with_meta()
15 |     datasets = datasets_model_class.scan(
16 |         filter_condition=datasets_model_class.id.startswith(DATASET_ID_PREFIX)
17 |     )
18 | 
19 |     # return response
20 |     resp_body = []
21 |     for dataset in datasets:
22 |         resp_item = dataset.as_dict()
23 |         resp_body.append(resp_item)
24 | 
25 |     return success_response(HTTPStatus.OK, resp_body)
26 | 


--------------------------------------------------------------------------------
/generate-requirements-files.bash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit -o noclobber -o nounset -o pipefail
 4 | shopt -s failglob inherit_errexit
 5 | 
 6 | if [[ $# -eq 0 ]]; then
 7 |     cat >&2 <<'EOF'
 8 | Synopsis: ./generate-requirements-files.bash PATH [PATH…]
 9 | 
10 | Example: ./generate-requirements-files.bash geostore/poetry.txt
11 | 
12 | Creates pip formatted requirements files (including dependencies and hashes) at each PATH with the package derived from the filename.
13 | 
14 | This is used to work around Dependabot not knowing which package is the "main" one in a requirements file.
15 | EOF
16 |     exit 1
17 | fi
18 | 
19 | for path; do
20 |     package_name="$(basename "${path%.txt}")"
21 |     pip-compile --allow-unsafe --generate-hashes --no-annotate --no-header --output-file="$path" --upgrade <(echo "$package_name")
22 | done
23 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: docker
 4 |     directory: /geostore
 5 |     schedule:
 6 |       interval: daily
 7 |   - package-ecosystem: github-actions
 8 |     directory: /
 9 |     schedule:
10 |       interval: daily
11 |     commit-message:
12 |       prefix: build(deps)
13 |   - package-ecosystem: gitsubmodule
14 |     directory: /
15 |     schedule:
16 |       interval: daily
17 |   - package-ecosystem: npm
18 |     directory: /
19 |     schedule:
20 |       interval: daily
21 |   - package-ecosystem: pip
22 |     directory: /
23 |     open-pull-requests-limit: 100
24 |     schedule:
25 |       interval: daily
26 |   - package-ecosystem: pip
27 |     directory: /geostore
28 |     schedule:
29 |       interval: daily
30 |   - package-ecosystem: pip
31 |     directory: /infrastructure/constructs/lambda_layers/botocore
32 |     schedule:
33 |       interval: daily
34 | 


--------------------------------------------------------------------------------
/.gitlint:
--------------------------------------------------------------------------------
 1 | # Configuration file for gitlint, used via pre-commit
 2 | # Configuration docs: http://jorisroovers.github.io/gitlint/configuration/
 3 | # Default rules: https://github.com/jorisroovers/gitlint/blob/master/docs/rules.md
 4 | 
 5 | [general]
 6 | # Ignore certain rules, you can reference them by their id or by their full name
 7 | ignore = body-is-missing, body-max-line-length
 8 | 
 9 | # Enable community contributed rule for conventional commits
10 | contrib = contrib-title-conventional-commits
11 | 
12 | [title-max-length]
13 | line-length = 72
14 | 
15 | # [title-match-regex]
16 | # Uncomment to ensure that there is an issue referenced in every commit title
17 | # regex=^.*?#[0-9]+\b.*?$
18 | 
19 | [contrib-title-conventional-commits]
20 | # Specify allowed commit types. For details see: https://www.conventionalcommits.org/
21 | types = build, chore, ci, docs, feat, fix, perf, refactor, revert, style, test
22 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | CDK application entry point file.
 3 | """
 4 | from aws_cdk import App, Tags
 5 | 
 6 | from geostore.environment import environment_name
 7 | from infrastructure.application_stack import Application
 8 | from infrastructure.constructs.batch_job_queue import APPLICATION_NAME, APPLICATION_NAME_TAG_NAME
 9 | 
10 | 
11 | def main() -> None:
12 |     app = App()
13 | 
14 |     env_name = environment_name()
15 |     Application(app, f"{env_name}-geostore")
16 | 
17 |     # tag all resources in stack
18 |     Tags.of(app).add("CostCentre", "100005")
19 |     Tags.of(app).add(APPLICATION_NAME_TAG_NAME, APPLICATION_NAME)
20 |     Tags.of(app).add("Owner", "Bill M. Nelson")
21 |     Tags.of(app).add("EnvironmentType", env_name)
22 |     Tags.of(app).add("SupportType", "Dev")
23 |     Tags.of(app).add("HoursOfOperation", "24x7")
24 | 
25 |     app.synth()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "file"]
 2 | 	path = geostore/check_stac_metadata/file
 3 | 	url = git@github.com:stac-extensions/file.git
 4 | 	branch = gh-pages
 5 | [submodule "geojson-spec"]
 6 | 	path = geostore/check_stac_metadata/geojson-spec
 7 | 	url = https://github.com/geojson/schema.git
 8 | 	branch = gh-pages
 9 | [submodule "projection"]
10 | 	path = geostore/check_stac_metadata/projection
11 | 	url = https://github.com/stac-extensions/projection.git
12 | 	branch = gh-pages
13 | [submodule "stac"]
14 | 	path = geostore/check_stac_metadata/stac
15 | 	url = https://github.com/linz/stac.git
16 | 	branch = gh-pages
17 | [submodule "stac-spec"]
18 | 	path = geostore/check_stac_metadata/stac-spec
19 | 	url = https://github.com/radiantearth/stac-spec.git
20 | 	branch = gh-pages
21 | [submodule "version"]
22 | 	path = geostore/check_stac_metadata/version
23 | 	url = https://github.com/stac-extensions/version.git
24 | 	branch = gh-pages
25 | 


--------------------------------------------------------------------------------
/geostore/resources.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from .environment import environment_name, is_production
 4 | 
 5 | 
 6 | def prefix_non_prod_name(name: str) -> str:
 7 |     if is_production():
 8 |         return name
 9 | 
10 |     return f"{environment_name()}-{name}"
11 | 
12 | 
13 | class Resource(Enum):
14 |     @property
15 |     def resource_name(self) -> str:
16 |         return prefix_non_prod_name(self.value)
17 | 
18 |     API_USERS_ROLE_NAME = "api-users"
19 |     CLOUDWATCH_RULE_NAME = "geostore-cloudwatch-rule"
20 |     DATASETS_ENDPOINT_FUNCTION_NAME = "datasets"
21 |     DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME = "dataset-versions"
22 |     IMPORT_STATUS_ENDPOINT_FUNCTION_NAME = "import-status"
23 |     S3_USERS_ROLE_NAME = "s3-users"
24 |     STAGING_USERS_ROLE_NAME = "staging-users"
25 |     STAGING_BUCKET_NAME = "linz-geostore-staging"
26 |     STORAGE_BUCKET_NAME = "linz-geostore"
27 |     SNS_TOPIC_NAME = "geostore-import-status"
28 | 


--------------------------------------------------------------------------------
/tests/test_parameter_store.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from pytest import mark, raises
 4 | 
 5 | from geostore import parameter_store
 6 | from geostore.parameter_store import (
 7 |     LOG_MESSAGE_PARAMETER_NOT_FOUND,
 8 |     SSM_CLIENT,
 9 |     ParameterName,
10 |     get_param,
11 | )
12 | 
13 | 
14 | @mark.infrastructure
15 | @patch(f"{parameter_store.__name__}.{ParameterName.__name__}")
16 | def should_log_missing_parameter_name(parameter_name_mock: MagicMock) -> None:
17 |     parameter_name = "invalid"
18 |     parameter_name_mock.INVALID.value = parameter_name
19 | 
20 |     with patch(f"{parameter_store.__name__}.LOGGER.error") as logger_mock:
21 |         with raises(SSM_CLIENT.exceptions.ParameterNotFound):
22 |             get_param(parameter_name_mock.INVALID)
23 | 
24 |         logger_mock.assert_any_call(
25 |             LOG_MESSAGE_PARAMETER_NOT_FOUND, extra={"parameter_value": parameter_name}
26 |         )
27 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/opentopo.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Tags, aws_iam, aws_s3
 2 | from constructs import Construct
 3 | 
 4 | from .roles import MAX_SESSION_DURATION
 5 | 
 6 | 
 7 | class OpenTopography(Construct):
 8 |     def __init__(
 9 |         self, scope: Construct, stack_id: str, *, env_name: str, storage_bucket: aws_s3.Bucket
10 |     ) -> None:
11 |         super().__init__(scope, stack_id)
12 | 
13 |         account_principal = aws_iam.AccountPrincipal(account_id="011766770214")
14 |         external_id = "opentopography-bahX0"
15 |         role = aws_iam.Role(
16 |             self,
17 |             "opentopography-read-role",
18 |             role_name=f"opentopography-s3-access-read-{env_name}",
19 |             assumed_by=account_principal,
20 |             external_ids=[external_id],
21 |             max_session_duration=MAX_SESSION_DURATION,
22 |         )
23 |         storage_bucket.grant_read(role)
24 | 
25 |         Tags.of(self).add("ApplicationLayer", "opentopography")
26 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lds.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Tags, aws_iam, aws_s3
 2 | from constructs import Construct
 3 | 
 4 | from geostore.environment import is_production
 5 | 
 6 | from .roles import MAX_SESSION_DURATION
 7 | 
 8 | 
 9 | class LDS(Construct):
10 |     def __init__(
11 |         self, scope: Construct, stack_id: str, *, env_name: str, storage_bucket: aws_s3.Bucket
12 |     ) -> None:
13 |         super().__init__(scope, stack_id)
14 | 
15 |         account_principal = aws_iam.AccountPrincipal(account_id="276514628126")
16 |         if is_production():
17 |             external_id = "koordinates-jAddR"
18 |         else:
19 |             external_id = "koordinates-4BnJQ"
20 |         role = aws_iam.Role(
21 |             self,
22 |             "koordinates-read-role",
23 |             role_name=f"koordinates-s3-access-read-{env_name}",
24 |             assumed_by=account_principal,
25 |             external_ids=[external_id],
26 |             max_session_duration=MAX_SESSION_DURATION,
27 |         )
28 |         storage_bucket.grant_read(role)
29 | 
30 |         Tags.of(self).add("ApplicationLayer", "lds")
31 | 


--------------------------------------------------------------------------------
/geostore/datasets/entrypoint.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dataset endpoint Lambda function.
 3 | """
 4 | from logging import Logger
 5 | from typing import Callable, MutableMapping
 6 | 
 7 | from linz_logger import get_log
 8 | 
 9 | from ..api_responses import handle_request
10 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START
11 | from ..parameter_store import ParameterName, get_param
12 | from ..types import JsonObject
13 | from .create import create_dataset
14 | from .delete import delete_dataset
15 | from .get import handle_get
16 | from .update import update_dataset
17 | 
18 | REQUEST_HANDLERS: MutableMapping[str, Callable[[JsonObject], JsonObject]] = {
19 |     "DELETE": delete_dataset,
20 |     "GET": handle_get,
21 |     "PATCH": update_dataset,
22 |     "POST": create_dataset,
23 | }
24 | 
25 | LOGGER: Logger = get_log()
26 | 
27 | 
28 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
29 |     LOGGER.debug(
30 |         LOG_MESSAGE_LAMBDA_START,
31 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
32 |     )
33 |     return handle_request(event, REQUEST_HANDLERS)
34 | 


--------------------------------------------------------------------------------
/geostore/s3.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | from uuid import uuid4
 3 | 
 4 | import boto3
 5 | 
 6 | from .boto3_config import CONFIG
 7 | from .environment import environment_name
 8 | 
 9 | if TYPE_CHECKING:
10 |     from mypy_boto3_s3 import S3Client
11 |     from mypy_boto3_sts import STSClient
12 | else:
13 |     S3Client = STSClient = object  # pragma: no mutate
14 | 
15 | 
16 | S3_SCHEMA = "s3"
17 | S3_URL_PREFIX = f"{S3_SCHEMA}://"
18 | 
19 | CHUNK_SIZE = 1024
20 | 
21 | STS_CLIENT: STSClient = boto3.client("sts", config=CONFIG)
22 | 
23 | 
24 | def get_s3_client_for_role(role_arn: str) -> S3Client:
25 |     assume_role_response = STS_CLIENT.assume_role(
26 |         RoleArn=role_arn, RoleSessionName=f"{environment_name()}_Geostore_{uuid4()}"
27 |     )
28 |     credentials = assume_role_response["Credentials"]
29 |     client: S3Client = boto3.client(
30 |         "s3",
31 |         config=CONFIG,
32 |         aws_access_key_id=credentials["AccessKeyId"],
33 |         aws_secret_access_key=credentials["SecretAccessKey"],
34 |         aws_session_token=credentials["SessionToken"],
35 |     )
36 |     return client
37 | 


--------------------------------------------------------------------------------
/geostore/import_asset_file/task.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | import boto3
 4 | 
 5 | from ..boto3_config import CONFIG
 6 | from ..import_dataset_file import get_import_result
 7 | from ..types import JsonObject
 8 | 
 9 | if TYPE_CHECKING:
10 |     # When type checking we want to use the third party package's stub
11 |     from mypy_boto3_s3 import S3Client
12 | else:
13 |     # In production we want to avoid depending on a package which has no runtime impact
14 |     S3Client = object  # pragma: no mutate
15 | 
16 | TARGET_S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
17 | 
18 | 
19 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
20 |     return get_import_result(event, importer)
21 | 
22 | 
23 | def importer(
24 |     source_bucket_name: str,
25 |     original_key: str,
26 |     target_bucket_name: str,
27 |     new_key: str,
28 |     source_s3_client: S3Client,
29 | ) -> None:
30 |     source_response = source_s3_client.get_object(Bucket=source_bucket_name, Key=original_key)
31 | 
32 |     TARGET_S3_CLIENT.upload_fileobj(source_response["Body"], Bucket=target_bucket_name, Key=new_key)
33 | 


--------------------------------------------------------------------------------
/clear-s3-buckets.bash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit -o noclobber -o nounset -o pipefail
 4 | 
 5 | if [[ $# -eq 0 ]]; then
 6 |     cat >&2 <<'EOF'
 7 | ./clear-s3-buckets.bash BUCKET [BUCKET…]
 8 | 
 9 | Deletes *all* versions of *all* files in *all* given buckets. Only to be used in case of emergency!
10 | EOF
11 |     exit 1
12 | fi
13 | 
14 | read -n1 -p "THIS WILL DELETE EVERYTHING IN BUCKETS ${*}! Press Ctrl-c to cancel or anything else to continue: " -r
15 | 
16 | delete_objects() {
17 |     count="$(jq length <<<"$1")"
18 | 
19 |     if [[ $count -eq 0 ]]; then
20 |         echo "No objects found; skipping" >&2
21 |         return
22 |     fi
23 | 
24 |     echo "Removing ${count} objects"
25 |     jq --raw-output '.[] | [.Key, .VersionId] | @tsv' <<<"$1" | parallel --colsep='\t' --group aws s3api delete-object --bucket="$bucket" --key='{1}' --version-id='{2}'
26 | }
27 | 
28 | for bucket; do
29 |     versions="$(aws s3api list-object-versions --bucket="$bucket" | jq .Versions)"
30 |     delete_objects "$versions"
31 | 
32 |     markers="$(aws s3api list-object-versions --bucket="$bucket" | jq .DeleteMarkers)"
33 |     delete_objects "$markers"
34 | done
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2022, 2021 Land Information New Zealand
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/table.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from aws_cdk import aws_dynamodb, aws_ssm
 4 | from constructs import Construct
 5 | 
 6 | from geostore.parameter_store import ParameterName
 7 | 
 8 | from .removal_policy import REMOVAL_POLICY
 9 | 
10 | 
11 | class Table(aws_dynamodb.Table):
12 |     def __init__(
13 |         self,
14 |         scope: Construct,
15 |         construct_id: str,
16 |         *,
17 |         env_name: str,
18 |         parameter_name: ParameterName,
19 |         sort_key: Optional[aws_dynamodb.Attribute] = None,
20 |     ):
21 |         super().__init__(
22 |             scope,
23 |             construct_id,
24 |             partition_key=aws_dynamodb.Attribute(name="pk", type=aws_dynamodb.AttributeType.STRING),
25 |             sort_key=sort_key,
26 |             point_in_time_recovery=True,
27 |             removal_policy=REMOVAL_POLICY,
28 |             billing_mode=aws_dynamodb.BillingMode.PAY_PER_REQUEST,
29 |         )
30 | 
31 |         self.name_parameter = aws_ssm.StringParameter(
32 |             self,
33 |             f"{construct_id} table name for {env_name}",
34 |             string_value=self.table_name,
35 |             parameter_name=parameter_name.value,
36 |         )
37 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/staging.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import RemovalPolicy, Tags, aws_iam, aws_s3
 2 | from constructs import Construct
 3 | 
 4 | from geostore.resources import Resource
 5 | 
 6 | 
 7 | class Staging(Construct):
 8 |     def __init__(self, scope: Construct, stack_id: str, *, users_role: aws_iam.Role) -> None:
 9 |         super().__init__(scope, stack_id)
10 | 
11 |         ############################################################################################
12 |         # ### DATASET STAGING S3 BUCKET ############################################################
13 |         ############################################################################################
14 |         staging_bucket = aws_s3.Bucket(
15 |             self,
16 |             "dataset-staging-bucket",
17 |             bucket_name=Resource.STAGING_BUCKET_NAME.resource_name,
18 |             access_control=aws_s3.BucketAccessControl.PRIVATE,
19 |             block_public_access=aws_s3.BlockPublicAccess.BLOCK_ALL,
20 |             versioned=True,
21 |             removal_policy=RemovalPolicy.DESTROY,
22 |             enforce_ssl=True,
23 |         )
24 |         staging_bucket.grant_read(users_role)
25 | 
26 |         Tags.of(self).add("ApplicationLayer", "staging")
27 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_task.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Optional
 2 | 
 3 | from aws_cdk import aws_lambda_python_alpha, aws_stepfunctions_tasks
 4 | from aws_cdk.aws_stepfunctions import JsonPath
 5 | from constructs import Construct
 6 | 
 7 | from .bundled_lambda_function import BundledLambdaFunction
 8 | 
 9 | 
10 | class LambdaTask(aws_stepfunctions_tasks.LambdaInvoke):
11 |     def __init__(
12 |         self,
13 |         scope: Construct,
14 |         construct_id: str,
15 |         *,
16 |         lambda_directory: str,
17 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
18 |         result_path: Optional[str] = JsonPath.DISCARD,
19 |         extra_environment: Optional[Mapping[str, str]] = None,
20 |     ):
21 |         self.lambda_function = BundledLambdaFunction(
22 |             scope,
23 |             f"{construct_id}Function",
24 |             lambda_directory=lambda_directory,
25 |             extra_environment=extra_environment,
26 |             botocore_lambda_layer=botocore_lambda_layer,
27 |         )
28 | 
29 |         super().__init__(
30 |             scope,
31 |             construct_id,
32 |             lambda_function=self.lambda_function,
33 |             result_path=result_path,
34 |             payload_response_only=True,
35 |         )
36 | 


--------------------------------------------------------------------------------
/.run/pytest.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="pytest" type="tests" factoryName="py.test" nameIsGenerated="true">
 3 |     <module name="geostore" />
 4 |     <option name="INTERPRETER_OPTIONS" value="" />
 5 |     <option name="PARENT_ENVS" value="true" />
 6 |     <option name="SDK_HOME" value="/run/current-system/sw/bin/python" />
 7 |     <option name="WORKING_DIRECTORY" value="" />
 8 |     <option name="IS_MODULE_SDK" value="false" />
 9 |     <option name="ADD_CONTENT_ROOTS" value="true" />
10 |     <option name="ADD_SOURCE_ROOTS" value="true" />
11 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
12 |     <EXTENSION ID="software.aws.toolkits.jetbrains.core.execution.PythonAwsConnectionExtension">
13 |       <option name="credential" />
14 |       <option name="region" />
15 |       <option name="useCurrentConnection" value="true" />
16 |     </EXTENSION>
17 |     <option name="_new_keywords" value="&quot;&quot;" />
18 |     <option name="_new_parameters" value="&quot;&quot;" />
19 |     <option name="_new_additionalArguments" value="&quot;&quot;" />
20 |     <option name="_new_target" value="&quot;&quot;" />
21 |     <option name="_new_targetType" value="&quot;CUSTOM&quot;" />
22 |     <method v="2" />
23 |   </configuration>
24 | </component>


--------------------------------------------------------------------------------
/activate-dev-env.bash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit -o noclobber -o nounset -o pipefail
 4 | 
 5 | usage() {
 6 |     cat >&2 <<'EOF'
 7 | Usage:
 8 | 
 9 | . activate-dev-env.bash
10 | EOF
11 | }
12 | 
13 | if ! (return 0); then
14 |     usage
15 |     exit 2
16 | fi
17 | 
18 | script_dir="$(dirname "${BASH_SOURCE[0]}")"
19 | 
20 | if type nvm &>/dev/null; then
21 |     nvm use
22 | fi
23 | PATH="${script_dir}/node_modules/.bin:${PATH}"
24 | 
25 | if ! diff <(node --version | cut --delimiter=. --fields=1-2 | tr --delete v) <(cut --delimiter=. --fields=1-2 "${script_dir}/.nvmrc"); then
26 |     # shellcheck disable=SC2016
27 |     echo 'Wrong major/minor version of Node.js detected. Please run `nvm install` to update Node.js and then reset the dev env.' >&2
28 |     exit 3
29 | fi
30 | 
31 | set +o errexit +o nounset
32 | if [[ -e "${script_dir}/.venv/bin/activate" ]]; then
33 |     # shellcheck source=/dev/null
34 |     . "${script_dir}/.venv/bin/activate"
35 | fi
36 | 
37 | if ! diff <(python <<<'import platform; print(platform.python_version())' | cut --delimiter=. --fields=1-2) <(cut --delimiter=. --fields=1-2 "${script_dir}/.python-version"); then
38 |     # shellcheck disable=SC2016
39 |     echo 'Wrong major/minor version of Python detected. Please run `pyenv install` to update Python and then reset the dev env.' >&2
40 |     exit 4
41 | fi
42 | 


--------------------------------------------------------------------------------
/.run/pytest-infrastructure.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="pytest infrastructure" type="tests" factoryName="py.test">
 3 |     <module name="geostore" />
 4 |     <option name="INTERPRETER_OPTIONS" value="" />
 5 |     <option name="PARENT_ENVS" value="true" />
 6 |     <option name="SDK_HOME" value="/run/current-system/sw/bin/python" />
 7 |     <option name="WORKING_DIRECTORY" value="" />
 8 |     <option name="IS_MODULE_SDK" value="false" />
 9 |     <option name="ADD_CONTENT_ROOTS" value="true" />
10 |     <option name="ADD_SOURCE_ROOTS" value="true" />
11 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
12 |     <EXTENSION ID="software.aws.toolkits.jetbrains.core.execution.PythonAwsConnectionExtension">
13 |       <option name="credential" />
14 |       <option name="region" />
15 |       <option name="useCurrentConnection" value="true" />
16 |     </EXTENSION>
17 |     <option name="_new_keywords" value="&quot;&quot;" />
18 |     <option name="_new_parameters" value="&quot;&quot;" />
19 |     <option name="_new_additionalArguments" value="&quot;-m infrastructure&quot;" />
20 |     <option name="_new_target" value="&quot;&quot;" />
21 |     <option name="_new_targetType" value="&quot;CUSTOM&quot;" />
22 |     <method v="2" />
23 |   </configuration>
24 | </component>


--------------------------------------------------------------------------------
/.run/pytest-offline.run.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="pytest offline" type="tests" factoryName="py.test">
 3 |     <module name="geostore" />
 4 |     <option name="INTERPRETER_OPTIONS" value="" />
 5 |     <option name="PARENT_ENVS" value="true" />
 6 |     <option name="SDK_HOME" value="/run/current-system/sw/bin/python" />
 7 |     <option name="WORKING_DIRECTORY" value="" />
 8 |     <option name="IS_MODULE_SDK" value="false" />
 9 |     <option name="ADD_CONTENT_ROOTS" value="true" />
10 |     <option name="ADD_SOURCE_ROOTS" value="true" />
11 |     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
12 |     <EXTENSION ID="software.aws.toolkits.jetbrains.core.execution.PythonAwsConnectionExtension">
13 |       <option name="credential" />
14 |       <option name="region" />
15 |       <option name="useCurrentConnection" value="true" />
16 |     </EXTENSION>
17 |     <option name="_new_keywords" value="&quot;&quot;" />
18 |     <option name="_new_parameters" value="&quot;&quot;" />
19 |     <option name="_new_additionalArguments" value="&quot;--disable-socket -m \u0027not infrastructure\u0027&quot;" />
20 |     <option name="_new_target" value="&quot;&quot;" />
21 |     <option name="_new_targetType" value="&quot;CUSTOM&quot;" />
22 |     <method v="2" />
23 |   </configuration>
24 | </component>


--------------------------------------------------------------------------------
/tests/test_step_function_logging.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_S3_BATCH_RESPONSE
 4 | from geostore.parameter_store import ParameterName, get_param
 5 | from geostore.step_function import get_s3_batch_copy_status
 6 | 
 7 | from .aws_utils import any_account_id
 8 | 
 9 | 
10 | @patch("geostore.step_function.S3CONTROL_CLIENT.describe_job")
11 | def should_log_s3_batch_response(
12 |     describe_s3_job_mock: MagicMock,
13 | ) -> None:
14 |     # Given
15 |     describe_s3_job_mock.return_value = s3_batch_response = {
16 |         "Job": {
17 |             "Status": "Some Response",
18 |             "FailureReasons": [],
19 |             "ProgressSummary": {"NumberOfTasksFailed": 0},
20 |         }
21 |     }
22 | 
23 |     with patch("geostore.step_function.LOGGER.debug") as logger_mock, patch(
24 |         "geostore.step_function.get_account_number"
25 |     ) as get_account_number_mock:
26 |         get_account_number_mock.return_value = any_account_id()
27 | 
28 |         # When
29 |         get_s3_batch_copy_status("test")
30 | 
31 |         # Then
32 |         logger_mock.assert_any_call(
33 |             LOG_MESSAGE_S3_BATCH_RESPONSE,
34 |             extra={
35 |                 "response": s3_batch_response,
36 |                 GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
37 |             },
38 |         )
39 | 


--------------------------------------------------------------------------------
/geostore/step_function_keys.py:
--------------------------------------------------------------------------------
 1 | from typing import Final
 2 | 
 3 | JOB_STATUS_FAILED = "FAILED"
 4 | JOB_STATUS_RUNNING = "RUNNING"
 5 | JOB_STATUS_SUCCEEDED = "SUCCEEDED"
 6 | 
 7 | S3_BATCH_STATUS_FAILED: Final = "Failed"
 8 | S3_BATCH_STATUS_CANCELLED: Final = "Cancelled"
 9 | S3_BATCH_STATUS_COMPLETE: Final = "Complete"
10 | 
11 | ASSET_UPLOAD_KEY = "asset_upload"
12 | CURRENT_VERSION_ID_KEY = "current_version_id"
13 | CURRENT_VERSION_EMPTY_VALUE = "None"
14 | DATASET_ID_KEY = "dataset_id"
15 | DATASET_ID_SHORT_KEY = "id"
16 | DESCRIPTION_KEY = "description"
17 | ERRORS_KEY = "errors"
18 | ERROR_CHECK_KEY = "check"
19 | ERROR_DETAILS_KEY = "details"
20 | ERROR_RESULT_KEY = "result"
21 | ERROR_URL_KEY = "url"
22 | EXECUTION_ARN_KEY = "execution_arn"
23 | FAILED_TASKS_KEY = "failed_tasks"
24 | FAILURE_REASONS_KEY = "failure_reasons"
25 | IMPORT_DATASET_KEY = "import_dataset"
26 | INPUT_KEY = "input"
27 | METADATA_UPLOAD_KEY = "metadata_upload"
28 | METADATA_URL_KEY = "metadata_url"
29 | NEW_VERSION_ID_KEY = "new_version_id"
30 | NEW_VERSION_S3_LOCATION = "new_version_s3_location"
31 | NOW_KEY = "now"
32 | OUTPUT_KEY = "output"
33 | S3_BATCH_RESPONSE_KEY = "s3_batch_response"
34 | S3_ROLE_ARN_KEY = "s3_role_arn"
35 | STATUS_KEY = "status"
36 | STEP_FUNCTION_KEY = "step_function"
37 | DATASET_TITLE_KEY = "title"
38 | UPDATE_DATASET_KEY = "update_root_catalog"
39 | UPLOAD_STATUS_KEY = "upload_status"
40 | VALIDATION_KEY = "validation"
41 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest configuration file.
 3 | """
 4 | from logging import INFO, basicConfig
 5 | 
 6 | import boto3
 7 | import pytest
 8 | from mypy_boto3_events import EventBridgeClient
 9 | from mypy_boto3_lambda import LambdaClient
10 | from mypy_boto3_s3 import S3Client
11 | from mypy_boto3_s3control import S3ControlClient
12 | from mypy_boto3_sqs import SQSServiceResource
13 | from mypy_boto3_ssm import SSMClient
14 | from mypy_boto3_stepfunctions import SFNClient
15 | 
16 | from geostore.boto3_config import CONFIG
17 | 
18 | basicConfig(level=INFO)
19 | 
20 | 
21 | @pytest.fixture()
22 | def lambda_client() -> LambdaClient:
23 |     return boto3.client("lambda", config=CONFIG)
24 | 
25 | 
26 | @pytest.fixture()
27 | def s3_client() -> S3Client:
28 |     return boto3.client("s3", config=CONFIG)
29 | 
30 | 
31 | @pytest.fixture()
32 | def s3_control_client() -> S3ControlClient:
33 |     return boto3.client("s3control", config=CONFIG)
34 | 
35 | 
36 | @pytest.fixture()
37 | def events_client() -> EventBridgeClient:
38 |     return boto3.client("events", config=CONFIG)
39 | 
40 | 
41 | @pytest.fixture()
42 | def ssm_client() -> SSMClient:
43 |     return boto3.client("ssm", config=CONFIG)
44 | 
45 | 
46 | @pytest.fixture()
47 | def step_functions_client() -> SFNClient:
48 |     return boto3.client("stepfunctions", config=CONFIG)
49 | 
50 | 
51 | @pytest.fixture()
52 | def sqs_resource() -> SQSServiceResource:
53 |     return boto3.resource("sqs")
54 | 


--------------------------------------------------------------------------------
/geostore/processing_assets_model.py:
--------------------------------------------------------------------------------
 1 | """Dataset object DynamoDB model."""
 2 | from dataclasses import dataclass
 3 | from enum import Enum
 4 | from os import environ
 5 | from typing import Optional, Type
 6 | 
 7 | from pynamodb.attributes import BooleanAttribute, UnicodeAttribute
 8 | from pynamodb.models import Model
 9 | 
10 | from .aws_keys import AWS_DEFAULT_REGION_KEY
11 | from .parameter_store import ParameterName, get_param
12 | 
13 | 
14 | class ProcessingAssetType(Enum):
15 |     DATA = "DATA_ITEM_INDEX"
16 |     METADATA = "METADATA_ITEM_INDEX"
17 | 
18 | 
19 | class ProcessingAssetsModelBase(Model):
20 |     pk = UnicodeAttribute(hash_key=True)
21 |     sk = UnicodeAttribute(range_key=True)
22 |     url = UnicodeAttribute()
23 |     filename = UnicodeAttribute()
24 |     multihash = UnicodeAttribute(null=True)
25 |     exists_in_staging = BooleanAttribute(null=True)
26 |     replaced_in_new_version = BooleanAttribute(null=True)
27 | 
28 | 
29 | def processing_assets_model_with_meta(
30 |     *, assets_table_name: Optional[str] = None
31 | ) -> Type[ProcessingAssetsModelBase]:
32 |     if assets_table_name is None:
33 |         assets_table_name = get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME)
34 | 
35 |     class ProcessingAssetsModel(ProcessingAssetsModelBase):
36 |         @dataclass
37 |         class Meta:
38 |             table_name = assets_table_name
39 |             region = environ[AWS_DEFAULT_REGION_KEY]
40 | 
41 |     return ProcessingAssetsModel
42 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/import_file_function.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Duration, aws_iam, aws_lambda_python_alpha
 2 | from constructs import Construct
 3 | 
 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME
 5 | 
 6 | from .bundled_lambda_function import BundledLambdaFunction
 7 | from .lambda_config import DEFAULT_LAMBDA_TIMEOUT
 8 | from .sts_policy import ALLOW_ASSUME_ANY_ROLE
 9 | 
10 | 
11 | class ImportFileFunction(BundledLambdaFunction):
12 |     def __init__(
13 |         self,
14 |         scope: Construct,
15 |         *,
16 |         lambda_directory: str,
17 |         invoker: aws_iam.Role,
18 |         env_name: str,
19 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
20 |         timeout: Duration = DEFAULT_LAMBDA_TIMEOUT,
21 |     ):
22 |         super().__init__(
23 |             scope,
24 |             lambda_directory.title().replace("_", ""),
25 |             lambda_directory=lambda_directory,
26 |             extra_environment={ENV_NAME_VARIABLE_NAME: env_name},
27 |             botocore_lambda_layer=botocore_lambda_layer,
28 |             timeout=timeout,
29 |         )
30 | 
31 |         self.add_to_role_policy(
32 |             aws_iam.PolicyStatement(
33 |                 actions=["s3:GetObject", "s3:GetObjectAcl", "s3:GetObjectTagging", "s3:ListBucket"],
34 |                 resources=["*"],
35 |             ),
36 |         )
37 |         self.add_to_role_policy(ALLOW_ASSUME_ANY_ROLE)
38 | 
39 |         self.grant_invoke(invoker)
40 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_endpoint.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import aws_iam, aws_lambda, aws_lambda_python_alpha
 2 | from constructs import Construct
 3 | 
 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME
 5 | 
 6 | from .backend import BACKEND_DIRECTORY
 7 | from .bundled_code import bundled_code
 8 | from .lambda_config import (
 9 |     DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES,
10 |     DEFAULT_LAMBDA_TIMEOUT,
11 |     PYTHON_RUNTIME,
12 |     RETENTION_DAYS,
13 | )
14 | 
15 | 
16 | class LambdaEndpoint(aws_lambda.Function):
17 |     def __init__(
18 |         self,
19 |         scope: Construct,
20 |         construct_id: str,
21 |         *,
22 |         env_name: str,
23 |         users_role: aws_iam.Role,
24 |         package_name: str,
25 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
26 |     ):
27 |         super().__init__(
28 |             scope,
29 |             f"{construct_id}-function",
30 |             function_name=construct_id,
31 |             handler=f"{BACKEND_DIRECTORY}.{package_name}.entrypoint.lambda_handler",
32 |             runtime=PYTHON_RUNTIME,
33 |             timeout=DEFAULT_LAMBDA_TIMEOUT,
34 |             code=bundled_code(package_name),
35 |             layers=[botocore_lambda_layer],
36 |             memory_size=DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES,
37 |             log_retention=RETENTION_DAYS,
38 |         )
39 | 
40 |         self.add_environment(ENV_NAME_VARIABLE_NAME, env_name)
41 |         self.grant_invoke(users_role)
42 | 


--------------------------------------------------------------------------------
/tests/test_storage_bucket.py:
--------------------------------------------------------------------------------
 1 | from mypy_boto3_s3 import S3Client
 2 | from pytest import mark
 3 | 
 4 | from geostore.resources import Resource
 5 | 
 6 | 
 7 | @mark.infrastructure
 8 | def should_create_storage_bucket_location_constraint(s3_client: S3Client) -> None:
 9 |     """Test if Geostore Storage S3 Bucket is created in correct region."""
10 |     response = s3_client.get_bucket_location(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name)
11 |     assert response["LocationConstraint"] == "ap-southeast-2"
12 | 
13 | 
14 | @mark.infrastructure
15 | def should_enable_storage_bucket_versioning(s3_client: S3Client) -> None:
16 |     """Test if Geostore Storage S3 Bucket versioning is enabled."""
17 |     response = s3_client.get_bucket_versioning(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name)
18 |     assert response["Status"] == "Enabled"
19 | 
20 | 
21 | @mark.infrastructure
22 | def should_create_storage_bucket_public_access_block(s3_client: S3Client) -> None:
23 |     """Test if Geostore Storage S3 Bucket access is blocked for public."""
24 |     response = s3_client.get_public_access_block(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name)
25 |     public_access_block_configuration = response["PublicAccessBlockConfiguration"]
26 |     assert public_access_block_configuration["BlockPublicAcls"] is True
27 |     assert public_access_block_configuration["IgnorePublicAcls"] is True
28 |     assert public_access_block_configuration["BlockPublicPolicy"] is True
29 |     assert public_access_block_configuration["RestrictPublicBuckets"] is True
30 | 


--------------------------------------------------------------------------------
/geostore/api_responses.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | from http.client import responses as http_responses
 3 | from typing import Callable, Mapping, Union
 4 | 
 5 | from jsonschema import ValidationError, validate
 6 | 
 7 | from .api_keys import MESSAGE_KEY
 8 | from .aws_keys import BODY_KEY, HTTP_METHOD_KEY, STATUS_CODE_KEY
 9 | from .types import JsonList, JsonObject
10 | 
11 | 
12 | def error_response(code: int, message: str) -> JsonObject:
13 |     return {STATUS_CODE_KEY: code, BODY_KEY: {MESSAGE_KEY: f"{http_responses[code]}: {message}"}}
14 | 
15 | 
16 | def success_response(code: int, body: Union[JsonList, JsonObject]) -> JsonObject:
17 |     return {STATUS_CODE_KEY: code, BODY_KEY: body}
18 | 
19 | 
20 | def handle_request(
21 |     event: JsonObject, request_handlers: Mapping[str, Callable[[JsonObject], JsonObject]]
22 | ) -> JsonObject:
23 |     """Main Lambda entry point."""
24 | 
25 |     # request validation
26 |     try:
27 |         validate(
28 |             event,
29 |             {
30 |                 "type": "object",
31 |                 "properties": {
32 |                     HTTP_METHOD_KEY: {"type": "string", "enum": list(request_handlers.keys())},
33 |                     BODY_KEY: {"type": "object"},
34 |                 },
35 |                 "required": [HTTP_METHOD_KEY, BODY_KEY],
36 |             },
37 |         )
38 |     except ValidationError as err:
39 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
40 | 
41 |     method = event[HTTP_METHOD_KEY]
42 |     return request_handlers[method](event[BODY_KEY])
43 | 


--------------------------------------------------------------------------------
/tests/test_validation_summary.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from geostore.api_keys import SUCCESS_KEY
 4 | from geostore.error_response_keys import ERROR_MESSAGE_KEY
 5 | from geostore.step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY
 6 | from geostore.validation_summary.task import lambda_handler
 7 | 
 8 | from .aws_utils import any_lambda_context
 9 | from .stac_generators import any_dataset_id, any_dataset_version_id
10 | 
11 | 
12 | def should_require_dataset_id() -> None:
13 |     response = lambda_handler({NEW_VERSION_ID_KEY: any_dataset_version_id()}, any_lambda_context())
14 | 
15 |     assert response == {ERROR_MESSAGE_KEY: "'dataset_id' is a required property"}
16 | 
17 | 
18 | def should_require_dataset_version() -> None:
19 |     response = lambda_handler({DATASET_ID_KEY: any_dataset_id()}, any_lambda_context())
20 | 
21 |     assert response == {ERROR_MESSAGE_KEY: "'new_version_id' is a required property"}
22 | 
23 | 
24 | @patch("geostore.validation_summary.task.validation_results_model_with_meta")
25 | def should_return_success_false_if_any_validation_results_are_unsuccessful(
26 |     validation_results_model_mock: MagicMock,
27 | ) -> None:
28 |     # Given an unsuccessful result
29 |     validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 1
30 | 
31 |     response = lambda_handler(
32 |         {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()},
33 |         any_lambda_context(),
34 |     )
35 | 
36 |     assert response == {SUCCESS_KEY: False}
37 | 


--------------------------------------------------------------------------------
/geostore/import_status/get.py:
--------------------------------------------------------------------------------
 1 | """Import Status handler function."""
 2 | from http import HTTPStatus
 3 | from logging import Logger
 4 | 
 5 | from jsonschema import ValidationError, validate
 6 | from linz_logger import get_log
 7 | 
 8 | from ..api_responses import error_response, success_response
 9 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_FAILURE, LOG_MESSAGE_LAMBDA_START
10 | from ..parameter_store import ParameterName, get_param
11 | from ..step_function import get_import_status_given_arn
12 | from ..step_function_keys import EXECUTION_ARN_KEY
13 | from ..types import JsonObject
14 | 
15 | LOGGER: Logger = get_log()
16 | 
17 | 
18 | def get_import_status(body: JsonObject) -> JsonObject:
19 |     LOGGER.debug(
20 |         LOG_MESSAGE_LAMBDA_START,
21 |         extra={"lambda_input": body, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
22 |     )
23 | 
24 |     try:
25 |         validate(
26 |             body,
27 |             {
28 |                 "type": "object",
29 |                 "properties": {EXECUTION_ARN_KEY: {"type": "string"}},
30 |                 "required": [EXECUTION_ARN_KEY],
31 |             },
32 |         )
33 |     except ValidationError as err:
34 |         LOGGER.warning(
35 |             LOG_MESSAGE_LAMBDA_FAILURE,
36 |             extra={"error": err.message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
37 |         )
38 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
39 | 
40 |     response_body = get_import_status_given_arn(body[EXECUTION_ARN_KEY])
41 | 
42 |     return success_response(HTTPStatus.OK, response_body)
43 | 


--------------------------------------------------------------------------------
/tests/test_upload_status_logging.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from geostore.api_keys import SUCCESS_KEY
 4 | from geostore.import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY
 5 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START
 6 | from geostore.parameter_store import ParameterName, get_param
 7 | from geostore.step_function_keys import (
 8 |     DATASET_ID_KEY,
 9 |     IMPORT_DATASET_KEY,
10 |     NEW_VERSION_ID_KEY,
11 |     VALIDATION_KEY,
12 | )
13 | from geostore.upload_status.task import lambda_handler
14 | 
15 | from .aws_utils import any_job_id, any_lambda_context
16 | from .stac_generators import any_dataset_id, any_dataset_version_id
17 | 
18 | 
19 | @patch("geostore.upload_status.task.get_tasks_status")
20 | def should_log_event(get_tasks_status_mock: MagicMock) -> None:
21 |     # Given
22 |     get_tasks_status_mock.return_value = {}
23 | 
24 |     event = {
25 |         DATASET_ID_KEY: any_dataset_id(),
26 |         NEW_VERSION_ID_KEY: any_dataset_version_id(),
27 |         VALIDATION_KEY: {SUCCESS_KEY: True},
28 |         IMPORT_DATASET_KEY: {
29 |             METADATA_JOB_ID_KEY: any_job_id(),
30 |             ASSET_JOB_ID_KEY: any_job_id(),
31 |         },
32 |     }
33 | 
34 |     with patch("geostore.upload_status.task.LOGGER.debug") as logger_mock:
35 |         # When
36 |         lambda_handler(event, any_lambda_context())
37 | 
38 |         # Then
39 |         logger_mock.assert_any_call(
40 |             LOG_MESSAGE_LAMBDA_START,
41 |             extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
42 |         )
43 | 


--------------------------------------------------------------------------------
/infrastructure/networking_stack.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Stack, Tags, aws_ec2
 2 | from constructs import Construct
 3 | 
 4 | from geostore.environment import is_production
 5 | 
 6 | 
 7 | class NetworkingStack(Stack):
 8 |     def __init__(self, scope: Construct, stack_id: str) -> None:
 9 |         super().__init__(scope, stack_id)
10 | 
11 |         ############################################################################################
12 |         # ### NETWORKING ###########################################################################
13 |         ############################################################################################
14 | 
15 |         # create new VPC
16 |         aws_ec2.Vpc(
17 |             self,
18 |             "geostore",
19 |             # cidr='10.0.0.0/16',  # TODO: use specific CIDR pylint:disable=fixme
20 |             subnet_configuration=[
21 |                 aws_ec2.SubnetConfiguration(
22 |                     cidr_mask=27, name="public", subnet_type=aws_ec2.SubnetType.PUBLIC
23 |                 ),
24 |                 aws_ec2.SubnetConfiguration(
25 |                     cidr_mask=20,
26 |                     name="ecs-cluster",
27 |                     subnet_type=aws_ec2.SubnetType.PRIVATE_ISOLATED,
28 |                 ),
29 |                 aws_ec2.SubnetConfiguration(
30 |                     name="reserved",
31 |                     subnet_type=aws_ec2.SubnetType.PRIVATE_ISOLATED,
32 |                     reserved=True,
33 |                 ),
34 |             ],
35 |             max_azs=99 if is_production() else 1,
36 |         )
37 | 
38 |         Tags.of(self).add("ApplicationLayer", "networking")
39 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/bundled_lambda_function.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Optional
 2 | 
 3 | from aws_cdk import Duration, aws_lambda, aws_lambda_python_alpha
 4 | from constructs import Construct
 5 | 
 6 | from .backend import BACKEND_DIRECTORY
 7 | from .bundled_code import bundled_code
 8 | from .common import LOG_LEVEL
 9 | from .lambda_config import (
10 |     DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES,
11 |     DEFAULT_LAMBDA_TIMEOUT,
12 |     PYTHON_RUNTIME,
13 | )
14 | 
15 | 
16 | class BundledLambdaFunction(aws_lambda.Function):
17 |     def __init__(
18 |         self,
19 |         scope: Construct,
20 |         construct_id: str,
21 |         *,
22 |         lambda_directory: str,
23 |         extra_environment: Optional[Mapping[str, str]],
24 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
25 |         timeout: Duration = DEFAULT_LAMBDA_TIMEOUT,
26 |         reserved_concurrent_executions: Optional[int] = None,
27 |     ):
28 |         environment = {"LOGLEVEL": LOG_LEVEL}
29 |         if extra_environment is not None:
30 |             environment.update(extra_environment)
31 | 
32 |         super().__init__(
33 |             scope,
34 |             construct_id,
35 |             code=bundled_code(lambda_directory),
36 |             handler=f"{BACKEND_DIRECTORY}.{lambda_directory}.task.lambda_handler",
37 |             runtime=PYTHON_RUNTIME,
38 |             environment=environment,
39 |             layers=[botocore_lambda_layer],
40 |             timeout=timeout,
41 |             memory_size=DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES,
42 |             reserved_concurrent_executions=reserved_concurrent_executions,
43 |         )
44 | 


--------------------------------------------------------------------------------
/geostore/pystac_io_methods.py:
--------------------------------------------------------------------------------
 1 | from logging import Logger
 2 | from typing import TYPE_CHECKING, Any, Union
 3 | 
 4 | import boto3
 5 | from linz_logger import get_log
 6 | from pystac.link import Link
 7 | from pystac.stac_io import StacIO
 8 | 
 9 | from .boto3_config import CONFIG
10 | from .s3_utils import calculate_s3_etag, get_bucket_and_key_from_url, get_s3_etag
11 | 
12 | if TYPE_CHECKING:
13 |     # When type checking we want to use the third party package's stub
14 |     from mypy_boto3_s3 import S3Client
15 | else:
16 |     # In production we want to avoid depending on a package which has no runtime impact
17 |     S3Client = object  # pragma: no mutate
18 | 
19 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
20 | LOGGER: Logger = get_log()
21 | 
22 | 
23 | class S3StacIO(StacIO):
24 |     def read_text(  # type: ignore[override]
25 |         self, source: Union[str, Link], *_args: Any, **_kwargs: Any
26 |     ) -> str:
27 |         url = source.href if isinstance(source, Link) else source
28 |         bucket, key = get_bucket_and_key_from_url(url)
29 |         obj = S3_CLIENT.get_object(Bucket=bucket, Key=key)
30 |         result: str = obj["Body"].read().decode("utf-8")
31 | 
32 |         return result
33 | 
34 |     def write_text(  # type: ignore[override]
35 |         self, dest: Union[str, Link], txt: str, *_args: Any, **_kwargs: Any
36 |     ) -> None:
37 |         url = dest.href if isinstance(dest, Link) else dest
38 |         bucket, key = get_bucket_and_key_from_url(url)
39 | 
40 |         s3_etag = get_s3_etag(bucket, key, LOGGER)
41 |         local_etag = calculate_s3_etag(txt.encode())
42 | 
43 |         if s3_etag != local_etag:
44 |             S3_CLIENT.put_object(Bucket=bucket, Key=key, Body=txt.encode())
45 | 


--------------------------------------------------------------------------------
/geostore/datasets/create.py:
--------------------------------------------------------------------------------
 1 | """Create dataset function."""
 2 | from http import HTTPStatus
 3 | 
 4 | from jsonschema import ValidationError, validate
 5 | from pystac.stac_io import StacIO
 6 | 
 7 | from ..api_responses import error_response, success_response
 8 | from ..dataset_properties import TITLE_PATTERN
 9 | from ..datasets_model import datasets_model_with_meta
10 | from ..pystac_io_methods import S3StacIO
11 | from ..step_function_keys import DATASET_TITLE_KEY, DESCRIPTION_KEY
12 | from ..types import JsonObject
13 | 
14 | StacIO.set_default(S3StacIO)
15 | 
16 | 
17 | def create_dataset(body: JsonObject) -> JsonObject:
18 |     """POST: Create Dataset."""
19 | 
20 |     body_schema = {
21 |         "type": "object",
22 |         "properties": {
23 |             DATASET_TITLE_KEY: {"type": "string", "pattern": TITLE_PATTERN},
24 |             DESCRIPTION_KEY: {"type": "string"},
25 |         },
26 |         "required": [DATASET_TITLE_KEY, DESCRIPTION_KEY],
27 |     }
28 | 
29 |     # request body validation
30 |     try:
31 |         validate(body, body_schema)
32 |     except ValidationError as err:
33 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
34 | 
35 |     # check for duplicate type/title
36 |     datasets_model_class = datasets_model_with_meta()
37 |     dataset_title = body[DATASET_TITLE_KEY]
38 |     if datasets_model_class.datasets_title_idx.count(hash_key=dataset_title):
39 |         return error_response(HTTPStatus.CONFLICT, f"dataset '{dataset_title}' already exists")
40 | 
41 |     # create dataset
42 |     dataset = datasets_model_class(title=dataset_title)
43 |     dataset.save()
44 |     dataset.refresh(consistent_read=True)
45 | 
46 |     # return response
47 |     resp_body = dataset.as_dict()
48 | 
49 |     return success_response(HTTPStatus.CREATED, resp_body)
50 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: CodeQL Analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   CodeQL-Build:
15 |     runs-on: ubuntu-22.04
16 | 
17 |     steps:
18 |       - name: Checkout repository
19 |         if: ${{ github.event_name == 'push' }}
20 |         uses: actions/checkout@v3.4.0
21 | 
22 |       - name: Checkout repository
23 |         if: ${{ github.event_name == 'pull_request' }}
24 |         uses: actions/checkout@v3.4.0
25 |         with:
26 |           ref: ${{ github.event.pull_request.head.sha }}
27 | 
28 |       - name: Get configuration
29 |         run: |
30 |           echo "PYTHON_VERSION=$(cat .python-version)" | tee -a $GITHUB_ENV
31 | 
32 |       - name: Use Python ${{ env.PYTHON_VERSION }}
33 |         uses: actions/setup-python@v4.5.0
34 |         with:
35 |           python-version: ${{ env.PYTHON_VERSION }}
36 | 
37 |       - name: Install dependencies
38 |         run: |
39 |           python -m pip install --requirement=geostore/pip.txt
40 |           python -m pip install --requirement=geostore/poetry.txt
41 |           python -m poetry install --all-extras --no-root --only=main
42 |           echo "CODEQL_PYTHON=$(python -m poetry run which python)" >> $GITHUB_ENV
43 | 
44 |         # Initializes the CodeQL tools for scanning.
45 |       - name: Initialize CodeQL
46 |         uses: github/codeql-action/init@v2.2.7
47 |         with:
48 |           config-file: ./.github/codeql/codeql-config.yml
49 |           setup-python-dependencies: false
50 |           languages: python
51 | 
52 |       - name: Perform CodeQL Analysis
53 |         uses: github/codeql-action/analyze@v2.2.7
54 | 


--------------------------------------------------------------------------------
/tests/test_api_endpoint_handler.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | from typing import Callable, MutableMapping
 3 | from unittest.mock import MagicMock
 4 | 
 5 | from pytest_subtests import SubTests
 6 | 
 7 | from geostore.api_keys import MESSAGE_KEY
 8 | from geostore.api_responses import handle_request
 9 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY, STATUS_CODE_KEY
10 | from geostore.types import JsonObject
11 | 
12 | 
13 | def should_return_required_property_error_when_missing_http_method() -> None:
14 |     response = handle_request({BODY_KEY: {}}, MagicMock())
15 | 
16 |     assert response == {
17 |         STATUS_CODE_KEY: HTTPStatus.BAD_REQUEST,
18 |         BODY_KEY: {MESSAGE_KEY: f"Bad Request: '{HTTP_METHOD_KEY}' is a required property"},
19 |     }
20 | 
21 | 
22 | def should_return_required_property_error_when_missing_body() -> None:
23 |     response = handle_request({HTTP_METHOD_KEY: "GET"}, MagicMock())
24 | 
25 |     assert response == {
26 |         STATUS_CODE_KEY: HTTPStatus.BAD_REQUEST,
27 |         BODY_KEY: {MESSAGE_KEY: f"Bad Request: '{BODY_KEY}' is a required property"},
28 |     }
29 | 
30 | 
31 | def should_call_relevant_http_method(subtests: SubTests) -> None:
32 |     post_mock = MagicMock()
33 | 
34 |     get_mock = MagicMock()
35 |     get_mock.return_value = expected_response = {"some key": "some value"}
36 | 
37 |     request_handlers: MutableMapping[str, Callable[[JsonObject], JsonObject]] = {
38 |         "POST": post_mock,
39 |         "GET": get_mock,
40 |     }
41 | 
42 |     response = handle_request({HTTP_METHOD_KEY: "GET", BODY_KEY: {}}, request_handlers)
43 | 
44 |     with subtests.test("Should return response"):
45 |         assert response == expected_response
46 |     with subtests.test("Should call GET method"):
47 |         assert get_mock.called
48 |     with subtests.test("Should not call POST method"):
49 |         assert not post_mock.called
50 | 


--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
 1 | let
 2 |   sources = import ./nix/sources.nix;
 3 |   pkgs = import sources.nixpkgs { };
 4 |   python = pkgs.python39;
 5 |   projectDir = builtins.path { path = ./.; name = "geostore"; };
 6 | 
 7 |   poetryEnv = pkgs.poetry2nix.mkPoetryEnv {
 8 |     inherit python projectDir;
 9 |     overrides = pkgs.poetry2nix.overrides.withDefaults (self: super: {
10 |       filelock = super.filelock.overridePythonAttrs (
11 |         # In poetry2nix >1.39.1
12 |         old: {
13 |           nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.hatchling self.hatch-vcs ];
14 |         }
15 |       );
16 |       python-ulid = super.python-ulid.overridePythonAttrs (
17 |         # In poetry2nix >1.39.1
18 |         old: {
19 |           nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.setuptools-scm ];
20 |         }
21 |       );
22 |       virtualenv = super.virtualenv.overridePythonAttrs (
23 |         # https://github.com/nix-community/poetry2nix/pull/985
24 |         old: {
25 |           nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.hatchling self.hatch-vcs ];
26 |         }
27 |       );
28 |     });
29 |   };
30 | in
31 | poetryEnv.env.overrideAttrs (
32 |   oldAttrs: {
33 |     buildInputs = [
34 |       pkgs.cacert
35 |       pkgs.cargo
36 |       pkgs.docker
37 |       pkgs.gitFull
38 |       pkgs.go
39 |       pkgs.niv
40 |       pkgs.nodejs
41 |       pkgs.python39Packages.pip
42 |       pkgs.python39Packages.pip-tools
43 |       (pkgs.poetry.override {
44 |         inherit python;
45 |       })
46 |       pkgs.which
47 |     ];
48 |     shellHook = ''
49 |       . ${projectDir + "/activate-dev-env.bash"}
50 |       ln --force --no-dereference --symbolic ${poetryEnv} .venv
51 |       cat <<'EOF'
52 |       Welcome to the Geostore development environment!
53 | 
54 |       Please run `npm install` to install Node.js packages, if you haven't already.
55 | 
56 |       You should now be able to run `cdk` and `pytest`.
57 |       EOF
58 |     '';
59 |   }
60 | )
61 | 


--------------------------------------------------------------------------------
/geostore/stac_format.py:
--------------------------------------------------------------------------------
 1 | STAC_ASSETS_KEY = "assets"
 2 | STAC_DESCRIPTION_KEY = "description"
 3 | STAC_EXTENSIONS_KEY = "stac_extensions"
 4 | STAC_EXTENT_BBOX_KEY = "bbox"
 5 | STAC_EXTENT_KEY = "extent"
 6 | STAC_EXTENT_SPATIAL_KEY = "spatial"
 7 | STAC_EXTENT_TEMPORAL_INTERVAL_KEY = "interval"
 8 | STAC_EXTENT_TEMPORAL_KEY = "temporal"
 9 | STAC_FILE_CHECKSUM_KEY = "file:checksum"
10 | STAC_GEOMETRY_KEY = "geometry"
11 | STAC_HREF_KEY = "href"
12 | STAC_ID_KEY = "id"
13 | STAC_LICENSE_KEY = "license"
14 | STAC_LINKS_KEY = "links"
15 | STAC_MAXIMUM_KEY = "maximum"
16 | STAC_MEDIA_TYPE_GEOJSON = "application/geo+json"
17 | STAC_MEDIA_TYPE_JSON = "application/json"
18 | STAC_MINIMUM_KEY = "minimum"
19 | STAC_PROPERTIES_DATETIME_KEY = "datetime"
20 | STAC_PROPERTIES_KEY = "properties"
21 | STAC_PROVIDERS_KEY = "providers"
22 | STAC_REL_CHILD = "child"
23 | STAC_REL_ITEM = "item"
24 | STAC_REL_KEY = "rel"
25 | STAC_REL_PARENT = "parent"
26 | STAC_REL_ROOT = "root"
27 | STAC_REL_SELF = "self"
28 | STAC_TITLE_KEY = "title"
29 | STAC_TYPE_CATALOG = "Catalog"
30 | STAC_TYPE_COLLECTION = "Collection"
31 | STAC_TYPE_ITEM = "Feature"
32 | STAC_TYPE_KEY = "type"
33 | STAC_VERSION_KEY = "stac_version"
34 | 
35 | LINZ_STAC_EXTENSIONS_BASE_URL = "https://stac.linz.govt.nz"
36 | LINZ_STAC_EXTENSIONS_LOCAL_PATH = "stac"
37 | 
38 | LINZ_STAC_EXTENSION_KEY_PREFIX = "linz"
39 | LINZ_STAC_CREATED_KEY = "created"
40 | LINZ_STAC_ASSET_SUMMARIES_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:asset_summaries"
41 | LINZ_STAC_GEOSPATIAL_TYPE_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:geospatial_type"
42 | LINZ_STAC_HISTORY_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:history"
43 | LINZ_STAC_LIFECYCLE_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:lifecycle"
44 | LINZ_STAC_PROVIDERS_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:providers"
45 | LINZ_STAC_SECURITY_CLASSIFICATION_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:security_classification"
46 | LINZ_STAC_SECURITY_CLASSIFICATION_UNCLASSIFIED = "unclassified"
47 | LINZ_STAC_UPDATED_KEY = "updated"
48 | 
49 | PROJECTION_EPSG_KEY = "proj:epsg"
50 | 
51 | VERSION_VERSION_KEY = "version"
52 | 


--------------------------------------------------------------------------------
/.github/workflows/package-cli.yml:
--------------------------------------------------------------------------------
 1 | name: Package CLI for PyPI release
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 |     types: [opened, reopened, synchronize]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-22.04
16 |     steps:
17 |       - name: Checkout repository
18 |         if: ${{ github.event_name == 'push' }}
19 |         uses: actions/checkout@v3.4.0
20 |         with:
21 |           submodules: 'true'
22 | 
23 |       - name: Checkout repository
24 |         if: ${{ github.event_name == 'pull_request' }}
25 |         uses: actions/checkout@v3.4.0
26 |         with:
27 |           ref: ${{ github.event.pull_request.head.sha }}
28 |           submodules: 'true'
29 | 
30 |       - name: Get Python version
31 |         run: echo "PYTHON_VERSION=$(cat .python-version)" >> "$GITHUB_ENV"
32 | 
33 |       - name: Use Python ${{ env.PYTHON_VERSION }}
34 |         uses: actions/setup-python@v4.5.0
35 |         with:
36 |           python-version: ${{ env.PYTHON_VERSION }}
37 | 
38 |       - name: Cache pip
39 |         uses: actions/cache@v3.3.1
40 |         with:
41 |           path: ~/.cache/pip
42 |           key:
43 |             ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-${{
44 |             hashFiles('./poetry.lock') }}
45 |           restore-keys: |
46 |             ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-
47 | 
48 |       - name: Install Python dependencies
49 |         run: |
50 |           python -m pip install --requirement=geostore/pip.txt
51 |           python -m pip install --requirement=geostore/poetry.txt
52 |           python -m poetry install --no-root --only=main
53 | 
54 |       - name: Build
55 |         run: poetry build
56 | 
57 |       - name: Archive build artifacts
58 |         uses: actions/upload-artifact@v3.1.2
59 |         with:
60 |           name: packages
61 |           path: dist/*
62 |           if-no-files-found: error
63 | 


--------------------------------------------------------------------------------
/geostore/parameter_store.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | from functools import lru_cache
 3 | from logging import Logger
 4 | from typing import TYPE_CHECKING, Sequence
 5 | 
 6 | import boto3
 7 | from linz_logger import get_log
 8 | 
 9 | from .boto3_config import CONFIG
10 | from .environment import environment_name
11 | 
12 | if TYPE_CHECKING:
13 |     # When type checking we want to use the third party package's stub
14 |     from mypy_boto3_ssm import SSMClient
15 | else:
16 |     # In production we want to avoid depending on a package which has no runtime impact
17 |     SSMClient = object  # pragma: no mutate
18 | 
19 | LOGGER: Logger = get_log()
20 | SSM_CLIENT: SSMClient = boto3.client("ssm", config=CONFIG)
21 | LOG_MESSAGE_PARAMETER_NOT_FOUND = "Parameter:DoesNotExist"
22 | 
23 | 
24 | class ParameterName(Enum):
25 |     # Use @staticmethod instead of all the ignores on the next line once we move to Python 3.9
26 |     # <https://github.com/python/mypy/issues/7591>.
27 |     def _generate_next_value_(  # type: ignore[misc,override] # pylint:disable=no-self-argument,no-member
28 |         name: str, _start: int, _count: int, _last_values: Sequence[str]
29 |     ) -> str:
30 |         return f"/{environment_name()}/{name.lower()}"
31 | 
32 |     GIT_COMMIT = auto()
33 |     PROCESSING_ASSETS_TABLE_NAME = auto()
34 |     PROCESSING_DATASET_VERSION_CREATION_STEP_FUNCTION_ARN = auto()
35 |     PROCESSING_IMPORT_ASSET_FILE_FUNCTION_TASK_ARN = auto()
36 |     PROCESSING_IMPORT_DATASET_ROLE_ARN = auto()
37 |     PROCESSING_IMPORT_METADATA_FILE_FUNCTION_TASK_ARN = auto()
38 |     UPDATE_CATALOG_MESSAGE_QUEUE_NAME = auto()
39 |     S3_USERS_ROLE_ARN = auto()
40 |     STATUS_SNS_TOPIC_ARN = auto()
41 |     STORAGE_DATASETS_TABLE_NAME = auto()
42 |     STORAGE_VALIDATION_RESULTS_TABLE_NAME = auto()
43 | 
44 | 
45 | @lru_cache
46 | def get_param(parameter: ParameterName) -> str:
47 |     try:
48 |         return SSM_CLIENT.get_parameter(Name=parameter.value)["Parameter"]["Value"]
49 |     except SSM_CLIENT.exceptions.ParameterNotFound:
50 |         LOGGER.error(LOG_MESSAGE_PARAMETER_NOT_FOUND, extra={"parameter_value": parameter.value})
51 |         raise
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: 'Bug: '
 5 | labels: 'bug'
 6 | ---
 7 | 
 8 | <!--
 9 | Checklist before submitting:
10 | 
11 | - [ ] Search through existing issue reports to check whether the issue already exists
12 | - [ ] If relevant, please include or link to a small sample dataset
13 | - [ ] Provide stacktrace / debugging messages where possible
14 | -->
15 | 
16 | ## Bug Description
17 | 
18 | <!-- A clear and concise description of what the bug is. -->
19 | 
20 | ## How to Reproduce
21 | 
22 | <!-- Steps, sample datasets, config and commands/or steps to reproduce the behavior. -->
23 | 
24 | 1. Do …
25 | 1. Run `…`
26 | 
27 | What did you expect to happen? <!-- Describe the expected result -->
28 | 
29 | What actually happened? <!-- Describe the actual outcome -->
30 | 
31 | ## Software Context
32 | 
33 | Operating system: <!-- e.g. Windows / Linux / macOS -->
34 | 
35 | Environment: <!-- e.g. production -->
36 | 
37 | Relevant software versions:
38 | 
39 | -  AWS CLI: <!-- include the output of `aws \-\-version` -->
40 | -  Poetry: <!-- include the output of `poetry \-\-version` -->
41 | <!-- Any other relevant software -->
42 | 
43 | ## Additional context
44 | 
45 | <!-- Add any other context about the problem here, such as stack traces or debugging info. -->
46 | 
47 | #### Definition of Done
48 | 
49 | -  [ ] This bug is **done**:
50 |    -  [ ] Bug resolved to **user's** satisfaction
51 |    -  [ ] Automated tests are passing
52 |    -  [ ] Code is peer reviewed and pushed to master
53 |    -  [ ] Deployed successfully to test environment
54 |    -  [ ] Checked against
55 |           [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md)
56 |    -  [ ] Relevant new tasks are added to backlog and communicated to the team
57 |    -  [ ] Important decisions recorded in the issue ticket
58 |    -  [ ] Readme/Changelog/Diagrams are updated
59 |    -  [ ] Product Owner has approved as complete
60 |    -  [ ] No regression to functional or
61 |           [non-functional](https://github.com/linz/geostore/blob/master/.github/ISSUE_TEMPLATE/user_story.md)
62 |           requirements
63 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/batch_submit_job_task.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Mapping, Optional
 2 | 
 3 | from aws_cdk import aws_batch_alpha, aws_iam, aws_stepfunctions, aws_stepfunctions_tasks
 4 | from constructs import Construct
 5 | 
 6 | from .common import LOG_LEVEL
 7 | from .task_job_definition import TaskJobDefinition
 8 | 
 9 | 
10 | class BatchSubmitJobTask(Construct):
11 |     def __init__(
12 |         self,
13 |         scope: Construct,
14 |         construct_id: str,
15 |         *,
16 |         env_name: str,
17 |         directory: str,
18 |         s3_policy: aws_iam.IManagedPolicy,
19 |         job_queue: aws_batch_alpha.JobQueue,
20 |         payload_object: Mapping[str, str],
21 |         container_overrides_command: List[str],
22 |         array_size: Optional[int] = None,
23 |     ):
24 |         super().__init__(scope, construct_id)
25 | 
26 |         self.job_role = aws_iam.Role(
27 |             self,
28 |             f"{construct_id}-batch-job-role",
29 |             assumed_by=aws_iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
30 |             managed_policies=[s3_policy],
31 |         )
32 | 
33 |         job_definition_arn = TaskJobDefinition(
34 |             self,
35 |             f"{construct_id}-task-definition",
36 |             env_name=env_name,
37 |             directory=directory,
38 |             job_role=self.job_role,
39 |         ).job_definition_arn
40 | 
41 |         container_overrides = aws_stepfunctions_tasks.BatchContainerOverrides(
42 |             command=container_overrides_command,
43 |             environment={"LOGLEVEL": LOG_LEVEL},
44 |         )
45 |         payload = aws_stepfunctions.TaskInput.from_object(payload_object)
46 |         self.batch_submit_job = aws_stepfunctions_tasks.BatchSubmitJob(
47 |             scope,
48 |             f"{construct_id}-batch-submit-job",
49 |             job_name=f"{construct_id}-job",
50 |             job_definition_arn=job_definition_arn,
51 |             job_queue_arn=job_queue.job_queue_arn,
52 |             array_size=array_size,
53 |             result_path=aws_stepfunctions.JsonPath.DISCARD,
54 |             container_overrides=container_overrides,
55 |             payload=payload,
56 |         )
57 | 


--------------------------------------------------------------------------------
/geostore/validation_summary/task.py:
--------------------------------------------------------------------------------
 1 | from logging import Logger
 2 | 
 3 | from jsonschema import ValidationError, validate
 4 | from linz_logger import get_log
 5 | 
 6 | from ..api_keys import SUCCESS_KEY
 7 | from ..error_response_keys import ERROR_MESSAGE_KEY
 8 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START, LOG_MESSAGE_VALIDATION_COMPLETE
 9 | from ..models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX
10 | from ..parameter_store import ParameterName, get_param
11 | from ..step_function import Outcome
12 | from ..step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY
13 | from ..types import JsonObject
14 | from ..validation_results_model import ValidationResult, validation_results_model_with_meta
15 | 
16 | LOGGER: Logger = get_log()
17 | 
18 | 
19 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
20 |     LOGGER.debug(
21 |         LOG_MESSAGE_LAMBDA_START,
22 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
23 |     )
24 | 
25 |     try:
26 |         validate(
27 |             event,
28 |             {
29 |                 "type": "object",
30 |                 "properties": {
31 |                     DATASET_ID_KEY: {"type": "string"},
32 |                     NEW_VERSION_ID_KEY: {"type": "string"},
33 |                 },
34 |                 "required": [DATASET_ID_KEY, NEW_VERSION_ID_KEY],
35 |             },
36 |         )
37 |     except ValidationError as error:
38 |         return {ERROR_MESSAGE_KEY: error.message}
39 | 
40 |     validation_results_model = validation_results_model_with_meta()
41 |     success = not bool(
42 |         validation_results_model.validation_outcome_index.count(
43 |             (
44 |                 f"{DATASET_ID_PREFIX}{event[DATASET_ID_KEY]}"
45 |                 f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{event[NEW_VERSION_ID_KEY]}"
46 |             ),
47 |             range_key_condition=validation_results_model.result == ValidationResult.FAILED.value,
48 |             limit=1,
49 |         )
50 |     )
51 | 
52 |     result = {SUCCESS_KEY: success}
53 |     LOGGER.debug(
54 |         LOG_MESSAGE_VALIDATION_COMPLETE,
55 |         extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
56 |     )
57 |     return result
58 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/task_job_definition.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | from pathlib import Path
 3 | from subprocess import check_call
 4 | 
 5 | from aws_cdk import aws_batch_alpha, aws_ecs, aws_iam
 6 | from constructs import Construct
 7 | 
 8 | from geostore.aws_keys import AWS_DEFAULT_REGION_KEY
 9 | from geostore.environment import ENV_NAME_VARIABLE_NAME, is_production
10 | from infrastructure.constructs.bundled_code import LambdaPackaging
11 | 
12 | from .backend import BACKEND_DIRECTORY
13 | 
14 | 
15 | class TaskJobDefinition(aws_batch_alpha.JobDefinition):
16 |     def __init__(
17 |         self,
18 |         scope: Construct,
19 |         construct_id: str,
20 |         *,
21 |         env_name: str,
22 |         directory: str,
23 |         job_role: aws_iam.Role,
24 |     ):
25 |         if is_production():
26 |             batch_job_definition_memory_limit = 3900
27 |         else:
28 |             batch_job_definition_memory_limit = 500
29 | 
30 |         python_version_path = Path(__file__).parent / "../../.python-version"
31 |         with python_version_path.open() as python_version:
32 |             docker_python_version = python_version.read().rstrip()
33 | 
34 |         check_call(
35 |             [
36 |                 "poetry",
37 |                 "export",
38 |                 f"--extras={directory}",
39 |                 "--without-hashes",
40 |                 f"--output={LambdaPackaging.directory}/{directory}.txt",
41 |             ]
42 |         )
43 | 
44 |         image = aws_ecs.ContainerImage.from_asset(
45 |             directory=".",
46 |             build_args={
47 |                 "python_version": docker_python_version,
48 |                 "task": directory,
49 |                 "packaging": LambdaPackaging.directory,
50 |             },
51 |             file=join(BACKEND_DIRECTORY, "Dockerfile"),
52 |         )
53 | 
54 |         container = aws_batch_alpha.JobDefinitionContainer(
55 |             image=image,
56 |             job_role=job_role,
57 |             memory_limit_mib=batch_job_definition_memory_limit,
58 |             vcpus=1,
59 |             environment={
60 |                 AWS_DEFAULT_REGION_KEY: job_role.stack.region,
61 |                 ENV_NAME_VARIABLE_NAME: env_name,
62 |             },
63 |         )
64 | 
65 |         super().__init__(scope, construct_id, container=container)
66 | 


--------------------------------------------------------------------------------
/geostore/datasets/update.py:
--------------------------------------------------------------------------------
 1 | """Update dataset function."""
 2 | from http import HTTPStatus
 3 | 
 4 | from jsonschema import ValidationError, validate
 5 | from pynamodb.exceptions import DoesNotExist
 6 | 
 7 | from ..api_responses import error_response, success_response
 8 | from ..datasets_model import DatasetsModelBase, datasets_model_with_meta
 9 | from ..models import DATASET_ID_PREFIX
10 | from ..step_function_keys import DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY
11 | from ..types import JsonObject
12 | 
13 | 
14 | def update_dataset(body: JsonObject) -> JsonObject:
15 |     """PATCH: Update Dataset."""
16 | 
17 |     body_schema = {
18 |         "type": "object",
19 |         "properties": {
20 |             DATASET_ID_SHORT_KEY: {"type": "string"},
21 |             DATASET_TITLE_KEY: {"type": "string"},
22 |         },
23 |         "required": [DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY],
24 |     }
25 | 
26 |     # request body validation
27 |     try:
28 |         validate(body, body_schema)
29 |     except ValidationError as err:
30 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
31 | 
32 |     # check for duplicate type/title
33 |     datasets_model_class = datasets_model_with_meta()
34 |     dataset_title = body[DATASET_TITLE_KEY]
35 |     if datasets_model_class.datasets_title_idx.count(hash_key=dataset_title):
36 |         return error_response(HTTPStatus.CONFLICT, f"dataset '{dataset_title}' already exists")
37 | 
38 |     # get dataset to update
39 |     dataset_id = body[DATASET_ID_SHORT_KEY]
40 |     try:
41 |         dataset = datasets_model_class.get(
42 |             hash_key=f"{DATASET_ID_PREFIX}{dataset_id}", consistent_read=True
43 |         )
44 |     except DoesNotExist:
45 |         return error_response(HTTPStatus.NOT_FOUND, f"dataset '{dataset_id}' does not exist")
46 | 
47 |     # update dataset
48 |     update_dataset_attributes(dataset, body)
49 |     dataset.save()
50 |     dataset.refresh(consistent_read=True)
51 | 
52 |     # return response
53 |     resp_body = dataset.as_dict()
54 | 
55 |     return success_response(HTTPStatus.OK, resp_body)
56 | 
57 | 
58 | def update_dataset_attributes(dataset: DatasetsModelBase, req_body: JsonObject) -> None:
59 |     for attr in DatasetsModelBase.get_attributes():
60 |         if attr in req_body and attr != "id":
61 |             setattr(dataset, attr, req_body[attr])
62 | 


--------------------------------------------------------------------------------
/geostore/import_metadata_file/task.py:
--------------------------------------------------------------------------------
 1 | from json import dumps, load
 2 | from os.path import basename
 3 | from typing import TYPE_CHECKING, Dict, Iterable, List
 4 | 
 5 | import boto3
 6 | 
 7 | from ..boto3_config import CONFIG
 8 | from ..import_dataset_file import get_import_result
 9 | from ..stac_format import (
10 |     STAC_ASSETS_KEY,
11 |     STAC_HREF_KEY,
12 |     STAC_LINKS_KEY,
13 |     STAC_REL_KEY,
14 |     STAC_REL_SELF,
15 | )
16 | from ..types import JsonObject
17 | 
18 | S3_BODY_KEY = "Body"
19 | 
20 | if TYPE_CHECKING:
21 |     from mypy_boto3_s3 import S3Client
22 |     from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef
23 | else:
24 |     PutObjectOutputTypeDef = JsonObject  # pragma: no mutate
25 |     S3Client = object  # pragma: no mutate
26 | 
27 | TARGET_S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
28 | 
29 | 
30 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
31 |     return get_import_result(event, importer)
32 | 
33 | 
34 | def importer(
35 |     source_bucket_name: str,
36 |     original_key: str,
37 |     target_bucket_name: str,
38 |     new_key: str,
39 |     source_s3_client: S3Client,
40 | ) -> PutObjectOutputTypeDef:
41 |     get_object_response = source_s3_client.get_object(Bucket=source_bucket_name, Key=original_key)
42 |     assert S3_BODY_KEY in get_object_response, get_object_response
43 | 
44 |     metadata = load(get_object_response["Body"])
45 | 
46 |     assets = metadata.get(STAC_ASSETS_KEY, {}).values()
47 |     change_href_to_basename(assets)
48 | 
49 |     links = metadata.get(STAC_LINKS_KEY, [])
50 |     delete_self_links(links)
51 |     change_href_to_basename(links)
52 |     update_root_link(links)
53 | 
54 |     return TARGET_S3_CLIENT.put_object(
55 |         Bucket=target_bucket_name,
56 |         Key=new_key,
57 |         Body=dumps(metadata).encode(),
58 |     )
59 | 
60 | 
61 | def change_href_to_basename(items: Iterable[Dict[str, str]]) -> None:
62 |     for item in items:
63 |         item[STAC_HREF_KEY] = basename(item[STAC_HREF_KEY])
64 | 
65 | 
66 | def delete_self_links(items: List[Dict[str, str]]) -> None:
67 |     items[:] = [item for item in items if item[STAC_REL_KEY] != STAC_REL_SELF]
68 | 
69 | 
70 | def update_root_link(items: List[Dict[str, str]]) -> None:
71 |     for item in items:
72 |         if item[STAC_REL_KEY] == "root":
73 |             item[STAC_HREF_KEY] = "../catalog.json"
74 | 


--------------------------------------------------------------------------------
/geostore/datasets/delete.py:
--------------------------------------------------------------------------------
 1 | """Delete dataset function."""
 2 | from http import HTTPStatus
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import boto3
 6 | from jsonschema import ValidationError, validate
 7 | from pynamodb.exceptions import DoesNotExist
 8 | 
 9 | from ..api_responses import error_response, success_response
10 | from ..boto3_config import CONFIG
11 | from ..datasets_model import datasets_model_with_meta
12 | from ..models import DATASET_ID_PREFIX
13 | from ..resources import Resource
14 | from ..step_function_keys import DATASET_ID_SHORT_KEY
15 | from ..types import JsonObject
16 | 
17 | if TYPE_CHECKING:
18 |     # When type checking we want to use the third party package's stub
19 |     from mypy_boto3_s3 import S3Client
20 | else:
21 |     # In production we want to avoid depending on a package which has no runtime impact
22 |     S3Client = object  # pragma: no mutate
23 | 
24 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
25 | 
26 | 
27 | def delete_dataset(body: JsonObject) -> JsonObject:
28 |     """DELETE: Delete Dataset."""
29 | 
30 |     body_schema = {
31 |         "type": "object",
32 |         "properties": {DATASET_ID_SHORT_KEY: {"type": "string"}},
33 |         "required": [DATASET_ID_SHORT_KEY],
34 |     }
35 | 
36 |     # request body validation
37 |     try:
38 |         validate(body, body_schema)
39 |     except ValidationError as err:
40 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
41 | 
42 |     datasets_model_class = datasets_model_with_meta()
43 | 
44 |     # get dataset to delete
45 |     dataset_id = body[DATASET_ID_SHORT_KEY]
46 |     try:
47 |         dataset = datasets_model_class.get(
48 |             hash_key=f"{DATASET_ID_PREFIX}{dataset_id}", consistent_read=True
49 |         )
50 |     except DoesNotExist:
51 |         return error_response(HTTPStatus.NOT_FOUND, f"dataset '{dataset_id}' does not exist")
52 | 
53 |     # Verify that the dataset is empty
54 |     list_objects_response = S3_CLIENT.list_objects_v2(
55 |         Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, MaxKeys=1, Prefix=f"{dataset_id}/"
56 |     )
57 |     if list_objects_response["KeyCount"]:
58 |         return error_response(
59 |             HTTPStatus.CONFLICT,
60 |             f"Can’t delete dataset “{dataset_id}”: dataset versions still exist",
61 |         )
62 | 
63 |     # delete dataset
64 |     dataset.delete()
65 | 
66 |     return success_response(HTTPStatus.NO_CONTENT, {})
67 | 


--------------------------------------------------------------------------------
/geostore/upload_status/task.py:
--------------------------------------------------------------------------------
 1 | from logging import Logger
 2 | 
 3 | from jsonschema import validate
 4 | from linz_logger import get_log
 5 | 
 6 | from ..api_keys import SUCCESS_KEY
 7 | from ..import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY
 8 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START
 9 | from ..parameter_store import ParameterName, get_param
10 | from ..step_function import get_tasks_status
11 | from ..step_function_keys import (
12 |     ASSET_UPLOAD_KEY,
13 |     DATASET_ID_KEY,
14 |     IMPORT_DATASET_KEY,
15 |     JOB_STATUS_RUNNING,
16 |     METADATA_UPLOAD_KEY,
17 |     NEW_VERSION_ID_KEY,
18 |     VALIDATION_KEY,
19 | )
20 | from ..types import JsonObject
21 | 
22 | INPUT_KEY = "input"
23 | EXECUTION_ID_KEY = "execution_id"
24 | 
25 | LOGGER: Logger = get_log()
26 | 
27 | 
28 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
29 |     LOGGER.debug(
30 |         LOG_MESSAGE_LAMBDA_START,
31 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
32 |     )
33 | 
34 |     validate(
35 |         event,
36 |         {
37 |             "type": "object",
38 |             "properties": {
39 |                 DATASET_ID_KEY: {"type": "string"},
40 |                 NEW_VERSION_ID_KEY: {"type": "string"},
41 |                 VALIDATION_KEY: {
42 |                     "type": "object",
43 |                     "properties": {SUCCESS_KEY: {"type": "boolean"}},
44 |                     "required": [SUCCESS_KEY],
45 |                 },
46 |                 IMPORT_DATASET_KEY: {
47 |                     "type": "object",
48 |                     "properties": {
49 |                         METADATA_JOB_ID_KEY: {"type": "string"},
50 |                         ASSET_JOB_ID_KEY: {"type": "string"},
51 |                     },
52 |                     "required": [METADATA_JOB_ID_KEY, ASSET_JOB_ID_KEY],
53 |                 },
54 |             },
55 |             "required": [DATASET_ID_KEY, NEW_VERSION_ID_KEY, VALIDATION_KEY, IMPORT_DATASET_KEY],
56 |         },
57 |     )
58 | 
59 |     raw_import_status = get_tasks_status(
60 |         JOB_STATUS_RUNNING,
61 |         event[DATASET_ID_KEY],
62 |         event[NEW_VERSION_ID_KEY],
63 |         event[VALIDATION_KEY][SUCCESS_KEY],
64 |         {
65 |             METADATA_JOB_ID_KEY: event[IMPORT_DATASET_KEY][METADATA_JOB_ID_KEY],
66 |             ASSET_JOB_ID_KEY: event[IMPORT_DATASET_KEY][ASSET_JOB_ID_KEY],
67 |         },
68 |     )
69 |     return {
70 |         key: raw_import_status[key]
71 |         for key in [VALIDATION_KEY, ASSET_UPLOAD_KEY, METADATA_UPLOAD_KEY]
72 |         if key in raw_import_status
73 |     }
74 | 


--------------------------------------------------------------------------------
/geostore/check_files_checksums/task.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from logging import Logger
 3 | from optparse import OptionParser, Values  # pylint: disable=deprecated-module
 4 | 
 5 | from linz_logger import get_log
 6 | 
 7 | from ..models import DB_KEY_SEPARATOR
 8 | from ..processing_assets_model import ProcessingAssetType
 9 | from ..s3_utils import get_s3_url_reader
10 | from ..step_function import AssetGarbageCollector, get_hash_key
11 | from ..validation_results_model import ValidationResultFactory
12 | from .utils import ChecksumUtils, get_job_offset
13 | 
14 | ASSETS_TABLE_NAME_ARGUMENT = "--assets-table-name"
15 | CURRENT_VERSION_ID_ARGUMENT = "--current-version-id"
16 | DATASET_ID_ARGUMENT = "--dataset-id"
17 | DATASET_TITLE_ARGUMENT = "--dataset-title"
18 | FIRST_ITEM_ARGUMENT = "--first-item"
19 | NEW_VERSION_ID_ARGUMENT = "--new-version-id"
20 | RESULTS_TABLE_NAME_ARGUMENT = "--results-table-name"
21 | S3_ROLE_ARN_ARGUMENT = "--s3-role-arn"
22 | 
23 | LOGGER: Logger = get_log()
24 | 
25 | 
26 | def parse_arguments() -> Values:
27 |     parser = OptionParser()
28 |     parser.add_option(DATASET_ID_ARGUMENT)
29 |     parser.add_option(NEW_VERSION_ID_ARGUMENT)
30 |     parser.add_option(CURRENT_VERSION_ID_ARGUMENT)
31 |     parser.add_option(DATASET_TITLE_ARGUMENT)
32 |     parser.add_option(FIRST_ITEM_ARGUMENT, type=int)
33 |     parser.add_option(RESULTS_TABLE_NAME_ARGUMENT)
34 |     parser.add_option(ASSETS_TABLE_NAME_ARGUMENT)
35 |     parser.add_option(S3_ROLE_ARN_ARGUMENT)
36 |     (options, _args) = parser.parse_args()
37 | 
38 |     for option in parser.option_list:
39 |         if option.dest is not None:
40 |             assert hasattr(options, option.dest)
41 | 
42 |     return options
43 | 
44 | 
45 | def main() -> None:
46 |     arguments = parse_arguments()
47 | 
48 |     index = arguments.first_item + get_job_offset()
49 |     hash_key = get_hash_key(arguments.dataset_id, arguments.new_version_id)
50 |     range_key = f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}{index}"
51 |     validation_result_factory = ValidationResultFactory(hash_key, arguments.results_table_name)
52 |     s3_url_reader = get_s3_url_reader(arguments.s3_role_arn, arguments.dataset_title, LOGGER)
53 | 
54 |     asset_garbage_collector = AssetGarbageCollector(
55 |         arguments.dataset_id,
56 |         arguments.current_version_id,
57 |         ProcessingAssetType.DATA,
58 |         LOGGER,
59 |         arguments.assets_table_name,
60 |     )
61 | 
62 |     utils = ChecksumUtils(
63 |         arguments.assets_table_name,
64 |         validation_result_factory,
65 |         s3_url_reader,
66 |         asset_garbage_collector,
67 |         LOGGER,
68 |     )
69 |     utils.run(hash_key, range_key)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/tests/test_validation_summary_logging.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from geostore.logging_keys import (
 4 |     GIT_COMMIT,
 5 |     LOG_MESSAGE_LAMBDA_START,
 6 |     LOG_MESSAGE_VALIDATION_COMPLETE,
 7 | )
 8 | from geostore.parameter_store import ParameterName, get_param
 9 | from geostore.step_function import Outcome
10 | from geostore.step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY
11 | from geostore.validation_summary import task
12 | 
13 | from .aws_utils import any_lambda_context
14 | from .stac_generators import any_dataset_id, any_dataset_version_id
15 | 
16 | 
17 | def should_log_event() -> None:
18 |     # Given
19 |     event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}
20 | 
21 |     with patch("geostore.validation_summary.task.validation_results_model_with_meta"), patch(
22 |         "geostore.validation_summary.task.LOGGER.debug"
23 |     ) as logger_mock:
24 |         # When
25 |         task.lambda_handler(event, any_lambda_context())
26 | 
27 |         # Then
28 |         logger_mock.assert_any_call(
29 |             LOG_MESSAGE_LAMBDA_START,
30 |             extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
31 |         )
32 | 
33 | 
34 | @patch("geostore.validation_summary.task.validation_results_model_with_meta")
35 | def should_log_failure_result(validation_results_model_mock: MagicMock) -> None:
36 |     # Given
37 |     event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}
38 |     validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 1
39 | 
40 |     with patch("geostore.validation_summary.task.LOGGER.debug") as logger_mock:
41 |         # When
42 |         task.lambda_handler(event, any_lambda_context())
43 | 
44 |         # Then
45 |         logger_mock.assert_any_call(
46 |             LOG_MESSAGE_VALIDATION_COMPLETE,
47 |             extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
48 |         )
49 | 
50 | 
51 | @patch("geostore.validation_summary.task.validation_results_model_with_meta")
52 | def should_log_success_result(validation_results_model_mock: MagicMock) -> None:
53 |     # Given
54 |     event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}
55 |     validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 0
56 | 
57 |     with patch("geostore.validation_summary.task.LOGGER.debug") as logger_mock:
58 |         # When
59 |         task.lambda_handler(event, any_lambda_context())
60 | 
61 |         # Then
62 |         logger_mock.assert_any_call(
63 |             LOG_MESSAGE_VALIDATION_COMPLETE,
64 |             extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
65 |         )
66 | 


--------------------------------------------------------------------------------
/reset-dev-env.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | set -o errexit -o noclobber -o nounset -o pipefail
  4 | 
  5 | usage() {
  6 |     cat >&2 <<'EOF'
  7 | ./reset-dev-env.bash --all
  8 | ./reset-dev-env.bash [--delete] [--hooks] [--node] [--python] [--submodule]
  9 | ./reset-dev-env.bash --help
 10 | 
 11 | `--all` implies `--delete --hooks --node --python --submodule`.
 12 | EOF
 13 | }
 14 | 
 15 | arguments="$(getopt --options '' \
 16 |     --longoptions all,delete,help,hooks,node,python,submodule --name "$0" -- "$@")"
 17 | eval set -- "$arguments"
 18 | unset arguments
 19 | 
 20 | while true; do
 21 |     case "$1" in
 22 |     --all)
 23 |         delete=1
 24 |         hooks=1
 25 |         node=1
 26 |         python=1
 27 |         submodule=1
 28 |         shift
 29 |         ;;
 30 |     --delete)
 31 |         delete=1
 32 |         shift
 33 |         ;;
 34 |     --help)
 35 |         usage
 36 |         exit
 37 |         ;;
 38 |     --hooks)
 39 |         hooks=1
 40 |         shift
 41 |         ;;
 42 |     --node)
 43 |         node=1
 44 |         shift
 45 |         ;;
 46 |     --python)
 47 |         python=1
 48 |         shift
 49 |         ;;
 50 |     --submodule)
 51 |         submodule=1
 52 |         shift
 53 |         ;;
 54 |     --)
 55 |         shift
 56 |         break
 57 |         ;;
 58 |     *)
 59 |         printf 'Not implemented: %q\n' "$1" >&2
 60 |         exit 1
 61 |         ;;
 62 |     esac
 63 | done
 64 | 
 65 | if [[ -z ${hooks-} ]] &&
 66 |     [[ -z ${node-} ]] &&
 67 |     [[ -z ${python-} ]] &&
 68 |     [[ -z ${submodule-} ]]; then
 69 |     usage
 70 |     exit 1
 71 | fi
 72 | 
 73 | cd "$(dirname "${BASH_SOURCE[0]}")"
 74 | 
 75 | if [[ -n ${delete-} ]]; then
 76 |     echo "Cleaning Git repository"
 77 |     git clean -d --exclude='.idea' --force -x
 78 | fi
 79 | 
 80 | if [[ -n ${submodule-} ]]; then
 81 |     echo "Updating submodules"
 82 |     git submodule update --init
 83 | fi
 84 | 
 85 | if [[ -n ${node-} ]]; then
 86 |     if [[ -n ${delete-} ]]; then
 87 |         echo "Removing Node.js packages"
 88 |         rm --force --recursive ./node_modules
 89 |     fi
 90 | 
 91 |     echo "Installing Node.js packages"
 92 |     npm ci
 93 | fi
 94 | 
 95 | if [[ -n ${python-} ]]; then
 96 |     if [[ -n ${delete-} ]]; then
 97 |         echo "Removing Python packages"
 98 |         rm --force --recursive ./.venv
 99 |     fi
100 | 
101 |     echo "Installing Python packages"
102 |     poetry env use "$(cat .python-version)"
103 |     poetry install --all-extras --no-root --sync
104 | fi
105 | 
106 | if [[ -n ${hooks-} ]]; then
107 |     echo "Installing Git hooks"
108 | 
109 |     # shellcheck source=/dev/null
110 |     . .venv/bin/activate
111 | 
112 |     pre-commit install --hook-type=commit-msg --overwrite
113 |     pre-commit install --hook-type=pre-commit --overwrite
114 | fi
115 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/bundled_code.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from dataclasses import dataclass
 3 | from re import sub
 4 | from subprocess import check_call, check_output
 5 | from sys import executable
 6 | from typing import List
 7 | 
 8 | from aws_cdk import BundlingOptions, aws_lambda
 9 | 
10 | from .backend import BACKEND_DIRECTORY
11 | from .lambda_config import PYTHON_RUNTIME
12 | 
13 | 
14 | def poetry_export_extras(lambda_directory: str) -> List[str]:
15 |     # There isn't an elegant way of getting poetry to install package dependencies in a bespoke
16 |     # target lambda_directory within Python, so we export a requirements file and install using pip.
17 |     # This has been raised and discussed by the community as below:
18 |     # https://github.com/python-poetry/poetry/issues/1937
19 | 
20 |     export_extras = check_output(
21 |         ["poetry", "export", f"--extras={lambda_directory}", "--without-hashes"]
22 |     )
23 |     # Remove botocore as this is already installed in the lambda layer
24 |     export_extras = sub(b"botocore==.*\n", b"", export_extras)
25 | 
26 |     return export_extras.decode("utf-8").splitlines()
27 | 
28 | 
29 | def pip_install_requirements(lambda_directory: str, export_extras: List[str]) -> None:
30 |     # Documentation recommend against calling pip internal api; rather, via command line
31 |     # https://pip.pypa.io/en/latest/user_guide/#using-pip-from-your-program
32 | 
33 |     check_call(
34 |         [
35 |             executable,
36 |             "-m",
37 |             "pip",
38 |             "install",
39 |             "--no-deps",
40 |             "--quiet",
41 |             f"--cache-dir={LambdaPackaging.directory}/cache",
42 |             f"--target={LambdaPackaging.directory}/{lambda_directory}",
43 |             *export_extras,
44 |         ]
45 |     )
46 | 
47 | 
48 | @dataclass
49 | class LambdaPackaging:
50 |     directory = tempfile.mkdtemp(dir=BACKEND_DIRECTORY, prefix=".lambda_out_")
51 | 
52 | 
53 | def bundled_code(lambda_directory: str) -> aws_lambda.Code:
54 |     export_extras = poetry_export_extras(lambda_directory)
55 |     pip_install_requirements(lambda_directory, export_extras)
56 |     bundling_options = BundlingOptions(
57 |         image=PYTHON_RUNTIME.bundling_image,  # pylint:disable=no-member
58 |         command=[
59 |             "bash",
60 |             "-c",
61 |             f"""mkdir --parents /asset-output/geostore/{lambda_directory} && \
62 |                 cp --archive --update {LambdaPackaging.directory}/{lambda_directory}/* /asset-output/ && \
63 |                 cp --archive --update /asset-input/geostore/*.py /asset-output/geostore/ && \
64 |                 cp --archive --update /asset-input/geostore/{lambda_directory} /asset-output/geostore/""",  # pylint: disable=line-too-long
65 |         ],
66 |     )
67 |     return aws_lambda.Code.from_asset(path=".", bundling=bundling_options)
68 | 


--------------------------------------------------------------------------------
/geostore/datasets/get.py:
--------------------------------------------------------------------------------
 1 | """Get datasets functions."""
 2 | from http import HTTPStatus
 3 | 
 4 | from jsonschema import ValidationError, validate
 5 | from pynamodb.exceptions import DoesNotExist
 6 | 
 7 | from ..api_responses import error_response, success_response
 8 | from ..datasets_model import datasets_model_with_meta
 9 | from ..models import DATASET_ID_PREFIX
10 | from ..step_function_keys import DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY
11 | from ..types import JsonObject
12 | from .list import list_datasets
13 | 
14 | 
15 | def handle_get(body: JsonObject) -> JsonObject:
16 |     if DATASET_ID_SHORT_KEY in body:
17 |         return get_dataset_single(body)
18 | 
19 |     if DATASET_TITLE_KEY in body:
20 |         return get_dataset_filter(body)
21 | 
22 |     if body == {}:
23 |         return list_datasets()
24 | 
25 |     return error_response(HTTPStatus.BAD_REQUEST, "Unhandled request")
26 | 
27 | 
28 | def get_dataset_single(body: JsonObject) -> JsonObject:
29 |     """GET: Get single Dataset."""
30 | 
31 |     body_schema = {
32 |         "type": "object",
33 |         "properties": {DATASET_ID_SHORT_KEY: {"type": "string"}},
34 |         "required": [DATASET_ID_SHORT_KEY],
35 |     }
36 | 
37 |     # request body validation
38 |     try:
39 |         validate(body, body_schema)
40 |     except ValidationError as err:
41 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
42 | 
43 |     datasets_model_class = datasets_model_with_meta()
44 | 
45 |     # get dataset
46 |     try:
47 |         dataset = datasets_model_class.get(
48 |             hash_key=f"{DATASET_ID_PREFIX}{body[DATASET_ID_SHORT_KEY]}", consistent_read=True
49 |         )
50 |     except DoesNotExist:
51 |         return error_response(
52 |             HTTPStatus.NOT_FOUND, f"dataset '{body[DATASET_ID_SHORT_KEY]}' does not exist"
53 |         )
54 | 
55 |     # return response
56 |     resp_body = dataset.as_dict()
57 | 
58 |     return success_response(HTTPStatus.OK, resp_body)
59 | 
60 | 
61 | def get_dataset_filter(body: JsonObject) -> JsonObject:
62 |     """GET: Get Datasets by filter."""
63 | 
64 |     body_schema = {
65 |         "type": "object",
66 |         "properties": {DATASET_TITLE_KEY: {"type": "string"}},
67 |         "required": [DATASET_TITLE_KEY],
68 |     }
69 | 
70 |     # request body validation
71 |     try:
72 |         validate(body, body_schema)
73 |     except ValidationError as err:
74 |         return error_response(HTTPStatus.BAD_REQUEST, err.message)
75 | 
76 |     # dataset query by filter
77 |     datasets_model_class = datasets_model_with_meta()
78 |     datasets = datasets_model_class.datasets_title_idx.query(hash_key=body[DATASET_TITLE_KEY])
79 | 
80 |     # return response
81 |     resp_body = []
82 |     for dataset in datasets:
83 |         resp_item = dataset.as_dict()
84 |         resp_body.append(resp_item)
85 | 
86 |     return success_response(HTTPStatus.OK, resp_body)
87 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enabler_story.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Enabler story
 3 | about: Suggest an idea to enable the team to deliver a better product
 4 | labels: enabler story
 5 | ---
 6 | 
 7 | ### Enabler
 8 | 
 9 | <!-- A description of the enabler that covers what needs to be done why it needs to be done. It should be understandable by all members of the team -->
10 | 
11 | So that [some reason], we want to [do something]
12 | 
13 | #### Acceptance Criteria
14 | 
15 | <!-- Requirements to accept this enabler as completed -->
16 | 
17 | -  [ ] ...
18 | -  [ ] ...
19 | 
20 | #### Additional context
21 | 
22 | <!-- Add any other context here -->
23 | 
24 | #### Tasks
25 | 
26 | <!-- Tasks needed to complete this enabler -->
27 | 
28 | -  [ ] ...
29 | -  [ ] ...
30 | 
31 | #### Definition of Ready
32 | 
33 | -  [ ] This story is **ready** to work on
34 |    -  [ ] Negotiable (team can decide how to design and implement)
35 |    -  [ ] Valuable (from a user perspective)
36 |    -  [ ] Estimate value applied (agreed by team)
37 |    -  [ ] Small (so as to fit within an iteration)
38 |    -  [ ] Testable (in principle, even if there isn't a test for it yet)
39 |    -  [ ] Environments are ready to meet definition of done
40 |    -  [ ] Resources required to implement will be ready
41 |    -  [ ] Everyone understands and agrees with the tasks to complete the story
42 |    -  [ ] Release value (e.g. Iteration 3) applied
43 |    -  [ ] Sprint value (e.g. Aug 1 - Aug 15) applied
44 | 
45 | #### Definition of Done
46 | 
47 | -  [ ] This story is **done**:
48 |    -  [ ] Acceptance criteria completed
49 |    -  [ ] Automated tests are passing
50 |    -  [ ] Code is peer reviewed and pushed to master
51 |    -  [ ] Deployed successfully to test environment
52 |    -  [ ] Checked against
53 |           [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md)
54 |    -  [ ] Relevant new tasks are added to backlog and communicated to the team
55 |    -  [ ] Important decisions recorded in the issue ticket
56 |    -  [ ] Readme/Changelog/Diagrams are updated
57 |    -  [ ] Product Owner has approved acceptance criteria as complete
58 |    -  [ ] Meets non-functional requirements:
59 |       -  [ ] Scalability (data): Can scale to 300TB of data and 100,000,000 files and ability to
60 |              increase 10% every year
61 |       -  [ ] Scability (users): Can scale to 100 concurrent users
62 |       -  [ ] Cost: Data can be stored at < 0.5 NZD per GB per year
63 |       -  [ ] Performance: A large dataset (500 GB and 50,000 files - e.g. Akl aerial imagery) can be
64 |              validated, imported and stored within 24 hours
65 |       -  [ ] Accessibility: Can be used from LINZ networks and the public internet
66 |       -  [ ] Availability: System available 24 hours a day and 7 days a week, this does not include
67 |              maintenance windows < 4 hours and does not include operational support
68 |       -  [ ] Recoverability: RPO of fully imported datasets < 4 hours, RTO of a single 3 TB dataset
69 |              < 12 hours
70 | 
71 | <!-- Please add one or more of these labels: 'spike', 'refactor', 'architecture', 'infrastructure', 'compliance' -->
72 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/lambda_layers/botocore/poetry.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Poetry and should not be changed by hand.
 2 | 
 3 | [[package]]
 4 | name = "botocore"
 5 | version = "1.29.91"
 6 | description = "Low-level, data-driven core of boto 3."
 7 | category = "main"
 8 | optional = false
 9 | python-versions = ">= 3.7"
10 | files = [
11 |     {file = "botocore-1.29.91-py3-none-any.whl", hash = "sha256:4ed6a488aee1b42367eace71f7d0993dda05b02eebd7dcdd78db5c9ce3d80da5"},
12 |     {file = "botocore-1.29.91.tar.gz", hash = "sha256:a8a800a2a945da807758cace539fc5b5ec1d5082ce363799d3a3870c2c4ed6fc"},
13 | ]
14 | 
15 | [package.dependencies]
16 | jmespath = ">=0.7.1,<2.0.0"
17 | python-dateutil = ">=2.1,<3.0.0"
18 | urllib3 = ">=1.25.4,<1.27"
19 | 
20 | [package.extras]
21 | crt = ["awscrt (==0.16.9)"]
22 | 
23 | [[package]]
24 | name = "jmespath"
25 | version = "0.10.0"
26 | description = "JSON Matching Expressions"
27 | category = "main"
28 | optional = false
29 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
30 | files = [
31 |     {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"},
32 |     {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"},
33 | ]
34 | 
35 | [[package]]
36 | name = "python-dateutil"
37 | version = "2.8.1"
38 | description = "Extensions to the standard Python datetime module"
39 | category = "main"
40 | optional = false
41 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
42 | files = [
43 |     {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
44 |     {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
45 | ]
46 | 
47 | [package.dependencies]
48 | six = ">=1.5"
49 | 
50 | [[package]]
51 | name = "six"
52 | version = "1.15.0"
53 | description = "Python 2 and 3 compatibility utilities"
54 | category = "main"
55 | optional = false
56 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
57 | files = [
58 |     {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
59 |     {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
60 | ]
61 | 
62 | [[package]]
63 | name = "urllib3"
64 | version = "1.26.5"
65 | description = "HTTP library with thread-safe connection pooling, file post, and more."
66 | category = "main"
67 | optional = false
68 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
69 | files = [
70 |     {file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"},
71 |     {file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"},
72 | ]
73 | 
74 | [package.extras]
75 | brotli = ["brotlipy (>=0.6.0)"]
76 | secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
77 | socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
78 | 
79 | [metadata]
80 | lock-version = "2.0"
81 | python-versions = "^3.9,<3.10"
82 | content-hash = "92176b3eafd22453239cf3e9e7fcefb9e0a3c0af572a1d46403aee00f8e7d931"
83 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/user_story.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: User story
 3 | about: Suggest an idea to give the user a more valuable product
 4 | labels: user story
 5 | ---
 6 | 
 7 | ### User Story
 8 | 
 9 | <!-- A user story to describe why a user wants to do something, who the user is and what they want to do -->
10 | 
11 | So that [some reason], as a [role], I want to [do something].
12 | 
13 | <!-- optional: Instead of [existing behaviour] -->
14 | 
15 | #### Acceptance Criteria
16 | 
17 | <!-- Required artifacts to accept this feature as completed. -->
18 | 
19 | -  [ ] Given [a pre-condition], when [an event happens], then [an expected outcome occurs]
20 | -  [ ] Given [a pre-condition], when [an event happens], then [an expected outcome occurs]
21 | 
22 | #### Additional context
23 | 
24 | <!-- Add any other context or mocked CLI commands or screenshots about the feature request here.-->
25 | 
26 | #### Tasks
27 | 
28 | <!-- Tasks needed to complete this enabler -->
29 | 
30 | -  [ ] ...
31 | -  [ ] ...
32 | 
33 | #### Definition of Ready
34 | 
35 | -  [ ] This story is **ready** to work on
36 |    -  [ ] Independent (story is independent of all other tasks)
37 |    -  [ ] Negotiable (team can decide how to design and implement)
38 |    -  [ ] Valuable (from a user perspective)
39 |    -  [ ] Estimate value applied (agreed by team)
40 |    -  [ ] Small (so as to fit within an iteration)
41 |    -  [ ] Testable (in principle, even if there isn't a test for it yet)
42 |    -  [ ] Environments are ready to meet definition of done
43 |    -  [ ] Resources required to implement will be ready
44 |    -  [ ] Everyone understands and agrees with the tasks to complete the story
45 |    -  [ ] Release value (e.g. Iteration 3) applied
46 |    -  [ ] Sprint value (e.g. Aug 1 - Aug 15) applied
47 | 
48 | #### Definition of Done
49 | 
50 | -  [ ] This story is **done**:
51 |    -  [ ] Acceptance criteria completed
52 |    -  [ ] Automated tests are passing
53 |    -  [ ] Code is peer reviewed and pushed to master
54 |    -  [ ] Deployed successfully to test environment
55 |    -  [ ] Checked against
56 |           [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md)
57 |    -  [ ] Relevant new tasks are added to backlog and communicated to the team
58 |    -  [ ] Important decisions recorded in the issue ticket
59 |    -  [ ] Readme/Changelog/Diagrams are updated
60 |    -  [ ] Product Owner has approved acceptance criteria as complete
61 |    -  [ ] Meets non-functional requirements:
62 |       -  [ ] Scalability (data): Can scale to 300TB of data and 100,000,000 files and ability to
63 |              increase 10% every year
64 |       -  [ ] Scability (users): Can scale to 100 concurrent users
65 |       -  [ ] Cost: Data can be stored at < 0.5 NZD per GB per year
66 |       -  [ ] Performance: A large dataset (500 GB and 50,000 files - e.g. Akl aerial imagery) can be
67 |              validated, imported and stored within 24 hours
68 |       -  [ ] Accessibility: Can be used from LINZ networks and the public internet
69 |       -  [ ] Availability: System available 24 hours a day and 7 days a week, this does not include
70 |              maintenance windows < 4 hours and does not include operational support
71 |       -  [ ] Recoverability: RPO of fully imported datasets < 4 hours, RTO of a single 3 TB dataset
72 |              < 12 hours
73 | 


--------------------------------------------------------------------------------
/geostore/content_iterator/task.py:
--------------------------------------------------------------------------------
 1 | from jsonschema import validate
 2 | 
 3 | from ..models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX
 4 | from ..parameter_store import ParameterName, get_param
 5 | from ..processing_assets_model import ProcessingAssetType, processing_assets_model_with_meta
 6 | from ..step_function_keys import DATASET_ID_KEY, METADATA_URL_KEY, NEW_VERSION_ID_KEY
 7 | from ..types import JsonObject
 8 | 
 9 | MAX_ITERATION_SIZE = 10_000
10 | 
11 | ASSETS_TABLE_NAME_KEY = "assets_table_name"
12 | CONTENT_KEY = "content"
13 | FIRST_ITEM_KEY = "first_item"
14 | ITERATION_SIZE_KEY = "iteration_size"
15 | NEXT_ITEM_KEY = "next_item"
16 | RESULTS_TABLE_NAME_KEY = "results_table_name"
17 | 
18 | EVENT_SCHEMA = {
19 |     "type": "object",
20 |     "properties": {
21 |         CONTENT_KEY: {
22 |             "type": "object",
23 |             "properties": {
24 |                 FIRST_ITEM_KEY: {"type": "string", "pattern": r"^\d+$"},
25 |                 ITERATION_SIZE_KEY: {
26 |                     "type": "integer",
27 |                     "minimum": 1,
28 |                     "maximum": MAX_ITERATION_SIZE,
29 |                 },
30 |                 NEXT_ITEM_KEY: {
31 |                     "type": "integer",
32 |                     "minimum": MAX_ITERATION_SIZE,
33 |                     "multipleOf": MAX_ITERATION_SIZE,
34 |                 },
35 |             },
36 |             "required": [FIRST_ITEM_KEY, ITERATION_SIZE_KEY, NEXT_ITEM_KEY],
37 |             "additionalProperties": False,
38 |         },
39 |         DATASET_ID_KEY: {"type": "string"},
40 |         METADATA_URL_KEY: {"type": "string"},
41 |         NEW_VERSION_ID_KEY: {"type": "string"},
42 |     },
43 |     "required": [DATASET_ID_KEY, METADATA_URL_KEY, NEW_VERSION_ID_KEY],
44 |     "additionalProperties": True,
45 | }
46 | 
47 | 
48 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
49 |     validate(event, EVENT_SCHEMA)
50 | 
51 |     if CONTENT_KEY in event.keys():
52 |         assert int(event[CONTENT_KEY][FIRST_ITEM_KEY]) % MAX_ITERATION_SIZE == 0
53 |         first_item_index = event[CONTENT_KEY][NEXT_ITEM_KEY]
54 |     else:
55 |         first_item_index = 0
56 | 
57 |     dataset_id = event[DATASET_ID_KEY]
58 |     version_id = event[NEW_VERSION_ID_KEY]
59 | 
60 |     processing_assets_model = processing_assets_model_with_meta()
61 | 
62 |     asset_count = processing_assets_model.count(
63 |         hash_key=(
64 |             f"{DATASET_ID_PREFIX}{dataset_id}{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}"
65 |         ),
66 |         range_key_condition=processing_assets_model.sk.startswith(
67 |             f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}"
68 |         ),
69 |     )
70 | 
71 |     remaining_assets = asset_count - first_item_index
72 |     if remaining_assets > MAX_ITERATION_SIZE:
73 |         next_item_index = first_item_index + MAX_ITERATION_SIZE
74 |         iteration_size = MAX_ITERATION_SIZE
75 |     else:
76 |         next_item_index = -1
77 |         iteration_size = remaining_assets
78 | 
79 |     return {
80 |         FIRST_ITEM_KEY: str(first_item_index),
81 |         ITERATION_SIZE_KEY: iteration_size,
82 |         NEXT_ITEM_KEY: next_item_index,
83 |         ASSETS_TABLE_NAME_KEY: get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME),
84 |         RESULTS_TABLE_NAME_KEY: get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME),
85 |     }
86 | 


--------------------------------------------------------------------------------
/tests/test_check_files_checksums_logging.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from os import environ
 3 | from unittest.mock import patch
 4 | 
 5 | from pynamodb.exceptions import DoesNotExist
 6 | from pytest import mark, raises
 7 | from pytest_subtests import SubTests
 8 | 
 9 | from geostore.api_keys import MESSAGE_KEY
10 | from geostore.check_files_checksums.task import (
11 |     ASSETS_TABLE_NAME_ARGUMENT,
12 |     CURRENT_VERSION_ID_ARGUMENT,
13 |     DATASET_ID_ARGUMENT,
14 |     DATASET_TITLE_ARGUMENT,
15 |     FIRST_ITEM_ARGUMENT,
16 |     NEW_VERSION_ID_ARGUMENT,
17 |     RESULTS_TABLE_NAME_ARGUMENT,
18 |     S3_ROLE_ARN_ARGUMENT,
19 |     main,
20 | )
21 | from geostore.check_files_checksums.utils import ARRAY_INDEX_VARIABLE_NAME
22 | from geostore.error_response_keys import ERROR_KEY
23 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_VALIDATION_COMPLETE
24 | from geostore.models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX
25 | from geostore.parameter_store import ParameterName, get_param
26 | from geostore.processing_assets_model import ProcessingAssetType, ProcessingAssetsModelBase
27 | from geostore.step_function import Outcome
28 | from geostore.step_function_keys import CURRENT_VERSION_EMPTY_VALUE
29 | 
30 | from .aws_utils import get_s3_role_arn
31 | from .general_generators import any_program_name
32 | from .stac_generators import any_dataset_id, any_dataset_title, any_dataset_version_id
33 | 
34 | 
35 | @mark.infrastructure
36 | def should_log_missing_item(subtests: SubTests) -> None:
37 |     # Given
38 |     dataset_id = any_dataset_id()
39 |     version_id = any_dataset_version_id()
40 |     index = 0
41 |     expected_log = {
42 |         ERROR_KEY: {MESSAGE_KEY: ProcessingAssetsModelBase.DoesNotExist.msg},
43 |         "parameters": {
44 |             "hash_key": (
45 |                 f"{DATASET_ID_PREFIX}{dataset_id}"
46 |                 f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}"
47 |             ),
48 |             "range_key": f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}{index}",
49 |         },
50 |     }
51 | 
52 |     sys.argv = [
53 |         any_program_name(),
54 |         f"{DATASET_ID_ARGUMENT}={dataset_id}",
55 |         f"{NEW_VERSION_ID_ARGUMENT}={version_id}",
56 |         f"{CURRENT_VERSION_ID_ARGUMENT}={CURRENT_VERSION_EMPTY_VALUE}",
57 |         f"{DATASET_TITLE_ARGUMENT}={any_dataset_title()}",
58 |         f"{FIRST_ITEM_ARGUMENT}={index}",
59 |         f"{ASSETS_TABLE_NAME_ARGUMENT}={get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME)}",
60 |         (
61 |             f"{RESULTS_TABLE_NAME_ARGUMENT}"
62 |             f"={get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME)}"
63 |         ),
64 |         f"{S3_ROLE_ARN_ARGUMENT}={get_s3_role_arn()}",
65 |     ]
66 | 
67 |     # When/Then
68 |     with patch("geostore.check_files_checksums.task.LOGGER.error") as logger_mock, patch.dict(
69 |         environ, {ARRAY_INDEX_VARIABLE_NAME: "0"}
70 |     ):
71 |         with subtests.test(msg="Return code"), raises(DoesNotExist):
72 |             main()
73 | 
74 |         with subtests.test(msg="Log message"):
75 |             logger_mock.assert_any_call(
76 |                 LOG_MESSAGE_VALIDATION_COMPLETE,
77 |                 extra={
78 |                     "outcome": Outcome.FAILED,
79 |                     "error": expected_log,
80 |                     GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
81 |                 },
82 |             )
83 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
  1 | # Configuration file for pre-commit (https://pre-commit.com/).
  2 | # Please run `pre-commit run --all-files` when adding or changing entries.
  3 | 
  4 | repos:
  5 |   - repo: local
  6 |     hooks:
  7 |       - id: black
  8 |         name: black
  9 |         entry: black
 10 |         language: system
 11 |         stages: [commit]
 12 |         types: [python]
 13 | 
 14 |       - id: gitlint
 15 |         name: gitlint
 16 |         entry: gitlint
 17 |         args: [--msg-filename]
 18 |         language: system
 19 |         stages: [commit-msg]
 20 | 
 21 |       - id: hadolint
 22 |         name: hadolint
 23 |         language: docker_image
 24 |         entry: hadolint/hadolint:v2.2.0 hadolint
 25 |         stages: [commit]
 26 |         types: [dockerfile]
 27 | 
 28 |       - id: isort
 29 |         name: isort
 30 |         entry: isort
 31 |         language: system
 32 |         stages: [commit]
 33 |         types: [python]
 34 | 
 35 |       - id: mypy
 36 |         name: mypy
 37 |         entry: mypy
 38 |         language: system
 39 |         stages: [commit]
 40 |         types_or: [python, pyi]
 41 |         require_serial: true
 42 | 
 43 |       - id: pathchk
 44 |         name: pathchk
 45 |         entry: pathchk
 46 |         args: [--portability]
 47 |         exclude: ^([a-zA-Z0-9._][a-zA-Z0-9._-]+)(/[a-zA-Z0-9._][a-zA-Z0-9._-]+)*$ # https://lists.gnu.org/archive/html/coreutils/2023-01/msg00006.html
 48 |         language: system
 49 |         stages: [commit]
 50 | 
 51 |       - id: pretty-format-ini
 52 |         name: Pretty format INI
 53 |         entry: pretty-format-ini
 54 |         args: [--autofix]
 55 |         language: system
 56 |         stages: [commit]
 57 |         types: [ini]
 58 | 
 59 |       - id: pretty-format-toml
 60 |         name: Pretty format TOML
 61 |         entry: pretty-format-toml
 62 |         args: [--autofix]
 63 |         language: system
 64 |         stages: [commit]
 65 |         types: [toml]
 66 |         exclude: ^.*\.lock
 67 | 
 68 |       - id: pylint
 69 |         name: pylint
 70 |         entry: pylint
 71 |         language: system
 72 |         stages: [commit]
 73 |         types: [python]
 74 | 
 75 |   - repo: https://github.com/koalaman/shellcheck-precommit
 76 |     rev: 3f77b826548d8dc2d26675f077361c92773b50a7 # frozen: v0.9.0
 77 |     hooks:
 78 |       - id: shellcheck
 79 |         stages: [commit]
 80 |       # TODO: Kill if https://github.com/pre-commit/identify/issues/350 is fixed
 81 |       - id: shellcheck
 82 |         files: ^\.envrc$
 83 |         types: []
 84 |         stages: [commit]
 85 | 
 86 |   - repo: https://github.com/scop/pre-commit-shfmt
 87 |     rev: f21b778d68a3930f77d7424821022e81e3ae17d7 # frozen: v3.6.0-1
 88 |     hooks:
 89 |       - id: shfmt
 90 |         stages: [commit]
 91 |       # TODO: Kill if https://github.com/pre-commit/identify/issues/350 is fixed
 92 |       - id: shfmt
 93 |         files: ^\.envrc$
 94 |         types: []
 95 |         stages: [commit]
 96 | 
 97 |   - repo: https://github.com/nix-community/nixpkgs-fmt
 98 |     rev: 6740ea881d3ac5942d4fbf124f5956b896666c76 # frozen: v1.3.0
 99 |     hooks:
100 |       - id: nixpkgs-fmt
101 |         stages: [commit]
102 | 
103 |   - repo: https://github.com/pre-commit/mirrors-prettier
104 |     rev: cafd5506f18eea191804850dacc0a4264772d59d # frozen: v3.0.0-alpha.4
105 |     hooks:
106 |       - id: prettier
107 |         stages: [commit]
108 | 


--------------------------------------------------------------------------------
/.github/workflows/mutation-test.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 | 
 4 | jobs:
 5 |   mutation-test:
 6 |     runs-on: ubuntu-22.04
 7 |     permissions:
 8 |       id-token: write
 9 |       contents: read
10 |     steps:
11 |       - name: Check out repository
12 |         uses: actions/checkout@v3.4.0
13 |         with:
14 |           submodules: true
15 | 
16 |       - name: Get Node.js version
17 |         run: echo "NODE_VERSION=$(cat .nvmrc)" >> "$GITHUB_ENV"
18 | 
19 |       - name: Get Python version
20 |         run: echo "PYTHON_VERSION=$(cat .python-version)" >> "$GITHUB_ENV"
21 | 
22 |       - name: Use Node.js ${{ env.NODE_VERSION }}
23 |         uses: actions/setup-node@v3.6.0
24 |         with:
25 |           node-version: ${{ env.NODE_VERSION }}
26 |           registry-url: https://registry.npmjs.org
27 | 
28 |       - name: Cache Node.js packages
29 |         uses: actions/cache@v3.3.1
30 |         with:
31 |           path: ~/.npm
32 |           key:
33 |             ${{ runner.os }}-node-${{ secrets.CACHE_SEED }}-${{ hashFiles('**/package-lock.json') }}
34 |           restore-keys: ${{ runner.os }}-node-${{ secrets.CACHE_SEED }}-
35 | 
36 |       - name: Install Node.js dependencies
37 |         run: npm ci --production
38 | 
39 |       - name: Add local Node packages to PATH
40 |         run: echo "./node_modules/.bin:$PATH" >> $GITHUB_PATH
41 | 
42 |       - name: Use Python ${{ env.PYTHON_VERSION }}
43 |         uses: actions/setup-python@v4.5.0
44 |         with:
45 |           python-version: ${{ env.PYTHON_VERSION }}
46 | 
47 |       - name: Cache pip
48 |         uses: actions/cache@v3.3.1
49 |         with:
50 |           path: ~/.cache/pip
51 |           key:
52 |             ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-${{
53 |             hashFiles('./poetry.lock') }}
54 |           restore-keys: ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-
55 | 
56 |       - name: Upgrade pip
57 |         run: python -m pip install --requirement=geostore/pip.txt
58 | 
59 |       - name: Install Poetry
60 |         run: python -m pip install --requirement=geostore/poetry.txt
61 | 
62 |       - name: Install Python dependencies
63 |         run: python -m poetry install --all-extras --no-root
64 | 
65 |       - name: Get Oidc deploy role arn
66 |         run: cat .github/workflows/.env >> $GITHUB_ENV
67 | 
68 |       - name: Configure AWS credentials
69 |         uses: aws-actions/configure-aws-credentials@v2.0.0
70 |         with:
71 |           aws-region: ap-southeast-2
72 |           mask-aws-account-id: true
73 |           role-to-assume: ${{ env.CiOidc }}
74 | 
75 |       - name: Set unique deployment environment type variable
76 |         run: echo "GEOSTORE_ENV_NAME=ci${GITHUB_RUN_ID}" | tee -a $GITHUB_ENV
77 | 
78 |       - name: Deploy AWS stacks for testing
79 |         run:
80 |           poetry run cdk deploy --all --require-approval never --strict --change-set-name
81 |           "ci-${GITHUB_RUN_ID}"
82 | 
83 |       - run: poetry run mutmut run
84 |       - run: poetry run mutmut junitxml > mutmut.xml
85 |         if: failure()
86 |       - uses: actions/upload-artifact@v3.1.2
87 |         with:
88 |           name: mutation-test-report
89 |           path: mutmut.xml
90 |         if: failure()
91 |       - uses: mikepenz/action-junit-report@v3.7.5
92 |         if: failure()
93 |         with:
94 |           report_paths: mutmut.xml
95 | 
96 |       - name: Destroy AWS stacks used for testing
97 |         run: poetry run cdk destroy --force --all
98 |         if: always() # clean-up AWS stack after failure
99 | 


--------------------------------------------------------------------------------
/tests/general_generators.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta, timezone
  2 | from http import HTTPStatus
  3 | from os import urandom
  4 | from random import choice, randrange
  5 | from string import ascii_letters, ascii_uppercase, digits, printable
  6 | from typing import Type
  7 | from uuid import uuid4
  8 | 
  9 | from mypy_boto3_lambda.type_defs import ResponseMetadataTypeDef
 10 | 
 11 | REFERENCE_DATETIME = datetime(2000, 1, 1, tzinfo=timezone.utc)
 12 | 
 13 | 
 14 | # General-purpose generators
 15 | 
 16 | 
 17 | def random_string(length: int) -> str:
 18 |     """
 19 |     Includes ASCII printable characters and the first printable character from several Unicode
 20 |     blocks <https://en.wikipedia.org/wiki/List_of_Unicode_characters>.
 21 |     """
 22 |     return _random_string_choices(f"{printable}¡ĀƀḂəʰͰἀЀ–⁰₠℀⅐←∀⌀①─▀■☀🬀✁ㄅﬀ", length)
 23 | 
 24 | 
 25 | def random_ascii_letter_string(length: int) -> str:
 26 |     return _random_string_choices(ascii_letters, length)
 27 | 
 28 | 
 29 | def _random_string_choices(characters: str, length: int) -> str:
 30 |     return "".join(choice(characters) for _ in range(length))
 31 | 
 32 | 
 33 | def any_past_datetime() -> datetime:
 34 |     return REFERENCE_DATETIME - timedelta(seconds=randrange(30_000_000_000))  # Back to year 1049
 35 | 
 36 | 
 37 | def any_past_datetime_string() -> str:
 38 |     return any_past_datetime().isoformat()
 39 | 
 40 | 
 41 | def any_past_utc_datetime_string() -> str:
 42 |     return any_past_datetime().strftime("%Y-%m-%dT%H:%M:%SZ")
 43 | 
 44 | 
 45 | def any_program_name() -> str:
 46 |     """Arbitrary-length string"""
 47 |     return random_string(20)
 48 | 
 49 | 
 50 | def any_safe_file_path() -> str:
 51 |     paths = [any_safe_filename() for _ in range(randrange(1, 5))]
 52 |     return "/".join(paths)
 53 | 
 54 | 
 55 | def any_safe_filename() -> str:
 56 |     return _random_string_choices(f"{digits}{ascii_letters}", 20)
 57 | 
 58 | 
 59 | def any_host() -> str:
 60 |     return random_ascii_letter_string(20)
 61 | 
 62 | 
 63 | def any_https_url() -> str:
 64 |     host = any_host()
 65 |     path = any_safe_file_path()
 66 |     return f"https://{host}/{path}"
 67 | 
 68 | 
 69 | def any_file_contents(byte_count: int = 10) -> bytes:
 70 |     return urandom(byte_count)
 71 | 
 72 | 
 73 | def any_request_id() -> str:
 74 |     """Arbitrary-length string"""
 75 |     return uuid4().hex
 76 | 
 77 | 
 78 | def any_http_status_code() -> int:
 79 |     return choice(list(HTTPStatus))
 80 | 
 81 | 
 82 | def any_retry_attempts() -> int:
 83 |     """Arbitrary-length integer"""
 84 |     return randrange(10)
 85 | 
 86 | 
 87 | def any_response_metadata() -> ResponseMetadataTypeDef:
 88 |     return {
 89 |         "RequestId": any_request_id(),
 90 |         "HostId": any_host(),
 91 |         "HTTPStatusCode": any_http_status_code(),
 92 |         "HTTPHeaders": {},
 93 |         "RetryAttempts": any_retry_attempts(),
 94 |     }
 95 | 
 96 | 
 97 | def any_error_message() -> str:
 98 |     """Arbitrary-length string"""
 99 |     return random_string(50)
100 | 
101 | 
102 | def any_class_name() -> str:
103 |     return f"{choice(ascii_uppercase)}{random_ascii_letter_string(10)}Error"
104 | 
105 | 
106 | def any_exception_class() -> Type[Exception]:
107 |     exception_class = type(any_class_name(), (Exception,), {})
108 |     return exception_class
109 | 
110 | 
111 | def any_dictionary_key() -> str:
112 |     """Arbitrary-length string"""
113 |     return random_string(20)
114 | 
115 | 
116 | def any_etag() -> str:
117 |     """Arbitrary-length string"""
118 |     return random_string(10)
119 | 
120 | 
121 | def any_name() -> str:
122 |     return random_string(10)
123 | 
124 | 
125 | def any_description() -> str:
126 |     return random_string(20)
127 | 


--------------------------------------------------------------------------------
/geostore/datasets_model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from os import environ
  3 | from typing import Any, Dict, Optional, Tuple, Type
  4 | 
  5 | from pynamodb.attributes import UTCDateTimeAttribute, UnicodeAttribute
  6 | from pynamodb.indexes import AllProjection, GlobalSecondaryIndex
  7 | from pynamodb.models import MetaModel, Model
  8 | from ulid import ULID
  9 | from ulid.base32 import encode_randomness
 10 | from ulid.constants import TIMESTAMP_LEN
 11 | 
 12 | from .aws_keys import AWS_DEFAULT_REGION_KEY
 13 | from .clock import now
 14 | from .models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR
 15 | from .parameter_store import ParameterName, get_param
 16 | 
 17 | 
 18 | def human_readable_ulid(ulid: ULID) -> str:
 19 |     """
 20 |     Formats the timestamp part of the ULID as a human readable datetime. Uses "T" as the date/time
 21 |     separator as per RFC3339, hyphen as the datetime field separator to ensure broad filesystem
 22 |     compatibility, and underscore as the datetime/randomness separator.
 23 | 
 24 |     ULIDs have millisecond timestamps, but strftime can only format microseconds, so we need to chop
 25 |     off the last three characters.
 26 |     """
 27 |     datetime_string = ulid.datetime.strftime("%Y-%m-%dT%H-%M-%S-%f")[:-3]
 28 |     return f"{datetime_string}Z_{encode_randomness(ulid.bytes[TIMESTAMP_LEN :])}"
 29 | 
 30 | 
 31 | class DatasetsTitleIdx(GlobalSecondaryIndex["DatasetsModelBase"]):  # type: ignore[no-untyped-call]
 32 |     """Dataset title global index."""
 33 | 
 34 |     @dataclass
 35 |     class Meta:
 36 |         """Meta class."""
 37 | 
 38 |         index_name = "datasets_title"
 39 |         read_capacity_units = 1
 40 |         write_capacity_units = 1
 41 |         projection = AllProjection()
 42 | 
 43 |     title = UnicodeAttribute(hash_key=True)
 44 | 
 45 | 
 46 | class DatasetsModelBase(Model):
 47 |     """Dataset model."""
 48 | 
 49 |     id = UnicodeAttribute(
 50 |         hash_key=True,
 51 |         attr_name="pk",
 52 |         default_for_new=lambda: f"{DATASET_ID_PREFIX}{ULID()}",
 53 |     )
 54 |     title = UnicodeAttribute()
 55 |     created_at = UTCDateTimeAttribute(default_for_new=now)
 56 |     updated_at = UTCDateTimeAttribute(default=now)
 57 |     current_dataset_version = UnicodeAttribute(null=True)
 58 | 
 59 |     datasets_title_idx: DatasetsTitleIdx
 60 | 
 61 |     def as_dict(self) -> Dict[str, Any]:
 62 |         serialized = self.serialize()
 63 |         result: Dict[str, Any] = {key: value["S"] for key, value in serialized.items()}
 64 |         result["id"] = self.dataset_id
 65 |         return result
 66 | 
 67 |     @property
 68 |     def dataset_id(self) -> str:
 69 |         """Dataset ID value."""
 70 |         return str(self.id).split(DB_KEY_SEPARATOR)[1]
 71 | 
 72 | 
 73 | class DatasetsModelMeta(MetaModel):
 74 |     def __new__(
 75 |         cls,
 76 |         name: str,
 77 |         bases: Tuple[Type[object], ...],
 78 |         namespace: Dict[str, Any],
 79 |         discriminator: Optional[Any] = None,
 80 |     ) -> "DatasetsModelMeta":
 81 |         namespace["Meta"] = type(
 82 |             "Meta",
 83 |             (),
 84 |             {
 85 |                 "table_name": get_param(ParameterName.STORAGE_DATASETS_TABLE_NAME),
 86 |                 "region": environ[AWS_DEFAULT_REGION_KEY],
 87 |             },
 88 |         )
 89 |         klass: "DatasetsModelMeta" = MetaModel.__new__(  # type: ignore[no-untyped-call]
 90 |             cls, name, bases, namespace, discriminator=discriminator
 91 |         )
 92 |         return klass
 93 | 
 94 | 
 95 | def datasets_model_with_meta() -> Type[DatasetsModelBase]:
 96 |     class DatasetModel(DatasetsModelBase, metaclass=DatasetsModelMeta):
 97 |         datasets_title_idx = DatasetsTitleIdx()
 98 | 
 99 |     return DatasetModel
100 | 


--------------------------------------------------------------------------------
/tests/test_upload_status.py:
--------------------------------------------------------------------------------
  1 | from typing import cast
  2 | from unittest.mock import MagicMock, patch
  3 | 
  4 | from jsonschema import ValidationError
  5 | from pytest import raises
  6 | 
  7 | from geostore.api_keys import SUCCESS_KEY
  8 | from geostore.import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY
  9 | from geostore.step_function import Outcome
 10 | from geostore.step_function_keys import (
 11 |     ASSET_UPLOAD_KEY,
 12 |     DATASET_ID_KEY,
 13 |     ERRORS_KEY,
 14 |     FAILED_TASKS_KEY,
 15 |     FAILURE_REASONS_KEY,
 16 |     IMPORT_DATASET_KEY,
 17 |     METADATA_UPLOAD_KEY,
 18 |     NEW_VERSION_ID_KEY,
 19 |     STATUS_KEY,
 20 |     VALIDATION_KEY,
 21 | )
 22 | from geostore.types import JsonObject
 23 | from geostore.upload_status.task import lambda_handler
 24 | 
 25 | from .aws_utils import any_account_id, any_batch_job_status, any_job_id, any_lambda_context
 26 | from .stac_generators import any_dataset_id, any_dataset_version_id
 27 | 
 28 | 
 29 | def should_raise_exception_when_missing_mandatory_execution_arn() -> None:
 30 |     with raises(ValidationError):
 31 |         lambda_handler({}, any_lambda_context())
 32 | 
 33 | 
 34 | @patch("geostore.step_function.get_step_function_validation_results")
 35 | @patch("geostore.step_function.S3CONTROL_CLIENT.describe_job")
 36 | @patch("geostore.step_function.get_account_number")
 37 | def should_report_upload_statuses(
 38 |     get_account_number_mock: MagicMock,
 39 |     describe_job_mock: MagicMock,
 40 |     get_step_function_validation_results_mock: MagicMock,
 41 | ) -> None:
 42 |     # Given
 43 |     account_id = any_account_id()
 44 |     get_account_number_mock.return_value = account_id
 45 |     asset_job_id = any_job_id()
 46 |     asset_job_status = any_batch_job_status()
 47 |     metadata_job_id = any_job_id()
 48 |     metadata_job_status = any_batch_job_status()
 49 | 
 50 |     get_step_function_validation_results_mock.return_value = []
 51 | 
 52 |     def describe_job(AccountId: str, JobId: str) -> JsonObject:  # pylint: disable=invalid-name
 53 |         assert AccountId == cast(str, account_id)
 54 |         return {
 55 |             asset_job_id: {
 56 |                 "Job": {
 57 |                     "Status": asset_job_status,
 58 |                     "FailureReasons": [],
 59 |                     "ProgressSummary": {"NumberOfTasksFailed": 0},
 60 |                 }
 61 |             },
 62 |             metadata_job_id: {
 63 |                 "Job": {
 64 |                     "Status": metadata_job_status,
 65 |                     "FailureReasons": [],
 66 |                     "ProgressSummary": {"NumberOfTasksFailed": 0},
 67 |                 }
 68 |             },
 69 |         }[JobId]
 70 | 
 71 |     describe_job_mock.side_effect = describe_job
 72 | 
 73 |     expected_response = {
 74 |         VALIDATION_KEY: {STATUS_KEY: Outcome.PASSED.value, ERRORS_KEY: []},
 75 |         ASSET_UPLOAD_KEY: {
 76 |             STATUS_KEY: asset_job_status,
 77 |             ERRORS_KEY: {FAILED_TASKS_KEY: 0, FAILURE_REASONS_KEY: []},
 78 |         },
 79 |         METADATA_UPLOAD_KEY: {
 80 |             STATUS_KEY: metadata_job_status,
 81 |             ERRORS_KEY: {FAILED_TASKS_KEY: 0, FAILURE_REASONS_KEY: []},
 82 |         },
 83 |     }
 84 | 
 85 |     # When
 86 |     response = lambda_handler(
 87 |         {
 88 |             DATASET_ID_KEY: any_dataset_id(),
 89 |             NEW_VERSION_ID_KEY: any_dataset_version_id(),
 90 |             VALIDATION_KEY: {SUCCESS_KEY: True},
 91 |             IMPORT_DATASET_KEY: {
 92 |                 METADATA_JOB_ID_KEY: metadata_job_id,
 93 |                 ASSET_JOB_ID_KEY: asset_job_id,
 94 |             },
 95 |         },
 96 |         any_lambda_context(),
 97 |     )
 98 | 
 99 |     # Then
100 |     assert response == expected_response
101 | 


--------------------------------------------------------------------------------
/geostore/validation_results_model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from enum import Enum
  3 | from os import environ
  4 | from typing import Any, Dict, Optional, Tuple, Type
  5 | 
  6 | from pynamodb.attributes import MapAttribute, UnicodeAttribute
  7 | from pynamodb.indexes import AllProjection, GlobalSecondaryIndex
  8 | from pynamodb.models import MetaModel, Model
  9 | 
 10 | from .aws_keys import AWS_DEFAULT_REGION_KEY
 11 | from .check import Check
 12 | from .models import CHECK_ID_PREFIX, DB_KEY_SEPARATOR, URL_ID_PREFIX
 13 | from .parameter_store import ParameterName, get_param
 14 | from .types import JsonObject
 15 | 
 16 | 
 17 | class ValidationResult(Enum):
 18 |     FAILED = "Failed"
 19 |     PASSED = "Passed"
 20 | 
 21 | 
 22 | class ValidationOutcomeIdx(  # type: ignore[no-untyped-call]
 23 |     GlobalSecondaryIndex["ValidationResultsModelBase"]
 24 | ):
 25 |     @dataclass
 26 |     class Meta:
 27 |         index_name = "validation_outcome"
 28 |         read_capacity_units = 1
 29 |         write_capacity_units = 1
 30 |         projection = AllProjection()
 31 | 
 32 |     pk = UnicodeAttribute(hash_key=True, attr_name="pk")
 33 |     result = UnicodeAttribute(range_key=True, attr_name="result")
 34 | 
 35 | 
 36 | class ValidationResultsModelBase(Model):
 37 |     pk = UnicodeAttribute(hash_key=True)
 38 |     sk = UnicodeAttribute(range_key=True)
 39 |     result = UnicodeAttribute()
 40 |     # TODO: Remove type-arg when PynamoDB issue #920 is fixed pylint:disable=fixme
 41 |     details: MapAttribute[str, Any] = MapAttribute(null=True)  # type: ignore[no-untyped-call]
 42 | 
 43 |     validation_outcome_index: ValidationOutcomeIdx
 44 | 
 45 | 
 46 | def validation_results_model_with_meta(
 47 |     *, results_table_name: Optional[str] = None
 48 | ) -> Type[ValidationResultsModelBase]:
 49 |     if results_table_name is None:
 50 |         results_table_name = get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME)
 51 | 
 52 |     class ValidationResultsModelMeta(MetaModel):
 53 |         def __new__(
 54 |             cls,
 55 |             name: str,
 56 |             bases: Tuple[Type[object], ...],
 57 |             namespace: Dict[str, Any],
 58 |             discriminator: Optional[Any] = None,
 59 |         ) -> "ValidationResultsModelMeta":
 60 |             namespace["Meta"] = type(
 61 |                 "Meta",
 62 |                 (),
 63 |                 {
 64 |                     "table_name": results_table_name,
 65 |                     "region": environ[AWS_DEFAULT_REGION_KEY],
 66 |                 },
 67 |             )
 68 |             klass: "ValidationResultsModelMeta"
 69 |             klass = MetaModel.__new__(  # type: ignore[no-untyped-call]
 70 |                 cls, name, bases, namespace, discriminator=discriminator
 71 |             )
 72 |             return klass
 73 | 
 74 |     class ValidationResultsModel(ValidationResultsModelBase, metaclass=ValidationResultsModelMeta):
 75 |         validation_outcome_index = ValidationOutcomeIdx()
 76 | 
 77 |     return ValidationResultsModel
 78 | 
 79 | 
 80 | class ValidationResultFactory:  # pylint:disable=too-few-public-methods
 81 |     def __init__(self, hash_key: str, results_table_name: str):
 82 |         self.hash_key = hash_key
 83 |         self.validation_results_model = validation_results_model_with_meta(
 84 |             results_table_name=results_table_name
 85 |         )
 86 | 
 87 |     def save(
 88 |         self,
 89 |         url: str,
 90 |         check: Check,
 91 |         result: ValidationResult,
 92 |         *,
 93 |         details: Optional[JsonObject] = None,
 94 |     ) -> None:
 95 |         self.validation_results_model(
 96 |             pk=self.hash_key,
 97 |             sk=f"{CHECK_ID_PREFIX}{check.value}{DB_KEY_SEPARATOR}{URL_ID_PREFIX}{url}",
 98 |             result=result.value,
 99 |             details=details,
100 |         ).save()
101 | 


--------------------------------------------------------------------------------
/geostore/check_stac_metadata/task.py:
--------------------------------------------------------------------------------
  1 | from logging import Logger
  2 | 
  3 | from botocore.exceptions import ClientError
  4 | from jsonschema import ValidationError, validate
  5 | from linz_logger import get_log
  6 | 
  7 | from ..api_keys import SUCCESS_KEY
  8 | from ..error_response_keys import ERROR_MESSAGE_KEY
  9 | from ..logging_keys import (
 10 |     GIT_COMMIT,
 11 |     LOG_MESSAGE_LAMBDA_FAILURE,
 12 |     LOG_MESSAGE_LAMBDA_START,
 13 |     LOG_MESSAGE_VALIDATION_COMPLETE,
 14 | )
 15 | from ..parameter_store import ParameterName, get_param
 16 | from ..processing_assets_model import ProcessingAssetType
 17 | from ..s3_utils import get_s3_url_reader
 18 | from ..step_function import AssetGarbageCollector, Outcome, get_hash_key
 19 | from ..step_function_keys import (
 20 |     CURRENT_VERSION_ID_KEY,
 21 |     DATASET_ID_KEY,
 22 |     DATASET_TITLE_KEY,
 23 |     METADATA_URL_KEY,
 24 |     NEW_VERSION_ID_KEY,
 25 |     S3_ROLE_ARN_KEY,
 26 | )
 27 | from ..types import JsonObject
 28 | from ..validation_results_model import ValidationResultFactory
 29 | from .utils import STACDatasetValidator
 30 | 
 31 | LOGGER: Logger = get_log()
 32 | 
 33 | 
 34 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
 35 |     LOGGER.debug(
 36 |         LOG_MESSAGE_LAMBDA_START,
 37 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 38 |     )
 39 | 
 40 |     # validate input
 41 |     try:
 42 |         validate(
 43 |             event,
 44 |             {
 45 |                 "type": "object",
 46 |                 "properties": {
 47 |                     CURRENT_VERSION_ID_KEY: {"type": "string"},
 48 |                     DATASET_ID_KEY: {"type": "string"},
 49 |                     DATASET_TITLE_KEY: {"type": "string"},
 50 |                     METADATA_URL_KEY: {"type": "string"},
 51 |                     NEW_VERSION_ID_KEY: {"type": "string"},
 52 |                     S3_ROLE_ARN_KEY: {"type": "string"},
 53 |                 },
 54 |                 "required": [
 55 |                     CURRENT_VERSION_ID_KEY,
 56 |                     DATASET_ID_KEY,
 57 |                     DATASET_TITLE_KEY,
 58 |                     METADATA_URL_KEY,
 59 |                     NEW_VERSION_ID_KEY,
 60 |                     S3_ROLE_ARN_KEY,
 61 |                 ],
 62 |                 "additionalProperties": True,
 63 |             },
 64 |         )
 65 |     except ValidationError as error:
 66 |         LOGGER.warning(
 67 |             LOG_MESSAGE_VALIDATION_COMPLETE,
 68 |             extra={
 69 |                 "outcome": Outcome.FAILED,
 70 |                 "error": error,
 71 |                 GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
 72 |             },
 73 |         )
 74 |         return {ERROR_MESSAGE_KEY: error.message}
 75 | 
 76 |     try:
 77 |         s3_url_reader = get_s3_url_reader(event[S3_ROLE_ARN_KEY], event[DATASET_TITLE_KEY], LOGGER)
 78 |     except ClientError as error:
 79 |         LOGGER.warning(
 80 |             LOG_MESSAGE_LAMBDA_FAILURE,
 81 |             extra={"error": error, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 82 |         )
 83 |         return {ERROR_MESSAGE_KEY: str(error)}
 84 | 
 85 |     asset_garbage_collector = AssetGarbageCollector(
 86 |         event[DATASET_ID_KEY],
 87 |         event[CURRENT_VERSION_ID_KEY],
 88 |         ProcessingAssetType.METADATA,
 89 |         LOGGER,
 90 |     )
 91 | 
 92 |     hash_key = get_hash_key(event[DATASET_ID_KEY], event[NEW_VERSION_ID_KEY])
 93 | 
 94 |     validation_result_factory = ValidationResultFactory(
 95 |         hash_key, get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME)
 96 |     )
 97 | 
 98 |     validator = STACDatasetValidator(
 99 |         hash_key, s3_url_reader, asset_garbage_collector, validation_result_factory
100 |     )
101 | 
102 |     validator.run(event[METADATA_URL_KEY])
103 |     return {SUCCESS_KEY: True}
104 | 


--------------------------------------------------------------------------------
/tests/test_import_status_logging.py:
--------------------------------------------------------------------------------
  1 | from json import dumps
  2 | from unittest.mock import MagicMock, patch
  3 | 
  4 | from jsonschema import ValidationError
  5 | 
  6 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY
  7 | from geostore.import_status.get import get_import_status
  8 | from geostore.logging_keys import (
  9 |     GIT_COMMIT,
 10 |     LOG_MESSAGE_LAMBDA_FAILURE,
 11 |     LOG_MESSAGE_LAMBDA_START,
 12 |     LOG_MESSAGE_STEP_FUNCTION_RESPONSE,
 13 | )
 14 | from geostore.parameter_store import ParameterName, get_param
 15 | from geostore.step_function_keys import DATASET_ID_KEY, EXECUTION_ARN_KEY, NEW_VERSION_ID_KEY
 16 | 
 17 | from .aws_utils import any_arn_formatted_string
 18 | from .general_generators import any_error_message
 19 | from .stac_generators import any_dataset_id, any_dataset_version_id
 20 | 
 21 | 
 22 | @patch("geostore.step_function.STEP_FUNCTIONS_CLIENT.describe_execution")
 23 | def should_log_payload(describe_step_function_mock: MagicMock) -> None:
 24 |     # Given
 25 |     event = {
 26 |         HTTP_METHOD_KEY: "GET",
 27 |         BODY_KEY: {EXECUTION_ARN_KEY: any_arn_formatted_string()},
 28 |     }
 29 | 
 30 |     describe_step_function_mock.return_value = {
 31 |         "status": "RUNNING",
 32 |         "input": dumps(
 33 |             {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}
 34 |         ),
 35 |     }
 36 | 
 37 |     with patch("geostore.import_status.get.LOGGER.debug") as logger_mock, patch(
 38 |         "geostore.step_function.get_step_function_validation_results"
 39 |     ) as validation_mock:
 40 |         validation_mock.return_value = []
 41 | 
 42 |         # When
 43 |         get_import_status(event)
 44 | 
 45 |         # Then
 46 |         logger_mock.assert_any_call(
 47 |             LOG_MESSAGE_LAMBDA_START,
 48 |             extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 49 |         )
 50 | 
 51 | 
 52 | @patch("geostore.import_status.get.validate")
 53 | def should_log_schema_validation_warning(validate_schema_mock: MagicMock) -> None:
 54 |     # Given
 55 | 
 56 |     error_message = any_error_message()
 57 |     validate_schema_mock.side_effect = ValidationError(error_message)
 58 | 
 59 |     with patch("geostore.import_status.get.LOGGER.warning") as logger_mock:
 60 |         # When
 61 |         get_import_status(
 62 |             {
 63 |                 HTTP_METHOD_KEY: "GET",
 64 |                 BODY_KEY: {},
 65 |             }
 66 |         )
 67 | 
 68 |         # Then
 69 |         logger_mock.assert_any_call(
 70 |             LOG_MESSAGE_LAMBDA_FAILURE,
 71 |             extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 72 |         )
 73 | 
 74 | 
 75 | @patch("geostore.step_function.STEP_FUNCTIONS_CLIENT.describe_execution")
 76 | def should_log_stepfunctions_status_response(
 77 |     describe_execution_mock: MagicMock,
 78 | ) -> None:
 79 |     # Given
 80 |     describe_execution_mock.return_value = describe_execution_response = {
 81 |         "status": "Some Response",
 82 |         "input": dumps(
 83 |             {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}
 84 |         ),
 85 |     }
 86 | 
 87 |     with patch("geostore.step_function.LOGGER.debug") as logger_mock, patch(
 88 |         "geostore.step_function.get_account_number"
 89 |     ), patch("geostore.step_function.get_step_function_validation_results") as validation_mock:
 90 |         validation_mock.return_value = []
 91 |         # When
 92 |         get_import_status({EXECUTION_ARN_KEY: any_arn_formatted_string()})
 93 | 
 94 |         # Then
 95 |         logger_mock.assert_any_call(
 96 |             LOG_MESSAGE_STEP_FUNCTION_RESPONSE,
 97 |             extra={
 98 |                 "response": describe_execution_response,
 99 |                 GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
100 |             },
101 |         )
102 | 


--------------------------------------------------------------------------------
/geostore/import_dataset_file.py:
--------------------------------------------------------------------------------
  1 | from json import loads
  2 | from logging import Logger
  3 | from typing import TYPE_CHECKING, Callable, Optional
  4 | from urllib.parse import unquote_plus
  5 | 
  6 | from botocore.exceptions import ClientError
  7 | from linz_logger import get_log
  8 | 
  9 | from .aws_response import AWS_CODE_REQUEST_TIMEOUT
 10 | from .import_dataset_keys import NEW_KEY_KEY, ORIGINAL_KEY_KEY, TARGET_BUCKET_NAME_KEY
 11 | from .logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START
 12 | from .parameter_store import ParameterName, get_param
 13 | from .s3 import get_s3_client_for_role
 14 | from .step_function_keys import S3_ROLE_ARN_KEY
 15 | from .types import JsonObject
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from mypy_boto3_s3 import S3Client
 19 |     from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef
 20 | else:
 21 |     PutObjectOutputTypeDef = JsonObject  # pragma: no mutate
 22 |     S3Client = object  # pragma: no mutate
 23 | 
 24 | INVOCATION_ID_KEY = "invocationId"
 25 | INVOCATION_SCHEMA_VERSION_KEY = "invocationSchemaVersion"
 26 | RESULTS_KEY = "results"
 27 | RESULT_CODE_KEY = "resultCode"
 28 | RESULT_STRING_KEY = "resultString"
 29 | S3_BUCKET_ARN_KEY = "s3BucketArn"
 30 | S3_KEY_KEY = "s3Key"
 31 | TASKS_KEY = "tasks"
 32 | TASK_ID_KEY = "taskId"
 33 | TREAT_MISSING_KEYS_AS_KEY = "treatMissingKeysAs"
 34 | 
 35 | RESULT_CODE_PERMANENT_FAILURE = "PermanentFailure"
 36 | RESULT_CODE_SUCCEEDED = "Succeeded"
 37 | RESULT_CODE_TEMPORARY_FAILURE = "TemporaryFailure"
 38 | 
 39 | EXCEPTION_PREFIX = "Exception"
 40 | RETRY_RESULT_STRING = "Retry request to Amazon S3 due to timeout."
 41 | 
 42 | LOGGER: Logger = get_log()
 43 | LOG_MESSAGE_S3_BATCH_COPY_RESULT = "S3 Batch Result"
 44 | 
 45 | 
 46 | def get_import_result(
 47 |     event: JsonObject,
 48 |     importer: Callable[[str, str, str, str, S3Client], Optional[PutObjectOutputTypeDef]],
 49 | ) -> JsonObject:
 50 |     LOGGER.debug(
 51 |         LOG_MESSAGE_LAMBDA_START,
 52 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 53 |     )
 54 | 
 55 |     task = event[TASKS_KEY][0]
 56 |     source_bucket_name = task[S3_BUCKET_ARN_KEY].split(":::", maxsplit=1)[-1]
 57 |     parameters = loads(unquote_plus(task[S3_KEY_KEY]))
 58 |     source_s3_client = get_s3_client_for_role(parameters[S3_ROLE_ARN_KEY])
 59 | 
 60 |     try:
 61 |         response = importer(
 62 |             source_bucket_name,
 63 |             parameters[ORIGINAL_KEY_KEY],
 64 |             parameters[TARGET_BUCKET_NAME_KEY],
 65 |             parameters[NEW_KEY_KEY],
 66 |             source_s3_client,
 67 |         )
 68 |         result_code = RESULT_CODE_SUCCEEDED
 69 |         result_string = str(response)
 70 |     except ClientError as error:
 71 |         error_code = error.response["Error"]["Code"]
 72 |         if error_code == AWS_CODE_REQUEST_TIMEOUT:
 73 |             result_code = RESULT_CODE_TEMPORARY_FAILURE
 74 |             result_string = RETRY_RESULT_STRING
 75 |         else:
 76 |             result_code = RESULT_CODE_PERMANENT_FAILURE
 77 |             error_message = error.response["Error"]["Message"]
 78 |             result_string = f"{error_code} when calling {error.operation_name}: {error_message}"
 79 |     except Exception as error:  # pylint:disable=broad-except
 80 |         result_code = RESULT_CODE_PERMANENT_FAILURE
 81 |         result_string = f"{EXCEPTION_PREFIX}: {error}"
 82 | 
 83 |     result = {
 84 |         INVOCATION_SCHEMA_VERSION_KEY: event[INVOCATION_SCHEMA_VERSION_KEY],
 85 |         TREAT_MISSING_KEYS_AS_KEY: RESULT_CODE_PERMANENT_FAILURE,
 86 |         INVOCATION_ID_KEY: event[INVOCATION_ID_KEY],
 87 |         RESULTS_KEY: [
 88 |             {
 89 |                 TASK_ID_KEY: task[TASK_ID_KEY],
 90 |                 RESULT_CODE_KEY: result_code,
 91 |                 RESULT_STRING_KEY: result_string,
 92 |             }
 93 |         ],
 94 |     }
 95 |     LOGGER.debug(
 96 |         LOG_MESSAGE_S3_BATCH_COPY_RESULT,
 97 |         extra={"result": result, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 98 |     )
 99 |     return result
100 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/notify.py:
--------------------------------------------------------------------------------
  1 | from os import environ
  2 | 
  3 | from aws_cdk import (
  4 |     aws_events,
  5 |     aws_events_targets,
  6 |     aws_iam,
  7 |     aws_lambda_python_alpha,
  8 |     aws_sns,
  9 |     aws_ssm,
 10 |     aws_stepfunctions,
 11 | )
 12 | from constructs import Construct
 13 | 
 14 | from geostore.environment import ENV_NAME_VARIABLE_NAME
 15 | from geostore.notify_status_update.task import SLACK_URL_ENV_NAME
 16 | from geostore.parameter_store import ParameterName
 17 | from geostore.resources import Resource
 18 | 
 19 | from .bundled_lambda_function import BundledLambdaFunction
 20 | from .common import grant_parameter_read_access
 21 | from .s3_policy import ALLOW_DESCRIBE_ANY_S3_JOB
 22 | from .table import Table
 23 | 
 24 | 
 25 | class Notify(Construct):
 26 |     def __init__(
 27 |         self,
 28 |         scope: Construct,
 29 |         stack_id: str,
 30 |         *,
 31 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
 32 |         env_name: str,
 33 |         state_machine: aws_stepfunctions.StateMachine,
 34 |         validation_results_table: Table,
 35 |         git_commit_parameter: aws_ssm.StringParameter,
 36 |     ) -> None:
 37 |         super().__init__(scope, stack_id)
 38 | 
 39 |         slack_notify_function = BundledLambdaFunction(
 40 |             scope,
 41 |             "GetStatusUpdate",
 42 |             lambda_directory="notify_status_update",
 43 |             extra_environment={
 44 |                 ENV_NAME_VARIABLE_NAME: env_name,
 45 |             },
 46 |             botocore_lambda_layer=botocore_lambda_layer,
 47 |         )
 48 |         if SLACK_URL_ENV_NAME in environ:
 49 |             slack_notify_function.add_environment(
 50 |                 SLACK_URL_ENV_NAME,
 51 |                 environ[SLACK_URL_ENV_NAME],
 52 |             )
 53 | 
 54 |         validation_results_table.grant_read_data(slack_notify_function)
 55 |         validation_results_table.grant(slack_notify_function, "dynamodb:DescribeTable")
 56 |         state_machine.grant_read(slack_notify_function)
 57 | 
 58 |         slack_notify_function.add_to_role_policy(ALLOW_DESCRIBE_ANY_S3_JOB)
 59 | 
 60 |         # Allow anyone to subscribe to topic
 61 |         step_function_topic = aws_sns.Topic(
 62 |             scope,
 63 |             "geostore-stepfunction-status-topic",
 64 |             topic_name=Resource.SNS_TOPIC_NAME.resource_name,
 65 |         )
 66 |         sns_topic_arn_parameter = aws_ssm.StringParameter(
 67 |             self,
 68 |             "status-sns-topic-arn",
 69 |             string_value=step_function_topic.topic_arn,
 70 |             description=f"Status SNS Topic ARN for {env_name}",
 71 |             parameter_name=ParameterName.STATUS_SNS_TOPIC_ARN.value,
 72 |         )
 73 | 
 74 |         # Allow access to any validations
 75 |         grant_parameter_read_access(
 76 |             {
 77 |                 sns_topic_arn_parameter: [slack_notify_function],
 78 |                 validation_results_table.name_parameter: [
 79 |                     slack_notify_function,
 80 |                 ],
 81 |                 git_commit_parameter: [slack_notify_function],
 82 |             }
 83 |         )
 84 |         step_function_topic.grant_publish(slack_notify_function)
 85 | 
 86 |         step_function_topic.add_to_resource_policy(
 87 |             aws_iam.PolicyStatement(
 88 |                 actions=["sns:Subscribe", "sns:Receive"],
 89 |                 principals=[aws_iam.AnyPrincipal()],
 90 |                 resources=[step_function_topic.topic_arn],
 91 |             )
 92 |         )
 93 | 
 94 |         aws_events.Rule(
 95 |             scope,
 96 |             "geostore-cloudwatch-stepfunctions-rule",
 97 |             enabled=True,
 98 |             rule_name=Resource.CLOUDWATCH_RULE_NAME.resource_name,
 99 |             description="Cloudwatch rule to detect import status updates",
100 |             event_pattern=aws_events.EventPattern(
101 |                 source=["aws.states"],
102 |                 detail_type=["Step Functions Execution Status Change"],
103 |                 detail={"stateMachineArn": [state_machine.state_machine_arn]},
104 |             ),
105 |             targets=[aws_events_targets.LambdaFunction(slack_notify_function)],
106 |         )
107 | 


--------------------------------------------------------------------------------
/geostore/check_stac_metadata/stac_validators.py:
--------------------------------------------------------------------------------
  1 | from functools import cached_property, lru_cache
  2 | from json import load
  3 | from os import scandir
  4 | from os.path import dirname, join
  5 | from re import fullmatch
  6 | 
  7 | from jsonschema import Draft7Validator, FormatChecker, RefResolver
  8 | from jsonschema._utils import URIDict
  9 | from jsonschema.validators import extend
 10 | from packaging.version import parse
 11 | 
 12 | from ..stac_format import LINZ_STAC_EXTENSIONS_LOCAL_PATH
 13 | from ..types import JsonObject
 14 | 
 15 | 
 16 | class Schema:
 17 |     def __init__(self, path: str):
 18 |         self.path = path
 19 | 
 20 |     @cached_property
 21 |     def as_dict(self) -> JsonObject:
 22 |         with open(join(dirname(__file__), self.path), encoding="utf-8") as file_pointer:
 23 |             result: JsonObject = load(file_pointer)
 24 |             return result
 25 | 
 26 |     @cached_property
 27 |     def schema_id(self) -> str:
 28 |         id_: str = self.as_dict["$id"]
 29 |         return id_
 30 | 
 31 |     @cached_property
 32 |     def uri(self) -> str:
 33 |         uri_: str = URIDict().normalize(self.schema_id)
 34 |         return uri_
 35 | 
 36 | 
 37 | @lru_cache
 38 | def get_latest_extension_schema_version(extension_path: str) -> str:
 39 |     directories = scandir(join(dirname(__file__), extension_path))
 40 |     versions = []
 41 |     for directory in directories:
 42 |         if directory.is_dir() and fullmatch(r"v\d+\.\d+\.\d+", directory.name):
 43 |             versions.append(directory.name[1:])
 44 |     return sorted(versions, key=parse, reverse=True)[0]
 45 | 
 46 | 
 47 | FILE_STAC_SCHEMA_PATH = "file/v2.0.0/schema.json"
 48 | PROJECTION_STAC_SCHEMA_PATH = "projection/v1.0.0/schema.json"
 49 | VERSION_STAC_SCHEMA_PATH = "version/v1.0.0/schema.json"
 50 | FILE_SCHEMA = Schema(FILE_STAC_SCHEMA_PATH)
 51 | 
 52 | STAC_SPEC_EXTENSION_PATH = "stac-spec"
 53 | STAC_VERSION = get_latest_extension_schema_version(STAC_SPEC_EXTENSION_PATH)
 54 | STAC_SPEC_PATH = f"{STAC_SPEC_EXTENSION_PATH}/v{STAC_VERSION}"
 55 | CATALOG_SCHEMA = Schema(f"{STAC_SPEC_PATH}/catalog-spec/json-schema/catalog.json")
 56 | LINZ_STAC_EXTENSIONS_URL_PATH = (
 57 |     f"v{get_latest_extension_schema_version(LINZ_STAC_EXTENSIONS_LOCAL_PATH)}"
 58 | )
 59 | LINZ_SCHEMA_URL_DIRECTORY = f"{LINZ_STAC_EXTENSIONS_URL_PATH}/linz"
 60 | LINZ_SCHEMA_URL_PATH = f"{LINZ_SCHEMA_URL_DIRECTORY}/schema.json"
 61 | LINZ_SCHEMA = Schema(join(LINZ_STAC_EXTENSIONS_LOCAL_PATH, LINZ_SCHEMA_URL_PATH))
 62 | STAC_ITEM_SPEC_PATH = f"{STAC_SPEC_PATH}/item-spec/json-schema"
 63 | ITEM_SCHEMA = Schema(f"{STAC_ITEM_SPEC_PATH}/item.json")
 64 | QUALITY_SCHEMA_PATH = f"{LINZ_STAC_EXTENSIONS_URL_PATH}/quality/schema.json"
 65 | 
 66 | schema_store = {}
 67 | for schema in [
 68 |     CATALOG_SCHEMA,
 69 |     Schema(f"{STAC_SPEC_PATH}/collection-spec/json-schema/collection.json"),
 70 |     FILE_SCHEMA,
 71 |     Schema("geojson-spec/Feature.json"),
 72 |     Schema("geojson-spec/Geometry.json"),
 73 |     ITEM_SCHEMA,
 74 |     Schema(f"{STAC_ITEM_SPEC_PATH}/basics.json"),
 75 |     Schema(f"{STAC_ITEM_SPEC_PATH}/datetime.json"),
 76 |     Schema(f"{STAC_ITEM_SPEC_PATH}/instrument.json"),
 77 |     Schema(f"{STAC_ITEM_SPEC_PATH}/licensing.json"),
 78 |     Schema(f"{STAC_ITEM_SPEC_PATH}/provider.json"),
 79 |     LINZ_SCHEMA,
 80 |     Schema(PROJECTION_STAC_SCHEMA_PATH),
 81 |     Schema(VERSION_STAC_SCHEMA_PATH),
 82 |     Schema(join(LINZ_STAC_EXTENSIONS_LOCAL_PATH, QUALITY_SCHEMA_PATH)),
 83 | ]:
 84 |     # Normalize URLs the same way as jsonschema does
 85 |     schema_store[schema.uri] = schema.as_dict
 86 | 
 87 | BaseSTACValidator = extend(Draft7Validator)
 88 | BaseSTACValidator.format_checker = FormatChecker()
 89 | 
 90 | STACCatalogSchemaValidator = extend(BaseSTACValidator)(
 91 |     resolver=RefResolver.from_schema(CATALOG_SCHEMA.as_dict, store=schema_store),
 92 |     schema=CATALOG_SCHEMA.as_dict,
 93 | )
 94 | 
 95 | STACCollectionSchemaValidator = extend(BaseSTACValidator)(
 96 |     resolver=RefResolver.from_schema(LINZ_SCHEMA.as_dict, store=schema_store),
 97 |     schema=LINZ_SCHEMA.as_dict,
 98 | )
 99 | 
100 | STACItemSchemaValidator = extend(BaseSTACValidator)(
101 |     resolver=RefResolver.from_schema(LINZ_SCHEMA.as_dict, store=schema_store),
102 |     schema=LINZ_SCHEMA.as_dict,
103 | )
104 | 


--------------------------------------------------------------------------------
/tests/test_step_function.py:
--------------------------------------------------------------------------------
  1 | from os.path import basename
  2 | from unittest.mock import MagicMock, patch
  3 | 
  4 | from pytest import mark
  5 | from pytest_subtests import SubTests
  6 | 
  7 | from geostore.logging_keys import GIT_COMMIT
  8 | from geostore.models import DB_KEY_SEPARATOR
  9 | from geostore.parameter_store import ParameterName, get_param
 10 | from geostore.processing_assets_model import ProcessingAssetType, processing_assets_model_with_meta
 11 | from geostore.step_function import AssetGarbageCollector, get_hash_key
 12 | from geostore.step_function_keys import CURRENT_VERSION_EMPTY_VALUE
 13 | from tests.aws_utils import ProcessingAsset, any_s3_url
 14 | from tests.stac_generators import any_dataset_id, any_dataset_version_id
 15 | 
 16 | 
 17 | @mark.infrastructure
 18 | def should_mark_asset_as_replaced(subtests: SubTests) -> None:
 19 |     # Given
 20 | 
 21 |     dataset_id = any_dataset_id()
 22 |     current_version_id = any_dataset_version_id()
 23 |     url = any_s3_url()
 24 |     filename = basename(url)
 25 |     logger_mock = MagicMock()
 26 | 
 27 |     expected_log_message = (
 28 |         f"Dataset: '{dataset_id}' "
 29 |         f"Version: '{current_version_id}' "
 30 |         f"Filename: '{filename}' has been marked as replaced"
 31 |     )
 32 | 
 33 |     hash_key = get_hash_key(dataset_id, current_version_id)
 34 |     processing_assets_model = processing_assets_model_with_meta()
 35 |     expected_metadata_item = processing_assets_model(
 36 |         hash_key=hash_key,
 37 |         range_key=f"{ProcessingAssetType.METADATA.value}{DB_KEY_SEPARATOR}0",
 38 |         url=url,
 39 |         filename=filename,
 40 |         replaced_in_new_version=True,
 41 |     )
 42 | 
 43 |     with ProcessingAsset(
 44 |         asset_id=hash_key,
 45 |         url=url,
 46 |     ):
 47 |         # When
 48 |         AssetGarbageCollector(
 49 |             dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock
 50 |         ).mark_asset_as_replaced(filename)
 51 | 
 52 |         # Then
 53 |         with subtests.test(msg="Log is recorded"):
 54 |             logger_mock.debug.assert_called_once_with(
 55 |                 expected_log_message,
 56 |                 extra={
 57 |                     GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
 58 |                 },
 59 |             )
 60 | 
 61 |         actual_first_version_metadata_item = processing_assets_model.query(
 62 |             hash_key,
 63 |             processing_assets_model.sk.startswith(
 64 |                 f"{ProcessingAssetType.METADATA.value}{DB_KEY_SEPARATOR}"
 65 |             ),
 66 |             consistent_read=True,
 67 |         ).next()
 68 | 
 69 |         with subtests.test(msg=f"Metadata {actual_first_version_metadata_item.pk}"):
 70 |             assert (
 71 |                 actual_first_version_metadata_item.attribute_values
 72 |                 == expected_metadata_item.attribute_values
 73 |             )
 74 | 
 75 | 
 76 | @mark.infrastructure
 77 | def should_do_nothing_if_no_asset_returned(subtests: SubTests) -> None:
 78 |     # Given
 79 | 
 80 |     dataset_id = any_dataset_id()
 81 |     current_version_id = any_dataset_version_id()
 82 |     url = any_s3_url()
 83 |     filename = basename(url)
 84 |     logger_mock = MagicMock()
 85 | 
 86 |     # When
 87 |     AssetGarbageCollector(
 88 |         dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock
 89 |     ).mark_asset_as_replaced(filename)
 90 | 
 91 |     # Then
 92 |     with subtests.test(msg="Log is recorded"):
 93 |         logger_mock.debug.assert_not_called()
 94 | 
 95 | 
 96 | @patch("geostore.step_function.processing_assets_model_with_meta")
 97 | def should_return_early_if_no_dataset_version(
 98 |     processing_assets_model_mock: MagicMock, subtests: SubTests
 99 | ) -> None:
100 |     # Given
101 |     dataset_id = any_dataset_id()
102 |     current_version_id = CURRENT_VERSION_EMPTY_VALUE
103 |     url = any_s3_url()
104 |     filename = basename(url)
105 |     logger_mock = MagicMock()
106 | 
107 |     # When
108 |     AssetGarbageCollector(
109 |         dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock
110 |     ).mark_asset_as_replaced(filename)
111 | 
112 |     # Then
113 |     with subtests.test(msg="db record is not queried"):
114 |         processing_assets_model_mock.return_value.assert_not_called()
115 | 
116 |     with subtests.test(msg="Log is not recorded"):
117 |         logger_mock.debug.assert_not_called()
118 | 


--------------------------------------------------------------------------------
/geostore/s3_utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from dataclasses import dataclass
  3 | from logging import Logger
  4 | from os.path import basename
  5 | from typing import Callable, Optional, Tuple
  6 | from urllib.parse import urlparse
  7 | 
  8 | from botocore.exceptions import ClientError
  9 | from botocore.response import StreamingBody
 10 | 
 11 | from .logging_keys import GIT_COMMIT
 12 | from .parameter_store import ParameterName, get_param
 13 | from .resources import Resource
 14 | from .s3 import get_s3_client_for_role
 15 | 
 16 | KNOWN_ETAG_OF_EMPTY_FILE = '"d41d8cd98f00b204e9800998ecf8427e"'
 17 | 
 18 | 
 19 | def get_bucket_and_key_from_url(url: str) -> Tuple[str, str]:
 20 |     parsed = urlparse(url)
 21 |     return parsed.netloc, parsed.path[1:]
 22 | 
 23 | 
 24 | @dataclass
 25 | class GeostoreS3Response:
 26 |     response: StreamingBody
 27 |     file_in_staging: bool
 28 | 
 29 | 
 30 | def get_s3_url_reader(
 31 |     s3_role_arn: str, dataset_title: str, logger: Logger
 32 | ) -> Callable[[str], GeostoreS3Response]:
 33 |     def s3_url_reader(staging_url: str) -> GeostoreS3Response:
 34 |         bucket_name, key = get_bucket_and_key_from_url(staging_url)
 35 | 
 36 |         try:
 37 |             staging_object = staging_s3_client.get_object(Bucket=bucket_name, Key=key)
 38 |             return GeostoreS3Response(staging_object["Body"], True)
 39 |         except ClientError as error:
 40 |             if error.response["Error"]["Code"] != "NoSuchKey":
 41 |                 raise error
 42 | 
 43 |             geostore_key = f"{dataset_title}/{basename(urlparse(staging_url).path[1:])}"
 44 | 
 45 |             logger.debug(
 46 |                 f"'{key}' is not present in the staging bucket."
 47 |                 f" Using '{geostore_key}' from the geostore bucket for validation instead.",
 48 |                 extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 49 |             )
 50 |             geostore_object = geostore_s3_client.get_object(
 51 |                 Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, Key=geostore_key
 52 |             )
 53 |             return GeostoreS3Response(geostore_object["Body"], False)
 54 | 
 55 |     staging_s3_client = get_s3_client_for_role(s3_role_arn)
 56 |     geostore_s3_client = get_s3_client_for_role(get_param(ParameterName.S3_USERS_ROLE_ARN))
 57 |     return s3_url_reader
 58 | 
 59 | 
 60 | def get_s3_etag(s3_bucket: str, s3_object_key: str, logger: Logger) -> Optional[str]:
 61 |     geostore_s3_client = get_s3_client_for_role(get_param(ParameterName.S3_USERS_ROLE_ARN))
 62 | 
 63 |     try:
 64 |         s3_response = geostore_s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
 65 |         return s3_response["ETag"]
 66 |     except ClientError as error:
 67 |         if error.response["Error"]["Code"] != "404":
 68 |             logger.debug(
 69 |                 f"Unable to fetch eTag for “{s3_object_key}” in s3://{s3_bucket} due to “{error}”",
 70 |                 extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 71 |             )
 72 |         # rather than raise, we return an empty string, indicating that the etag is different
 73 |         # thus allowing the next step to continue rather than stalling the entire process
 74 |         return None
 75 | 
 76 | 
 77 | def calculate_s3_etag(body: bytes) -> str:
 78 |     # https://awscli.amazonaws.com/v2/documentation/api/latest/topic/s3-config.html#multipart-chunksize
 79 |     s3_default_chunk_size = 8_388_608  # Default value is 8 * 1024 * 1024
 80 | 
 81 |     if body == b"":
 82 |         return KNOWN_ETAG_OF_EMPTY_FILE
 83 | 
 84 |     chunk_hashes = []
 85 | 
 86 |     for chunk_start in range(0, len(body), s3_default_chunk_size):
 87 |         chunk = body[chunk_start : chunk_start + s3_default_chunk_size]
 88 |         chunk_hashes.append(hashlib.md5(chunk, usedforsecurity=False))
 89 | 
 90 |     # file smaller than s3_default_chunk_size has one chunk
 91 |     if len(chunk_hashes) == 1:
 92 |         # file at exactly s3_default_chunk_size is still one chunk
 93 |         # but etag is calculated as multi chunk file (e.g. "656dadd6d61e0ebfd29264e34d742df3-1")
 94 |         # where -1 suffix signifies 1 chunk
 95 |         if len(body) < s3_default_chunk_size:
 96 |             return f'"{chunk_hashes[0].hexdigest()}"'
 97 | 
 98 |     hash_object = hashlib.md5(usedforsecurity=False)
 99 |     for chunk_hash in chunk_hashes:
100 |         hash_object.update(chunk_hash.digest())
101 | 
102 |     return f'"{hash_object.hexdigest()}-{len(chunk_hashes)}"'
103 | 


--------------------------------------------------------------------------------
/tests/test_dataset_versions_endpoint_logging.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | from jsonschema import ValidationError
  4 | from pynamodb.exceptions import DoesNotExist
  5 | from pytest import mark
  6 | 
  7 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY
  8 | from geostore.dataset_versions.create import create_dataset_version
  9 | from geostore.logging_keys import (
 10 |     GIT_COMMIT,
 11 |     LOG_MESSAGE_LAMBDA_FAILURE,
 12 |     LOG_MESSAGE_LAMBDA_START,
 13 |     LOG_MESSAGE_STEP_FUNCTION_RESPONSE,
 14 | )
 15 | from geostore.parameter_store import ParameterName, get_param
 16 | from geostore.step_function_keys import DATASET_ID_SHORT_KEY, METADATA_URL_KEY, S3_ROLE_ARN_KEY
 17 | 
 18 | from .aws_utils import Dataset, any_role_arn, any_s3_url
 19 | from .general_generators import any_error_message
 20 | from .stac_generators import any_dataset_id, any_dataset_version_id
 21 | 
 22 | 
 23 | @mark.infrastructure
 24 | def should_log_payload() -> None:
 25 |     # Given
 26 |     with patch(
 27 |         "geostore.dataset_versions.create.STEP_FUNCTIONS_CLIENT.start_execution"
 28 |     ), Dataset() as dataset, patch("geostore.dataset_versions.create.LOGGER.debug") as logger_mock:
 29 |         event = {
 30 |             HTTP_METHOD_KEY: "POST",
 31 |             BODY_KEY: {
 32 |                 METADATA_URL_KEY: any_s3_url(),
 33 |                 DATASET_ID_SHORT_KEY: dataset.dataset_id,
 34 |                 S3_ROLE_ARN_KEY: any_role_arn(),
 35 |             },
 36 |         }
 37 | 
 38 |         # When
 39 |         create_dataset_version(event)
 40 | 
 41 |         # Then
 42 |         logger_mock.assert_any_call(
 43 |             LOG_MESSAGE_LAMBDA_START,
 44 |             extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 45 |         )
 46 | 
 47 | 
 48 | @mark.infrastructure
 49 | @patch("geostore.dataset_versions.create.STEP_FUNCTIONS_CLIENT.start_execution")
 50 | def should_log_step_function_state_machine_response(start_execution_mock: MagicMock) -> None:
 51 |     # Given
 52 |     start_execution_mock.return_value = step_function_response = {"executionArn": "Some Response"}
 53 | 
 54 |     with Dataset(current_dataset_version=any_dataset_version_id()) as dataset, patch(
 55 |         "geostore.dataset_versions.create.LOGGER.debug"
 56 |     ) as logger_mock:
 57 |         event = {
 58 |             METADATA_URL_KEY: any_s3_url(),
 59 |             DATASET_ID_SHORT_KEY: dataset.dataset_id,
 60 |             S3_ROLE_ARN_KEY: any_role_arn(),
 61 |         }
 62 | 
 63 |         # When
 64 |         create_dataset_version(event)
 65 | 
 66 |         # Then
 67 |         logger_mock.assert_any_call(
 68 |             LOG_MESSAGE_STEP_FUNCTION_RESPONSE,
 69 |             extra={
 70 |                 "response": step_function_response,
 71 |                 GIT_COMMIT: get_param(ParameterName.GIT_COMMIT),
 72 |             },
 73 |         )
 74 | 
 75 | 
 76 | @patch("geostore.dataset_versions.create.validate")
 77 | def should_log_missing_argument_warning(validate_schema_mock: MagicMock) -> None:
 78 |     # given
 79 |     error_message = any_error_message()
 80 |     validate_schema_mock.side_effect = ValidationError(error_message)
 81 | 
 82 |     payload = {HTTP_METHOD_KEY: "POST", BODY_KEY: {}}
 83 | 
 84 |     with patch("geostore.dataset_versions.create.LOGGER.warning") as logger_mock:
 85 |         # when
 86 |         create_dataset_version(payload)
 87 | 
 88 |         # then
 89 |         logger_mock.assert_any_call(
 90 |             LOG_MESSAGE_LAMBDA_FAILURE,
 91 |             extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 92 |         )
 93 | 
 94 | 
 95 | @patch("geostore.dataset_versions.create.datasets_model_with_meta")
 96 | def should_log_warning_if_dataset_does_not_exist(datasets_model_mock: MagicMock) -> None:
 97 |     # given
 98 |     error_message = any_error_message()
 99 |     datasets_model_mock.return_value.get.side_effect = DoesNotExist(error_message)
100 | 
101 |     payload = {
102 |         METADATA_URL_KEY: any_s3_url(),
103 |         DATASET_ID_SHORT_KEY: any_dataset_id(),
104 |         S3_ROLE_ARN_KEY: any_role_arn(),
105 |     }
106 | 
107 |     with patch("geostore.dataset_versions.create.LOGGER.warning") as logger_mock:
108 |         # when
109 |         create_dataset_version(payload)
110 | 
111 |         # then
112 |         logger_mock.assert_any_call(
113 |             LOG_MESSAGE_LAMBDA_FAILURE,
114 |             extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
115 |         )
116 | 


--------------------------------------------------------------------------------
/tests/stac_generators.py:
--------------------------------------------------------------------------------
  1 | from hashlib import sha256
  2 | from random import choice, randrange
  3 | from uuid import uuid4
  4 | 
  5 | from multihash import SHA2_256
  6 | 
  7 | from geostore.dataset_properties import TITLE_CHARACTERS
  8 | from geostore.stac_format import (
  9 |     LINZ_STAC_CREATED_KEY,
 10 |     LINZ_STAC_UPDATED_KEY,
 11 |     STAC_MAXIMUM_KEY,
 12 |     STAC_MINIMUM_KEY,
 13 | )
 14 | from geostore.types import JsonObject
 15 | 
 16 | from .general_generators import (
 17 |     _random_string_choices,
 18 |     any_description,
 19 |     any_https_url,
 20 |     any_name,
 21 |     any_past_datetime,
 22 |     random_string,
 23 | )
 24 | 
 25 | 
 26 | def any_hex_multihash() -> str:
 27 |     hex_digest = any_sha256_hex_digest()
 28 |     return sha256_hex_digest_to_multihash(hex_digest)
 29 | 
 30 | 
 31 | def any_sha256_hex_digest() -> str:
 32 |     return sha256(random_string(20).encode()).hexdigest()
 33 | 
 34 | 
 35 | def sha256_hex_digest_to_multihash(hex_digest: str) -> str:
 36 |     return f"{SHA2_256:x}{32:x}{hex_digest}"
 37 | 
 38 | 
 39 | def any_dataset_id() -> str:
 40 |     return uuid4().hex
 41 | 
 42 | 
 43 | def any_dataset_version_id() -> str:
 44 |     """Arbitrary-length string"""
 45 |     return uuid4().hex
 46 | 
 47 | 
 48 | def any_dataset_title() -> str:
 49 |     """Arbitrary-length string of valid dataset title characters"""
 50 |     return _random_string_choices(TITLE_CHARACTERS, 20)
 51 | 
 52 | 
 53 | def any_asset_name() -> str:
 54 |     """Arbitrary-length string"""
 55 |     return random_string(20)
 56 | 
 57 | 
 58 | def any_dataset_description() -> str:
 59 |     """Arbitrary-length string"""
 60 |     return random_string(100)
 61 | 
 62 | 
 63 | def any_linz_asset_summaries() -> JsonObject:
 64 |     """
 65 |     Semi-arbitrary dates:
 66 | 
 67 |     - The first creation date can't be after any of the other dates
 68 |     - The last created and first updated dates can be anywhere within the range
 69 |     - The last updated date can't be before any of the other dates
 70 |     """
 71 |     datetimes = [any_past_datetime(), any_past_datetime(), any_past_datetime(), any_past_datetime()]
 72 |     return {
 73 |         LINZ_STAC_CREATED_KEY: {
 74 |             STAC_MINIMUM_KEY: min(datetimes).isoformat(),
 75 |             STAC_MAXIMUM_KEY: choice(datetimes).isoformat(),
 76 |         },
 77 |         LINZ_STAC_UPDATED_KEY: {
 78 |             STAC_MINIMUM_KEY: choice(datetimes).isoformat(),
 79 |             STAC_MAXIMUM_KEY: max(datetimes).isoformat(),
 80 |         },
 81 |     }
 82 | 
 83 | 
 84 | def any_linz_geospatial_type() -> str:
 85 |     return choice(
 86 |         [
 87 |             "black and white image",
 88 |             "circular string",
 89 |             "color image",
 90 |             "compound curve",
 91 |             "curve polygon",
 92 |             "geometry",
 93 |             "geometry collection",
 94 |             "grayscale",
 95 |             "grid",
 96 |             "hyperspectral",
 97 |             "multicurve",
 98 |             "multilinestring",
 99 |             "multipoint",
100 |             "multipolygon",
101 |             "multispectral",
102 |             "multisurface",
103 |             "linestring",
104 |             "point",
105 |             "point cloud",
106 |             "polygon",
107 |             "polyhedral surface",
108 |             "rgb",
109 |             "tin",
110 |             "triangle",
111 |         ]
112 |     )
113 | 
114 | 
115 | def any_linz_history() -> str:
116 |     """Arbitrary-length string"""
117 |     return random_string(20)
118 | 
119 | 
120 | def any_linz_lifecycle() -> str:
121 |     return choice(["under development", "preview", "ongoing", "completed", "deprecated"])
122 | 
123 | 
124 | def any_provider(role: str) -> JsonObject:
125 |     return {
126 |         "name": any_name(),
127 |         "description": any_description(),
128 |         "roles": [role],
129 |         "url": any_https_url(),
130 |     }
131 | 
132 | 
133 | def any_linz_provider_custodian() -> JsonObject:
134 |     return any_provider("custodian")
135 | 
136 | 
137 | def any_linz_provider_manager() -> JsonObject:
138 |     return any_provider("manager")
139 | 
140 | 
141 | def any_provider_licensor() -> JsonObject:
142 |     return any_provider("licensor")
143 | 
144 | 
145 | def any_provider_producer() -> JsonObject:
146 |     return any_provider("producer")
147 | 
148 | 
149 | def any_epsg() -> int:
150 |     return randrange(1_000_000)
151 | 
152 | 
153 | def any_version_version() -> str:
154 |     return f"{randrange(1_000)}.{randrange(1_000)}.{randrange(1_000)}"
155 | 


--------------------------------------------------------------------------------
/geostore/populate_catalog/task.py:
--------------------------------------------------------------------------------
  1 | from json import dumps
  2 | from logging import Logger
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | import boto3
  6 | from linz_logger import get_log
  7 | from pystac import read_file
  8 | from pystac.catalog import Catalog, CatalogType
  9 | from pystac.collection import Collection
 10 | from pystac.item import Item
 11 | from pystac.layout import HrefLayoutStrategy
 12 | from pystac.stac_io import StacIO
 13 | 
 14 | from ..api_keys import EVENT_KEY
 15 | from ..aws_keys import BODY_KEY
 16 | from ..boto3_config import CONFIG
 17 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_FAILURE
 18 | from ..parameter_store import ParameterName, get_param
 19 | from ..pystac_io_methods import S3StacIO
 20 | from ..resources import Resource
 21 | from ..s3 import S3_URL_PREFIX
 22 | from ..types import JsonObject
 23 | 
 24 | if TYPE_CHECKING:
 25 |     # When type checking we want to use the third party package's stub
 26 |     from mypy_boto3_s3 import S3Client
 27 | else:
 28 |     # In production we want to avoid depending on a package which has no runtime impact
 29 |     S3Client = object  # pragma: no mutate
 30 | 
 31 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
 32 | 
 33 | ROOT_CATALOG_ID = "root_catalog"
 34 | ROOT_CATALOG_TITLE = "Toitū Te Whenua Land Information New Zealand Geostore"
 35 | ROOT_CATALOG_DESCRIPTION = (
 36 |     "The Geospatial Data Store (Geostore) contains all the important "
 37 |     "geospatial data held by Toitū Te Whenua Land Information New Zealand.<br/>"
 38 |     "Please browse this catalog to find and access our data."
 39 | )
 40 | CATALOG_FILENAME = "catalog.json"
 41 | CONTENTS_KEY = "Contents"
 42 | RECORDS_KEY = "Records"
 43 | 
 44 | LOGGER: Logger = get_log()
 45 | 
 46 | StacIO.set_default(S3StacIO)
 47 | 
 48 | 
 49 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
 50 |     """Main Lambda entry point."""
 51 | 
 52 |     LOGGER.debug(dumps({EVENT_KEY: event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}))
 53 | 
 54 |     for message in event[RECORDS_KEY]:
 55 |         handle_message(message[BODY_KEY])
 56 | 
 57 |     return {}
 58 | 
 59 | 
 60 | class GeostoreSTACLayoutStrategy(HrefLayoutStrategy):
 61 |     def get_catalog_href(self, cat: Catalog, parent_dir: str, is_root: bool) -> str:
 62 |         return str(cat.get_self_href())
 63 | 
 64 |     def get_collection_href(self, col: Collection, parent_dir: str, is_root: bool) -> str:
 65 |         assert not is_root
 66 |         return str(col.get_self_href())
 67 | 
 68 |     def get_item_href(self, item: Item, parent_dir: str) -> str:  # pragma: no cover
 69 |         raise NotImplementedError()
 70 | 
 71 | 
 72 | def handle_message(metadata_key: str) -> None:
 73 |     """Handle writing a new dataset to the root catalog"""
 74 | 
 75 |     storage_bucket_path = f"{S3_URL_PREFIX}{Resource.STORAGE_BUCKET_NAME.resource_name}"
 76 | 
 77 |     # there could be a myriad of problems preventing catalog from being populated
 78 |     # hence a rather broad try except exception clause is used
 79 |     # an exception thrown here indicates stuck message(s) in the sqs queue
 80 |     # logging is monitored by elasticsearch and alerting is set up to notify the team of a problem
 81 |     try:
 82 |         dataset_metadata = read_file(f"{storage_bucket_path}/{metadata_key}")
 83 |         assert isinstance(dataset_metadata, (Catalog, Collection))
 84 | 
 85 |         results = S3_CLIENT.list_objects(
 86 |             Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, Prefix=CATALOG_FILENAME
 87 |         )
 88 | 
 89 |         # create root catalog if it doesn't exist
 90 |         if CONTENTS_KEY in results:
 91 |             root_catalog = Catalog.from_file(f"{storage_bucket_path}/{CATALOG_FILENAME}")
 92 | 
 93 |         else:
 94 |             root_catalog = Catalog(
 95 |                 id=ROOT_CATALOG_ID,
 96 |                 title=ROOT_CATALOG_TITLE,
 97 |                 description=ROOT_CATALOG_DESCRIPTION,
 98 |                 catalog_type=CatalogType.SELF_CONTAINED,
 99 |             )
100 |             root_catalog.set_self_href(f"{storage_bucket_path}/{CATALOG_FILENAME}")
101 | 
102 |         if root_catalog.get_child(dataset_metadata.id) is None:
103 |             root_catalog.add_child(child=dataset_metadata, strategy=GeostoreSTACLayoutStrategy())
104 | 
105 |         root_catalog.save(catalog_type=CatalogType.SELF_CONTAINED)
106 | 
107 |     except Exception as error:
108 |         LOGGER.warning(
109 |             f"{LOG_MESSAGE_LAMBDA_FAILURE}: Unable to populate catalog due to “{error}”",
110 |             extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
111 |         )
112 |         raise
113 | 


--------------------------------------------------------------------------------
/geostore/update_root_catalog/task.py:
--------------------------------------------------------------------------------
  1 | from logging import Logger
  2 | from os.path import basename
  3 | from typing import TYPE_CHECKING
  4 | from urllib.parse import urlparse
  5 | from uuid import uuid4
  6 | 
  7 | import boto3
  8 | from jsonschema import ValidationError, validate
  9 | from linz_logger import get_log
 10 | 
 11 | from ..boto3_config import CONFIG
 12 | from ..datasets_model import datasets_model_with_meta
 13 | from ..error_response_keys import ERROR_MESSAGE_KEY
 14 | from ..logging_keys import (
 15 |     GIT_COMMIT,
 16 |     LOG_MESSAGE_LAMBDA_FAILURE,
 17 |     LOG_MESSAGE_LAMBDA_START,
 18 |     LOG_MESSAGE_S3_DELETION_RESPONSE,
 19 | )
 20 | from ..models import DATASET_ID_PREFIX
 21 | from ..parameter_store import ParameterName, get_param
 22 | from ..processing_assets_model import processing_assets_model_with_meta
 23 | from ..resources import Resource
 24 | from ..s3 import S3_URL_PREFIX
 25 | from ..step_function import get_hash_key
 26 | from ..step_function_keys import (
 27 |     CURRENT_VERSION_ID_KEY,
 28 |     DATASET_ID_KEY,
 29 |     DATASET_TITLE_KEY,
 30 |     METADATA_URL_KEY,
 31 |     NEW_VERSION_ID_KEY,
 32 |     NEW_VERSION_S3_LOCATION,
 33 | )
 34 | from ..types import JsonObject
 35 | 
 36 | if TYPE_CHECKING:
 37 |     # When type checking we want to use the third party package's stub
 38 |     from mypy_boto3_s3 import S3Client
 39 |     from mypy_boto3_sqs import SQSServiceResource
 40 | else:
 41 |     # In production we want to avoid depending on a package which has no runtime impact
 42 |     S3Client = SQSServiceResource = object  # pragma: no mutate
 43 | 
 44 | LOGGER: Logger = get_log()
 45 | SQS_RESOURCE: SQSServiceResource = boto3.resource("sqs")
 46 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG)
 47 | 
 48 | SQS_MESSAGE_GROUP_ID = "update_root_catalog_message_group"
 49 | 
 50 | 
 51 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject:
 52 |     """Main Lambda entry point."""
 53 |     LOGGER.debug(
 54 |         LOG_MESSAGE_LAMBDA_START,
 55 |         extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 56 |     )
 57 | 
 58 |     # validate input
 59 |     try:
 60 |         validate(
 61 |             event,
 62 |             {
 63 |                 "type": "object",
 64 |                 "properties": {
 65 |                     CURRENT_VERSION_ID_KEY: {"type": "string"},
 66 |                     DATASET_ID_KEY: {"type": "string"},
 67 |                     DATASET_TITLE_KEY: {"type": "string"},
 68 |                     NEW_VERSION_ID_KEY: {"type": "string"},
 69 |                     METADATA_URL_KEY: {"type": "string"},
 70 |                 },
 71 |                 "required": [
 72 |                     CURRENT_VERSION_ID_KEY,
 73 |                     DATASET_ID_KEY,
 74 |                     DATASET_TITLE_KEY,
 75 |                     METADATA_URL_KEY,
 76 |                     NEW_VERSION_ID_KEY,
 77 |                 ],
 78 |             },
 79 |         )
 80 |     except ValidationError as error:
 81 |         LOGGER.warning(
 82 |             LOG_MESSAGE_LAMBDA_FAILURE,
 83 |             extra={"error": error, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
 84 |         )
 85 |         return {ERROR_MESSAGE_KEY: error.message}
 86 | 
 87 |     dataset_key = (
 88 |         f"{event[DATASET_TITLE_KEY]}/{basename(urlparse(event[METADATA_URL_KEY]).path[1:])}"
 89 |     )
 90 | 
 91 |     # add reference to root catalog
 92 |     SQS_RESOURCE.get_queue_by_name(
 93 |         QueueName=get_param(ParameterName.UPDATE_CATALOG_MESSAGE_QUEUE_NAME)
 94 |     ).send_message(
 95 |         MessageBody=dataset_key,
 96 |         MessageGroupId=SQS_MESSAGE_GROUP_ID,
 97 |         MessageDeduplicationId=uuid4().hex,
 98 |     )
 99 | 
100 |     processing_assets_model = processing_assets_model_with_meta()
101 |     for item in processing_assets_model.query(
102 |         get_hash_key(event[DATASET_ID_KEY], event[CURRENT_VERSION_ID_KEY]),
103 |         filter_condition=processing_assets_model.replaced_in_new_version.does_not_exist(),
104 |     ):
105 |         s3_response = S3_CLIENT.delete_object(
106 |             Bucket=Resource.STORAGE_BUCKET_NAME.resource_name,
107 |             Key=f"{event[DATASET_TITLE_KEY]}/{item.filename}",
108 |         )
109 |         LOGGER.debug(
110 |             LOG_MESSAGE_S3_DELETION_RESPONSE,
111 |             extra={"response": s3_response, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)},
112 |         )
113 | 
114 |     # Update dataset record with the latest version
115 |     datasets_model = datasets_model_with_meta()
116 |     dataset = datasets_model.get(
117 |         hash_key=f"{DATASET_ID_PREFIX}{event[DATASET_ID_KEY]}", consistent_read=True
118 |     )
119 |     dataset.update(actions=[datasets_model.current_dataset_version.set(event[NEW_VERSION_ID_KEY])])
120 | 
121 |     return {
122 |         NEW_VERSION_S3_LOCATION: f"{S3_URL_PREFIX}"
123 |         f"{Resource.STORAGE_BUCKET_NAME.resource_name}/"
124 |         f"{dataset_key}"
125 |     }
126 | 


--------------------------------------------------------------------------------
/infrastructure/application_stack.py:
--------------------------------------------------------------------------------
  1 | import atexit
  2 | import shutil
  3 | from os import environ
  4 | 
  5 | import constructs
  6 | from aws_cdk import Environment, Stack, aws_iam
  7 | 
  8 | from geostore.environment import environment_name
  9 | from infrastructure.constructs.bundled_code import LambdaPackaging
 10 | 
 11 | from .constructs.api import API
 12 | from .constructs.lambda_layers import LambdaLayers
 13 | from .constructs.lds import LDS
 14 | from .constructs.notify import Notify
 15 | from .constructs.opentopo import OpenTopography
 16 | from .constructs.processing import Processing
 17 | from .constructs.staging import Staging
 18 | from .constructs.storage import Storage
 19 | 
 20 | 
 21 | class Application(Stack):
 22 |     def __init__(self, scope: constructs.Construct, stack_id: str) -> None:
 23 |         environment = Environment(
 24 |             account=environ["CDK_DEFAULT_ACCOUNT"], region=environ["CDK_DEFAULT_REGION"]
 25 |         )
 26 | 
 27 |         super().__init__(scope, stack_id, env=environment)
 28 | 
 29 |         env_name = environment_name()
 30 | 
 31 |         principal: aws_iam.PrincipalBase
 32 |         if saml_provider_arn := environ.get("GEOSTORE_SAML_IDENTITY_PROVIDER_ARN"):
 33 |             principal = aws_iam.FederatedPrincipal(
 34 |                 federated=saml_provider_arn,
 35 |                 assume_role_action="sts:AssumeRoleWithSAML",
 36 |                 conditions={"StringEquals": {"SAML:aud": "https://signin.aws.amazon.com/saml"}},
 37 |             )
 38 |         else:
 39 |             open_id_connect_provider_arn = (
 40 |                 f"arn:aws:iam::"
 41 |                 f"{aws_iam.AccountRootPrincipal().account_id}"
 42 |                 f":oidc-provider/token.actions.githubusercontent.com"
 43 |             )
 44 | 
 45 |             principal = aws_iam.CompositePrincipal(
 46 |                 aws_iam.AccountPrincipal(account_id=aws_iam.AccountRootPrincipal().account_id),
 47 |                 aws_iam.WebIdentityPrincipal(
 48 |                     identity_provider=open_id_connect_provider_arn,
 49 |                     conditions={
 50 |                         "StringLike": {
 51 |                             "token.actions.githubusercontent.com:aud": ["sts.amazonaws.com"],
 52 |                             "token.actions.githubusercontent.com:sub": ["repo:linz/geostore:*"],
 53 |                         }
 54 |                     },
 55 |                 ),
 56 |             )
 57 | 
 58 |         storage = Storage(self, "storage", env_name=env_name)
 59 | 
 60 |         lambda_layers = LambdaLayers(self, "lambda-layers", env_name=env_name)
 61 | 
 62 |         processing = Processing(
 63 |             self,
 64 |             "processing",
 65 |             botocore_lambda_layer=lambda_layers.botocore,
 66 |             env_name=env_name,
 67 |             principal=principal,
 68 |             s3_role_arn_parameter=storage.s3_role_arn_parameter,
 69 |             storage_bucket=storage.storage_bucket,
 70 |             validation_results_table=storage.validation_results_table,
 71 |             datasets_table=storage.datasets_table,
 72 |             git_commit_parameter=storage.git_commit_parameter,
 73 |         )
 74 |         Staging(self, "staging", users_role=processing.staging_users_role)
 75 | 
 76 |         API(
 77 |             self,
 78 |             "api",
 79 |             botocore_lambda_layer=lambda_layers.botocore,
 80 |             datasets_table=storage.datasets_table,
 81 |             env_name=env_name,
 82 |             processing_assets_table=processing.processing_assets_table,
 83 |             state_machine=processing.state_machine,
 84 |             state_machine_parameter=processing.state_machine_parameter,
 85 |             sqs_queue=processing.message_queue,
 86 |             sqs_queue_parameter=processing.message_queue_name_parameter,
 87 |             storage_bucket=storage.storage_bucket,
 88 |             validation_results_table=storage.validation_results_table,
 89 |             git_commit_parameter=storage.git_commit_parameter,
 90 |         )
 91 | 
 92 |         Notify(
 93 |             self,
 94 |             "notify",
 95 |             botocore_lambda_layer=lambda_layers.botocore,
 96 |             env_name=env_name,
 97 |             state_machine=processing.state_machine,
 98 |             validation_results_table=storage.validation_results_table,
 99 |             git_commit_parameter=storage.git_commit_parameter,
100 |         )
101 | 
102 |         if self.node.try_get_context("enableLDSAccess"):
103 |             LDS(self, "lds", env_name=env_name, storage_bucket=storage.storage_bucket)
104 | 
105 |         if self.node.try_get_context("enableOpenTopographyAccess"):
106 |             OpenTopography(
107 |                 self, "opentopography", env_name=env_name, storage_bucket=storage.storage_bucket
108 |             )
109 | 
110 |         # Remove temp lambda packaging directory at exit to purge pip packages
111 |         # Reusing pip packages would speed things up, but also makes things
112 |         # harder to troubleshoot when there is a change in one of the Python packages
113 |         atexit.register(lambda: shutil.rmtree(LambdaPackaging.directory))
114 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/api.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (
  2 |     Tags,
  3 |     aws_iam,
  4 |     aws_lambda_python_alpha,
  5 |     aws_s3,
  6 |     aws_sqs,
  7 |     aws_ssm,
  8 |     aws_stepfunctions,
  9 | )
 10 | from constructs import Construct
 11 | 
 12 | from geostore.resources import Resource
 13 | 
 14 | from .common import grant_parameter_read_access
 15 | from .lambda_endpoint import LambdaEndpoint
 16 | from .roles import LINZ_ORGANIZATION_ID, MAX_SESSION_DURATION
 17 | from .s3_policy import ALLOW_DESCRIBE_ANY_S3_JOB
 18 | from .table import Table
 19 | 
 20 | 
 21 | class API(Construct):
 22 |     def __init__(  # pylint: disable=too-many-locals
 23 |         self,
 24 |         scope: Construct,
 25 |         stack_id: str,
 26 |         *,
 27 |         botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion,
 28 |         datasets_table: Table,
 29 |         env_name: str,
 30 |         processing_assets_table: Table,
 31 |         state_machine: aws_stepfunctions.StateMachine,
 32 |         state_machine_parameter: aws_ssm.StringParameter,
 33 |         sqs_queue: aws_sqs.Queue,
 34 |         sqs_queue_parameter: aws_ssm.StringParameter,
 35 |         storage_bucket: aws_s3.Bucket,
 36 |         validation_results_table: Table,
 37 |         git_commit_parameter: aws_ssm.StringParameter,
 38 |     ) -> None:
 39 |         super().__init__(scope, stack_id)
 40 | 
 41 |         ############################################################################################
 42 |         # ### API ENDPOINTS ########################################################################
 43 |         ############################################################################################
 44 | 
 45 |         api_users_role = aws_iam.Role(
 46 |             self,
 47 |             "api-users-role",
 48 |             role_name=Resource.API_USERS_ROLE_NAME.resource_name,
 49 |             assumed_by=aws_iam.OrganizationPrincipal(LINZ_ORGANIZATION_ID),
 50 |             max_session_duration=MAX_SESSION_DURATION,
 51 |         )
 52 | 
 53 |         datasets_endpoint_lambda = LambdaEndpoint(
 54 |             self,
 55 |             Resource.DATASETS_ENDPOINT_FUNCTION_NAME.resource_name,
 56 |             package_name="datasets",
 57 |             env_name=env_name,
 58 |             users_role=api_users_role,
 59 |             botocore_lambda_layer=botocore_lambda_layer,
 60 |         )
 61 | 
 62 |         dataset_versions_endpoint_lambda = LambdaEndpoint(
 63 |             self,
 64 |             Resource.DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME.resource_name,
 65 |             package_name="dataset_versions",
 66 |             env_name=env_name,
 67 |             users_role=api_users_role,
 68 |             botocore_lambda_layer=botocore_lambda_layer,
 69 |         )
 70 |         processing_assets_table.grant_read_write_data(dataset_versions_endpoint_lambda)
 71 |         processing_assets_table.grant(dataset_versions_endpoint_lambda, "dynamodb:DescribeTable")
 72 | 
 73 |         state_machine.grant_start_execution(dataset_versions_endpoint_lambda)
 74 | 
 75 |         storage_bucket.grant_read_write(datasets_endpoint_lambda)
 76 | 
 77 |         sqs_queue.grant_send_messages(datasets_endpoint_lambda)
 78 | 
 79 |         for function in [datasets_endpoint_lambda, dataset_versions_endpoint_lambda]:
 80 |             datasets_table.grant_read_write_data(function)
 81 |             datasets_table.grant(function, "dynamodb:DescribeTable")  # required by pynamodb
 82 | 
 83 |         import_status_endpoint_lambda = LambdaEndpoint(
 84 |             self,
 85 |             Resource.IMPORT_STATUS_ENDPOINT_FUNCTION_NAME.resource_name,
 86 |             package_name="import_status",
 87 |             env_name=env_name,
 88 |             users_role=api_users_role,
 89 |             botocore_lambda_layer=botocore_lambda_layer,
 90 |         )
 91 | 
 92 |         validation_results_table.grant_read_data(import_status_endpoint_lambda)
 93 |         validation_results_table.grant(
 94 |             import_status_endpoint_lambda, "dynamodb:DescribeTable"
 95 |         )  # required by pynamodb
 96 | 
 97 |         state_machine.grant_read(import_status_endpoint_lambda)
 98 |         import_status_endpoint_lambda.add_to_role_policy(ALLOW_DESCRIBE_ANY_S3_JOB)
 99 | 
100 |         grant_parameter_read_access(
101 |             {
102 |                 datasets_table.name_parameter: [
103 |                     datasets_endpoint_lambda,
104 |                     dataset_versions_endpoint_lambda,
105 |                 ],
106 |                 processing_assets_table.name_parameter: [dataset_versions_endpoint_lambda],
107 |                 validation_results_table.name_parameter: [import_status_endpoint_lambda],
108 |                 state_machine_parameter: [dataset_versions_endpoint_lambda],
109 |                 sqs_queue_parameter: [datasets_endpoint_lambda],
110 |                 git_commit_parameter: [
111 |                     datasets_endpoint_lambda,
112 |                     dataset_versions_endpoint_lambda,
113 |                     import_status_endpoint_lambda,
114 |                 ],
115 |             }
116 |         )
117 | 
118 |         Tags.of(self).add("ApplicationLayer", "api")
119 | 


--------------------------------------------------------------------------------
/infrastructure/constructs/storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Geostore AWS resources definitions.
  3 | """
  4 | from aws_cdk import Tags, aws_dynamodb, aws_iam, aws_s3, aws_ssm
  5 | from constructs import Construct
  6 | 
  7 | from geostore.datasets_model import DatasetsTitleIdx
  8 | from geostore.parameter_store import ParameterName
  9 | from geostore.resources import Resource
 10 | from geostore.validation_results_model import ValidationOutcomeIdx
 11 | 
 12 | from .removal_policy import REMOVAL_POLICY
 13 | from .roles import LINZ_ORGANIZATION_ID, MAX_SESSION_DURATION
 14 | from .table import Table
 15 | from .version import GIT_BRANCH, GIT_COMMIT, GIT_TAG
 16 | 
 17 | 
 18 | class Storage(Construct):
 19 |     def __init__(self, scope: Construct, stack_id: str, *, env_name: str) -> None:
 20 |         super().__init__(scope, stack_id)
 21 | 
 22 |         ############################################################################################
 23 |         # ### DEPLOYMENT VERSION ###################################################################
 24 |         ############################################################################################
 25 | 
 26 |         aws_ssm.StringParameter(
 27 |             self,
 28 |             "git-branch",
 29 |             parameter_name=f"/{env_name}/git_branch",
 30 |             string_value=GIT_BRANCH,
 31 |             description="Deployment git branch",
 32 |         )
 33 | 
 34 |         self.git_commit_parameter = aws_ssm.StringParameter(
 35 |             self,
 36 |             "git-commit",
 37 |             parameter_name=f"/{env_name}/git_commit",
 38 |             string_value=GIT_COMMIT,
 39 |             description="Deployment git commit",
 40 |         )
 41 | 
 42 |         aws_ssm.StringParameter(
 43 |             self,
 44 |             "git-tag",
 45 |             parameter_name=f"/{env_name}/version",
 46 |             string_value=GIT_TAG,
 47 |             description="Deployment version",
 48 |         )
 49 | 
 50 |         ############################################################################################
 51 |         # ### STORAGE S3 BUCKET ####################################################################
 52 |         ############################################################################################
 53 |         self.storage_bucket = aws_s3.Bucket(
 54 |             self,
 55 |             "storage-bucket",
 56 |             bucket_name=Resource.STORAGE_BUCKET_NAME.resource_name,
 57 |             access_control=aws_s3.BucketAccessControl.PRIVATE,
 58 |             block_public_access=aws_s3.BlockPublicAccess.BLOCK_ALL,
 59 |             versioned=True,
 60 |             removal_policy=REMOVAL_POLICY,
 61 |             enforce_ssl=True,
 62 |         )
 63 | 
 64 |         s3_users_role = aws_iam.Role(
 65 |             self,
 66 |             "s3-users-role",
 67 |             role_name=Resource.S3_USERS_ROLE_NAME.resource_name,
 68 |             assumed_by=aws_iam.OrganizationPrincipal(LINZ_ORGANIZATION_ID),
 69 |             max_session_duration=MAX_SESSION_DURATION,
 70 |         )
 71 |         self.storage_bucket.grant_read(s3_users_role)
 72 | 
 73 |         self.s3_role_arn_parameter = aws_ssm.StringParameter(
 74 |             self,
 75 |             "s3-users-role-arn",
 76 |             string_value=s3_users_role.role_arn,
 77 |             parameter_name=ParameterName.S3_USERS_ROLE_ARN.value,
 78 |         )
 79 | 
 80 |         ############################################################################################
 81 |         # ### APPLICATION DB #######################################################################
 82 |         ############################################################################################
 83 |         self.datasets_table = Table(
 84 |             self,
 85 |             f"{env_name}-datasets",
 86 |             env_name=env_name,
 87 |             parameter_name=ParameterName.STORAGE_DATASETS_TABLE_NAME,
 88 |         )
 89 | 
 90 |         self.datasets_table.add_global_secondary_index(
 91 |             index_name=DatasetsTitleIdx.Meta.index_name,
 92 |             partition_key=aws_dynamodb.Attribute(
 93 |                 name="title", type=aws_dynamodb.AttributeType.STRING
 94 |             ),
 95 |         )
 96 | 
 97 |         self.validation_results_table = Table(
 98 |             self,
 99 |             f"{env_name}-validation-results",
100 |             env_name=env_name,
101 |             parameter_name=ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME,
102 |             sort_key=aws_dynamodb.Attribute(name="sk", type=aws_dynamodb.AttributeType.STRING),
103 |         )
104 | 
105 |         self.validation_results_table.add_global_secondary_index(
106 |             index_name=ValidationOutcomeIdx.Meta.index_name,
107 |             partition_key=aws_dynamodb.Attribute(
108 |                 name=ValidationOutcomeIdx.pk.attr_name, type=aws_dynamodb.AttributeType.STRING
109 |             ),
110 |             sort_key=aws_dynamodb.Attribute(
111 |                 name=ValidationOutcomeIdx.result.attr_name, type=aws_dynamodb.AttributeType.STRING
112 |             ),
113 |         )
114 | 
115 |         Tags.of(self).add("ApplicationLayer", "storage")
116 | 


--------------------------------------------------------------------------------