├── .nvmrc ├── tests ├── __init__.py ├── test_networking_stack.py ├── dynamodb_generators.py ├── test_get_latest_extension_schema_version.py ├── file_utils.py ├── test_datasets_model_base.py ├── test_prefix_non_prod_name.py ├── test_parameter_store.py ├── test_step_function_logging.py ├── conftest.py ├── test_storage_bucket.py ├── test_validation_summary.py ├── test_upload_status_logging.py ├── test_api_endpoint_handler.py ├── test_validation_summary_logging.py ├── test_check_files_checksums_logging.py ├── general_generators.py ├── test_upload_status.py ├── test_import_status_logging.py ├── test_step_function.py ├── test_dataset_versions_endpoint_logging.py └── stac_generators.py ├── geostore ├── __init__.py ├── datasets │ ├── __init__.py │ ├── list.py │ ├── entrypoint.py │ ├── create.py │ ├── update.py │ ├── delete.py │ └── get.py ├── content_iterator │ ├── __init__.py │ └── task.py ├── dataset_versions │ ├── __init__.py │ └── entrypoint.py ├── import_asset_file │ ├── __init__.py │ └── task.py ├── import_dataset │ └── __init__.py ├── import_status │ ├── __init__.py │ ├── entrypoint.py │ └── get.py ├── populate_catalog │ ├── __init__.py │ └── task.py ├── upload_status │ ├── __init__.py │ └── task.py ├── check_files_checksums │ ├── __init__.py │ └── task.py ├── check_stac_metadata │ ├── __init__.py │ ├── task.py │ └── stac_validators.py ├── import_metadata_file │ ├── __init__.py │ └── task.py ├── notify_status_update │ └── __init__.py ├── update_root_catalog │ ├── __init__.py │ └── task.py ├── validation_summary │ ├── __init__.py │ └── task.py ├── aws_response.py ├── error_response_keys.py ├── import_file_batch_job_id_keys.py ├── api_keys.py ├── boto3_config.py ├── types.py ├── clock.py ├── import_dataset_keys.py ├── aws_keys.py ├── dataset_properties.py ├── wheel.txt ├── models.py ├── pip.txt ├── environment.py ├── logging_keys.py ├── aws_message_attributes.py ├── sts.py ├── Dockerfile ├── check.py ├── resources.py ├── s3.py ├── step_function_keys.py ├── processing_assets_model.py ├── api_responses.py ├── pystac_io_methods.py ├── stac_format.py ├── parameter_store.py ├── datasets_model.py ├── validation_results_model.py ├── import_dataset_file.py └── s3_utils.py ├── .python-version ├── infrastructure ├── __init__.py ├── constructs │ ├── __init__.py │ ├── backend.py │ ├── lambda_layers │ │ └── botocore │ │ │ ├── .gitignore │ │ │ ├── pyproject.toml │ │ │ └── poetry.lock │ ├── roles.py │ ├── sts_policy.py │ ├── s3_policy.py │ ├── removal_policy.py │ ├── lambda_config.py │ ├── common.py │ ├── version.py │ ├── lambda_layers.py │ ├── opentopo.py │ ├── lds.py │ ├── table.py │ ├── staging.py │ ├── lambda_task.py │ ├── import_file_function.py │ ├── lambda_endpoint.py │ ├── bundled_lambda_function.py │ ├── batch_submit_job_task.py │ ├── task_job_definition.py │ ├── bundled_code.py │ ├── notify.py │ ├── api.py │ └── storage.py ├── networking_stack.py └── application_stack.py ├── .hadolint.yaml ├── poetry.toml ├── .github ├── release.yml ├── codeql │ └── codeql-config.yml ├── workflows │ ├── .env │ ├── update-license-year.yml │ ├── codeql-analysis.yml │ ├── package-cli.yml │ └── mutation-test.yml ├── pull_request_template.md ├── dependabot.yml └── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── enabler_story.md │ └── user_story.md ├── .dockerignore ├── package.json ├── .kodiak.toml ├── .editorconfig ├── cdk.json ├── .envrc ├── .gitignore ├── setup.cfg ├── nix └── sources.json ├── generate-requirements-files.bash ├── .gitlint ├── app.py ├── .gitmodules ├── clear-s3-buckets.bash ├── LICENSE ├── .run ├── pytest.run.xml ├── pytest-infrastructure.run.xml └── pytest-offline.run.xml ├── activate-dev-env.bash ├── shell.nix ├── reset-dev-env.bash └── .pre-commit-config.yaml /.nvmrc: -------------------------------------------------------------------------------- 1 | 18.14.1 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.15 2 | -------------------------------------------------------------------------------- /infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_networking_stack.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/content_iterator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/dataset_versions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/import_asset_file/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/import_dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/import_status/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/populate_catalog/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/upload_status/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /infrastructure/constructs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.hadolint.yaml: -------------------------------------------------------------------------------- 1 | ignored: 2 | - DL3008 3 | -------------------------------------------------------------------------------- /geostore/check_files_checksums/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/check_stac_metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/import_metadata_file/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/notify_status_update/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/update_root_catalog/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geostore/validation_summary/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /geostore/aws_response.py: -------------------------------------------------------------------------------- 1 | AWS_CODE_REQUEST_TIMEOUT = "RequestTimeout" 2 | -------------------------------------------------------------------------------- /infrastructure/constructs/backend.py: -------------------------------------------------------------------------------- 1 | BACKEND_DIRECTORY = "geostore" 2 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_layers/botocore/.gitignore: -------------------------------------------------------------------------------- 1 | /requirements.txt 2 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | authors: 4 | - dependabot 5 | -------------------------------------------------------------------------------- /geostore/error_response_keys.py: -------------------------------------------------------------------------------- 1 | ERROR_KEY = "error" 2 | ERROR_MESSAGE_KEY = "error_message" 3 | -------------------------------------------------------------------------------- /.github/codeql/codeql-config.yml: -------------------------------------------------------------------------------- 1 | paths-ignore: 2 | - node_modules 3 | - tests 4 | - .venv 5 | -------------------------------------------------------------------------------- /geostore/import_file_batch_job_id_keys.py: -------------------------------------------------------------------------------- 1 | METADATA_JOB_ID_KEY = "metadata_job_id" 2 | ASSET_JOB_ID_KEY = "asset_job_id" 3 | -------------------------------------------------------------------------------- /geostore/api_keys.py: -------------------------------------------------------------------------------- 1 | MESSAGE_KEY = "message" 2 | STATUS_KEY = "status" 3 | SUCCESS_KEY = "success" 4 | 5 | EVENT_KEY = "event" 6 | -------------------------------------------------------------------------------- /geostore/boto3_config.py: -------------------------------------------------------------------------------- 1 | from botocore.config import Config 2 | 3 | CONFIG = Config(retries={"max_attempts": 5, "mode": "standard"}) 4 | -------------------------------------------------------------------------------- /geostore/types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, MutableMapping 2 | 3 | JsonList = List[Any] 4 | JsonObject = MutableMapping[str, Any] 5 | -------------------------------------------------------------------------------- /geostore/clock.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | 3 | 4 | def now() -> datetime: 5 | return datetime.now(timezone.utc) 6 | -------------------------------------------------------------------------------- /geostore/import_dataset_keys.py: -------------------------------------------------------------------------------- 1 | ORIGINAL_KEY_KEY = "original_key" 2 | NEW_KEY_KEY = "new_key" 3 | TARGET_BUCKET_NAME_KEY = "target_bucket_name" 4 | -------------------------------------------------------------------------------- /infrastructure/constructs/roles.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Duration 2 | 3 | MAX_SESSION_DURATION = Duration.hours(12) 4 | LINZ_ORGANIZATION_ID = "o-g9kpx6ff4u" 5 | -------------------------------------------------------------------------------- /geostore/aws_keys.py: -------------------------------------------------------------------------------- 1 | AWS_DEFAULT_REGION_KEY = "AWS_DEFAULT_REGION" 2 | BODY_KEY = "body" 3 | HTTP_METHOD_KEY = "http_method" 4 | STATUS_CODE_KEY = "status_code" 5 | -------------------------------------------------------------------------------- /infrastructure/constructs/sts_policy.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import aws_iam 2 | 3 | ALLOW_ASSUME_ANY_ROLE = aws_iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"]) 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | cdk.out 2 | .coverage 3 | .git 4 | .github 5 | .idea 6 | .mypy_cache 7 | node_modules 8 | *.pyc 9 | __pycache__ 10 | .pytest_cache 11 | .venv 12 | .vscode 13 | -------------------------------------------------------------------------------- /geostore/dataset_properties.py: -------------------------------------------------------------------------------- 1 | from string import ascii_letters, digits 2 | 3 | TITLE_CHARACTERS = f"āēīōūĀĒĪŌŪ{ascii_letters}{digits}_-" 4 | TITLE_PATTERN = f"^[{TITLE_CHARACTERS}]+$" 5 | -------------------------------------------------------------------------------- /infrastructure/constructs/s3_policy.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import aws_iam 2 | 3 | ALLOW_DESCRIBE_ANY_S3_JOB = aws_iam.PolicyStatement( 4 | resources=["*"], 5 | actions=["s3:DescribeJob"], 6 | ) 7 | -------------------------------------------------------------------------------- /geostore/wheel.txt: -------------------------------------------------------------------------------- 1 | wheel==0.40.0 \ 2 | --hash=sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873 \ 3 | --hash=sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247 4 | -------------------------------------------------------------------------------- /.github/workflows/.env: -------------------------------------------------------------------------------- 1 | AWS_DEFAULT_REGION=ap-southeast-2 2 | CiOidc=arn:aws:iam::586981104868:role/CiOidc 3 | NonProdOidc=arn:aws:iam::632223577832:role/NonProdOidc 4 | ProdOidc=arn:aws:iam::715898075157:role/ProdOidc 5 | -------------------------------------------------------------------------------- /geostore/models.py: -------------------------------------------------------------------------------- 1 | DB_KEY_SEPARATOR = "#" 2 | 3 | CHECK_ID_PREFIX = f"CHECK{DB_KEY_SEPARATOR}" 4 | DATASET_ID_PREFIX = f"DATASET{DB_KEY_SEPARATOR}" 5 | URL_ID_PREFIX = f"URL{DB_KEY_SEPARATOR}" 6 | VERSION_ID_PREFIX = f"VERSION{DB_KEY_SEPARATOR}" 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "aws-cdk": "*" 4 | }, 5 | "devDependencies": {}, 6 | "prettier": { 7 | "printWidth": 100, 8 | "proseWrap": "always", 9 | "singleQuote": true, 10 | "trailingComma": "all" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/dynamodb_generators.py: -------------------------------------------------------------------------------- 1 | from geostore.step_function import get_hash_key 2 | 3 | from .stac_generators import any_dataset_id, any_dataset_version_id 4 | 5 | 6 | def any_hash_key() -> str: 7 | return get_hash_key(any_dataset_id(), any_dataset_version_id()) 8 | -------------------------------------------------------------------------------- /tests/test_get_latest_extension_schema_version.py: -------------------------------------------------------------------------------- 1 | from geostore.check_stac_metadata.stac_validators import get_latest_extension_schema_version 2 | 3 | 4 | def should_get_latest_stac_spec_version() -> None: 5 | assert get_latest_extension_schema_version("stac-spec") == "1.0.0" 6 | -------------------------------------------------------------------------------- /geostore/pip.txt: -------------------------------------------------------------------------------- 1 | 2 | # The following packages are considered to be unsafe in a requirements file: 3 | pip==23.0.1 \ 4 | --hash=sha256:236bcb61156d76c4b8a05821b988c7b8c35bf0da28a4b614e8d6ab5212c25c6f \ 5 | --hash=sha256:cd015ea1bfb0fcef59d8a286c1f8bebcb983f6317719d415dc5351efb7cd7024 6 | -------------------------------------------------------------------------------- /tests/file_utils.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from json import dumps 3 | from typing import BinaryIO 4 | 5 | from geostore.types import JsonObject 6 | 7 | 8 | def json_dict_to_file_object(value: JsonObject) -> BinaryIO: 9 | return BytesIO(initial_bytes=dumps(value).encode()) 10 | -------------------------------------------------------------------------------- /.kodiak.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [approve] 4 | auto_approve_usernames = ["dependabot"] 5 | 6 | [merge] 7 | method = "squash" 8 | 9 | [merge.automerge_dependencies] 10 | usernames = ["dependabot"] 11 | versions = ["minor", "patch"] 12 | 13 | [merge.message] 14 | title = "pull_request_title" 15 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | indent_size = 4 7 | indent_style = space 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.{json,nix,yaml,yml}] 12 | indent_size = 2 13 | 14 | [*.md] 15 | indent_size = 3 16 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "@aws-cdk/core:newStyleStackSynthesis": true, 5 | "@aws-cdk/core:stackRelativeExports": true, 6 | "@aws-cdk:enableDiffNoFail": true, 7 | "enableLDSAccess": true, 8 | "enableOpenTopographyAccess": true 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /infrastructure/constructs/removal_policy.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | 3 | from aws_cdk import RemovalPolicy 4 | 5 | if environ.get("RESOURCE_REMOVAL_POLICY", "DESTROY").upper() == "RETAIN": 6 | REMOVAL_POLICY = RemovalPolicy.RETAIN 7 | 8 | else: 9 | REMOVAL_POLICY = RemovalPolicy.DESTROY 10 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # the shebang is ignored, but nice for editors 3 | 4 | if type -P lorri &>/dev/null; then 5 | eval "$(lorri direnv)" 6 | else 7 | echo 'while direnv evaluated .envrc, could not find the command "lorri" [https://github.com/nix-community/lorri]' 8 | use nix 9 | fi 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | /cdk.out/ 3 | /.coverage 4 | /dist/ 5 | html/ 6 | /htmlcov/ 7 | /.idea/ 8 | *.isorted 9 | /junit.xml 10 | /geostore/.lambda_out_*/ 11 | .mutmut-cache 12 | mutmut.xml 13 | /.mypy_cache/ 14 | /node_modules/ 15 | *.pyc 16 | __pycache__ 17 | /.pytest_cache 18 | Thumbs.db 19 | /.venv 20 | /.vscode 21 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [mutmut] 2 | dict_synonyms = JobManifestLocationTypeDef,JobManifestSpecTypeDef,JobManifestTypeDef,JobOperationTypeDef,JobReportTypeDef,JsonObject,LambdaInvokeOperationTypeDef,MessageAttributeValueTypeDef 3 | paths_to_mutate = geostore,infrastructure 4 | runner = python -m pytest --assert=plain --exitfirst -m 'not infrastructure' 5 | -------------------------------------------------------------------------------- /tests/test_datasets_model_base.py: -------------------------------------------------------------------------------- 1 | from pytest import mark 2 | 3 | from geostore.datasets_model import datasets_model_with_meta 4 | 5 | 6 | @mark.infrastructure 7 | def should_create_unique_id_per_dataset() -> None: 8 | model = datasets_model_with_meta() 9 | first = model() 10 | second = model() 11 | 12 | assert first.dataset_id != second.dataset_id 13 | -------------------------------------------------------------------------------- /geostore/environment.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | 3 | ENV_NAME_VARIABLE_NAME = "GEOSTORE_ENV_NAME" 4 | PRODUCTION_ENVIRONMENT_NAME = "prod" 5 | 6 | 7 | def environment_name() -> str: 8 | return environ.get(ENV_NAME_VARIABLE_NAME, PRODUCTION_ENVIRONMENT_NAME) 9 | 10 | 11 | def is_production() -> bool: 12 | return environment_name() == PRODUCTION_ENVIRONMENT_NAME 13 | -------------------------------------------------------------------------------- /geostore/logging_keys.py: -------------------------------------------------------------------------------- 1 | LOG_MESSAGE_LAMBDA_START = "Lambda Start" 2 | LOG_MESSAGE_LAMBDA_FAILURE = "Lambda Failure" 3 | LOG_MESSAGE_S3_BATCH_RESPONSE = "S3 Batch Response" 4 | LOG_MESSAGE_S3_DELETION_RESPONSE = "S3 Deletion Response" 5 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE = "Step Function Response" 6 | LOG_MESSAGE_VALIDATION_COMPLETE = "Validation Complete" 7 | GIT_COMMIT = "git_commit" 8 | -------------------------------------------------------------------------------- /.github/workflows/update-license-year.yml: -------------------------------------------------------------------------------- 1 | name: Update copyright year in license file 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | run: 8 | runs-on: ubuntu-22.04 9 | steps: 10 | - uses: actions/checkout@v3.4.0 11 | with: 12 | fetch-depth: 0 13 | - uses: FantasticFiasco/action-update-license-year@v2.3.0 14 | with: 15 | token: ${{ secrets.GITHUB_TOKEN }} 16 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_layers/botocore/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "poetry.core.masonry.api" 3 | requires = ["poetry-core>=1.0.0"] 4 | 5 | [tool.poetry] 6 | authors = ["Your Name "] 7 | description = "" 8 | name = "botocore-layer" 9 | version = "0.1.0" 10 | 11 | [tool.poetry.dependencies] 12 | botocore = "*" 13 | python = "^3.9,<3.10" 14 | 15 | [tool.poetry.dev-dependencies] 16 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_config.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Duration, aws_lambda, aws_logs 2 | 3 | from geostore.environment import is_production 4 | 5 | PYTHON_RUNTIME = aws_lambda.Runtime.PYTHON_3_9 6 | 7 | DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES = 1024 8 | DEFAULT_LAMBDA_TIMEOUT = Duration.seconds(60) 9 | 10 | if is_production(): 11 | RETENTION_DAYS = aws_logs.RetentionDays.ONE_YEAR 12 | else: 13 | RETENTION_DAYS = aws_logs.RetentionDays.THREE_MONTHS 14 | -------------------------------------------------------------------------------- /infrastructure/constructs/common.py: -------------------------------------------------------------------------------- 1 | from logging import DEBUG, getLevelName 2 | from typing import Iterable, Mapping 3 | 4 | from aws_cdk import aws_iam, aws_ssm 5 | 6 | LOG_LEVEL = getLevelName(DEBUG) 7 | 8 | 9 | def grant_parameter_read_access( 10 | parameter_readers: Mapping[aws_ssm.StringParameter, Iterable[aws_iam.IGrantable]] 11 | ) -> None: 12 | for parameter, readers in parameter_readers.items(): 13 | for reader in readers: 14 | parameter.grant_read(reader) 15 | -------------------------------------------------------------------------------- /geostore/import_status/entrypoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset-versions endpoint Lambda function. 3 | """ 4 | from typing import Callable, Mapping 5 | 6 | from ..api_responses import handle_request 7 | from ..types import JsonObject 8 | from .get import get_import_status 9 | 10 | REQUEST_HANDLERS: Mapping[str, Callable[[JsonObject], JsonObject]] = { 11 | "GET": get_import_status, 12 | } 13 | 14 | 15 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 16 | return handle_request(event, REQUEST_HANDLERS) 17 | -------------------------------------------------------------------------------- /geostore/aws_message_attributes.py: -------------------------------------------------------------------------------- 1 | def decapitalize(key: str) -> str: 2 | """ 3 | This method will be used to lower case the first character of SQS 4 | message attributes being received by Lambda to resolve inconsistencies. 5 | Issue outlined here: https://github.com/boto/boto3/issues/2582 6 | """ 7 | return f"{key[:1].lower()}{key[1:]}" 8 | 9 | 10 | DATA_TYPE_KEY = "DataType" 11 | DATA_TYPE_STRING = "String" 12 | STRING_VALUE_KEY = "StringValue" 13 | STRING_VALUE_KEY_LOWER = decapitalize(STRING_VALUE_KEY) 14 | -------------------------------------------------------------------------------- /geostore/dataset_versions/entrypoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset-versions endpoint Lambda function. 3 | """ 4 | from typing import Callable, MutableMapping 5 | 6 | from ..api_responses import handle_request 7 | from ..types import JsonObject 8 | from .create import create_dataset_version 9 | 10 | REQUEST_HANDLERS: MutableMapping[str, Callable[[JsonObject], JsonObject]] = { 11 | "POST": create_dataset_version, 12 | } 13 | 14 | 15 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 16 | return handle_request(event, REQUEST_HANDLERS) 17 | -------------------------------------------------------------------------------- /nix/sources.json: -------------------------------------------------------------------------------- 1 | { 2 | "nixpkgs": { 3 | "branch": "release-22.11", 4 | "description": "Nix Packages collection", 5 | "homepage": "", 6 | "owner": "NixOS", 7 | "repo": "nixpkgs", 8 | "rev": "96e18717904dfedcd884541e5a92bf9ff632cf39", 9 | "sha256": "0zw1851mia86xqxdf8jgy1c6fm5lqw4rncv7v2lwxar3vhpn6c78", 10 | "type": "tarball", 11 | "url": "https://github.com/NixOS/nixpkgs/archive/96e18717904dfedcd884541e5a92bf9ff632cf39.tar.gz", 12 | "url_template": "https://github.com///archive/.tar.gz" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | 12 | 13 | ## Reference 14 | 15 | [Code review checklist](https://github.com/linz/geostore/blob/master/CODING.md#Checklist) 16 | -------------------------------------------------------------------------------- /geostore/sts.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from typing import TYPE_CHECKING 3 | 4 | import boto3 5 | 6 | from .boto3_config import CONFIG 7 | 8 | if TYPE_CHECKING: 9 | from mypy_boto3_sts import STSClient 10 | else: 11 | STSClient = object # pragma: no mutate 12 | 13 | STS_CLIENT: STSClient = boto3.client("sts", config=CONFIG) 14 | 15 | 16 | @lru_cache 17 | def get_account_number() -> str: 18 | caller_identity = STS_CLIENT.get_caller_identity() 19 | assert "Account" in caller_identity, caller_identity 20 | return caller_identity["Account"] 21 | -------------------------------------------------------------------------------- /infrastructure/constructs/version.py: -------------------------------------------------------------------------------- 1 | from subprocess import PIPE, Popen 2 | 3 | with Popen(["git", "rev-parse", "--abbrev-ref", "HEAD"], stdout=PIPE) as branch_command: 4 | GIT_BRANCH = branch_command.communicate()[0].decode().strip() 5 | 6 | with Popen(["git", "rev-parse", "--short", "HEAD"], stdout=PIPE) as commit_command: 7 | GIT_COMMIT = commit_command.communicate()[0].decode().strip() 8 | 9 | with Popen(["git", "describe", "--tags", "--exact-match"], stdout=PIPE) as tag_command: 10 | GIT_TAG = tag_command.communicate()[0].decode().strip() 11 | if not GIT_TAG: 12 | GIT_TAG = "UNRELEASED" 13 | -------------------------------------------------------------------------------- /geostore/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG python_version 2 | FROM python:${python_version}-slim as build 3 | 4 | ARG task 5 | ARG packaging 6 | 7 | RUN python -m venv /opt/.venv 8 | 9 | COPY poetry.lock poetry.toml pyproject.toml /opt/ 10 | COPY ${packaging}/${task}.txt /opt/ 11 | 12 | RUN /opt/.venv/bin/pip install --no-cache-dir --no-deps --requirement=/opt/${task}.txt 13 | 14 | 15 | ARG python_version 16 | FROM python:${python_version}-slim 17 | 18 | ENTRYPOINT ["/opt/.venv/bin/python", "-bb", "-m", "src.task.task"] 19 | 20 | USER 10000:10000 21 | 22 | COPY --from=build /opt/.venv /opt/.venv 23 | 24 | COPY geostore/*.py /src/ 25 | ARG task 26 | COPY geostore/${task} /src/task/ 27 | -------------------------------------------------------------------------------- /geostore/check.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Check(Enum): 5 | ASSETS_IN_DATASET = "assets in dataset" 6 | CHECKSUM = "checksum" 7 | DUPLICATE_OBJECT_KEY = "duplicate asset name" 8 | FILE_NOT_FOUND = "file not found in staging or storage" 9 | INVALID_STAC_ROOT_TYPE = "root type must be catalog or collection" 10 | JSON_PARSE = "JSON parse" 11 | JSON_SCHEMA = "JSON schema" 12 | NON_S3_URL = "not an s3 url" 13 | NO_ASSETS_IN_DATASET = "no assets in the dataset" 14 | SECURITY_CLASSIFICATION = "security classification" 15 | STAGING_ACCESS = "staging bucket access" 16 | UNKNOWN_CLIENT_ERROR = "unknown client error" 17 | UNKNOWN_MULTIHASH_ERROR = "unknown multihash error" 18 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_layers.py: -------------------------------------------------------------------------------- 1 | import constructs 2 | from aws_cdk import aws_lambda_python_alpha 3 | from constructs import Construct 4 | 5 | from .lambda_config import PYTHON_RUNTIME 6 | 7 | 8 | class LambdaLayers(Construct): 9 | def __init__(self, scope: constructs.Construct, stack_id: str, *, env_name: str) -> None: 10 | super().__init__(scope, stack_id) 11 | 12 | self.botocore = aws_lambda_python_alpha.PythonLayerVersion( 13 | self, 14 | f"{env_name}-botocore-lambda-layer", 15 | entry="infrastructure/constructs/lambda_layers/botocore", 16 | compatible_runtimes=[PYTHON_RUNTIME], 17 | description="botocore library", 18 | ) 19 | -------------------------------------------------------------------------------- /tests/test_prefix_non_prod_name.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | from unittest.mock import patch 3 | 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME, PRODUCTION_ENVIRONMENT_NAME 5 | from geostore.resources import prefix_non_prod_name 6 | 7 | 8 | def should_return_original_name_when_production() -> None: 9 | name = "any name" 10 | with patch.dict(environ, {ENV_NAME_VARIABLE_NAME: PRODUCTION_ENVIRONMENT_NAME}): 11 | assert prefix_non_prod_name(name) == name 12 | 13 | 14 | def should_return_prefixed_name_when_not_production() -> None: 15 | name = "any name" 16 | environment_name = f"not {PRODUCTION_ENVIRONMENT_NAME}" 17 | with patch.dict(environ, {ENV_NAME_VARIABLE_NAME: environment_name}): 18 | assert prefix_non_prod_name(name) == f"{environment_name}-{name}" 19 | -------------------------------------------------------------------------------- /geostore/datasets/list.py: -------------------------------------------------------------------------------- 1 | """List all datasets function.""" 2 | from http import HTTPStatus 3 | 4 | from ..api_responses import success_response 5 | from ..datasets_model import datasets_model_with_meta 6 | from ..models import DATASET_ID_PREFIX 7 | from ..types import JsonObject 8 | 9 | 10 | def list_datasets() -> JsonObject: 11 | """GET: List all Datasets.""" 12 | 13 | # list all datasets 14 | datasets_model_class = datasets_model_with_meta() 15 | datasets = datasets_model_class.scan( 16 | filter_condition=datasets_model_class.id.startswith(DATASET_ID_PREFIX) 17 | ) 18 | 19 | # return response 20 | resp_body = [] 21 | for dataset in datasets: 22 | resp_item = dataset.as_dict() 23 | resp_body.append(resp_item) 24 | 25 | return success_response(HTTPStatus.OK, resp_body) 26 | -------------------------------------------------------------------------------- /generate-requirements-files.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit -o noclobber -o nounset -o pipefail 4 | shopt -s failglob inherit_errexit 5 | 6 | if [[ $# -eq 0 ]]; then 7 | cat >&2 <<'EOF' 8 | Synopsis: ./generate-requirements-files.bash PATH [PATH…] 9 | 10 | Example: ./generate-requirements-files.bash geostore/poetry.txt 11 | 12 | Creates pip formatted requirements files (including dependencies and hashes) at each PATH with the package derived from the filename. 13 | 14 | This is used to work around Dependabot not knowing which package is the "main" one in a requirements file. 15 | EOF 16 | exit 1 17 | fi 18 | 19 | for path; do 20 | package_name="$(basename "${path%.txt}")" 21 | pip-compile --allow-unsafe --generate-hashes --no-annotate --no-header --output-file="$path" --upgrade <(echo "$package_name") 22 | done 23 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: docker 4 | directory: /geostore 5 | schedule: 6 | interval: daily 7 | - package-ecosystem: github-actions 8 | directory: / 9 | schedule: 10 | interval: daily 11 | commit-message: 12 | prefix: build(deps) 13 | - package-ecosystem: gitsubmodule 14 | directory: / 15 | schedule: 16 | interval: daily 17 | - package-ecosystem: npm 18 | directory: / 19 | schedule: 20 | interval: daily 21 | - package-ecosystem: pip 22 | directory: / 23 | open-pull-requests-limit: 100 24 | schedule: 25 | interval: daily 26 | - package-ecosystem: pip 27 | directory: /geostore 28 | schedule: 29 | interval: daily 30 | - package-ecosystem: pip 31 | directory: /infrastructure/constructs/lambda_layers/botocore 32 | schedule: 33 | interval: daily 34 | -------------------------------------------------------------------------------- /.gitlint: -------------------------------------------------------------------------------- 1 | # Configuration file for gitlint, used via pre-commit 2 | # Configuration docs: http://jorisroovers.github.io/gitlint/configuration/ 3 | # Default rules: https://github.com/jorisroovers/gitlint/blob/master/docs/rules.md 4 | 5 | [general] 6 | # Ignore certain rules, you can reference them by their id or by their full name 7 | ignore = body-is-missing, body-max-line-length 8 | 9 | # Enable community contributed rule for conventional commits 10 | contrib = contrib-title-conventional-commits 11 | 12 | [title-max-length] 13 | line-length = 72 14 | 15 | # [title-match-regex] 16 | # Uncomment to ensure that there is an issue referenced in every commit title 17 | # regex=^.*?#[0-9]+\b.*?$ 18 | 19 | [contrib-title-conventional-commits] 20 | # Specify allowed commit types. For details see: https://www.conventionalcommits.org/ 21 | types = build, chore, ci, docs, feat, fix, perf, refactor, revert, style, test 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """ 2 | CDK application entry point file. 3 | """ 4 | from aws_cdk import App, Tags 5 | 6 | from geostore.environment import environment_name 7 | from infrastructure.application_stack import Application 8 | from infrastructure.constructs.batch_job_queue import APPLICATION_NAME, APPLICATION_NAME_TAG_NAME 9 | 10 | 11 | def main() -> None: 12 | app = App() 13 | 14 | env_name = environment_name() 15 | Application(app, f"{env_name}-geostore") 16 | 17 | # tag all resources in stack 18 | Tags.of(app).add("CostCentre", "100005") 19 | Tags.of(app).add(APPLICATION_NAME_TAG_NAME, APPLICATION_NAME) 20 | Tags.of(app).add("Owner", "Bill M. Nelson") 21 | Tags.of(app).add("EnvironmentType", env_name) 22 | Tags.of(app).add("SupportType", "Dev") 23 | Tags.of(app).add("HoursOfOperation", "24x7") 24 | 25 | app.synth() 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "file"] 2 | path = geostore/check_stac_metadata/file 3 | url = git@github.com:stac-extensions/file.git 4 | branch = gh-pages 5 | [submodule "geojson-spec"] 6 | path = geostore/check_stac_metadata/geojson-spec 7 | url = https://github.com/geojson/schema.git 8 | branch = gh-pages 9 | [submodule "projection"] 10 | path = geostore/check_stac_metadata/projection 11 | url = https://github.com/stac-extensions/projection.git 12 | branch = gh-pages 13 | [submodule "stac"] 14 | path = geostore/check_stac_metadata/stac 15 | url = https://github.com/linz/stac.git 16 | branch = gh-pages 17 | [submodule "stac-spec"] 18 | path = geostore/check_stac_metadata/stac-spec 19 | url = https://github.com/radiantearth/stac-spec.git 20 | branch = gh-pages 21 | [submodule "version"] 22 | path = geostore/check_stac_metadata/version 23 | url = https://github.com/stac-extensions/version.git 24 | branch = gh-pages 25 | -------------------------------------------------------------------------------- /geostore/resources.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from .environment import environment_name, is_production 4 | 5 | 6 | def prefix_non_prod_name(name: str) -> str: 7 | if is_production(): 8 | return name 9 | 10 | return f"{environment_name()}-{name}" 11 | 12 | 13 | class Resource(Enum): 14 | @property 15 | def resource_name(self) -> str: 16 | return prefix_non_prod_name(self.value) 17 | 18 | API_USERS_ROLE_NAME = "api-users" 19 | CLOUDWATCH_RULE_NAME = "geostore-cloudwatch-rule" 20 | DATASETS_ENDPOINT_FUNCTION_NAME = "datasets" 21 | DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME = "dataset-versions" 22 | IMPORT_STATUS_ENDPOINT_FUNCTION_NAME = "import-status" 23 | S3_USERS_ROLE_NAME = "s3-users" 24 | STAGING_USERS_ROLE_NAME = "staging-users" 25 | STAGING_BUCKET_NAME = "linz-geostore-staging" 26 | STORAGE_BUCKET_NAME = "linz-geostore" 27 | SNS_TOPIC_NAME = "geostore-import-status" 28 | -------------------------------------------------------------------------------- /tests/test_parameter_store.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from pytest import mark, raises 4 | 5 | from geostore import parameter_store 6 | from geostore.parameter_store import ( 7 | LOG_MESSAGE_PARAMETER_NOT_FOUND, 8 | SSM_CLIENT, 9 | ParameterName, 10 | get_param, 11 | ) 12 | 13 | 14 | @mark.infrastructure 15 | @patch(f"{parameter_store.__name__}.{ParameterName.__name__}") 16 | def should_log_missing_parameter_name(parameter_name_mock: MagicMock) -> None: 17 | parameter_name = "invalid" 18 | parameter_name_mock.INVALID.value = parameter_name 19 | 20 | with patch(f"{parameter_store.__name__}.LOGGER.error") as logger_mock: 21 | with raises(SSM_CLIENT.exceptions.ParameterNotFound): 22 | get_param(parameter_name_mock.INVALID) 23 | 24 | logger_mock.assert_any_call( 25 | LOG_MESSAGE_PARAMETER_NOT_FOUND, extra={"parameter_value": parameter_name} 26 | ) 27 | -------------------------------------------------------------------------------- /infrastructure/constructs/opentopo.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Tags, aws_iam, aws_s3 2 | from constructs import Construct 3 | 4 | from .roles import MAX_SESSION_DURATION 5 | 6 | 7 | class OpenTopography(Construct): 8 | def __init__( 9 | self, scope: Construct, stack_id: str, *, env_name: str, storage_bucket: aws_s3.Bucket 10 | ) -> None: 11 | super().__init__(scope, stack_id) 12 | 13 | account_principal = aws_iam.AccountPrincipal(account_id="011766770214") 14 | external_id = "opentopography-bahX0" 15 | role = aws_iam.Role( 16 | self, 17 | "opentopography-read-role", 18 | role_name=f"opentopography-s3-access-read-{env_name}", 19 | assumed_by=account_principal, 20 | external_ids=[external_id], 21 | max_session_duration=MAX_SESSION_DURATION, 22 | ) 23 | storage_bucket.grant_read(role) 24 | 25 | Tags.of(self).add("ApplicationLayer", "opentopography") 26 | -------------------------------------------------------------------------------- /infrastructure/constructs/lds.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Tags, aws_iam, aws_s3 2 | from constructs import Construct 3 | 4 | from geostore.environment import is_production 5 | 6 | from .roles import MAX_SESSION_DURATION 7 | 8 | 9 | class LDS(Construct): 10 | def __init__( 11 | self, scope: Construct, stack_id: str, *, env_name: str, storage_bucket: aws_s3.Bucket 12 | ) -> None: 13 | super().__init__(scope, stack_id) 14 | 15 | account_principal = aws_iam.AccountPrincipal(account_id="276514628126") 16 | if is_production(): 17 | external_id = "koordinates-jAddR" 18 | else: 19 | external_id = "koordinates-4BnJQ" 20 | role = aws_iam.Role( 21 | self, 22 | "koordinates-read-role", 23 | role_name=f"koordinates-s3-access-read-{env_name}", 24 | assumed_by=account_principal, 25 | external_ids=[external_id], 26 | max_session_duration=MAX_SESSION_DURATION, 27 | ) 28 | storage_bucket.grant_read(role) 29 | 30 | Tags.of(self).add("ApplicationLayer", "lds") 31 | -------------------------------------------------------------------------------- /geostore/datasets/entrypoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset endpoint Lambda function. 3 | """ 4 | from logging import Logger 5 | from typing import Callable, MutableMapping 6 | 7 | from linz_logger import get_log 8 | 9 | from ..api_responses import handle_request 10 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START 11 | from ..parameter_store import ParameterName, get_param 12 | from ..types import JsonObject 13 | from .create import create_dataset 14 | from .delete import delete_dataset 15 | from .get import handle_get 16 | from .update import update_dataset 17 | 18 | REQUEST_HANDLERS: MutableMapping[str, Callable[[JsonObject], JsonObject]] = { 19 | "DELETE": delete_dataset, 20 | "GET": handle_get, 21 | "PATCH": update_dataset, 22 | "POST": create_dataset, 23 | } 24 | 25 | LOGGER: Logger = get_log() 26 | 27 | 28 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 29 | LOGGER.debug( 30 | LOG_MESSAGE_LAMBDA_START, 31 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 32 | ) 33 | return handle_request(event, REQUEST_HANDLERS) 34 | -------------------------------------------------------------------------------- /geostore/s3.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | from uuid import uuid4 3 | 4 | import boto3 5 | 6 | from .boto3_config import CONFIG 7 | from .environment import environment_name 8 | 9 | if TYPE_CHECKING: 10 | from mypy_boto3_s3 import S3Client 11 | from mypy_boto3_sts import STSClient 12 | else: 13 | S3Client = STSClient = object # pragma: no mutate 14 | 15 | 16 | S3_SCHEMA = "s3" 17 | S3_URL_PREFIX = f"{S3_SCHEMA}://" 18 | 19 | CHUNK_SIZE = 1024 20 | 21 | STS_CLIENT: STSClient = boto3.client("sts", config=CONFIG) 22 | 23 | 24 | def get_s3_client_for_role(role_arn: str) -> S3Client: 25 | assume_role_response = STS_CLIENT.assume_role( 26 | RoleArn=role_arn, RoleSessionName=f"{environment_name()}_Geostore_{uuid4()}" 27 | ) 28 | credentials = assume_role_response["Credentials"] 29 | client: S3Client = boto3.client( 30 | "s3", 31 | config=CONFIG, 32 | aws_access_key_id=credentials["AccessKeyId"], 33 | aws_secret_access_key=credentials["SecretAccessKey"], 34 | aws_session_token=credentials["SessionToken"], 35 | ) 36 | return client 37 | -------------------------------------------------------------------------------- /geostore/import_asset_file/task.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import boto3 4 | 5 | from ..boto3_config import CONFIG 6 | from ..import_dataset_file import get_import_result 7 | from ..types import JsonObject 8 | 9 | if TYPE_CHECKING: 10 | # When type checking we want to use the third party package's stub 11 | from mypy_boto3_s3 import S3Client 12 | else: 13 | # In production we want to avoid depending on a package which has no runtime impact 14 | S3Client = object # pragma: no mutate 15 | 16 | TARGET_S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 17 | 18 | 19 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 20 | return get_import_result(event, importer) 21 | 22 | 23 | def importer( 24 | source_bucket_name: str, 25 | original_key: str, 26 | target_bucket_name: str, 27 | new_key: str, 28 | source_s3_client: S3Client, 29 | ) -> None: 30 | source_response = source_s3_client.get_object(Bucket=source_bucket_name, Key=original_key) 31 | 32 | TARGET_S3_CLIENT.upload_fileobj(source_response["Body"], Bucket=target_bucket_name, Key=new_key) 33 | -------------------------------------------------------------------------------- /clear-s3-buckets.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit -o noclobber -o nounset -o pipefail 4 | 5 | if [[ $# -eq 0 ]]; then 6 | cat >&2 <<'EOF' 7 | ./clear-s3-buckets.bash BUCKET [BUCKET…] 8 | 9 | Deletes *all* versions of *all* files in *all* given buckets. Only to be used in case of emergency! 10 | EOF 11 | exit 1 12 | fi 13 | 14 | read -n1 -p "THIS WILL DELETE EVERYTHING IN BUCKETS ${*}! Press Ctrl-c to cancel or anything else to continue: " -r 15 | 16 | delete_objects() { 17 | count="$(jq length <<<"$1")" 18 | 19 | if [[ $count -eq 0 ]]; then 20 | echo "No objects found; skipping" >&2 21 | return 22 | fi 23 | 24 | echo "Removing ${count} objects" 25 | jq --raw-output '.[] | [.Key, .VersionId] | @tsv' <<<"$1" | parallel --colsep='\t' --group aws s3api delete-object --bucket="$bucket" --key='{1}' --version-id='{2}' 26 | } 27 | 28 | for bucket; do 29 | versions="$(aws s3api list-object-versions --bucket="$bucket" | jq .Versions)" 30 | delete_objects "$versions" 31 | 32 | markers="$(aws s3api list-object-versions --bucket="$bucket" | jq .DeleteMarkers)" 33 | delete_objects "$markers" 34 | done 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2022, 2021 Land Information New Zealand 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /infrastructure/constructs/table.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from aws_cdk import aws_dynamodb, aws_ssm 4 | from constructs import Construct 5 | 6 | from geostore.parameter_store import ParameterName 7 | 8 | from .removal_policy import REMOVAL_POLICY 9 | 10 | 11 | class Table(aws_dynamodb.Table): 12 | def __init__( 13 | self, 14 | scope: Construct, 15 | construct_id: str, 16 | *, 17 | env_name: str, 18 | parameter_name: ParameterName, 19 | sort_key: Optional[aws_dynamodb.Attribute] = None, 20 | ): 21 | super().__init__( 22 | scope, 23 | construct_id, 24 | partition_key=aws_dynamodb.Attribute(name="pk", type=aws_dynamodb.AttributeType.STRING), 25 | sort_key=sort_key, 26 | point_in_time_recovery=True, 27 | removal_policy=REMOVAL_POLICY, 28 | billing_mode=aws_dynamodb.BillingMode.PAY_PER_REQUEST, 29 | ) 30 | 31 | self.name_parameter = aws_ssm.StringParameter( 32 | self, 33 | f"{construct_id} table name for {env_name}", 34 | string_value=self.table_name, 35 | parameter_name=parameter_name.value, 36 | ) 37 | -------------------------------------------------------------------------------- /infrastructure/constructs/staging.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import RemovalPolicy, Tags, aws_iam, aws_s3 2 | from constructs import Construct 3 | 4 | from geostore.resources import Resource 5 | 6 | 7 | class Staging(Construct): 8 | def __init__(self, scope: Construct, stack_id: str, *, users_role: aws_iam.Role) -> None: 9 | super().__init__(scope, stack_id) 10 | 11 | ############################################################################################ 12 | # ### DATASET STAGING S3 BUCKET ############################################################ 13 | ############################################################################################ 14 | staging_bucket = aws_s3.Bucket( 15 | self, 16 | "dataset-staging-bucket", 17 | bucket_name=Resource.STAGING_BUCKET_NAME.resource_name, 18 | access_control=aws_s3.BucketAccessControl.PRIVATE, 19 | block_public_access=aws_s3.BlockPublicAccess.BLOCK_ALL, 20 | versioned=True, 21 | removal_policy=RemovalPolicy.DESTROY, 22 | enforce_ssl=True, 23 | ) 24 | staging_bucket.grant_read(users_role) 25 | 26 | Tags.of(self).add("ApplicationLayer", "staging") 27 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_task.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional 2 | 3 | from aws_cdk import aws_lambda_python_alpha, aws_stepfunctions_tasks 4 | from aws_cdk.aws_stepfunctions import JsonPath 5 | from constructs import Construct 6 | 7 | from .bundled_lambda_function import BundledLambdaFunction 8 | 9 | 10 | class LambdaTask(aws_stepfunctions_tasks.LambdaInvoke): 11 | def __init__( 12 | self, 13 | scope: Construct, 14 | construct_id: str, 15 | *, 16 | lambda_directory: str, 17 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 18 | result_path: Optional[str] = JsonPath.DISCARD, 19 | extra_environment: Optional[Mapping[str, str]] = None, 20 | ): 21 | self.lambda_function = BundledLambdaFunction( 22 | scope, 23 | f"{construct_id}Function", 24 | lambda_directory=lambda_directory, 25 | extra_environment=extra_environment, 26 | botocore_lambda_layer=botocore_lambda_layer, 27 | ) 28 | 29 | super().__init__( 30 | scope, 31 | construct_id, 32 | lambda_function=self.lambda_function, 33 | result_path=result_path, 34 | payload_response_only=True, 35 | ) 36 | -------------------------------------------------------------------------------- /.run/pytest.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | -------------------------------------------------------------------------------- /activate-dev-env.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit -o noclobber -o nounset -o pipefail 4 | 5 | usage() { 6 | cat >&2 <<'EOF' 7 | Usage: 8 | 9 | . activate-dev-env.bash 10 | EOF 11 | } 12 | 13 | if ! (return 0); then 14 | usage 15 | exit 2 16 | fi 17 | 18 | script_dir="$(dirname "${BASH_SOURCE[0]}")" 19 | 20 | if type nvm &>/dev/null; then 21 | nvm use 22 | fi 23 | PATH="${script_dir}/node_modules/.bin:${PATH}" 24 | 25 | if ! diff <(node --version | cut --delimiter=. --fields=1-2 | tr --delete v) <(cut --delimiter=. --fields=1-2 "${script_dir}/.nvmrc"); then 26 | # shellcheck disable=SC2016 27 | echo 'Wrong major/minor version of Node.js detected. Please run `nvm install` to update Node.js and then reset the dev env.' >&2 28 | exit 3 29 | fi 30 | 31 | set +o errexit +o nounset 32 | if [[ -e "${script_dir}/.venv/bin/activate" ]]; then 33 | # shellcheck source=/dev/null 34 | . "${script_dir}/.venv/bin/activate" 35 | fi 36 | 37 | if ! diff <(python <<<'import platform; print(platform.python_version())' | cut --delimiter=. --fields=1-2) <(cut --delimiter=. --fields=1-2 "${script_dir}/.python-version"); then 38 | # shellcheck disable=SC2016 39 | echo 'Wrong major/minor version of Python detected. Please run `pyenv install` to update Python and then reset the dev env.' >&2 40 | exit 4 41 | fi 42 | -------------------------------------------------------------------------------- /.run/pytest-infrastructure.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | -------------------------------------------------------------------------------- /.run/pytest-offline.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | -------------------------------------------------------------------------------- /tests/test_step_function_logging.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_S3_BATCH_RESPONSE 4 | from geostore.parameter_store import ParameterName, get_param 5 | from geostore.step_function import get_s3_batch_copy_status 6 | 7 | from .aws_utils import any_account_id 8 | 9 | 10 | @patch("geostore.step_function.S3CONTROL_CLIENT.describe_job") 11 | def should_log_s3_batch_response( 12 | describe_s3_job_mock: MagicMock, 13 | ) -> None: 14 | # Given 15 | describe_s3_job_mock.return_value = s3_batch_response = { 16 | "Job": { 17 | "Status": "Some Response", 18 | "FailureReasons": [], 19 | "ProgressSummary": {"NumberOfTasksFailed": 0}, 20 | } 21 | } 22 | 23 | with patch("geostore.step_function.LOGGER.debug") as logger_mock, patch( 24 | "geostore.step_function.get_account_number" 25 | ) as get_account_number_mock: 26 | get_account_number_mock.return_value = any_account_id() 27 | 28 | # When 29 | get_s3_batch_copy_status("test") 30 | 31 | # Then 32 | logger_mock.assert_any_call( 33 | LOG_MESSAGE_S3_BATCH_RESPONSE, 34 | extra={ 35 | "response": s3_batch_response, 36 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 37 | }, 38 | ) 39 | -------------------------------------------------------------------------------- /geostore/step_function_keys.py: -------------------------------------------------------------------------------- 1 | from typing import Final 2 | 3 | JOB_STATUS_FAILED = "FAILED" 4 | JOB_STATUS_RUNNING = "RUNNING" 5 | JOB_STATUS_SUCCEEDED = "SUCCEEDED" 6 | 7 | S3_BATCH_STATUS_FAILED: Final = "Failed" 8 | S3_BATCH_STATUS_CANCELLED: Final = "Cancelled" 9 | S3_BATCH_STATUS_COMPLETE: Final = "Complete" 10 | 11 | ASSET_UPLOAD_KEY = "asset_upload" 12 | CURRENT_VERSION_ID_KEY = "current_version_id" 13 | CURRENT_VERSION_EMPTY_VALUE = "None" 14 | DATASET_ID_KEY = "dataset_id" 15 | DATASET_ID_SHORT_KEY = "id" 16 | DESCRIPTION_KEY = "description" 17 | ERRORS_KEY = "errors" 18 | ERROR_CHECK_KEY = "check" 19 | ERROR_DETAILS_KEY = "details" 20 | ERROR_RESULT_KEY = "result" 21 | ERROR_URL_KEY = "url" 22 | EXECUTION_ARN_KEY = "execution_arn" 23 | FAILED_TASKS_KEY = "failed_tasks" 24 | FAILURE_REASONS_KEY = "failure_reasons" 25 | IMPORT_DATASET_KEY = "import_dataset" 26 | INPUT_KEY = "input" 27 | METADATA_UPLOAD_KEY = "metadata_upload" 28 | METADATA_URL_KEY = "metadata_url" 29 | NEW_VERSION_ID_KEY = "new_version_id" 30 | NEW_VERSION_S3_LOCATION = "new_version_s3_location" 31 | NOW_KEY = "now" 32 | OUTPUT_KEY = "output" 33 | S3_BATCH_RESPONSE_KEY = "s3_batch_response" 34 | S3_ROLE_ARN_KEY = "s3_role_arn" 35 | STATUS_KEY = "status" 36 | STEP_FUNCTION_KEY = "step_function" 37 | DATASET_TITLE_KEY = "title" 38 | UPDATE_DATASET_KEY = "update_root_catalog" 39 | UPLOAD_STATUS_KEY = "upload_status" 40 | VALIDATION_KEY = "validation" 41 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest configuration file. 3 | """ 4 | from logging import INFO, basicConfig 5 | 6 | import boto3 7 | import pytest 8 | from mypy_boto3_events import EventBridgeClient 9 | from mypy_boto3_lambda import LambdaClient 10 | from mypy_boto3_s3 import S3Client 11 | from mypy_boto3_s3control import S3ControlClient 12 | from mypy_boto3_sqs import SQSServiceResource 13 | from mypy_boto3_ssm import SSMClient 14 | from mypy_boto3_stepfunctions import SFNClient 15 | 16 | from geostore.boto3_config import CONFIG 17 | 18 | basicConfig(level=INFO) 19 | 20 | 21 | @pytest.fixture() 22 | def lambda_client() -> LambdaClient: 23 | return boto3.client("lambda", config=CONFIG) 24 | 25 | 26 | @pytest.fixture() 27 | def s3_client() -> S3Client: 28 | return boto3.client("s3", config=CONFIG) 29 | 30 | 31 | @pytest.fixture() 32 | def s3_control_client() -> S3ControlClient: 33 | return boto3.client("s3control", config=CONFIG) 34 | 35 | 36 | @pytest.fixture() 37 | def events_client() -> EventBridgeClient: 38 | return boto3.client("events", config=CONFIG) 39 | 40 | 41 | @pytest.fixture() 42 | def ssm_client() -> SSMClient: 43 | return boto3.client("ssm", config=CONFIG) 44 | 45 | 46 | @pytest.fixture() 47 | def step_functions_client() -> SFNClient: 48 | return boto3.client("stepfunctions", config=CONFIG) 49 | 50 | 51 | @pytest.fixture() 52 | def sqs_resource() -> SQSServiceResource: 53 | return boto3.resource("sqs") 54 | -------------------------------------------------------------------------------- /geostore/processing_assets_model.py: -------------------------------------------------------------------------------- 1 | """Dataset object DynamoDB model.""" 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from os import environ 5 | from typing import Optional, Type 6 | 7 | from pynamodb.attributes import BooleanAttribute, UnicodeAttribute 8 | from pynamodb.models import Model 9 | 10 | from .aws_keys import AWS_DEFAULT_REGION_KEY 11 | from .parameter_store import ParameterName, get_param 12 | 13 | 14 | class ProcessingAssetType(Enum): 15 | DATA = "DATA_ITEM_INDEX" 16 | METADATA = "METADATA_ITEM_INDEX" 17 | 18 | 19 | class ProcessingAssetsModelBase(Model): 20 | pk = UnicodeAttribute(hash_key=True) 21 | sk = UnicodeAttribute(range_key=True) 22 | url = UnicodeAttribute() 23 | filename = UnicodeAttribute() 24 | multihash = UnicodeAttribute(null=True) 25 | exists_in_staging = BooleanAttribute(null=True) 26 | replaced_in_new_version = BooleanAttribute(null=True) 27 | 28 | 29 | def processing_assets_model_with_meta( 30 | *, assets_table_name: Optional[str] = None 31 | ) -> Type[ProcessingAssetsModelBase]: 32 | if assets_table_name is None: 33 | assets_table_name = get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME) 34 | 35 | class ProcessingAssetsModel(ProcessingAssetsModelBase): 36 | @dataclass 37 | class Meta: 38 | table_name = assets_table_name 39 | region = environ[AWS_DEFAULT_REGION_KEY] 40 | 41 | return ProcessingAssetsModel 42 | -------------------------------------------------------------------------------- /infrastructure/constructs/import_file_function.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Duration, aws_iam, aws_lambda_python_alpha 2 | from constructs import Construct 3 | 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME 5 | 6 | from .bundled_lambda_function import BundledLambdaFunction 7 | from .lambda_config import DEFAULT_LAMBDA_TIMEOUT 8 | from .sts_policy import ALLOW_ASSUME_ANY_ROLE 9 | 10 | 11 | class ImportFileFunction(BundledLambdaFunction): 12 | def __init__( 13 | self, 14 | scope: Construct, 15 | *, 16 | lambda_directory: str, 17 | invoker: aws_iam.Role, 18 | env_name: str, 19 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 20 | timeout: Duration = DEFAULT_LAMBDA_TIMEOUT, 21 | ): 22 | super().__init__( 23 | scope, 24 | lambda_directory.title().replace("_", ""), 25 | lambda_directory=lambda_directory, 26 | extra_environment={ENV_NAME_VARIABLE_NAME: env_name}, 27 | botocore_lambda_layer=botocore_lambda_layer, 28 | timeout=timeout, 29 | ) 30 | 31 | self.add_to_role_policy( 32 | aws_iam.PolicyStatement( 33 | actions=["s3:GetObject", "s3:GetObjectAcl", "s3:GetObjectTagging", "s3:ListBucket"], 34 | resources=["*"], 35 | ), 36 | ) 37 | self.add_to_role_policy(ALLOW_ASSUME_ANY_ROLE) 38 | 39 | self.grant_invoke(invoker) 40 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_endpoint.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import aws_iam, aws_lambda, aws_lambda_python_alpha 2 | from constructs import Construct 3 | 4 | from geostore.environment import ENV_NAME_VARIABLE_NAME 5 | 6 | from .backend import BACKEND_DIRECTORY 7 | from .bundled_code import bundled_code 8 | from .lambda_config import ( 9 | DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES, 10 | DEFAULT_LAMBDA_TIMEOUT, 11 | PYTHON_RUNTIME, 12 | RETENTION_DAYS, 13 | ) 14 | 15 | 16 | class LambdaEndpoint(aws_lambda.Function): 17 | def __init__( 18 | self, 19 | scope: Construct, 20 | construct_id: str, 21 | *, 22 | env_name: str, 23 | users_role: aws_iam.Role, 24 | package_name: str, 25 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 26 | ): 27 | super().__init__( 28 | scope, 29 | f"{construct_id}-function", 30 | function_name=construct_id, 31 | handler=f"{BACKEND_DIRECTORY}.{package_name}.entrypoint.lambda_handler", 32 | runtime=PYTHON_RUNTIME, 33 | timeout=DEFAULT_LAMBDA_TIMEOUT, 34 | code=bundled_code(package_name), 35 | layers=[botocore_lambda_layer], 36 | memory_size=DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES, 37 | log_retention=RETENTION_DAYS, 38 | ) 39 | 40 | self.add_environment(ENV_NAME_VARIABLE_NAME, env_name) 41 | self.grant_invoke(users_role) 42 | -------------------------------------------------------------------------------- /tests/test_storage_bucket.py: -------------------------------------------------------------------------------- 1 | from mypy_boto3_s3 import S3Client 2 | from pytest import mark 3 | 4 | from geostore.resources import Resource 5 | 6 | 7 | @mark.infrastructure 8 | def should_create_storage_bucket_location_constraint(s3_client: S3Client) -> None: 9 | """Test if Geostore Storage S3 Bucket is created in correct region.""" 10 | response = s3_client.get_bucket_location(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name) 11 | assert response["LocationConstraint"] == "ap-southeast-2" 12 | 13 | 14 | @mark.infrastructure 15 | def should_enable_storage_bucket_versioning(s3_client: S3Client) -> None: 16 | """Test if Geostore Storage S3 Bucket versioning is enabled.""" 17 | response = s3_client.get_bucket_versioning(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name) 18 | assert response["Status"] == "Enabled" 19 | 20 | 21 | @mark.infrastructure 22 | def should_create_storage_bucket_public_access_block(s3_client: S3Client) -> None: 23 | """Test if Geostore Storage S3 Bucket access is blocked for public.""" 24 | response = s3_client.get_public_access_block(Bucket=Resource.STORAGE_BUCKET_NAME.resource_name) 25 | public_access_block_configuration = response["PublicAccessBlockConfiguration"] 26 | assert public_access_block_configuration["BlockPublicAcls"] is True 27 | assert public_access_block_configuration["IgnorePublicAcls"] is True 28 | assert public_access_block_configuration["BlockPublicPolicy"] is True 29 | assert public_access_block_configuration["RestrictPublicBuckets"] is True 30 | -------------------------------------------------------------------------------- /geostore/api_responses.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | from http.client import responses as http_responses 3 | from typing import Callable, Mapping, Union 4 | 5 | from jsonschema import ValidationError, validate 6 | 7 | from .api_keys import MESSAGE_KEY 8 | from .aws_keys import BODY_KEY, HTTP_METHOD_KEY, STATUS_CODE_KEY 9 | from .types import JsonList, JsonObject 10 | 11 | 12 | def error_response(code: int, message: str) -> JsonObject: 13 | return {STATUS_CODE_KEY: code, BODY_KEY: {MESSAGE_KEY: f"{http_responses[code]}: {message}"}} 14 | 15 | 16 | def success_response(code: int, body: Union[JsonList, JsonObject]) -> JsonObject: 17 | return {STATUS_CODE_KEY: code, BODY_KEY: body} 18 | 19 | 20 | def handle_request( 21 | event: JsonObject, request_handlers: Mapping[str, Callable[[JsonObject], JsonObject]] 22 | ) -> JsonObject: 23 | """Main Lambda entry point.""" 24 | 25 | # request validation 26 | try: 27 | validate( 28 | event, 29 | { 30 | "type": "object", 31 | "properties": { 32 | HTTP_METHOD_KEY: {"type": "string", "enum": list(request_handlers.keys())}, 33 | BODY_KEY: {"type": "object"}, 34 | }, 35 | "required": [HTTP_METHOD_KEY, BODY_KEY], 36 | }, 37 | ) 38 | except ValidationError as err: 39 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 40 | 41 | method = event[HTTP_METHOD_KEY] 42 | return request_handlers[method](event[BODY_KEY]) 43 | -------------------------------------------------------------------------------- /tests/test_validation_summary.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from geostore.api_keys import SUCCESS_KEY 4 | from geostore.error_response_keys import ERROR_MESSAGE_KEY 5 | from geostore.step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY 6 | from geostore.validation_summary.task import lambda_handler 7 | 8 | from .aws_utils import any_lambda_context 9 | from .stac_generators import any_dataset_id, any_dataset_version_id 10 | 11 | 12 | def should_require_dataset_id() -> None: 13 | response = lambda_handler({NEW_VERSION_ID_KEY: any_dataset_version_id()}, any_lambda_context()) 14 | 15 | assert response == {ERROR_MESSAGE_KEY: "'dataset_id' is a required property"} 16 | 17 | 18 | def should_require_dataset_version() -> None: 19 | response = lambda_handler({DATASET_ID_KEY: any_dataset_id()}, any_lambda_context()) 20 | 21 | assert response == {ERROR_MESSAGE_KEY: "'new_version_id' is a required property"} 22 | 23 | 24 | @patch("geostore.validation_summary.task.validation_results_model_with_meta") 25 | def should_return_success_false_if_any_validation_results_are_unsuccessful( 26 | validation_results_model_mock: MagicMock, 27 | ) -> None: 28 | # Given an unsuccessful result 29 | validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 1 30 | 31 | response = lambda_handler( 32 | {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()}, 33 | any_lambda_context(), 34 | ) 35 | 36 | assert response == {SUCCESS_KEY: False} 37 | -------------------------------------------------------------------------------- /geostore/import_status/get.py: -------------------------------------------------------------------------------- 1 | """Import Status handler function.""" 2 | from http import HTTPStatus 3 | from logging import Logger 4 | 5 | from jsonschema import ValidationError, validate 6 | from linz_logger import get_log 7 | 8 | from ..api_responses import error_response, success_response 9 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_FAILURE, LOG_MESSAGE_LAMBDA_START 10 | from ..parameter_store import ParameterName, get_param 11 | from ..step_function import get_import_status_given_arn 12 | from ..step_function_keys import EXECUTION_ARN_KEY 13 | from ..types import JsonObject 14 | 15 | LOGGER: Logger = get_log() 16 | 17 | 18 | def get_import_status(body: JsonObject) -> JsonObject: 19 | LOGGER.debug( 20 | LOG_MESSAGE_LAMBDA_START, 21 | extra={"lambda_input": body, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 22 | ) 23 | 24 | try: 25 | validate( 26 | body, 27 | { 28 | "type": "object", 29 | "properties": {EXECUTION_ARN_KEY: {"type": "string"}}, 30 | "required": [EXECUTION_ARN_KEY], 31 | }, 32 | ) 33 | except ValidationError as err: 34 | LOGGER.warning( 35 | LOG_MESSAGE_LAMBDA_FAILURE, 36 | extra={"error": err.message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 37 | ) 38 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 39 | 40 | response_body = get_import_status_given_arn(body[EXECUTION_ARN_KEY]) 41 | 42 | return success_response(HTTPStatus.OK, response_body) 43 | -------------------------------------------------------------------------------- /tests/test_upload_status_logging.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from geostore.api_keys import SUCCESS_KEY 4 | from geostore.import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY 5 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START 6 | from geostore.parameter_store import ParameterName, get_param 7 | from geostore.step_function_keys import ( 8 | DATASET_ID_KEY, 9 | IMPORT_DATASET_KEY, 10 | NEW_VERSION_ID_KEY, 11 | VALIDATION_KEY, 12 | ) 13 | from geostore.upload_status.task import lambda_handler 14 | 15 | from .aws_utils import any_job_id, any_lambda_context 16 | from .stac_generators import any_dataset_id, any_dataset_version_id 17 | 18 | 19 | @patch("geostore.upload_status.task.get_tasks_status") 20 | def should_log_event(get_tasks_status_mock: MagicMock) -> None: 21 | # Given 22 | get_tasks_status_mock.return_value = {} 23 | 24 | event = { 25 | DATASET_ID_KEY: any_dataset_id(), 26 | NEW_VERSION_ID_KEY: any_dataset_version_id(), 27 | VALIDATION_KEY: {SUCCESS_KEY: True}, 28 | IMPORT_DATASET_KEY: { 29 | METADATA_JOB_ID_KEY: any_job_id(), 30 | ASSET_JOB_ID_KEY: any_job_id(), 31 | }, 32 | } 33 | 34 | with patch("geostore.upload_status.task.LOGGER.debug") as logger_mock: 35 | # When 36 | lambda_handler(event, any_lambda_context()) 37 | 38 | # Then 39 | logger_mock.assert_any_call( 40 | LOG_MESSAGE_LAMBDA_START, 41 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 42 | ) 43 | -------------------------------------------------------------------------------- /infrastructure/networking_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Stack, Tags, aws_ec2 2 | from constructs import Construct 3 | 4 | from geostore.environment import is_production 5 | 6 | 7 | class NetworkingStack(Stack): 8 | def __init__(self, scope: Construct, stack_id: str) -> None: 9 | super().__init__(scope, stack_id) 10 | 11 | ############################################################################################ 12 | # ### NETWORKING ########################################################################### 13 | ############################################################################################ 14 | 15 | # create new VPC 16 | aws_ec2.Vpc( 17 | self, 18 | "geostore", 19 | # cidr='10.0.0.0/16', # TODO: use specific CIDR pylint:disable=fixme 20 | subnet_configuration=[ 21 | aws_ec2.SubnetConfiguration( 22 | cidr_mask=27, name="public", subnet_type=aws_ec2.SubnetType.PUBLIC 23 | ), 24 | aws_ec2.SubnetConfiguration( 25 | cidr_mask=20, 26 | name="ecs-cluster", 27 | subnet_type=aws_ec2.SubnetType.PRIVATE_ISOLATED, 28 | ), 29 | aws_ec2.SubnetConfiguration( 30 | name="reserved", 31 | subnet_type=aws_ec2.SubnetType.PRIVATE_ISOLATED, 32 | reserved=True, 33 | ), 34 | ], 35 | max_azs=99 if is_production() else 1, 36 | ) 37 | 38 | Tags.of(self).add("ApplicationLayer", "networking") 39 | -------------------------------------------------------------------------------- /infrastructure/constructs/bundled_lambda_function.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional 2 | 3 | from aws_cdk import Duration, aws_lambda, aws_lambda_python_alpha 4 | from constructs import Construct 5 | 6 | from .backend import BACKEND_DIRECTORY 7 | from .bundled_code import bundled_code 8 | from .common import LOG_LEVEL 9 | from .lambda_config import ( 10 | DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES, 11 | DEFAULT_LAMBDA_TIMEOUT, 12 | PYTHON_RUNTIME, 13 | ) 14 | 15 | 16 | class BundledLambdaFunction(aws_lambda.Function): 17 | def __init__( 18 | self, 19 | scope: Construct, 20 | construct_id: str, 21 | *, 22 | lambda_directory: str, 23 | extra_environment: Optional[Mapping[str, str]], 24 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 25 | timeout: Duration = DEFAULT_LAMBDA_TIMEOUT, 26 | reserved_concurrent_executions: Optional[int] = None, 27 | ): 28 | environment = {"LOGLEVEL": LOG_LEVEL} 29 | if extra_environment is not None: 30 | environment.update(extra_environment) 31 | 32 | super().__init__( 33 | scope, 34 | construct_id, 35 | code=bundled_code(lambda_directory), 36 | handler=f"{BACKEND_DIRECTORY}.{lambda_directory}.task.lambda_handler", 37 | runtime=PYTHON_RUNTIME, 38 | environment=environment, 39 | layers=[botocore_lambda_layer], 40 | timeout=timeout, 41 | memory_size=DEFAULT_LAMBDA_MAX_MEMORY_MEBIBYTES, 42 | reserved_concurrent_executions=reserved_concurrent_executions, 43 | ) 44 | -------------------------------------------------------------------------------- /geostore/pystac_io_methods.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | from typing import TYPE_CHECKING, Any, Union 3 | 4 | import boto3 5 | from linz_logger import get_log 6 | from pystac.link import Link 7 | from pystac.stac_io import StacIO 8 | 9 | from .boto3_config import CONFIG 10 | from .s3_utils import calculate_s3_etag, get_bucket_and_key_from_url, get_s3_etag 11 | 12 | if TYPE_CHECKING: 13 | # When type checking we want to use the third party package's stub 14 | from mypy_boto3_s3 import S3Client 15 | else: 16 | # In production we want to avoid depending on a package which has no runtime impact 17 | S3Client = object # pragma: no mutate 18 | 19 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 20 | LOGGER: Logger = get_log() 21 | 22 | 23 | class S3StacIO(StacIO): 24 | def read_text( # type: ignore[override] 25 | self, source: Union[str, Link], *_args: Any, **_kwargs: Any 26 | ) -> str: 27 | url = source.href if isinstance(source, Link) else source 28 | bucket, key = get_bucket_and_key_from_url(url) 29 | obj = S3_CLIENT.get_object(Bucket=bucket, Key=key) 30 | result: str = obj["Body"].read().decode("utf-8") 31 | 32 | return result 33 | 34 | def write_text( # type: ignore[override] 35 | self, dest: Union[str, Link], txt: str, *_args: Any, **_kwargs: Any 36 | ) -> None: 37 | url = dest.href if isinstance(dest, Link) else dest 38 | bucket, key = get_bucket_and_key_from_url(url) 39 | 40 | s3_etag = get_s3_etag(bucket, key, LOGGER) 41 | local_etag = calculate_s3_etag(txt.encode()) 42 | 43 | if s3_etag != local_etag: 44 | S3_CLIENT.put_object(Bucket=bucket, Key=key, Body=txt.encode()) 45 | -------------------------------------------------------------------------------- /geostore/datasets/create.py: -------------------------------------------------------------------------------- 1 | """Create dataset function.""" 2 | from http import HTTPStatus 3 | 4 | from jsonschema import ValidationError, validate 5 | from pystac.stac_io import StacIO 6 | 7 | from ..api_responses import error_response, success_response 8 | from ..dataset_properties import TITLE_PATTERN 9 | from ..datasets_model import datasets_model_with_meta 10 | from ..pystac_io_methods import S3StacIO 11 | from ..step_function_keys import DATASET_TITLE_KEY, DESCRIPTION_KEY 12 | from ..types import JsonObject 13 | 14 | StacIO.set_default(S3StacIO) 15 | 16 | 17 | def create_dataset(body: JsonObject) -> JsonObject: 18 | """POST: Create Dataset.""" 19 | 20 | body_schema = { 21 | "type": "object", 22 | "properties": { 23 | DATASET_TITLE_KEY: {"type": "string", "pattern": TITLE_PATTERN}, 24 | DESCRIPTION_KEY: {"type": "string"}, 25 | }, 26 | "required": [DATASET_TITLE_KEY, DESCRIPTION_KEY], 27 | } 28 | 29 | # request body validation 30 | try: 31 | validate(body, body_schema) 32 | except ValidationError as err: 33 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 34 | 35 | # check for duplicate type/title 36 | datasets_model_class = datasets_model_with_meta() 37 | dataset_title = body[DATASET_TITLE_KEY] 38 | if datasets_model_class.datasets_title_idx.count(hash_key=dataset_title): 39 | return error_response(HTTPStatus.CONFLICT, f"dataset '{dataset_title}' already exists") 40 | 41 | # create dataset 42 | dataset = datasets_model_class(title=dataset_title) 43 | dataset.save() 44 | dataset.refresh(consistent_read=True) 45 | 46 | # return response 47 | resp_body = dataset.as_dict() 48 | 49 | return success_response(HTTPStatus.CREATED, resp_body) 50 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: CodeQL Analysis 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | CodeQL-Build: 15 | runs-on: ubuntu-22.04 16 | 17 | steps: 18 | - name: Checkout repository 19 | if: ${{ github.event_name == 'push' }} 20 | uses: actions/checkout@v3.4.0 21 | 22 | - name: Checkout repository 23 | if: ${{ github.event_name == 'pull_request' }} 24 | uses: actions/checkout@v3.4.0 25 | with: 26 | ref: ${{ github.event.pull_request.head.sha }} 27 | 28 | - name: Get configuration 29 | run: | 30 | echo "PYTHON_VERSION=$(cat .python-version)" | tee -a $GITHUB_ENV 31 | 32 | - name: Use Python ${{ env.PYTHON_VERSION }} 33 | uses: actions/setup-python@v4.5.0 34 | with: 35 | python-version: ${{ env.PYTHON_VERSION }} 36 | 37 | - name: Install dependencies 38 | run: | 39 | python -m pip install --requirement=geostore/pip.txt 40 | python -m pip install --requirement=geostore/poetry.txt 41 | python -m poetry install --all-extras --no-root --only=main 42 | echo "CODEQL_PYTHON=$(python -m poetry run which python)" >> $GITHUB_ENV 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v2.2.7 47 | with: 48 | config-file: ./.github/codeql/codeql-config.yml 49 | setup-python-dependencies: false 50 | languages: python 51 | 52 | - name: Perform CodeQL Analysis 53 | uses: github/codeql-action/analyze@v2.2.7 54 | -------------------------------------------------------------------------------- /tests/test_api_endpoint_handler.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | from typing import Callable, MutableMapping 3 | from unittest.mock import MagicMock 4 | 5 | from pytest_subtests import SubTests 6 | 7 | from geostore.api_keys import MESSAGE_KEY 8 | from geostore.api_responses import handle_request 9 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY, STATUS_CODE_KEY 10 | from geostore.types import JsonObject 11 | 12 | 13 | def should_return_required_property_error_when_missing_http_method() -> None: 14 | response = handle_request({BODY_KEY: {}}, MagicMock()) 15 | 16 | assert response == { 17 | STATUS_CODE_KEY: HTTPStatus.BAD_REQUEST, 18 | BODY_KEY: {MESSAGE_KEY: f"Bad Request: '{HTTP_METHOD_KEY}' is a required property"}, 19 | } 20 | 21 | 22 | def should_return_required_property_error_when_missing_body() -> None: 23 | response = handle_request({HTTP_METHOD_KEY: "GET"}, MagicMock()) 24 | 25 | assert response == { 26 | STATUS_CODE_KEY: HTTPStatus.BAD_REQUEST, 27 | BODY_KEY: {MESSAGE_KEY: f"Bad Request: '{BODY_KEY}' is a required property"}, 28 | } 29 | 30 | 31 | def should_call_relevant_http_method(subtests: SubTests) -> None: 32 | post_mock = MagicMock() 33 | 34 | get_mock = MagicMock() 35 | get_mock.return_value = expected_response = {"some key": "some value"} 36 | 37 | request_handlers: MutableMapping[str, Callable[[JsonObject], JsonObject]] = { 38 | "POST": post_mock, 39 | "GET": get_mock, 40 | } 41 | 42 | response = handle_request({HTTP_METHOD_KEY: "GET", BODY_KEY: {}}, request_handlers) 43 | 44 | with subtests.test("Should return response"): 45 | assert response == expected_response 46 | with subtests.test("Should call GET method"): 47 | assert get_mock.called 48 | with subtests.test("Should not call POST method"): 49 | assert not post_mock.called 50 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | sources = import ./nix/sources.nix; 3 | pkgs = import sources.nixpkgs { }; 4 | python = pkgs.python39; 5 | projectDir = builtins.path { path = ./.; name = "geostore"; }; 6 | 7 | poetryEnv = pkgs.poetry2nix.mkPoetryEnv { 8 | inherit python projectDir; 9 | overrides = pkgs.poetry2nix.overrides.withDefaults (self: super: { 10 | filelock = super.filelock.overridePythonAttrs ( 11 | # In poetry2nix >1.39.1 12 | old: { 13 | nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.hatchling self.hatch-vcs ]; 14 | } 15 | ); 16 | python-ulid = super.python-ulid.overridePythonAttrs ( 17 | # In poetry2nix >1.39.1 18 | old: { 19 | nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.setuptools-scm ]; 20 | } 21 | ); 22 | virtualenv = super.virtualenv.overridePythonAttrs ( 23 | # https://github.com/nix-community/poetry2nix/pull/985 24 | old: { 25 | nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ self.hatchling self.hatch-vcs ]; 26 | } 27 | ); 28 | }); 29 | }; 30 | in 31 | poetryEnv.env.overrideAttrs ( 32 | oldAttrs: { 33 | buildInputs = [ 34 | pkgs.cacert 35 | pkgs.cargo 36 | pkgs.docker 37 | pkgs.gitFull 38 | pkgs.go 39 | pkgs.niv 40 | pkgs.nodejs 41 | pkgs.python39Packages.pip 42 | pkgs.python39Packages.pip-tools 43 | (pkgs.poetry.override { 44 | inherit python; 45 | }) 46 | pkgs.which 47 | ]; 48 | shellHook = '' 49 | . ${projectDir + "/activate-dev-env.bash"} 50 | ln --force --no-dereference --symbolic ${poetryEnv} .venv 51 | cat <<'EOF' 52 | Welcome to the Geostore development environment! 53 | 54 | Please run `npm install` to install Node.js packages, if you haven't already. 55 | 56 | You should now be able to run `cdk` and `pytest`. 57 | EOF 58 | ''; 59 | } 60 | ) 61 | -------------------------------------------------------------------------------- /geostore/stac_format.py: -------------------------------------------------------------------------------- 1 | STAC_ASSETS_KEY = "assets" 2 | STAC_DESCRIPTION_KEY = "description" 3 | STAC_EXTENSIONS_KEY = "stac_extensions" 4 | STAC_EXTENT_BBOX_KEY = "bbox" 5 | STAC_EXTENT_KEY = "extent" 6 | STAC_EXTENT_SPATIAL_KEY = "spatial" 7 | STAC_EXTENT_TEMPORAL_INTERVAL_KEY = "interval" 8 | STAC_EXTENT_TEMPORAL_KEY = "temporal" 9 | STAC_FILE_CHECKSUM_KEY = "file:checksum" 10 | STAC_GEOMETRY_KEY = "geometry" 11 | STAC_HREF_KEY = "href" 12 | STAC_ID_KEY = "id" 13 | STAC_LICENSE_KEY = "license" 14 | STAC_LINKS_KEY = "links" 15 | STAC_MAXIMUM_KEY = "maximum" 16 | STAC_MEDIA_TYPE_GEOJSON = "application/geo+json" 17 | STAC_MEDIA_TYPE_JSON = "application/json" 18 | STAC_MINIMUM_KEY = "minimum" 19 | STAC_PROPERTIES_DATETIME_KEY = "datetime" 20 | STAC_PROPERTIES_KEY = "properties" 21 | STAC_PROVIDERS_KEY = "providers" 22 | STAC_REL_CHILD = "child" 23 | STAC_REL_ITEM = "item" 24 | STAC_REL_KEY = "rel" 25 | STAC_REL_PARENT = "parent" 26 | STAC_REL_ROOT = "root" 27 | STAC_REL_SELF = "self" 28 | STAC_TITLE_KEY = "title" 29 | STAC_TYPE_CATALOG = "Catalog" 30 | STAC_TYPE_COLLECTION = "Collection" 31 | STAC_TYPE_ITEM = "Feature" 32 | STAC_TYPE_KEY = "type" 33 | STAC_VERSION_KEY = "stac_version" 34 | 35 | LINZ_STAC_EXTENSIONS_BASE_URL = "https://stac.linz.govt.nz" 36 | LINZ_STAC_EXTENSIONS_LOCAL_PATH = "stac" 37 | 38 | LINZ_STAC_EXTENSION_KEY_PREFIX = "linz" 39 | LINZ_STAC_CREATED_KEY = "created" 40 | LINZ_STAC_ASSET_SUMMARIES_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:asset_summaries" 41 | LINZ_STAC_GEOSPATIAL_TYPE_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:geospatial_type" 42 | LINZ_STAC_HISTORY_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:history" 43 | LINZ_STAC_LIFECYCLE_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:lifecycle" 44 | LINZ_STAC_PROVIDERS_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:providers" 45 | LINZ_STAC_SECURITY_CLASSIFICATION_KEY = f"{LINZ_STAC_EXTENSION_KEY_PREFIX}:security_classification" 46 | LINZ_STAC_SECURITY_CLASSIFICATION_UNCLASSIFIED = "unclassified" 47 | LINZ_STAC_UPDATED_KEY = "updated" 48 | 49 | PROJECTION_EPSG_KEY = "proj:epsg" 50 | 51 | VERSION_VERSION_KEY = "version" 52 | -------------------------------------------------------------------------------- /.github/workflows/package-cli.yml: -------------------------------------------------------------------------------- 1 | name: Package CLI for PyPI release 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | types: [opened, reopened, synchronize] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-22.04 16 | steps: 17 | - name: Checkout repository 18 | if: ${{ github.event_name == 'push' }} 19 | uses: actions/checkout@v3.4.0 20 | with: 21 | submodules: 'true' 22 | 23 | - name: Checkout repository 24 | if: ${{ github.event_name == 'pull_request' }} 25 | uses: actions/checkout@v3.4.0 26 | with: 27 | ref: ${{ github.event.pull_request.head.sha }} 28 | submodules: 'true' 29 | 30 | - name: Get Python version 31 | run: echo "PYTHON_VERSION=$(cat .python-version)" >> "$GITHUB_ENV" 32 | 33 | - name: Use Python ${{ env.PYTHON_VERSION }} 34 | uses: actions/setup-python@v4.5.0 35 | with: 36 | python-version: ${{ env.PYTHON_VERSION }} 37 | 38 | - name: Cache pip 39 | uses: actions/cache@v3.3.1 40 | with: 41 | path: ~/.cache/pip 42 | key: 43 | ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-${{ 44 | hashFiles('./poetry.lock') }} 45 | restore-keys: | 46 | ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}- 47 | 48 | - name: Install Python dependencies 49 | run: | 50 | python -m pip install --requirement=geostore/pip.txt 51 | python -m pip install --requirement=geostore/poetry.txt 52 | python -m poetry install --no-root --only=main 53 | 54 | - name: Build 55 | run: poetry build 56 | 57 | - name: Archive build artifacts 58 | uses: actions/upload-artifact@v3.1.2 59 | with: 60 | name: packages 61 | path: dist/* 62 | if-no-files-found: error 63 | -------------------------------------------------------------------------------- /geostore/parameter_store.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | from functools import lru_cache 3 | from logging import Logger 4 | from typing import TYPE_CHECKING, Sequence 5 | 6 | import boto3 7 | from linz_logger import get_log 8 | 9 | from .boto3_config import CONFIG 10 | from .environment import environment_name 11 | 12 | if TYPE_CHECKING: 13 | # When type checking we want to use the third party package's stub 14 | from mypy_boto3_ssm import SSMClient 15 | else: 16 | # In production we want to avoid depending on a package which has no runtime impact 17 | SSMClient = object # pragma: no mutate 18 | 19 | LOGGER: Logger = get_log() 20 | SSM_CLIENT: SSMClient = boto3.client("ssm", config=CONFIG) 21 | LOG_MESSAGE_PARAMETER_NOT_FOUND = "Parameter:DoesNotExist" 22 | 23 | 24 | class ParameterName(Enum): 25 | # Use @staticmethod instead of all the ignores on the next line once we move to Python 3.9 26 | # . 27 | def _generate_next_value_( # type: ignore[misc,override] # pylint:disable=no-self-argument,no-member 28 | name: str, _start: int, _count: int, _last_values: Sequence[str] 29 | ) -> str: 30 | return f"/{environment_name()}/{name.lower()}" 31 | 32 | GIT_COMMIT = auto() 33 | PROCESSING_ASSETS_TABLE_NAME = auto() 34 | PROCESSING_DATASET_VERSION_CREATION_STEP_FUNCTION_ARN = auto() 35 | PROCESSING_IMPORT_ASSET_FILE_FUNCTION_TASK_ARN = auto() 36 | PROCESSING_IMPORT_DATASET_ROLE_ARN = auto() 37 | PROCESSING_IMPORT_METADATA_FILE_FUNCTION_TASK_ARN = auto() 38 | UPDATE_CATALOG_MESSAGE_QUEUE_NAME = auto() 39 | S3_USERS_ROLE_ARN = auto() 40 | STATUS_SNS_TOPIC_ARN = auto() 41 | STORAGE_DATASETS_TABLE_NAME = auto() 42 | STORAGE_VALIDATION_RESULTS_TABLE_NAME = auto() 43 | 44 | 45 | @lru_cache 46 | def get_param(parameter: ParameterName) -> str: 47 | try: 48 | return SSM_CLIENT.get_parameter(Name=parameter.value)["Parameter"]["Value"] 49 | except SSM_CLIENT.exceptions.ParameterNotFound: 50 | LOGGER.error(LOG_MESSAGE_PARAMETER_NOT_FOUND, extra={"parameter_value": parameter.value}) 51 | raise 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: 'Bug: ' 5 | labels: 'bug' 6 | --- 7 | 8 | 15 | 16 | ## Bug Description 17 | 18 | 19 | 20 | ## How to Reproduce 21 | 22 | 23 | 24 | 1. Do … 25 | 1. Run `…` 26 | 27 | What did you expect to happen? 28 | 29 | What actually happened? 30 | 31 | ## Software Context 32 | 33 | Operating system: 34 | 35 | Environment: 36 | 37 | Relevant software versions: 38 | 39 | - AWS CLI: 40 | - Poetry: 41 | 42 | 43 | ## Additional context 44 | 45 | 46 | 47 | #### Definition of Done 48 | 49 | - [ ] This bug is **done**: 50 | - [ ] Bug resolved to **user's** satisfaction 51 | - [ ] Automated tests are passing 52 | - [ ] Code is peer reviewed and pushed to master 53 | - [ ] Deployed successfully to test environment 54 | - [ ] Checked against 55 | [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md) 56 | - [ ] Relevant new tasks are added to backlog and communicated to the team 57 | - [ ] Important decisions recorded in the issue ticket 58 | - [ ] Readme/Changelog/Diagrams are updated 59 | - [ ] Product Owner has approved as complete 60 | - [ ] No regression to functional or 61 | [non-functional](https://github.com/linz/geostore/blob/master/.github/ISSUE_TEMPLATE/user_story.md) 62 | requirements 63 | -------------------------------------------------------------------------------- /infrastructure/constructs/batch_submit_job_task.py: -------------------------------------------------------------------------------- 1 | from typing import List, Mapping, Optional 2 | 3 | from aws_cdk import aws_batch_alpha, aws_iam, aws_stepfunctions, aws_stepfunctions_tasks 4 | from constructs import Construct 5 | 6 | from .common import LOG_LEVEL 7 | from .task_job_definition import TaskJobDefinition 8 | 9 | 10 | class BatchSubmitJobTask(Construct): 11 | def __init__( 12 | self, 13 | scope: Construct, 14 | construct_id: str, 15 | *, 16 | env_name: str, 17 | directory: str, 18 | s3_policy: aws_iam.IManagedPolicy, 19 | job_queue: aws_batch_alpha.JobQueue, 20 | payload_object: Mapping[str, str], 21 | container_overrides_command: List[str], 22 | array_size: Optional[int] = None, 23 | ): 24 | super().__init__(scope, construct_id) 25 | 26 | self.job_role = aws_iam.Role( 27 | self, 28 | f"{construct_id}-batch-job-role", 29 | assumed_by=aws_iam.ServicePrincipal("ecs-tasks.amazonaws.com"), 30 | managed_policies=[s3_policy], 31 | ) 32 | 33 | job_definition_arn = TaskJobDefinition( 34 | self, 35 | f"{construct_id}-task-definition", 36 | env_name=env_name, 37 | directory=directory, 38 | job_role=self.job_role, 39 | ).job_definition_arn 40 | 41 | container_overrides = aws_stepfunctions_tasks.BatchContainerOverrides( 42 | command=container_overrides_command, 43 | environment={"LOGLEVEL": LOG_LEVEL}, 44 | ) 45 | payload = aws_stepfunctions.TaskInput.from_object(payload_object) 46 | self.batch_submit_job = aws_stepfunctions_tasks.BatchSubmitJob( 47 | scope, 48 | f"{construct_id}-batch-submit-job", 49 | job_name=f"{construct_id}-job", 50 | job_definition_arn=job_definition_arn, 51 | job_queue_arn=job_queue.job_queue_arn, 52 | array_size=array_size, 53 | result_path=aws_stepfunctions.JsonPath.DISCARD, 54 | container_overrides=container_overrides, 55 | payload=payload, 56 | ) 57 | -------------------------------------------------------------------------------- /geostore/validation_summary/task.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | 3 | from jsonschema import ValidationError, validate 4 | from linz_logger import get_log 5 | 6 | from ..api_keys import SUCCESS_KEY 7 | from ..error_response_keys import ERROR_MESSAGE_KEY 8 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START, LOG_MESSAGE_VALIDATION_COMPLETE 9 | from ..models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX 10 | from ..parameter_store import ParameterName, get_param 11 | from ..step_function import Outcome 12 | from ..step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY 13 | from ..types import JsonObject 14 | from ..validation_results_model import ValidationResult, validation_results_model_with_meta 15 | 16 | LOGGER: Logger = get_log() 17 | 18 | 19 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 20 | LOGGER.debug( 21 | LOG_MESSAGE_LAMBDA_START, 22 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 23 | ) 24 | 25 | try: 26 | validate( 27 | event, 28 | { 29 | "type": "object", 30 | "properties": { 31 | DATASET_ID_KEY: {"type": "string"}, 32 | NEW_VERSION_ID_KEY: {"type": "string"}, 33 | }, 34 | "required": [DATASET_ID_KEY, NEW_VERSION_ID_KEY], 35 | }, 36 | ) 37 | except ValidationError as error: 38 | return {ERROR_MESSAGE_KEY: error.message} 39 | 40 | validation_results_model = validation_results_model_with_meta() 41 | success = not bool( 42 | validation_results_model.validation_outcome_index.count( 43 | ( 44 | f"{DATASET_ID_PREFIX}{event[DATASET_ID_KEY]}" 45 | f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{event[NEW_VERSION_ID_KEY]}" 46 | ), 47 | range_key_condition=validation_results_model.result == ValidationResult.FAILED.value, 48 | limit=1, 49 | ) 50 | ) 51 | 52 | result = {SUCCESS_KEY: success} 53 | LOGGER.debug( 54 | LOG_MESSAGE_VALIDATION_COMPLETE, 55 | extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 56 | ) 57 | return result 58 | -------------------------------------------------------------------------------- /infrastructure/constructs/task_job_definition.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from pathlib import Path 3 | from subprocess import check_call 4 | 5 | from aws_cdk import aws_batch_alpha, aws_ecs, aws_iam 6 | from constructs import Construct 7 | 8 | from geostore.aws_keys import AWS_DEFAULT_REGION_KEY 9 | from geostore.environment import ENV_NAME_VARIABLE_NAME, is_production 10 | from infrastructure.constructs.bundled_code import LambdaPackaging 11 | 12 | from .backend import BACKEND_DIRECTORY 13 | 14 | 15 | class TaskJobDefinition(aws_batch_alpha.JobDefinition): 16 | def __init__( 17 | self, 18 | scope: Construct, 19 | construct_id: str, 20 | *, 21 | env_name: str, 22 | directory: str, 23 | job_role: aws_iam.Role, 24 | ): 25 | if is_production(): 26 | batch_job_definition_memory_limit = 3900 27 | else: 28 | batch_job_definition_memory_limit = 500 29 | 30 | python_version_path = Path(__file__).parent / "../../.python-version" 31 | with python_version_path.open() as python_version: 32 | docker_python_version = python_version.read().rstrip() 33 | 34 | check_call( 35 | [ 36 | "poetry", 37 | "export", 38 | f"--extras={directory}", 39 | "--without-hashes", 40 | f"--output={LambdaPackaging.directory}/{directory}.txt", 41 | ] 42 | ) 43 | 44 | image = aws_ecs.ContainerImage.from_asset( 45 | directory=".", 46 | build_args={ 47 | "python_version": docker_python_version, 48 | "task": directory, 49 | "packaging": LambdaPackaging.directory, 50 | }, 51 | file=join(BACKEND_DIRECTORY, "Dockerfile"), 52 | ) 53 | 54 | container = aws_batch_alpha.JobDefinitionContainer( 55 | image=image, 56 | job_role=job_role, 57 | memory_limit_mib=batch_job_definition_memory_limit, 58 | vcpus=1, 59 | environment={ 60 | AWS_DEFAULT_REGION_KEY: job_role.stack.region, 61 | ENV_NAME_VARIABLE_NAME: env_name, 62 | }, 63 | ) 64 | 65 | super().__init__(scope, construct_id, container=container) 66 | -------------------------------------------------------------------------------- /geostore/datasets/update.py: -------------------------------------------------------------------------------- 1 | """Update dataset function.""" 2 | from http import HTTPStatus 3 | 4 | from jsonschema import ValidationError, validate 5 | from pynamodb.exceptions import DoesNotExist 6 | 7 | from ..api_responses import error_response, success_response 8 | from ..datasets_model import DatasetsModelBase, datasets_model_with_meta 9 | from ..models import DATASET_ID_PREFIX 10 | from ..step_function_keys import DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY 11 | from ..types import JsonObject 12 | 13 | 14 | def update_dataset(body: JsonObject) -> JsonObject: 15 | """PATCH: Update Dataset.""" 16 | 17 | body_schema = { 18 | "type": "object", 19 | "properties": { 20 | DATASET_ID_SHORT_KEY: {"type": "string"}, 21 | DATASET_TITLE_KEY: {"type": "string"}, 22 | }, 23 | "required": [DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY], 24 | } 25 | 26 | # request body validation 27 | try: 28 | validate(body, body_schema) 29 | except ValidationError as err: 30 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 31 | 32 | # check for duplicate type/title 33 | datasets_model_class = datasets_model_with_meta() 34 | dataset_title = body[DATASET_TITLE_KEY] 35 | if datasets_model_class.datasets_title_idx.count(hash_key=dataset_title): 36 | return error_response(HTTPStatus.CONFLICT, f"dataset '{dataset_title}' already exists") 37 | 38 | # get dataset to update 39 | dataset_id = body[DATASET_ID_SHORT_KEY] 40 | try: 41 | dataset = datasets_model_class.get( 42 | hash_key=f"{DATASET_ID_PREFIX}{dataset_id}", consistent_read=True 43 | ) 44 | except DoesNotExist: 45 | return error_response(HTTPStatus.NOT_FOUND, f"dataset '{dataset_id}' does not exist") 46 | 47 | # update dataset 48 | update_dataset_attributes(dataset, body) 49 | dataset.save() 50 | dataset.refresh(consistent_read=True) 51 | 52 | # return response 53 | resp_body = dataset.as_dict() 54 | 55 | return success_response(HTTPStatus.OK, resp_body) 56 | 57 | 58 | def update_dataset_attributes(dataset: DatasetsModelBase, req_body: JsonObject) -> None: 59 | for attr in DatasetsModelBase.get_attributes(): 60 | if attr in req_body and attr != "id": 61 | setattr(dataset, attr, req_body[attr]) 62 | -------------------------------------------------------------------------------- /geostore/import_metadata_file/task.py: -------------------------------------------------------------------------------- 1 | from json import dumps, load 2 | from os.path import basename 3 | from typing import TYPE_CHECKING, Dict, Iterable, List 4 | 5 | import boto3 6 | 7 | from ..boto3_config import CONFIG 8 | from ..import_dataset_file import get_import_result 9 | from ..stac_format import ( 10 | STAC_ASSETS_KEY, 11 | STAC_HREF_KEY, 12 | STAC_LINKS_KEY, 13 | STAC_REL_KEY, 14 | STAC_REL_SELF, 15 | ) 16 | from ..types import JsonObject 17 | 18 | S3_BODY_KEY = "Body" 19 | 20 | if TYPE_CHECKING: 21 | from mypy_boto3_s3 import S3Client 22 | from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef 23 | else: 24 | PutObjectOutputTypeDef = JsonObject # pragma: no mutate 25 | S3Client = object # pragma: no mutate 26 | 27 | TARGET_S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 28 | 29 | 30 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 31 | return get_import_result(event, importer) 32 | 33 | 34 | def importer( 35 | source_bucket_name: str, 36 | original_key: str, 37 | target_bucket_name: str, 38 | new_key: str, 39 | source_s3_client: S3Client, 40 | ) -> PutObjectOutputTypeDef: 41 | get_object_response = source_s3_client.get_object(Bucket=source_bucket_name, Key=original_key) 42 | assert S3_BODY_KEY in get_object_response, get_object_response 43 | 44 | metadata = load(get_object_response["Body"]) 45 | 46 | assets = metadata.get(STAC_ASSETS_KEY, {}).values() 47 | change_href_to_basename(assets) 48 | 49 | links = metadata.get(STAC_LINKS_KEY, []) 50 | delete_self_links(links) 51 | change_href_to_basename(links) 52 | update_root_link(links) 53 | 54 | return TARGET_S3_CLIENT.put_object( 55 | Bucket=target_bucket_name, 56 | Key=new_key, 57 | Body=dumps(metadata).encode(), 58 | ) 59 | 60 | 61 | def change_href_to_basename(items: Iterable[Dict[str, str]]) -> None: 62 | for item in items: 63 | item[STAC_HREF_KEY] = basename(item[STAC_HREF_KEY]) 64 | 65 | 66 | def delete_self_links(items: List[Dict[str, str]]) -> None: 67 | items[:] = [item for item in items if item[STAC_REL_KEY] != STAC_REL_SELF] 68 | 69 | 70 | def update_root_link(items: List[Dict[str, str]]) -> None: 71 | for item in items: 72 | if item[STAC_REL_KEY] == "root": 73 | item[STAC_HREF_KEY] = "../catalog.json" 74 | -------------------------------------------------------------------------------- /geostore/datasets/delete.py: -------------------------------------------------------------------------------- 1 | """Delete dataset function.""" 2 | from http import HTTPStatus 3 | from typing import TYPE_CHECKING 4 | 5 | import boto3 6 | from jsonschema import ValidationError, validate 7 | from pynamodb.exceptions import DoesNotExist 8 | 9 | from ..api_responses import error_response, success_response 10 | from ..boto3_config import CONFIG 11 | from ..datasets_model import datasets_model_with_meta 12 | from ..models import DATASET_ID_PREFIX 13 | from ..resources import Resource 14 | from ..step_function_keys import DATASET_ID_SHORT_KEY 15 | from ..types import JsonObject 16 | 17 | if TYPE_CHECKING: 18 | # When type checking we want to use the third party package's stub 19 | from mypy_boto3_s3 import S3Client 20 | else: 21 | # In production we want to avoid depending on a package which has no runtime impact 22 | S3Client = object # pragma: no mutate 23 | 24 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 25 | 26 | 27 | def delete_dataset(body: JsonObject) -> JsonObject: 28 | """DELETE: Delete Dataset.""" 29 | 30 | body_schema = { 31 | "type": "object", 32 | "properties": {DATASET_ID_SHORT_KEY: {"type": "string"}}, 33 | "required": [DATASET_ID_SHORT_KEY], 34 | } 35 | 36 | # request body validation 37 | try: 38 | validate(body, body_schema) 39 | except ValidationError as err: 40 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 41 | 42 | datasets_model_class = datasets_model_with_meta() 43 | 44 | # get dataset to delete 45 | dataset_id = body[DATASET_ID_SHORT_KEY] 46 | try: 47 | dataset = datasets_model_class.get( 48 | hash_key=f"{DATASET_ID_PREFIX}{dataset_id}", consistent_read=True 49 | ) 50 | except DoesNotExist: 51 | return error_response(HTTPStatus.NOT_FOUND, f"dataset '{dataset_id}' does not exist") 52 | 53 | # Verify that the dataset is empty 54 | list_objects_response = S3_CLIENT.list_objects_v2( 55 | Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, MaxKeys=1, Prefix=f"{dataset_id}/" 56 | ) 57 | if list_objects_response["KeyCount"]: 58 | return error_response( 59 | HTTPStatus.CONFLICT, 60 | f"Can’t delete dataset “{dataset_id}”: dataset versions still exist", 61 | ) 62 | 63 | # delete dataset 64 | dataset.delete() 65 | 66 | return success_response(HTTPStatus.NO_CONTENT, {}) 67 | -------------------------------------------------------------------------------- /geostore/upload_status/task.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | 3 | from jsonschema import validate 4 | from linz_logger import get_log 5 | 6 | from ..api_keys import SUCCESS_KEY 7 | from ..import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY 8 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START 9 | from ..parameter_store import ParameterName, get_param 10 | from ..step_function import get_tasks_status 11 | from ..step_function_keys import ( 12 | ASSET_UPLOAD_KEY, 13 | DATASET_ID_KEY, 14 | IMPORT_DATASET_KEY, 15 | JOB_STATUS_RUNNING, 16 | METADATA_UPLOAD_KEY, 17 | NEW_VERSION_ID_KEY, 18 | VALIDATION_KEY, 19 | ) 20 | from ..types import JsonObject 21 | 22 | INPUT_KEY = "input" 23 | EXECUTION_ID_KEY = "execution_id" 24 | 25 | LOGGER: Logger = get_log() 26 | 27 | 28 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 29 | LOGGER.debug( 30 | LOG_MESSAGE_LAMBDA_START, 31 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 32 | ) 33 | 34 | validate( 35 | event, 36 | { 37 | "type": "object", 38 | "properties": { 39 | DATASET_ID_KEY: {"type": "string"}, 40 | NEW_VERSION_ID_KEY: {"type": "string"}, 41 | VALIDATION_KEY: { 42 | "type": "object", 43 | "properties": {SUCCESS_KEY: {"type": "boolean"}}, 44 | "required": [SUCCESS_KEY], 45 | }, 46 | IMPORT_DATASET_KEY: { 47 | "type": "object", 48 | "properties": { 49 | METADATA_JOB_ID_KEY: {"type": "string"}, 50 | ASSET_JOB_ID_KEY: {"type": "string"}, 51 | }, 52 | "required": [METADATA_JOB_ID_KEY, ASSET_JOB_ID_KEY], 53 | }, 54 | }, 55 | "required": [DATASET_ID_KEY, NEW_VERSION_ID_KEY, VALIDATION_KEY, IMPORT_DATASET_KEY], 56 | }, 57 | ) 58 | 59 | raw_import_status = get_tasks_status( 60 | JOB_STATUS_RUNNING, 61 | event[DATASET_ID_KEY], 62 | event[NEW_VERSION_ID_KEY], 63 | event[VALIDATION_KEY][SUCCESS_KEY], 64 | { 65 | METADATA_JOB_ID_KEY: event[IMPORT_DATASET_KEY][METADATA_JOB_ID_KEY], 66 | ASSET_JOB_ID_KEY: event[IMPORT_DATASET_KEY][ASSET_JOB_ID_KEY], 67 | }, 68 | ) 69 | return { 70 | key: raw_import_status[key] 71 | for key in [VALIDATION_KEY, ASSET_UPLOAD_KEY, METADATA_UPLOAD_KEY] 72 | if key in raw_import_status 73 | } 74 | -------------------------------------------------------------------------------- /geostore/check_files_checksums/task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from logging import Logger 3 | from optparse import OptionParser, Values # pylint: disable=deprecated-module 4 | 5 | from linz_logger import get_log 6 | 7 | from ..models import DB_KEY_SEPARATOR 8 | from ..processing_assets_model import ProcessingAssetType 9 | from ..s3_utils import get_s3_url_reader 10 | from ..step_function import AssetGarbageCollector, get_hash_key 11 | from ..validation_results_model import ValidationResultFactory 12 | from .utils import ChecksumUtils, get_job_offset 13 | 14 | ASSETS_TABLE_NAME_ARGUMENT = "--assets-table-name" 15 | CURRENT_VERSION_ID_ARGUMENT = "--current-version-id" 16 | DATASET_ID_ARGUMENT = "--dataset-id" 17 | DATASET_TITLE_ARGUMENT = "--dataset-title" 18 | FIRST_ITEM_ARGUMENT = "--first-item" 19 | NEW_VERSION_ID_ARGUMENT = "--new-version-id" 20 | RESULTS_TABLE_NAME_ARGUMENT = "--results-table-name" 21 | S3_ROLE_ARN_ARGUMENT = "--s3-role-arn" 22 | 23 | LOGGER: Logger = get_log() 24 | 25 | 26 | def parse_arguments() -> Values: 27 | parser = OptionParser() 28 | parser.add_option(DATASET_ID_ARGUMENT) 29 | parser.add_option(NEW_VERSION_ID_ARGUMENT) 30 | parser.add_option(CURRENT_VERSION_ID_ARGUMENT) 31 | parser.add_option(DATASET_TITLE_ARGUMENT) 32 | parser.add_option(FIRST_ITEM_ARGUMENT, type=int) 33 | parser.add_option(RESULTS_TABLE_NAME_ARGUMENT) 34 | parser.add_option(ASSETS_TABLE_NAME_ARGUMENT) 35 | parser.add_option(S3_ROLE_ARN_ARGUMENT) 36 | (options, _args) = parser.parse_args() 37 | 38 | for option in parser.option_list: 39 | if option.dest is not None: 40 | assert hasattr(options, option.dest) 41 | 42 | return options 43 | 44 | 45 | def main() -> None: 46 | arguments = parse_arguments() 47 | 48 | index = arguments.first_item + get_job_offset() 49 | hash_key = get_hash_key(arguments.dataset_id, arguments.new_version_id) 50 | range_key = f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}{index}" 51 | validation_result_factory = ValidationResultFactory(hash_key, arguments.results_table_name) 52 | s3_url_reader = get_s3_url_reader(arguments.s3_role_arn, arguments.dataset_title, LOGGER) 53 | 54 | asset_garbage_collector = AssetGarbageCollector( 55 | arguments.dataset_id, 56 | arguments.current_version_id, 57 | ProcessingAssetType.DATA, 58 | LOGGER, 59 | arguments.assets_table_name, 60 | ) 61 | 62 | utils = ChecksumUtils( 63 | arguments.assets_table_name, 64 | validation_result_factory, 65 | s3_url_reader, 66 | asset_garbage_collector, 67 | LOGGER, 68 | ) 69 | utils.run(hash_key, range_key) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /tests/test_validation_summary_logging.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from geostore.logging_keys import ( 4 | GIT_COMMIT, 5 | LOG_MESSAGE_LAMBDA_START, 6 | LOG_MESSAGE_VALIDATION_COMPLETE, 7 | ) 8 | from geostore.parameter_store import ParameterName, get_param 9 | from geostore.step_function import Outcome 10 | from geostore.step_function_keys import DATASET_ID_KEY, NEW_VERSION_ID_KEY 11 | from geostore.validation_summary import task 12 | 13 | from .aws_utils import any_lambda_context 14 | from .stac_generators import any_dataset_id, any_dataset_version_id 15 | 16 | 17 | def should_log_event() -> None: 18 | # Given 19 | event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()} 20 | 21 | with patch("geostore.validation_summary.task.validation_results_model_with_meta"), patch( 22 | "geostore.validation_summary.task.LOGGER.debug" 23 | ) as logger_mock: 24 | # When 25 | task.lambda_handler(event, any_lambda_context()) 26 | 27 | # Then 28 | logger_mock.assert_any_call( 29 | LOG_MESSAGE_LAMBDA_START, 30 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 31 | ) 32 | 33 | 34 | @patch("geostore.validation_summary.task.validation_results_model_with_meta") 35 | def should_log_failure_result(validation_results_model_mock: MagicMock) -> None: 36 | # Given 37 | event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()} 38 | validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 1 39 | 40 | with patch("geostore.validation_summary.task.LOGGER.debug") as logger_mock: 41 | # When 42 | task.lambda_handler(event, any_lambda_context()) 43 | 44 | # Then 45 | logger_mock.assert_any_call( 46 | LOG_MESSAGE_VALIDATION_COMPLETE, 47 | extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 48 | ) 49 | 50 | 51 | @patch("geostore.validation_summary.task.validation_results_model_with_meta") 52 | def should_log_success_result(validation_results_model_mock: MagicMock) -> None: 53 | # Given 54 | event = {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()} 55 | validation_results_model_mock.return_value.validation_outcome_index.count.return_value = 0 56 | 57 | with patch("geostore.validation_summary.task.LOGGER.debug") as logger_mock: 58 | # When 59 | task.lambda_handler(event, any_lambda_context()) 60 | 61 | # Then 62 | logger_mock.assert_any_call( 63 | LOG_MESSAGE_VALIDATION_COMPLETE, 64 | extra={"outcome": Outcome.PASSED, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 65 | ) 66 | -------------------------------------------------------------------------------- /reset-dev-env.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit -o noclobber -o nounset -o pipefail 4 | 5 | usage() { 6 | cat >&2 <<'EOF' 7 | ./reset-dev-env.bash --all 8 | ./reset-dev-env.bash [--delete] [--hooks] [--node] [--python] [--submodule] 9 | ./reset-dev-env.bash --help 10 | 11 | `--all` implies `--delete --hooks --node --python --submodule`. 12 | EOF 13 | } 14 | 15 | arguments="$(getopt --options '' \ 16 | --longoptions all,delete,help,hooks,node,python,submodule --name "$0" -- "$@")" 17 | eval set -- "$arguments" 18 | unset arguments 19 | 20 | while true; do 21 | case "$1" in 22 | --all) 23 | delete=1 24 | hooks=1 25 | node=1 26 | python=1 27 | submodule=1 28 | shift 29 | ;; 30 | --delete) 31 | delete=1 32 | shift 33 | ;; 34 | --help) 35 | usage 36 | exit 37 | ;; 38 | --hooks) 39 | hooks=1 40 | shift 41 | ;; 42 | --node) 43 | node=1 44 | shift 45 | ;; 46 | --python) 47 | python=1 48 | shift 49 | ;; 50 | --submodule) 51 | submodule=1 52 | shift 53 | ;; 54 | --) 55 | shift 56 | break 57 | ;; 58 | *) 59 | printf 'Not implemented: %q\n' "$1" >&2 60 | exit 1 61 | ;; 62 | esac 63 | done 64 | 65 | if [[ -z ${hooks-} ]] && 66 | [[ -z ${node-} ]] && 67 | [[ -z ${python-} ]] && 68 | [[ -z ${submodule-} ]]; then 69 | usage 70 | exit 1 71 | fi 72 | 73 | cd "$(dirname "${BASH_SOURCE[0]}")" 74 | 75 | if [[ -n ${delete-} ]]; then 76 | echo "Cleaning Git repository" 77 | git clean -d --exclude='.idea' --force -x 78 | fi 79 | 80 | if [[ -n ${submodule-} ]]; then 81 | echo "Updating submodules" 82 | git submodule update --init 83 | fi 84 | 85 | if [[ -n ${node-} ]]; then 86 | if [[ -n ${delete-} ]]; then 87 | echo "Removing Node.js packages" 88 | rm --force --recursive ./node_modules 89 | fi 90 | 91 | echo "Installing Node.js packages" 92 | npm ci 93 | fi 94 | 95 | if [[ -n ${python-} ]]; then 96 | if [[ -n ${delete-} ]]; then 97 | echo "Removing Python packages" 98 | rm --force --recursive ./.venv 99 | fi 100 | 101 | echo "Installing Python packages" 102 | poetry env use "$(cat .python-version)" 103 | poetry install --all-extras --no-root --sync 104 | fi 105 | 106 | if [[ -n ${hooks-} ]]; then 107 | echo "Installing Git hooks" 108 | 109 | # shellcheck source=/dev/null 110 | . .venv/bin/activate 111 | 112 | pre-commit install --hook-type=commit-msg --overwrite 113 | pre-commit install --hook-type=pre-commit --overwrite 114 | fi 115 | -------------------------------------------------------------------------------- /infrastructure/constructs/bundled_code.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from dataclasses import dataclass 3 | from re import sub 4 | from subprocess import check_call, check_output 5 | from sys import executable 6 | from typing import List 7 | 8 | from aws_cdk import BundlingOptions, aws_lambda 9 | 10 | from .backend import BACKEND_DIRECTORY 11 | from .lambda_config import PYTHON_RUNTIME 12 | 13 | 14 | def poetry_export_extras(lambda_directory: str) -> List[str]: 15 | # There isn't an elegant way of getting poetry to install package dependencies in a bespoke 16 | # target lambda_directory within Python, so we export a requirements file and install using pip. 17 | # This has been raised and discussed by the community as below: 18 | # https://github.com/python-poetry/poetry/issues/1937 19 | 20 | export_extras = check_output( 21 | ["poetry", "export", f"--extras={lambda_directory}", "--without-hashes"] 22 | ) 23 | # Remove botocore as this is already installed in the lambda layer 24 | export_extras = sub(b"botocore==.*\n", b"", export_extras) 25 | 26 | return export_extras.decode("utf-8").splitlines() 27 | 28 | 29 | def pip_install_requirements(lambda_directory: str, export_extras: List[str]) -> None: 30 | # Documentation recommend against calling pip internal api; rather, via command line 31 | # https://pip.pypa.io/en/latest/user_guide/#using-pip-from-your-program 32 | 33 | check_call( 34 | [ 35 | executable, 36 | "-m", 37 | "pip", 38 | "install", 39 | "--no-deps", 40 | "--quiet", 41 | f"--cache-dir={LambdaPackaging.directory}/cache", 42 | f"--target={LambdaPackaging.directory}/{lambda_directory}", 43 | *export_extras, 44 | ] 45 | ) 46 | 47 | 48 | @dataclass 49 | class LambdaPackaging: 50 | directory = tempfile.mkdtemp(dir=BACKEND_DIRECTORY, prefix=".lambda_out_") 51 | 52 | 53 | def bundled_code(lambda_directory: str) -> aws_lambda.Code: 54 | export_extras = poetry_export_extras(lambda_directory) 55 | pip_install_requirements(lambda_directory, export_extras) 56 | bundling_options = BundlingOptions( 57 | image=PYTHON_RUNTIME.bundling_image, # pylint:disable=no-member 58 | command=[ 59 | "bash", 60 | "-c", 61 | f"""mkdir --parents /asset-output/geostore/{lambda_directory} && \ 62 | cp --archive --update {LambdaPackaging.directory}/{lambda_directory}/* /asset-output/ && \ 63 | cp --archive --update /asset-input/geostore/*.py /asset-output/geostore/ && \ 64 | cp --archive --update /asset-input/geostore/{lambda_directory} /asset-output/geostore/""", # pylint: disable=line-too-long 65 | ], 66 | ) 67 | return aws_lambda.Code.from_asset(path=".", bundling=bundling_options) 68 | -------------------------------------------------------------------------------- /geostore/datasets/get.py: -------------------------------------------------------------------------------- 1 | """Get datasets functions.""" 2 | from http import HTTPStatus 3 | 4 | from jsonschema import ValidationError, validate 5 | from pynamodb.exceptions import DoesNotExist 6 | 7 | from ..api_responses import error_response, success_response 8 | from ..datasets_model import datasets_model_with_meta 9 | from ..models import DATASET_ID_PREFIX 10 | from ..step_function_keys import DATASET_ID_SHORT_KEY, DATASET_TITLE_KEY 11 | from ..types import JsonObject 12 | from .list import list_datasets 13 | 14 | 15 | def handle_get(body: JsonObject) -> JsonObject: 16 | if DATASET_ID_SHORT_KEY in body: 17 | return get_dataset_single(body) 18 | 19 | if DATASET_TITLE_KEY in body: 20 | return get_dataset_filter(body) 21 | 22 | if body == {}: 23 | return list_datasets() 24 | 25 | return error_response(HTTPStatus.BAD_REQUEST, "Unhandled request") 26 | 27 | 28 | def get_dataset_single(body: JsonObject) -> JsonObject: 29 | """GET: Get single Dataset.""" 30 | 31 | body_schema = { 32 | "type": "object", 33 | "properties": {DATASET_ID_SHORT_KEY: {"type": "string"}}, 34 | "required": [DATASET_ID_SHORT_KEY], 35 | } 36 | 37 | # request body validation 38 | try: 39 | validate(body, body_schema) 40 | except ValidationError as err: 41 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 42 | 43 | datasets_model_class = datasets_model_with_meta() 44 | 45 | # get dataset 46 | try: 47 | dataset = datasets_model_class.get( 48 | hash_key=f"{DATASET_ID_PREFIX}{body[DATASET_ID_SHORT_KEY]}", consistent_read=True 49 | ) 50 | except DoesNotExist: 51 | return error_response( 52 | HTTPStatus.NOT_FOUND, f"dataset '{body[DATASET_ID_SHORT_KEY]}' does not exist" 53 | ) 54 | 55 | # return response 56 | resp_body = dataset.as_dict() 57 | 58 | return success_response(HTTPStatus.OK, resp_body) 59 | 60 | 61 | def get_dataset_filter(body: JsonObject) -> JsonObject: 62 | """GET: Get Datasets by filter.""" 63 | 64 | body_schema = { 65 | "type": "object", 66 | "properties": {DATASET_TITLE_KEY: {"type": "string"}}, 67 | "required": [DATASET_TITLE_KEY], 68 | } 69 | 70 | # request body validation 71 | try: 72 | validate(body, body_schema) 73 | except ValidationError as err: 74 | return error_response(HTTPStatus.BAD_REQUEST, err.message) 75 | 76 | # dataset query by filter 77 | datasets_model_class = datasets_model_with_meta() 78 | datasets = datasets_model_class.datasets_title_idx.query(hash_key=body[DATASET_TITLE_KEY]) 79 | 80 | # return response 81 | resp_body = [] 82 | for dataset in datasets: 83 | resp_item = dataset.as_dict() 84 | resp_body.append(resp_item) 85 | 86 | return success_response(HTTPStatus.OK, resp_body) 87 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enabler_story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enabler story 3 | about: Suggest an idea to enable the team to deliver a better product 4 | labels: enabler story 5 | --- 6 | 7 | ### Enabler 8 | 9 | 10 | 11 | So that [some reason], we want to [do something] 12 | 13 | #### Acceptance Criteria 14 | 15 | 16 | 17 | - [ ] ... 18 | - [ ] ... 19 | 20 | #### Additional context 21 | 22 | 23 | 24 | #### Tasks 25 | 26 | 27 | 28 | - [ ] ... 29 | - [ ] ... 30 | 31 | #### Definition of Ready 32 | 33 | - [ ] This story is **ready** to work on 34 | - [ ] Negotiable (team can decide how to design and implement) 35 | - [ ] Valuable (from a user perspective) 36 | - [ ] Estimate value applied (agreed by team) 37 | - [ ] Small (so as to fit within an iteration) 38 | - [ ] Testable (in principle, even if there isn't a test for it yet) 39 | - [ ] Environments are ready to meet definition of done 40 | - [ ] Resources required to implement will be ready 41 | - [ ] Everyone understands and agrees with the tasks to complete the story 42 | - [ ] Release value (e.g. Iteration 3) applied 43 | - [ ] Sprint value (e.g. Aug 1 - Aug 15) applied 44 | 45 | #### Definition of Done 46 | 47 | - [ ] This story is **done**: 48 | - [ ] Acceptance criteria completed 49 | - [ ] Automated tests are passing 50 | - [ ] Code is peer reviewed and pushed to master 51 | - [ ] Deployed successfully to test environment 52 | - [ ] Checked against 53 | [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md) 54 | - [ ] Relevant new tasks are added to backlog and communicated to the team 55 | - [ ] Important decisions recorded in the issue ticket 56 | - [ ] Readme/Changelog/Diagrams are updated 57 | - [ ] Product Owner has approved acceptance criteria as complete 58 | - [ ] Meets non-functional requirements: 59 | - [ ] Scalability (data): Can scale to 300TB of data and 100,000,000 files and ability to 60 | increase 10% every year 61 | - [ ] Scability (users): Can scale to 100 concurrent users 62 | - [ ] Cost: Data can be stored at < 0.5 NZD per GB per year 63 | - [ ] Performance: A large dataset (500 GB and 50,000 files - e.g. Akl aerial imagery) can be 64 | validated, imported and stored within 24 hours 65 | - [ ] Accessibility: Can be used from LINZ networks and the public internet 66 | - [ ] Availability: System available 24 hours a day and 7 days a week, this does not include 67 | maintenance windows < 4 hours and does not include operational support 68 | - [ ] Recoverability: RPO of fully imported datasets < 4 hours, RTO of a single 3 TB dataset 69 | < 12 hours 70 | 71 | 72 | -------------------------------------------------------------------------------- /infrastructure/constructs/lambda_layers/botocore/poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "botocore" 5 | version = "1.29.91" 6 | description = "Low-level, data-driven core of boto 3." 7 | category = "main" 8 | optional = false 9 | python-versions = ">= 3.7" 10 | files = [ 11 | {file = "botocore-1.29.91-py3-none-any.whl", hash = "sha256:4ed6a488aee1b42367eace71f7d0993dda05b02eebd7dcdd78db5c9ce3d80da5"}, 12 | {file = "botocore-1.29.91.tar.gz", hash = "sha256:a8a800a2a945da807758cace539fc5b5ec1d5082ce363799d3a3870c2c4ed6fc"}, 13 | ] 14 | 15 | [package.dependencies] 16 | jmespath = ">=0.7.1,<2.0.0" 17 | python-dateutil = ">=2.1,<3.0.0" 18 | urllib3 = ">=1.25.4,<1.27" 19 | 20 | [package.extras] 21 | crt = ["awscrt (==0.16.9)"] 22 | 23 | [[package]] 24 | name = "jmespath" 25 | version = "0.10.0" 26 | description = "JSON Matching Expressions" 27 | category = "main" 28 | optional = false 29 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 30 | files = [ 31 | {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, 32 | {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, 33 | ] 34 | 35 | [[package]] 36 | name = "python-dateutil" 37 | version = "2.8.1" 38 | description = "Extensions to the standard Python datetime module" 39 | category = "main" 40 | optional = false 41 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" 42 | files = [ 43 | {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, 44 | {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, 45 | ] 46 | 47 | [package.dependencies] 48 | six = ">=1.5" 49 | 50 | [[package]] 51 | name = "six" 52 | version = "1.15.0" 53 | description = "Python 2 and 3 compatibility utilities" 54 | category = "main" 55 | optional = false 56 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 57 | files = [ 58 | {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, 59 | {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, 60 | ] 61 | 62 | [[package]] 63 | name = "urllib3" 64 | version = "1.26.5" 65 | description = "HTTP library with thread-safe connection pooling, file post, and more." 66 | category = "main" 67 | optional = false 68 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" 69 | files = [ 70 | {file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"}, 71 | {file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"}, 72 | ] 73 | 74 | [package.extras] 75 | brotli = ["brotlipy (>=0.6.0)"] 76 | secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"] 77 | socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] 78 | 79 | [metadata] 80 | lock-version = "2.0" 81 | python-versions = "^3.9,<3.10" 82 | content-hash = "92176b3eafd22453239cf3e9e7fcefb9e0a3c0af572a1d46403aee00f8e7d931" 83 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/user_story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: User story 3 | about: Suggest an idea to give the user a more valuable product 4 | labels: user story 5 | --- 6 | 7 | ### User Story 8 | 9 | 10 | 11 | So that [some reason], as a [role], I want to [do something]. 12 | 13 | 14 | 15 | #### Acceptance Criteria 16 | 17 | 18 | 19 | - [ ] Given [a pre-condition], when [an event happens], then [an expected outcome occurs] 20 | - [ ] Given [a pre-condition], when [an event happens], then [an expected outcome occurs] 21 | 22 | #### Additional context 23 | 24 | 25 | 26 | #### Tasks 27 | 28 | 29 | 30 | - [ ] ... 31 | - [ ] ... 32 | 33 | #### Definition of Ready 34 | 35 | - [ ] This story is **ready** to work on 36 | - [ ] Independent (story is independent of all other tasks) 37 | - [ ] Negotiable (team can decide how to design and implement) 38 | - [ ] Valuable (from a user perspective) 39 | - [ ] Estimate value applied (agreed by team) 40 | - [ ] Small (so as to fit within an iteration) 41 | - [ ] Testable (in principle, even if there isn't a test for it yet) 42 | - [ ] Environments are ready to meet definition of done 43 | - [ ] Resources required to implement will be ready 44 | - [ ] Everyone understands and agrees with the tasks to complete the story 45 | - [ ] Release value (e.g. Iteration 3) applied 46 | - [ ] Sprint value (e.g. Aug 1 - Aug 15) applied 47 | 48 | #### Definition of Done 49 | 50 | - [ ] This story is **done**: 51 | - [ ] Acceptance criteria completed 52 | - [ ] Automated tests are passing 53 | - [ ] Code is peer reviewed and pushed to master 54 | - [ ] Deployed successfully to test environment 55 | - [ ] Checked against 56 | [CODING guidelines](https://github.com/linz/geostore/blob/master/CODING.md) 57 | - [ ] Relevant new tasks are added to backlog and communicated to the team 58 | - [ ] Important decisions recorded in the issue ticket 59 | - [ ] Readme/Changelog/Diagrams are updated 60 | - [ ] Product Owner has approved acceptance criteria as complete 61 | - [ ] Meets non-functional requirements: 62 | - [ ] Scalability (data): Can scale to 300TB of data and 100,000,000 files and ability to 63 | increase 10% every year 64 | - [ ] Scability (users): Can scale to 100 concurrent users 65 | - [ ] Cost: Data can be stored at < 0.5 NZD per GB per year 66 | - [ ] Performance: A large dataset (500 GB and 50,000 files - e.g. Akl aerial imagery) can be 67 | validated, imported and stored within 24 hours 68 | - [ ] Accessibility: Can be used from LINZ networks and the public internet 69 | - [ ] Availability: System available 24 hours a day and 7 days a week, this does not include 70 | maintenance windows < 4 hours and does not include operational support 71 | - [ ] Recoverability: RPO of fully imported datasets < 4 hours, RTO of a single 3 TB dataset 72 | < 12 hours 73 | -------------------------------------------------------------------------------- /geostore/content_iterator/task.py: -------------------------------------------------------------------------------- 1 | from jsonschema import validate 2 | 3 | from ..models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX 4 | from ..parameter_store import ParameterName, get_param 5 | from ..processing_assets_model import ProcessingAssetType, processing_assets_model_with_meta 6 | from ..step_function_keys import DATASET_ID_KEY, METADATA_URL_KEY, NEW_VERSION_ID_KEY 7 | from ..types import JsonObject 8 | 9 | MAX_ITERATION_SIZE = 10_000 10 | 11 | ASSETS_TABLE_NAME_KEY = "assets_table_name" 12 | CONTENT_KEY = "content" 13 | FIRST_ITEM_KEY = "first_item" 14 | ITERATION_SIZE_KEY = "iteration_size" 15 | NEXT_ITEM_KEY = "next_item" 16 | RESULTS_TABLE_NAME_KEY = "results_table_name" 17 | 18 | EVENT_SCHEMA = { 19 | "type": "object", 20 | "properties": { 21 | CONTENT_KEY: { 22 | "type": "object", 23 | "properties": { 24 | FIRST_ITEM_KEY: {"type": "string", "pattern": r"^\d+$"}, 25 | ITERATION_SIZE_KEY: { 26 | "type": "integer", 27 | "minimum": 1, 28 | "maximum": MAX_ITERATION_SIZE, 29 | }, 30 | NEXT_ITEM_KEY: { 31 | "type": "integer", 32 | "minimum": MAX_ITERATION_SIZE, 33 | "multipleOf": MAX_ITERATION_SIZE, 34 | }, 35 | }, 36 | "required": [FIRST_ITEM_KEY, ITERATION_SIZE_KEY, NEXT_ITEM_KEY], 37 | "additionalProperties": False, 38 | }, 39 | DATASET_ID_KEY: {"type": "string"}, 40 | METADATA_URL_KEY: {"type": "string"}, 41 | NEW_VERSION_ID_KEY: {"type": "string"}, 42 | }, 43 | "required": [DATASET_ID_KEY, METADATA_URL_KEY, NEW_VERSION_ID_KEY], 44 | "additionalProperties": True, 45 | } 46 | 47 | 48 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 49 | validate(event, EVENT_SCHEMA) 50 | 51 | if CONTENT_KEY in event.keys(): 52 | assert int(event[CONTENT_KEY][FIRST_ITEM_KEY]) % MAX_ITERATION_SIZE == 0 53 | first_item_index = event[CONTENT_KEY][NEXT_ITEM_KEY] 54 | else: 55 | first_item_index = 0 56 | 57 | dataset_id = event[DATASET_ID_KEY] 58 | version_id = event[NEW_VERSION_ID_KEY] 59 | 60 | processing_assets_model = processing_assets_model_with_meta() 61 | 62 | asset_count = processing_assets_model.count( 63 | hash_key=( 64 | f"{DATASET_ID_PREFIX}{dataset_id}{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}" 65 | ), 66 | range_key_condition=processing_assets_model.sk.startswith( 67 | f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}" 68 | ), 69 | ) 70 | 71 | remaining_assets = asset_count - first_item_index 72 | if remaining_assets > MAX_ITERATION_SIZE: 73 | next_item_index = first_item_index + MAX_ITERATION_SIZE 74 | iteration_size = MAX_ITERATION_SIZE 75 | else: 76 | next_item_index = -1 77 | iteration_size = remaining_assets 78 | 79 | return { 80 | FIRST_ITEM_KEY: str(first_item_index), 81 | ITERATION_SIZE_KEY: iteration_size, 82 | NEXT_ITEM_KEY: next_item_index, 83 | ASSETS_TABLE_NAME_KEY: get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME), 84 | RESULTS_TABLE_NAME_KEY: get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME), 85 | } 86 | -------------------------------------------------------------------------------- /tests/test_check_files_checksums_logging.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from os import environ 3 | from unittest.mock import patch 4 | 5 | from pynamodb.exceptions import DoesNotExist 6 | from pytest import mark, raises 7 | from pytest_subtests import SubTests 8 | 9 | from geostore.api_keys import MESSAGE_KEY 10 | from geostore.check_files_checksums.task import ( 11 | ASSETS_TABLE_NAME_ARGUMENT, 12 | CURRENT_VERSION_ID_ARGUMENT, 13 | DATASET_ID_ARGUMENT, 14 | DATASET_TITLE_ARGUMENT, 15 | FIRST_ITEM_ARGUMENT, 16 | NEW_VERSION_ID_ARGUMENT, 17 | RESULTS_TABLE_NAME_ARGUMENT, 18 | S3_ROLE_ARN_ARGUMENT, 19 | main, 20 | ) 21 | from geostore.check_files_checksums.utils import ARRAY_INDEX_VARIABLE_NAME 22 | from geostore.error_response_keys import ERROR_KEY 23 | from geostore.logging_keys import GIT_COMMIT, LOG_MESSAGE_VALIDATION_COMPLETE 24 | from geostore.models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR, VERSION_ID_PREFIX 25 | from geostore.parameter_store import ParameterName, get_param 26 | from geostore.processing_assets_model import ProcessingAssetType, ProcessingAssetsModelBase 27 | from geostore.step_function import Outcome 28 | from geostore.step_function_keys import CURRENT_VERSION_EMPTY_VALUE 29 | 30 | from .aws_utils import get_s3_role_arn 31 | from .general_generators import any_program_name 32 | from .stac_generators import any_dataset_id, any_dataset_title, any_dataset_version_id 33 | 34 | 35 | @mark.infrastructure 36 | def should_log_missing_item(subtests: SubTests) -> None: 37 | # Given 38 | dataset_id = any_dataset_id() 39 | version_id = any_dataset_version_id() 40 | index = 0 41 | expected_log = { 42 | ERROR_KEY: {MESSAGE_KEY: ProcessingAssetsModelBase.DoesNotExist.msg}, 43 | "parameters": { 44 | "hash_key": ( 45 | f"{DATASET_ID_PREFIX}{dataset_id}" 46 | f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}" 47 | ), 48 | "range_key": f"{ProcessingAssetType.DATA.value}{DB_KEY_SEPARATOR}{index}", 49 | }, 50 | } 51 | 52 | sys.argv = [ 53 | any_program_name(), 54 | f"{DATASET_ID_ARGUMENT}={dataset_id}", 55 | f"{NEW_VERSION_ID_ARGUMENT}={version_id}", 56 | f"{CURRENT_VERSION_ID_ARGUMENT}={CURRENT_VERSION_EMPTY_VALUE}", 57 | f"{DATASET_TITLE_ARGUMENT}={any_dataset_title()}", 58 | f"{FIRST_ITEM_ARGUMENT}={index}", 59 | f"{ASSETS_TABLE_NAME_ARGUMENT}={get_param(ParameterName.PROCESSING_ASSETS_TABLE_NAME)}", 60 | ( 61 | f"{RESULTS_TABLE_NAME_ARGUMENT}" 62 | f"={get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME)}" 63 | ), 64 | f"{S3_ROLE_ARN_ARGUMENT}={get_s3_role_arn()}", 65 | ] 66 | 67 | # When/Then 68 | with patch("geostore.check_files_checksums.task.LOGGER.error") as logger_mock, patch.dict( 69 | environ, {ARRAY_INDEX_VARIABLE_NAME: "0"} 70 | ): 71 | with subtests.test(msg="Return code"), raises(DoesNotExist): 72 | main() 73 | 74 | with subtests.test(msg="Log message"): 75 | logger_mock.assert_any_call( 76 | LOG_MESSAGE_VALIDATION_COMPLETE, 77 | extra={ 78 | "outcome": Outcome.FAILED, 79 | "error": expected_log, 80 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 81 | }, 82 | ) 83 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Configuration file for pre-commit (https://pre-commit.com/). 2 | # Please run `pre-commit run --all-files` when adding or changing entries. 3 | 4 | repos: 5 | - repo: local 6 | hooks: 7 | - id: black 8 | name: black 9 | entry: black 10 | language: system 11 | stages: [commit] 12 | types: [python] 13 | 14 | - id: gitlint 15 | name: gitlint 16 | entry: gitlint 17 | args: [--msg-filename] 18 | language: system 19 | stages: [commit-msg] 20 | 21 | - id: hadolint 22 | name: hadolint 23 | language: docker_image 24 | entry: hadolint/hadolint:v2.2.0 hadolint 25 | stages: [commit] 26 | types: [dockerfile] 27 | 28 | - id: isort 29 | name: isort 30 | entry: isort 31 | language: system 32 | stages: [commit] 33 | types: [python] 34 | 35 | - id: mypy 36 | name: mypy 37 | entry: mypy 38 | language: system 39 | stages: [commit] 40 | types_or: [python, pyi] 41 | require_serial: true 42 | 43 | - id: pathchk 44 | name: pathchk 45 | entry: pathchk 46 | args: [--portability] 47 | exclude: ^([a-zA-Z0-9._][a-zA-Z0-9._-]+)(/[a-zA-Z0-9._][a-zA-Z0-9._-]+)*$ # https://lists.gnu.org/archive/html/coreutils/2023-01/msg00006.html 48 | language: system 49 | stages: [commit] 50 | 51 | - id: pretty-format-ini 52 | name: Pretty format INI 53 | entry: pretty-format-ini 54 | args: [--autofix] 55 | language: system 56 | stages: [commit] 57 | types: [ini] 58 | 59 | - id: pretty-format-toml 60 | name: Pretty format TOML 61 | entry: pretty-format-toml 62 | args: [--autofix] 63 | language: system 64 | stages: [commit] 65 | types: [toml] 66 | exclude: ^.*\.lock 67 | 68 | - id: pylint 69 | name: pylint 70 | entry: pylint 71 | language: system 72 | stages: [commit] 73 | types: [python] 74 | 75 | - repo: https://github.com/koalaman/shellcheck-precommit 76 | rev: 3f77b826548d8dc2d26675f077361c92773b50a7 # frozen: v0.9.0 77 | hooks: 78 | - id: shellcheck 79 | stages: [commit] 80 | # TODO: Kill if https://github.com/pre-commit/identify/issues/350 is fixed 81 | - id: shellcheck 82 | files: ^\.envrc$ 83 | types: [] 84 | stages: [commit] 85 | 86 | - repo: https://github.com/scop/pre-commit-shfmt 87 | rev: f21b778d68a3930f77d7424821022e81e3ae17d7 # frozen: v3.6.0-1 88 | hooks: 89 | - id: shfmt 90 | stages: [commit] 91 | # TODO: Kill if https://github.com/pre-commit/identify/issues/350 is fixed 92 | - id: shfmt 93 | files: ^\.envrc$ 94 | types: [] 95 | stages: [commit] 96 | 97 | - repo: https://github.com/nix-community/nixpkgs-fmt 98 | rev: 6740ea881d3ac5942d4fbf124f5956b896666c76 # frozen: v1.3.0 99 | hooks: 100 | - id: nixpkgs-fmt 101 | stages: [commit] 102 | 103 | - repo: https://github.com/pre-commit/mirrors-prettier 104 | rev: cafd5506f18eea191804850dacc0a4264772d59d # frozen: v3.0.0-alpha.4 105 | hooks: 106 | - id: prettier 107 | stages: [commit] 108 | -------------------------------------------------------------------------------- /.github/workflows/mutation-test.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | 4 | jobs: 5 | mutation-test: 6 | runs-on: ubuntu-22.04 7 | permissions: 8 | id-token: write 9 | contents: read 10 | steps: 11 | - name: Check out repository 12 | uses: actions/checkout@v3.4.0 13 | with: 14 | submodules: true 15 | 16 | - name: Get Node.js version 17 | run: echo "NODE_VERSION=$(cat .nvmrc)" >> "$GITHUB_ENV" 18 | 19 | - name: Get Python version 20 | run: echo "PYTHON_VERSION=$(cat .python-version)" >> "$GITHUB_ENV" 21 | 22 | - name: Use Node.js ${{ env.NODE_VERSION }} 23 | uses: actions/setup-node@v3.6.0 24 | with: 25 | node-version: ${{ env.NODE_VERSION }} 26 | registry-url: https://registry.npmjs.org 27 | 28 | - name: Cache Node.js packages 29 | uses: actions/cache@v3.3.1 30 | with: 31 | path: ~/.npm 32 | key: 33 | ${{ runner.os }}-node-${{ secrets.CACHE_SEED }}-${{ hashFiles('**/package-lock.json') }} 34 | restore-keys: ${{ runner.os }}-node-${{ secrets.CACHE_SEED }}- 35 | 36 | - name: Install Node.js dependencies 37 | run: npm ci --production 38 | 39 | - name: Add local Node packages to PATH 40 | run: echo "./node_modules/.bin:$PATH" >> $GITHUB_PATH 41 | 42 | - name: Use Python ${{ env.PYTHON_VERSION }} 43 | uses: actions/setup-python@v4.5.0 44 | with: 45 | python-version: ${{ env.PYTHON_VERSION }} 46 | 47 | - name: Cache pip 48 | uses: actions/cache@v3.3.1 49 | with: 50 | path: ~/.cache/pip 51 | key: 52 | ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}-${{ 53 | hashFiles('./poetry.lock') }} 54 | restore-keys: ${{ runner.os }}-pip-${{ secrets.CACHE_SEED }}-${{ env.PYTHON_VERSION }}- 55 | 56 | - name: Upgrade pip 57 | run: python -m pip install --requirement=geostore/pip.txt 58 | 59 | - name: Install Poetry 60 | run: python -m pip install --requirement=geostore/poetry.txt 61 | 62 | - name: Install Python dependencies 63 | run: python -m poetry install --all-extras --no-root 64 | 65 | - name: Get Oidc deploy role arn 66 | run: cat .github/workflows/.env >> $GITHUB_ENV 67 | 68 | - name: Configure AWS credentials 69 | uses: aws-actions/configure-aws-credentials@v2.0.0 70 | with: 71 | aws-region: ap-southeast-2 72 | mask-aws-account-id: true 73 | role-to-assume: ${{ env.CiOidc }} 74 | 75 | - name: Set unique deployment environment type variable 76 | run: echo "GEOSTORE_ENV_NAME=ci${GITHUB_RUN_ID}" | tee -a $GITHUB_ENV 77 | 78 | - name: Deploy AWS stacks for testing 79 | run: 80 | poetry run cdk deploy --all --require-approval never --strict --change-set-name 81 | "ci-${GITHUB_RUN_ID}" 82 | 83 | - run: poetry run mutmut run 84 | - run: poetry run mutmut junitxml > mutmut.xml 85 | if: failure() 86 | - uses: actions/upload-artifact@v3.1.2 87 | with: 88 | name: mutation-test-report 89 | path: mutmut.xml 90 | if: failure() 91 | - uses: mikepenz/action-junit-report@v3.7.5 92 | if: failure() 93 | with: 94 | report_paths: mutmut.xml 95 | 96 | - name: Destroy AWS stacks used for testing 97 | run: poetry run cdk destroy --force --all 98 | if: always() # clean-up AWS stack after failure 99 | -------------------------------------------------------------------------------- /tests/general_generators.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | from http import HTTPStatus 3 | from os import urandom 4 | from random import choice, randrange 5 | from string import ascii_letters, ascii_uppercase, digits, printable 6 | from typing import Type 7 | from uuid import uuid4 8 | 9 | from mypy_boto3_lambda.type_defs import ResponseMetadataTypeDef 10 | 11 | REFERENCE_DATETIME = datetime(2000, 1, 1, tzinfo=timezone.utc) 12 | 13 | 14 | # General-purpose generators 15 | 16 | 17 | def random_string(length: int) -> str: 18 | """ 19 | Includes ASCII printable characters and the first printable character from several Unicode 20 | blocks . 21 | """ 22 | return _random_string_choices(f"{printable}¡ĀƀḂəʰͰἀЀ–⁰₠℀⅐←∀⌀①─▀■☀🬀✁ㄅff", length) 23 | 24 | 25 | def random_ascii_letter_string(length: int) -> str: 26 | return _random_string_choices(ascii_letters, length) 27 | 28 | 29 | def _random_string_choices(characters: str, length: int) -> str: 30 | return "".join(choice(characters) for _ in range(length)) 31 | 32 | 33 | def any_past_datetime() -> datetime: 34 | return REFERENCE_DATETIME - timedelta(seconds=randrange(30_000_000_000)) # Back to year 1049 35 | 36 | 37 | def any_past_datetime_string() -> str: 38 | return any_past_datetime().isoformat() 39 | 40 | 41 | def any_past_utc_datetime_string() -> str: 42 | return any_past_datetime().strftime("%Y-%m-%dT%H:%M:%SZ") 43 | 44 | 45 | def any_program_name() -> str: 46 | """Arbitrary-length string""" 47 | return random_string(20) 48 | 49 | 50 | def any_safe_file_path() -> str: 51 | paths = [any_safe_filename() for _ in range(randrange(1, 5))] 52 | return "/".join(paths) 53 | 54 | 55 | def any_safe_filename() -> str: 56 | return _random_string_choices(f"{digits}{ascii_letters}", 20) 57 | 58 | 59 | def any_host() -> str: 60 | return random_ascii_letter_string(20) 61 | 62 | 63 | def any_https_url() -> str: 64 | host = any_host() 65 | path = any_safe_file_path() 66 | return f"https://{host}/{path}" 67 | 68 | 69 | def any_file_contents(byte_count: int = 10) -> bytes: 70 | return urandom(byte_count) 71 | 72 | 73 | def any_request_id() -> str: 74 | """Arbitrary-length string""" 75 | return uuid4().hex 76 | 77 | 78 | def any_http_status_code() -> int: 79 | return choice(list(HTTPStatus)) 80 | 81 | 82 | def any_retry_attempts() -> int: 83 | """Arbitrary-length integer""" 84 | return randrange(10) 85 | 86 | 87 | def any_response_metadata() -> ResponseMetadataTypeDef: 88 | return { 89 | "RequestId": any_request_id(), 90 | "HostId": any_host(), 91 | "HTTPStatusCode": any_http_status_code(), 92 | "HTTPHeaders": {}, 93 | "RetryAttempts": any_retry_attempts(), 94 | } 95 | 96 | 97 | def any_error_message() -> str: 98 | """Arbitrary-length string""" 99 | return random_string(50) 100 | 101 | 102 | def any_class_name() -> str: 103 | return f"{choice(ascii_uppercase)}{random_ascii_letter_string(10)}Error" 104 | 105 | 106 | def any_exception_class() -> Type[Exception]: 107 | exception_class = type(any_class_name(), (Exception,), {}) 108 | return exception_class 109 | 110 | 111 | def any_dictionary_key() -> str: 112 | """Arbitrary-length string""" 113 | return random_string(20) 114 | 115 | 116 | def any_etag() -> str: 117 | """Arbitrary-length string""" 118 | return random_string(10) 119 | 120 | 121 | def any_name() -> str: 122 | return random_string(10) 123 | 124 | 125 | def any_description() -> str: 126 | return random_string(20) 127 | -------------------------------------------------------------------------------- /geostore/datasets_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from os import environ 3 | from typing import Any, Dict, Optional, Tuple, Type 4 | 5 | from pynamodb.attributes import UTCDateTimeAttribute, UnicodeAttribute 6 | from pynamodb.indexes import AllProjection, GlobalSecondaryIndex 7 | from pynamodb.models import MetaModel, Model 8 | from ulid import ULID 9 | from ulid.base32 import encode_randomness 10 | from ulid.constants import TIMESTAMP_LEN 11 | 12 | from .aws_keys import AWS_DEFAULT_REGION_KEY 13 | from .clock import now 14 | from .models import DATASET_ID_PREFIX, DB_KEY_SEPARATOR 15 | from .parameter_store import ParameterName, get_param 16 | 17 | 18 | def human_readable_ulid(ulid: ULID) -> str: 19 | """ 20 | Formats the timestamp part of the ULID as a human readable datetime. Uses "T" as the date/time 21 | separator as per RFC3339, hyphen as the datetime field separator to ensure broad filesystem 22 | compatibility, and underscore as the datetime/randomness separator. 23 | 24 | ULIDs have millisecond timestamps, but strftime can only format microseconds, so we need to chop 25 | off the last three characters. 26 | """ 27 | datetime_string = ulid.datetime.strftime("%Y-%m-%dT%H-%M-%S-%f")[:-3] 28 | return f"{datetime_string}Z_{encode_randomness(ulid.bytes[TIMESTAMP_LEN :])}" 29 | 30 | 31 | class DatasetsTitleIdx(GlobalSecondaryIndex["DatasetsModelBase"]): # type: ignore[no-untyped-call] 32 | """Dataset title global index.""" 33 | 34 | @dataclass 35 | class Meta: 36 | """Meta class.""" 37 | 38 | index_name = "datasets_title" 39 | read_capacity_units = 1 40 | write_capacity_units = 1 41 | projection = AllProjection() 42 | 43 | title = UnicodeAttribute(hash_key=True) 44 | 45 | 46 | class DatasetsModelBase(Model): 47 | """Dataset model.""" 48 | 49 | id = UnicodeAttribute( 50 | hash_key=True, 51 | attr_name="pk", 52 | default_for_new=lambda: f"{DATASET_ID_PREFIX}{ULID()}", 53 | ) 54 | title = UnicodeAttribute() 55 | created_at = UTCDateTimeAttribute(default_for_new=now) 56 | updated_at = UTCDateTimeAttribute(default=now) 57 | current_dataset_version = UnicodeAttribute(null=True) 58 | 59 | datasets_title_idx: DatasetsTitleIdx 60 | 61 | def as_dict(self) -> Dict[str, Any]: 62 | serialized = self.serialize() 63 | result: Dict[str, Any] = {key: value["S"] for key, value in serialized.items()} 64 | result["id"] = self.dataset_id 65 | return result 66 | 67 | @property 68 | def dataset_id(self) -> str: 69 | """Dataset ID value.""" 70 | return str(self.id).split(DB_KEY_SEPARATOR)[1] 71 | 72 | 73 | class DatasetsModelMeta(MetaModel): 74 | def __new__( 75 | cls, 76 | name: str, 77 | bases: Tuple[Type[object], ...], 78 | namespace: Dict[str, Any], 79 | discriminator: Optional[Any] = None, 80 | ) -> "DatasetsModelMeta": 81 | namespace["Meta"] = type( 82 | "Meta", 83 | (), 84 | { 85 | "table_name": get_param(ParameterName.STORAGE_DATASETS_TABLE_NAME), 86 | "region": environ[AWS_DEFAULT_REGION_KEY], 87 | }, 88 | ) 89 | klass: "DatasetsModelMeta" = MetaModel.__new__( # type: ignore[no-untyped-call] 90 | cls, name, bases, namespace, discriminator=discriminator 91 | ) 92 | return klass 93 | 94 | 95 | def datasets_model_with_meta() -> Type[DatasetsModelBase]: 96 | class DatasetModel(DatasetsModelBase, metaclass=DatasetsModelMeta): 97 | datasets_title_idx = DatasetsTitleIdx() 98 | 99 | return DatasetModel 100 | -------------------------------------------------------------------------------- /tests/test_upload_status.py: -------------------------------------------------------------------------------- 1 | from typing import cast 2 | from unittest.mock import MagicMock, patch 3 | 4 | from jsonschema import ValidationError 5 | from pytest import raises 6 | 7 | from geostore.api_keys import SUCCESS_KEY 8 | from geostore.import_file_batch_job_id_keys import ASSET_JOB_ID_KEY, METADATA_JOB_ID_KEY 9 | from geostore.step_function import Outcome 10 | from geostore.step_function_keys import ( 11 | ASSET_UPLOAD_KEY, 12 | DATASET_ID_KEY, 13 | ERRORS_KEY, 14 | FAILED_TASKS_KEY, 15 | FAILURE_REASONS_KEY, 16 | IMPORT_DATASET_KEY, 17 | METADATA_UPLOAD_KEY, 18 | NEW_VERSION_ID_KEY, 19 | STATUS_KEY, 20 | VALIDATION_KEY, 21 | ) 22 | from geostore.types import JsonObject 23 | from geostore.upload_status.task import lambda_handler 24 | 25 | from .aws_utils import any_account_id, any_batch_job_status, any_job_id, any_lambda_context 26 | from .stac_generators import any_dataset_id, any_dataset_version_id 27 | 28 | 29 | def should_raise_exception_when_missing_mandatory_execution_arn() -> None: 30 | with raises(ValidationError): 31 | lambda_handler({}, any_lambda_context()) 32 | 33 | 34 | @patch("geostore.step_function.get_step_function_validation_results") 35 | @patch("geostore.step_function.S3CONTROL_CLIENT.describe_job") 36 | @patch("geostore.step_function.get_account_number") 37 | def should_report_upload_statuses( 38 | get_account_number_mock: MagicMock, 39 | describe_job_mock: MagicMock, 40 | get_step_function_validation_results_mock: MagicMock, 41 | ) -> None: 42 | # Given 43 | account_id = any_account_id() 44 | get_account_number_mock.return_value = account_id 45 | asset_job_id = any_job_id() 46 | asset_job_status = any_batch_job_status() 47 | metadata_job_id = any_job_id() 48 | metadata_job_status = any_batch_job_status() 49 | 50 | get_step_function_validation_results_mock.return_value = [] 51 | 52 | def describe_job(AccountId: str, JobId: str) -> JsonObject: # pylint: disable=invalid-name 53 | assert AccountId == cast(str, account_id) 54 | return { 55 | asset_job_id: { 56 | "Job": { 57 | "Status": asset_job_status, 58 | "FailureReasons": [], 59 | "ProgressSummary": {"NumberOfTasksFailed": 0}, 60 | } 61 | }, 62 | metadata_job_id: { 63 | "Job": { 64 | "Status": metadata_job_status, 65 | "FailureReasons": [], 66 | "ProgressSummary": {"NumberOfTasksFailed": 0}, 67 | } 68 | }, 69 | }[JobId] 70 | 71 | describe_job_mock.side_effect = describe_job 72 | 73 | expected_response = { 74 | VALIDATION_KEY: {STATUS_KEY: Outcome.PASSED.value, ERRORS_KEY: []}, 75 | ASSET_UPLOAD_KEY: { 76 | STATUS_KEY: asset_job_status, 77 | ERRORS_KEY: {FAILED_TASKS_KEY: 0, FAILURE_REASONS_KEY: []}, 78 | }, 79 | METADATA_UPLOAD_KEY: { 80 | STATUS_KEY: metadata_job_status, 81 | ERRORS_KEY: {FAILED_TASKS_KEY: 0, FAILURE_REASONS_KEY: []}, 82 | }, 83 | } 84 | 85 | # When 86 | response = lambda_handler( 87 | { 88 | DATASET_ID_KEY: any_dataset_id(), 89 | NEW_VERSION_ID_KEY: any_dataset_version_id(), 90 | VALIDATION_KEY: {SUCCESS_KEY: True}, 91 | IMPORT_DATASET_KEY: { 92 | METADATA_JOB_ID_KEY: metadata_job_id, 93 | ASSET_JOB_ID_KEY: asset_job_id, 94 | }, 95 | }, 96 | any_lambda_context(), 97 | ) 98 | 99 | # Then 100 | assert response == expected_response 101 | -------------------------------------------------------------------------------- /geostore/validation_results_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from os import environ 4 | from typing import Any, Dict, Optional, Tuple, Type 5 | 6 | from pynamodb.attributes import MapAttribute, UnicodeAttribute 7 | from pynamodb.indexes import AllProjection, GlobalSecondaryIndex 8 | from pynamodb.models import MetaModel, Model 9 | 10 | from .aws_keys import AWS_DEFAULT_REGION_KEY 11 | from .check import Check 12 | from .models import CHECK_ID_PREFIX, DB_KEY_SEPARATOR, URL_ID_PREFIX 13 | from .parameter_store import ParameterName, get_param 14 | from .types import JsonObject 15 | 16 | 17 | class ValidationResult(Enum): 18 | FAILED = "Failed" 19 | PASSED = "Passed" 20 | 21 | 22 | class ValidationOutcomeIdx( # type: ignore[no-untyped-call] 23 | GlobalSecondaryIndex["ValidationResultsModelBase"] 24 | ): 25 | @dataclass 26 | class Meta: 27 | index_name = "validation_outcome" 28 | read_capacity_units = 1 29 | write_capacity_units = 1 30 | projection = AllProjection() 31 | 32 | pk = UnicodeAttribute(hash_key=True, attr_name="pk") 33 | result = UnicodeAttribute(range_key=True, attr_name="result") 34 | 35 | 36 | class ValidationResultsModelBase(Model): 37 | pk = UnicodeAttribute(hash_key=True) 38 | sk = UnicodeAttribute(range_key=True) 39 | result = UnicodeAttribute() 40 | # TODO: Remove type-arg when PynamoDB issue #920 is fixed pylint:disable=fixme 41 | details: MapAttribute[str, Any] = MapAttribute(null=True) # type: ignore[no-untyped-call] 42 | 43 | validation_outcome_index: ValidationOutcomeIdx 44 | 45 | 46 | def validation_results_model_with_meta( 47 | *, results_table_name: Optional[str] = None 48 | ) -> Type[ValidationResultsModelBase]: 49 | if results_table_name is None: 50 | results_table_name = get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME) 51 | 52 | class ValidationResultsModelMeta(MetaModel): 53 | def __new__( 54 | cls, 55 | name: str, 56 | bases: Tuple[Type[object], ...], 57 | namespace: Dict[str, Any], 58 | discriminator: Optional[Any] = None, 59 | ) -> "ValidationResultsModelMeta": 60 | namespace["Meta"] = type( 61 | "Meta", 62 | (), 63 | { 64 | "table_name": results_table_name, 65 | "region": environ[AWS_DEFAULT_REGION_KEY], 66 | }, 67 | ) 68 | klass: "ValidationResultsModelMeta" 69 | klass = MetaModel.__new__( # type: ignore[no-untyped-call] 70 | cls, name, bases, namespace, discriminator=discriminator 71 | ) 72 | return klass 73 | 74 | class ValidationResultsModel(ValidationResultsModelBase, metaclass=ValidationResultsModelMeta): 75 | validation_outcome_index = ValidationOutcomeIdx() 76 | 77 | return ValidationResultsModel 78 | 79 | 80 | class ValidationResultFactory: # pylint:disable=too-few-public-methods 81 | def __init__(self, hash_key: str, results_table_name: str): 82 | self.hash_key = hash_key 83 | self.validation_results_model = validation_results_model_with_meta( 84 | results_table_name=results_table_name 85 | ) 86 | 87 | def save( 88 | self, 89 | url: str, 90 | check: Check, 91 | result: ValidationResult, 92 | *, 93 | details: Optional[JsonObject] = None, 94 | ) -> None: 95 | self.validation_results_model( 96 | pk=self.hash_key, 97 | sk=f"{CHECK_ID_PREFIX}{check.value}{DB_KEY_SEPARATOR}{URL_ID_PREFIX}{url}", 98 | result=result.value, 99 | details=details, 100 | ).save() 101 | -------------------------------------------------------------------------------- /geostore/check_stac_metadata/task.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | 3 | from botocore.exceptions import ClientError 4 | from jsonschema import ValidationError, validate 5 | from linz_logger import get_log 6 | 7 | from ..api_keys import SUCCESS_KEY 8 | from ..error_response_keys import ERROR_MESSAGE_KEY 9 | from ..logging_keys import ( 10 | GIT_COMMIT, 11 | LOG_MESSAGE_LAMBDA_FAILURE, 12 | LOG_MESSAGE_LAMBDA_START, 13 | LOG_MESSAGE_VALIDATION_COMPLETE, 14 | ) 15 | from ..parameter_store import ParameterName, get_param 16 | from ..processing_assets_model import ProcessingAssetType 17 | from ..s3_utils import get_s3_url_reader 18 | from ..step_function import AssetGarbageCollector, Outcome, get_hash_key 19 | from ..step_function_keys import ( 20 | CURRENT_VERSION_ID_KEY, 21 | DATASET_ID_KEY, 22 | DATASET_TITLE_KEY, 23 | METADATA_URL_KEY, 24 | NEW_VERSION_ID_KEY, 25 | S3_ROLE_ARN_KEY, 26 | ) 27 | from ..types import JsonObject 28 | from ..validation_results_model import ValidationResultFactory 29 | from .utils import STACDatasetValidator 30 | 31 | LOGGER: Logger = get_log() 32 | 33 | 34 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 35 | LOGGER.debug( 36 | LOG_MESSAGE_LAMBDA_START, 37 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 38 | ) 39 | 40 | # validate input 41 | try: 42 | validate( 43 | event, 44 | { 45 | "type": "object", 46 | "properties": { 47 | CURRENT_VERSION_ID_KEY: {"type": "string"}, 48 | DATASET_ID_KEY: {"type": "string"}, 49 | DATASET_TITLE_KEY: {"type": "string"}, 50 | METADATA_URL_KEY: {"type": "string"}, 51 | NEW_VERSION_ID_KEY: {"type": "string"}, 52 | S3_ROLE_ARN_KEY: {"type": "string"}, 53 | }, 54 | "required": [ 55 | CURRENT_VERSION_ID_KEY, 56 | DATASET_ID_KEY, 57 | DATASET_TITLE_KEY, 58 | METADATA_URL_KEY, 59 | NEW_VERSION_ID_KEY, 60 | S3_ROLE_ARN_KEY, 61 | ], 62 | "additionalProperties": True, 63 | }, 64 | ) 65 | except ValidationError as error: 66 | LOGGER.warning( 67 | LOG_MESSAGE_VALIDATION_COMPLETE, 68 | extra={ 69 | "outcome": Outcome.FAILED, 70 | "error": error, 71 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 72 | }, 73 | ) 74 | return {ERROR_MESSAGE_KEY: error.message} 75 | 76 | try: 77 | s3_url_reader = get_s3_url_reader(event[S3_ROLE_ARN_KEY], event[DATASET_TITLE_KEY], LOGGER) 78 | except ClientError as error: 79 | LOGGER.warning( 80 | LOG_MESSAGE_LAMBDA_FAILURE, 81 | extra={"error": error, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 82 | ) 83 | return {ERROR_MESSAGE_KEY: str(error)} 84 | 85 | asset_garbage_collector = AssetGarbageCollector( 86 | event[DATASET_ID_KEY], 87 | event[CURRENT_VERSION_ID_KEY], 88 | ProcessingAssetType.METADATA, 89 | LOGGER, 90 | ) 91 | 92 | hash_key = get_hash_key(event[DATASET_ID_KEY], event[NEW_VERSION_ID_KEY]) 93 | 94 | validation_result_factory = ValidationResultFactory( 95 | hash_key, get_param(ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME) 96 | ) 97 | 98 | validator = STACDatasetValidator( 99 | hash_key, s3_url_reader, asset_garbage_collector, validation_result_factory 100 | ) 101 | 102 | validator.run(event[METADATA_URL_KEY]) 103 | return {SUCCESS_KEY: True} 104 | -------------------------------------------------------------------------------- /tests/test_import_status_logging.py: -------------------------------------------------------------------------------- 1 | from json import dumps 2 | from unittest.mock import MagicMock, patch 3 | 4 | from jsonschema import ValidationError 5 | 6 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY 7 | from geostore.import_status.get import get_import_status 8 | from geostore.logging_keys import ( 9 | GIT_COMMIT, 10 | LOG_MESSAGE_LAMBDA_FAILURE, 11 | LOG_MESSAGE_LAMBDA_START, 12 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE, 13 | ) 14 | from geostore.parameter_store import ParameterName, get_param 15 | from geostore.step_function_keys import DATASET_ID_KEY, EXECUTION_ARN_KEY, NEW_VERSION_ID_KEY 16 | 17 | from .aws_utils import any_arn_formatted_string 18 | from .general_generators import any_error_message 19 | from .stac_generators import any_dataset_id, any_dataset_version_id 20 | 21 | 22 | @patch("geostore.step_function.STEP_FUNCTIONS_CLIENT.describe_execution") 23 | def should_log_payload(describe_step_function_mock: MagicMock) -> None: 24 | # Given 25 | event = { 26 | HTTP_METHOD_KEY: "GET", 27 | BODY_KEY: {EXECUTION_ARN_KEY: any_arn_formatted_string()}, 28 | } 29 | 30 | describe_step_function_mock.return_value = { 31 | "status": "RUNNING", 32 | "input": dumps( 33 | {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()} 34 | ), 35 | } 36 | 37 | with patch("geostore.import_status.get.LOGGER.debug") as logger_mock, patch( 38 | "geostore.step_function.get_step_function_validation_results" 39 | ) as validation_mock: 40 | validation_mock.return_value = [] 41 | 42 | # When 43 | get_import_status(event) 44 | 45 | # Then 46 | logger_mock.assert_any_call( 47 | LOG_MESSAGE_LAMBDA_START, 48 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 49 | ) 50 | 51 | 52 | @patch("geostore.import_status.get.validate") 53 | def should_log_schema_validation_warning(validate_schema_mock: MagicMock) -> None: 54 | # Given 55 | 56 | error_message = any_error_message() 57 | validate_schema_mock.side_effect = ValidationError(error_message) 58 | 59 | with patch("geostore.import_status.get.LOGGER.warning") as logger_mock: 60 | # When 61 | get_import_status( 62 | { 63 | HTTP_METHOD_KEY: "GET", 64 | BODY_KEY: {}, 65 | } 66 | ) 67 | 68 | # Then 69 | logger_mock.assert_any_call( 70 | LOG_MESSAGE_LAMBDA_FAILURE, 71 | extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 72 | ) 73 | 74 | 75 | @patch("geostore.step_function.STEP_FUNCTIONS_CLIENT.describe_execution") 76 | def should_log_stepfunctions_status_response( 77 | describe_execution_mock: MagicMock, 78 | ) -> None: 79 | # Given 80 | describe_execution_mock.return_value = describe_execution_response = { 81 | "status": "Some Response", 82 | "input": dumps( 83 | {DATASET_ID_KEY: any_dataset_id(), NEW_VERSION_ID_KEY: any_dataset_version_id()} 84 | ), 85 | } 86 | 87 | with patch("geostore.step_function.LOGGER.debug") as logger_mock, patch( 88 | "geostore.step_function.get_account_number" 89 | ), patch("geostore.step_function.get_step_function_validation_results") as validation_mock: 90 | validation_mock.return_value = [] 91 | # When 92 | get_import_status({EXECUTION_ARN_KEY: any_arn_formatted_string()}) 93 | 94 | # Then 95 | logger_mock.assert_any_call( 96 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE, 97 | extra={ 98 | "response": describe_execution_response, 99 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 100 | }, 101 | ) 102 | -------------------------------------------------------------------------------- /geostore/import_dataset_file.py: -------------------------------------------------------------------------------- 1 | from json import loads 2 | from logging import Logger 3 | from typing import TYPE_CHECKING, Callable, Optional 4 | from urllib.parse import unquote_plus 5 | 6 | from botocore.exceptions import ClientError 7 | from linz_logger import get_log 8 | 9 | from .aws_response import AWS_CODE_REQUEST_TIMEOUT 10 | from .import_dataset_keys import NEW_KEY_KEY, ORIGINAL_KEY_KEY, TARGET_BUCKET_NAME_KEY 11 | from .logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_START 12 | from .parameter_store import ParameterName, get_param 13 | from .s3 import get_s3_client_for_role 14 | from .step_function_keys import S3_ROLE_ARN_KEY 15 | from .types import JsonObject 16 | 17 | if TYPE_CHECKING: 18 | from mypy_boto3_s3 import S3Client 19 | from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef 20 | else: 21 | PutObjectOutputTypeDef = JsonObject # pragma: no mutate 22 | S3Client = object # pragma: no mutate 23 | 24 | INVOCATION_ID_KEY = "invocationId" 25 | INVOCATION_SCHEMA_VERSION_KEY = "invocationSchemaVersion" 26 | RESULTS_KEY = "results" 27 | RESULT_CODE_KEY = "resultCode" 28 | RESULT_STRING_KEY = "resultString" 29 | S3_BUCKET_ARN_KEY = "s3BucketArn" 30 | S3_KEY_KEY = "s3Key" 31 | TASKS_KEY = "tasks" 32 | TASK_ID_KEY = "taskId" 33 | TREAT_MISSING_KEYS_AS_KEY = "treatMissingKeysAs" 34 | 35 | RESULT_CODE_PERMANENT_FAILURE = "PermanentFailure" 36 | RESULT_CODE_SUCCEEDED = "Succeeded" 37 | RESULT_CODE_TEMPORARY_FAILURE = "TemporaryFailure" 38 | 39 | EXCEPTION_PREFIX = "Exception" 40 | RETRY_RESULT_STRING = "Retry request to Amazon S3 due to timeout." 41 | 42 | LOGGER: Logger = get_log() 43 | LOG_MESSAGE_S3_BATCH_COPY_RESULT = "S3 Batch Result" 44 | 45 | 46 | def get_import_result( 47 | event: JsonObject, 48 | importer: Callable[[str, str, str, str, S3Client], Optional[PutObjectOutputTypeDef]], 49 | ) -> JsonObject: 50 | LOGGER.debug( 51 | LOG_MESSAGE_LAMBDA_START, 52 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 53 | ) 54 | 55 | task = event[TASKS_KEY][0] 56 | source_bucket_name = task[S3_BUCKET_ARN_KEY].split(":::", maxsplit=1)[-1] 57 | parameters = loads(unquote_plus(task[S3_KEY_KEY])) 58 | source_s3_client = get_s3_client_for_role(parameters[S3_ROLE_ARN_KEY]) 59 | 60 | try: 61 | response = importer( 62 | source_bucket_name, 63 | parameters[ORIGINAL_KEY_KEY], 64 | parameters[TARGET_BUCKET_NAME_KEY], 65 | parameters[NEW_KEY_KEY], 66 | source_s3_client, 67 | ) 68 | result_code = RESULT_CODE_SUCCEEDED 69 | result_string = str(response) 70 | except ClientError as error: 71 | error_code = error.response["Error"]["Code"] 72 | if error_code == AWS_CODE_REQUEST_TIMEOUT: 73 | result_code = RESULT_CODE_TEMPORARY_FAILURE 74 | result_string = RETRY_RESULT_STRING 75 | else: 76 | result_code = RESULT_CODE_PERMANENT_FAILURE 77 | error_message = error.response["Error"]["Message"] 78 | result_string = f"{error_code} when calling {error.operation_name}: {error_message}" 79 | except Exception as error: # pylint:disable=broad-except 80 | result_code = RESULT_CODE_PERMANENT_FAILURE 81 | result_string = f"{EXCEPTION_PREFIX}: {error}" 82 | 83 | result = { 84 | INVOCATION_SCHEMA_VERSION_KEY: event[INVOCATION_SCHEMA_VERSION_KEY], 85 | TREAT_MISSING_KEYS_AS_KEY: RESULT_CODE_PERMANENT_FAILURE, 86 | INVOCATION_ID_KEY: event[INVOCATION_ID_KEY], 87 | RESULTS_KEY: [ 88 | { 89 | TASK_ID_KEY: task[TASK_ID_KEY], 90 | RESULT_CODE_KEY: result_code, 91 | RESULT_STRING_KEY: result_string, 92 | } 93 | ], 94 | } 95 | LOGGER.debug( 96 | LOG_MESSAGE_S3_BATCH_COPY_RESULT, 97 | extra={"result": result, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 98 | ) 99 | return result 100 | -------------------------------------------------------------------------------- /infrastructure/constructs/notify.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | 3 | from aws_cdk import ( 4 | aws_events, 5 | aws_events_targets, 6 | aws_iam, 7 | aws_lambda_python_alpha, 8 | aws_sns, 9 | aws_ssm, 10 | aws_stepfunctions, 11 | ) 12 | from constructs import Construct 13 | 14 | from geostore.environment import ENV_NAME_VARIABLE_NAME 15 | from geostore.notify_status_update.task import SLACK_URL_ENV_NAME 16 | from geostore.parameter_store import ParameterName 17 | from geostore.resources import Resource 18 | 19 | from .bundled_lambda_function import BundledLambdaFunction 20 | from .common import grant_parameter_read_access 21 | from .s3_policy import ALLOW_DESCRIBE_ANY_S3_JOB 22 | from .table import Table 23 | 24 | 25 | class Notify(Construct): 26 | def __init__( 27 | self, 28 | scope: Construct, 29 | stack_id: str, 30 | *, 31 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 32 | env_name: str, 33 | state_machine: aws_stepfunctions.StateMachine, 34 | validation_results_table: Table, 35 | git_commit_parameter: aws_ssm.StringParameter, 36 | ) -> None: 37 | super().__init__(scope, stack_id) 38 | 39 | slack_notify_function = BundledLambdaFunction( 40 | scope, 41 | "GetStatusUpdate", 42 | lambda_directory="notify_status_update", 43 | extra_environment={ 44 | ENV_NAME_VARIABLE_NAME: env_name, 45 | }, 46 | botocore_lambda_layer=botocore_lambda_layer, 47 | ) 48 | if SLACK_URL_ENV_NAME in environ: 49 | slack_notify_function.add_environment( 50 | SLACK_URL_ENV_NAME, 51 | environ[SLACK_URL_ENV_NAME], 52 | ) 53 | 54 | validation_results_table.grant_read_data(slack_notify_function) 55 | validation_results_table.grant(slack_notify_function, "dynamodb:DescribeTable") 56 | state_machine.grant_read(slack_notify_function) 57 | 58 | slack_notify_function.add_to_role_policy(ALLOW_DESCRIBE_ANY_S3_JOB) 59 | 60 | # Allow anyone to subscribe to topic 61 | step_function_topic = aws_sns.Topic( 62 | scope, 63 | "geostore-stepfunction-status-topic", 64 | topic_name=Resource.SNS_TOPIC_NAME.resource_name, 65 | ) 66 | sns_topic_arn_parameter = aws_ssm.StringParameter( 67 | self, 68 | "status-sns-topic-arn", 69 | string_value=step_function_topic.topic_arn, 70 | description=f"Status SNS Topic ARN for {env_name}", 71 | parameter_name=ParameterName.STATUS_SNS_TOPIC_ARN.value, 72 | ) 73 | 74 | # Allow access to any validations 75 | grant_parameter_read_access( 76 | { 77 | sns_topic_arn_parameter: [slack_notify_function], 78 | validation_results_table.name_parameter: [ 79 | slack_notify_function, 80 | ], 81 | git_commit_parameter: [slack_notify_function], 82 | } 83 | ) 84 | step_function_topic.grant_publish(slack_notify_function) 85 | 86 | step_function_topic.add_to_resource_policy( 87 | aws_iam.PolicyStatement( 88 | actions=["sns:Subscribe", "sns:Receive"], 89 | principals=[aws_iam.AnyPrincipal()], 90 | resources=[step_function_topic.topic_arn], 91 | ) 92 | ) 93 | 94 | aws_events.Rule( 95 | scope, 96 | "geostore-cloudwatch-stepfunctions-rule", 97 | enabled=True, 98 | rule_name=Resource.CLOUDWATCH_RULE_NAME.resource_name, 99 | description="Cloudwatch rule to detect import status updates", 100 | event_pattern=aws_events.EventPattern( 101 | source=["aws.states"], 102 | detail_type=["Step Functions Execution Status Change"], 103 | detail={"stateMachineArn": [state_machine.state_machine_arn]}, 104 | ), 105 | targets=[aws_events_targets.LambdaFunction(slack_notify_function)], 106 | ) 107 | -------------------------------------------------------------------------------- /geostore/check_stac_metadata/stac_validators.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property, lru_cache 2 | from json import load 3 | from os import scandir 4 | from os.path import dirname, join 5 | from re import fullmatch 6 | 7 | from jsonschema import Draft7Validator, FormatChecker, RefResolver 8 | from jsonschema._utils import URIDict 9 | from jsonschema.validators import extend 10 | from packaging.version import parse 11 | 12 | from ..stac_format import LINZ_STAC_EXTENSIONS_LOCAL_PATH 13 | from ..types import JsonObject 14 | 15 | 16 | class Schema: 17 | def __init__(self, path: str): 18 | self.path = path 19 | 20 | @cached_property 21 | def as_dict(self) -> JsonObject: 22 | with open(join(dirname(__file__), self.path), encoding="utf-8") as file_pointer: 23 | result: JsonObject = load(file_pointer) 24 | return result 25 | 26 | @cached_property 27 | def schema_id(self) -> str: 28 | id_: str = self.as_dict["$id"] 29 | return id_ 30 | 31 | @cached_property 32 | def uri(self) -> str: 33 | uri_: str = URIDict().normalize(self.schema_id) 34 | return uri_ 35 | 36 | 37 | @lru_cache 38 | def get_latest_extension_schema_version(extension_path: str) -> str: 39 | directories = scandir(join(dirname(__file__), extension_path)) 40 | versions = [] 41 | for directory in directories: 42 | if directory.is_dir() and fullmatch(r"v\d+\.\d+\.\d+", directory.name): 43 | versions.append(directory.name[1:]) 44 | return sorted(versions, key=parse, reverse=True)[0] 45 | 46 | 47 | FILE_STAC_SCHEMA_PATH = "file/v2.0.0/schema.json" 48 | PROJECTION_STAC_SCHEMA_PATH = "projection/v1.0.0/schema.json" 49 | VERSION_STAC_SCHEMA_PATH = "version/v1.0.0/schema.json" 50 | FILE_SCHEMA = Schema(FILE_STAC_SCHEMA_PATH) 51 | 52 | STAC_SPEC_EXTENSION_PATH = "stac-spec" 53 | STAC_VERSION = get_latest_extension_schema_version(STAC_SPEC_EXTENSION_PATH) 54 | STAC_SPEC_PATH = f"{STAC_SPEC_EXTENSION_PATH}/v{STAC_VERSION}" 55 | CATALOG_SCHEMA = Schema(f"{STAC_SPEC_PATH}/catalog-spec/json-schema/catalog.json") 56 | LINZ_STAC_EXTENSIONS_URL_PATH = ( 57 | f"v{get_latest_extension_schema_version(LINZ_STAC_EXTENSIONS_LOCAL_PATH)}" 58 | ) 59 | LINZ_SCHEMA_URL_DIRECTORY = f"{LINZ_STAC_EXTENSIONS_URL_PATH}/linz" 60 | LINZ_SCHEMA_URL_PATH = f"{LINZ_SCHEMA_URL_DIRECTORY}/schema.json" 61 | LINZ_SCHEMA = Schema(join(LINZ_STAC_EXTENSIONS_LOCAL_PATH, LINZ_SCHEMA_URL_PATH)) 62 | STAC_ITEM_SPEC_PATH = f"{STAC_SPEC_PATH}/item-spec/json-schema" 63 | ITEM_SCHEMA = Schema(f"{STAC_ITEM_SPEC_PATH}/item.json") 64 | QUALITY_SCHEMA_PATH = f"{LINZ_STAC_EXTENSIONS_URL_PATH}/quality/schema.json" 65 | 66 | schema_store = {} 67 | for schema in [ 68 | CATALOG_SCHEMA, 69 | Schema(f"{STAC_SPEC_PATH}/collection-spec/json-schema/collection.json"), 70 | FILE_SCHEMA, 71 | Schema("geojson-spec/Feature.json"), 72 | Schema("geojson-spec/Geometry.json"), 73 | ITEM_SCHEMA, 74 | Schema(f"{STAC_ITEM_SPEC_PATH}/basics.json"), 75 | Schema(f"{STAC_ITEM_SPEC_PATH}/datetime.json"), 76 | Schema(f"{STAC_ITEM_SPEC_PATH}/instrument.json"), 77 | Schema(f"{STAC_ITEM_SPEC_PATH}/licensing.json"), 78 | Schema(f"{STAC_ITEM_SPEC_PATH}/provider.json"), 79 | LINZ_SCHEMA, 80 | Schema(PROJECTION_STAC_SCHEMA_PATH), 81 | Schema(VERSION_STAC_SCHEMA_PATH), 82 | Schema(join(LINZ_STAC_EXTENSIONS_LOCAL_PATH, QUALITY_SCHEMA_PATH)), 83 | ]: 84 | # Normalize URLs the same way as jsonschema does 85 | schema_store[schema.uri] = schema.as_dict 86 | 87 | BaseSTACValidator = extend(Draft7Validator) 88 | BaseSTACValidator.format_checker = FormatChecker() 89 | 90 | STACCatalogSchemaValidator = extend(BaseSTACValidator)( 91 | resolver=RefResolver.from_schema(CATALOG_SCHEMA.as_dict, store=schema_store), 92 | schema=CATALOG_SCHEMA.as_dict, 93 | ) 94 | 95 | STACCollectionSchemaValidator = extend(BaseSTACValidator)( 96 | resolver=RefResolver.from_schema(LINZ_SCHEMA.as_dict, store=schema_store), 97 | schema=LINZ_SCHEMA.as_dict, 98 | ) 99 | 100 | STACItemSchemaValidator = extend(BaseSTACValidator)( 101 | resolver=RefResolver.from_schema(LINZ_SCHEMA.as_dict, store=schema_store), 102 | schema=LINZ_SCHEMA.as_dict, 103 | ) 104 | -------------------------------------------------------------------------------- /tests/test_step_function.py: -------------------------------------------------------------------------------- 1 | from os.path import basename 2 | from unittest.mock import MagicMock, patch 3 | 4 | from pytest import mark 5 | from pytest_subtests import SubTests 6 | 7 | from geostore.logging_keys import GIT_COMMIT 8 | from geostore.models import DB_KEY_SEPARATOR 9 | from geostore.parameter_store import ParameterName, get_param 10 | from geostore.processing_assets_model import ProcessingAssetType, processing_assets_model_with_meta 11 | from geostore.step_function import AssetGarbageCollector, get_hash_key 12 | from geostore.step_function_keys import CURRENT_VERSION_EMPTY_VALUE 13 | from tests.aws_utils import ProcessingAsset, any_s3_url 14 | from tests.stac_generators import any_dataset_id, any_dataset_version_id 15 | 16 | 17 | @mark.infrastructure 18 | def should_mark_asset_as_replaced(subtests: SubTests) -> None: 19 | # Given 20 | 21 | dataset_id = any_dataset_id() 22 | current_version_id = any_dataset_version_id() 23 | url = any_s3_url() 24 | filename = basename(url) 25 | logger_mock = MagicMock() 26 | 27 | expected_log_message = ( 28 | f"Dataset: '{dataset_id}' " 29 | f"Version: '{current_version_id}' " 30 | f"Filename: '{filename}' has been marked as replaced" 31 | ) 32 | 33 | hash_key = get_hash_key(dataset_id, current_version_id) 34 | processing_assets_model = processing_assets_model_with_meta() 35 | expected_metadata_item = processing_assets_model( 36 | hash_key=hash_key, 37 | range_key=f"{ProcessingAssetType.METADATA.value}{DB_KEY_SEPARATOR}0", 38 | url=url, 39 | filename=filename, 40 | replaced_in_new_version=True, 41 | ) 42 | 43 | with ProcessingAsset( 44 | asset_id=hash_key, 45 | url=url, 46 | ): 47 | # When 48 | AssetGarbageCollector( 49 | dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock 50 | ).mark_asset_as_replaced(filename) 51 | 52 | # Then 53 | with subtests.test(msg="Log is recorded"): 54 | logger_mock.debug.assert_called_once_with( 55 | expected_log_message, 56 | extra={ 57 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 58 | }, 59 | ) 60 | 61 | actual_first_version_metadata_item = processing_assets_model.query( 62 | hash_key, 63 | processing_assets_model.sk.startswith( 64 | f"{ProcessingAssetType.METADATA.value}{DB_KEY_SEPARATOR}" 65 | ), 66 | consistent_read=True, 67 | ).next() 68 | 69 | with subtests.test(msg=f"Metadata {actual_first_version_metadata_item.pk}"): 70 | assert ( 71 | actual_first_version_metadata_item.attribute_values 72 | == expected_metadata_item.attribute_values 73 | ) 74 | 75 | 76 | @mark.infrastructure 77 | def should_do_nothing_if_no_asset_returned(subtests: SubTests) -> None: 78 | # Given 79 | 80 | dataset_id = any_dataset_id() 81 | current_version_id = any_dataset_version_id() 82 | url = any_s3_url() 83 | filename = basename(url) 84 | logger_mock = MagicMock() 85 | 86 | # When 87 | AssetGarbageCollector( 88 | dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock 89 | ).mark_asset_as_replaced(filename) 90 | 91 | # Then 92 | with subtests.test(msg="Log is recorded"): 93 | logger_mock.debug.assert_not_called() 94 | 95 | 96 | @patch("geostore.step_function.processing_assets_model_with_meta") 97 | def should_return_early_if_no_dataset_version( 98 | processing_assets_model_mock: MagicMock, subtests: SubTests 99 | ) -> None: 100 | # Given 101 | dataset_id = any_dataset_id() 102 | current_version_id = CURRENT_VERSION_EMPTY_VALUE 103 | url = any_s3_url() 104 | filename = basename(url) 105 | logger_mock = MagicMock() 106 | 107 | # When 108 | AssetGarbageCollector( 109 | dataset_id, current_version_id, ProcessingAssetType.METADATA, logger_mock 110 | ).mark_asset_as_replaced(filename) 111 | 112 | # Then 113 | with subtests.test(msg="db record is not queried"): 114 | processing_assets_model_mock.return_value.assert_not_called() 115 | 116 | with subtests.test(msg="Log is not recorded"): 117 | logger_mock.debug.assert_not_called() 118 | -------------------------------------------------------------------------------- /geostore/s3_utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from dataclasses import dataclass 3 | from logging import Logger 4 | from os.path import basename 5 | from typing import Callable, Optional, Tuple 6 | from urllib.parse import urlparse 7 | 8 | from botocore.exceptions import ClientError 9 | from botocore.response import StreamingBody 10 | 11 | from .logging_keys import GIT_COMMIT 12 | from .parameter_store import ParameterName, get_param 13 | from .resources import Resource 14 | from .s3 import get_s3_client_for_role 15 | 16 | KNOWN_ETAG_OF_EMPTY_FILE = '"d41d8cd98f00b204e9800998ecf8427e"' 17 | 18 | 19 | def get_bucket_and_key_from_url(url: str) -> Tuple[str, str]: 20 | parsed = urlparse(url) 21 | return parsed.netloc, parsed.path[1:] 22 | 23 | 24 | @dataclass 25 | class GeostoreS3Response: 26 | response: StreamingBody 27 | file_in_staging: bool 28 | 29 | 30 | def get_s3_url_reader( 31 | s3_role_arn: str, dataset_title: str, logger: Logger 32 | ) -> Callable[[str], GeostoreS3Response]: 33 | def s3_url_reader(staging_url: str) -> GeostoreS3Response: 34 | bucket_name, key = get_bucket_and_key_from_url(staging_url) 35 | 36 | try: 37 | staging_object = staging_s3_client.get_object(Bucket=bucket_name, Key=key) 38 | return GeostoreS3Response(staging_object["Body"], True) 39 | except ClientError as error: 40 | if error.response["Error"]["Code"] != "NoSuchKey": 41 | raise error 42 | 43 | geostore_key = f"{dataset_title}/{basename(urlparse(staging_url).path[1:])}" 44 | 45 | logger.debug( 46 | f"'{key}' is not present in the staging bucket." 47 | f" Using '{geostore_key}' from the geostore bucket for validation instead.", 48 | extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 49 | ) 50 | geostore_object = geostore_s3_client.get_object( 51 | Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, Key=geostore_key 52 | ) 53 | return GeostoreS3Response(geostore_object["Body"], False) 54 | 55 | staging_s3_client = get_s3_client_for_role(s3_role_arn) 56 | geostore_s3_client = get_s3_client_for_role(get_param(ParameterName.S3_USERS_ROLE_ARN)) 57 | return s3_url_reader 58 | 59 | 60 | def get_s3_etag(s3_bucket: str, s3_object_key: str, logger: Logger) -> Optional[str]: 61 | geostore_s3_client = get_s3_client_for_role(get_param(ParameterName.S3_USERS_ROLE_ARN)) 62 | 63 | try: 64 | s3_response = geostore_s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key) 65 | return s3_response["ETag"] 66 | except ClientError as error: 67 | if error.response["Error"]["Code"] != "404": 68 | logger.debug( 69 | f"Unable to fetch eTag for “{s3_object_key}” in s3://{s3_bucket} due to “{error}”", 70 | extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 71 | ) 72 | # rather than raise, we return an empty string, indicating that the etag is different 73 | # thus allowing the next step to continue rather than stalling the entire process 74 | return None 75 | 76 | 77 | def calculate_s3_etag(body: bytes) -> str: 78 | # https://awscli.amazonaws.com/v2/documentation/api/latest/topic/s3-config.html#multipart-chunksize 79 | s3_default_chunk_size = 8_388_608 # Default value is 8 * 1024 * 1024 80 | 81 | if body == b"": 82 | return KNOWN_ETAG_OF_EMPTY_FILE 83 | 84 | chunk_hashes = [] 85 | 86 | for chunk_start in range(0, len(body), s3_default_chunk_size): 87 | chunk = body[chunk_start : chunk_start + s3_default_chunk_size] 88 | chunk_hashes.append(hashlib.md5(chunk, usedforsecurity=False)) 89 | 90 | # file smaller than s3_default_chunk_size has one chunk 91 | if len(chunk_hashes) == 1: 92 | # file at exactly s3_default_chunk_size is still one chunk 93 | # but etag is calculated as multi chunk file (e.g. "656dadd6d61e0ebfd29264e34d742df3-1") 94 | # where -1 suffix signifies 1 chunk 95 | if len(body) < s3_default_chunk_size: 96 | return f'"{chunk_hashes[0].hexdigest()}"' 97 | 98 | hash_object = hashlib.md5(usedforsecurity=False) 99 | for chunk_hash in chunk_hashes: 100 | hash_object.update(chunk_hash.digest()) 101 | 102 | return f'"{hash_object.hexdigest()}-{len(chunk_hashes)}"' 103 | -------------------------------------------------------------------------------- /tests/test_dataset_versions_endpoint_logging.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from jsonschema import ValidationError 4 | from pynamodb.exceptions import DoesNotExist 5 | from pytest import mark 6 | 7 | from geostore.aws_keys import BODY_KEY, HTTP_METHOD_KEY 8 | from geostore.dataset_versions.create import create_dataset_version 9 | from geostore.logging_keys import ( 10 | GIT_COMMIT, 11 | LOG_MESSAGE_LAMBDA_FAILURE, 12 | LOG_MESSAGE_LAMBDA_START, 13 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE, 14 | ) 15 | from geostore.parameter_store import ParameterName, get_param 16 | from geostore.step_function_keys import DATASET_ID_SHORT_KEY, METADATA_URL_KEY, S3_ROLE_ARN_KEY 17 | 18 | from .aws_utils import Dataset, any_role_arn, any_s3_url 19 | from .general_generators import any_error_message 20 | from .stac_generators import any_dataset_id, any_dataset_version_id 21 | 22 | 23 | @mark.infrastructure 24 | def should_log_payload() -> None: 25 | # Given 26 | with patch( 27 | "geostore.dataset_versions.create.STEP_FUNCTIONS_CLIENT.start_execution" 28 | ), Dataset() as dataset, patch("geostore.dataset_versions.create.LOGGER.debug") as logger_mock: 29 | event = { 30 | HTTP_METHOD_KEY: "POST", 31 | BODY_KEY: { 32 | METADATA_URL_KEY: any_s3_url(), 33 | DATASET_ID_SHORT_KEY: dataset.dataset_id, 34 | S3_ROLE_ARN_KEY: any_role_arn(), 35 | }, 36 | } 37 | 38 | # When 39 | create_dataset_version(event) 40 | 41 | # Then 42 | logger_mock.assert_any_call( 43 | LOG_MESSAGE_LAMBDA_START, 44 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 45 | ) 46 | 47 | 48 | @mark.infrastructure 49 | @patch("geostore.dataset_versions.create.STEP_FUNCTIONS_CLIENT.start_execution") 50 | def should_log_step_function_state_machine_response(start_execution_mock: MagicMock) -> None: 51 | # Given 52 | start_execution_mock.return_value = step_function_response = {"executionArn": "Some Response"} 53 | 54 | with Dataset(current_dataset_version=any_dataset_version_id()) as dataset, patch( 55 | "geostore.dataset_versions.create.LOGGER.debug" 56 | ) as logger_mock: 57 | event = { 58 | METADATA_URL_KEY: any_s3_url(), 59 | DATASET_ID_SHORT_KEY: dataset.dataset_id, 60 | S3_ROLE_ARN_KEY: any_role_arn(), 61 | } 62 | 63 | # When 64 | create_dataset_version(event) 65 | 66 | # Then 67 | logger_mock.assert_any_call( 68 | LOG_MESSAGE_STEP_FUNCTION_RESPONSE, 69 | extra={ 70 | "response": step_function_response, 71 | GIT_COMMIT: get_param(ParameterName.GIT_COMMIT), 72 | }, 73 | ) 74 | 75 | 76 | @patch("geostore.dataset_versions.create.validate") 77 | def should_log_missing_argument_warning(validate_schema_mock: MagicMock) -> None: 78 | # given 79 | error_message = any_error_message() 80 | validate_schema_mock.side_effect = ValidationError(error_message) 81 | 82 | payload = {HTTP_METHOD_KEY: "POST", BODY_KEY: {}} 83 | 84 | with patch("geostore.dataset_versions.create.LOGGER.warning") as logger_mock: 85 | # when 86 | create_dataset_version(payload) 87 | 88 | # then 89 | logger_mock.assert_any_call( 90 | LOG_MESSAGE_LAMBDA_FAILURE, 91 | extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 92 | ) 93 | 94 | 95 | @patch("geostore.dataset_versions.create.datasets_model_with_meta") 96 | def should_log_warning_if_dataset_does_not_exist(datasets_model_mock: MagicMock) -> None: 97 | # given 98 | error_message = any_error_message() 99 | datasets_model_mock.return_value.get.side_effect = DoesNotExist(error_message) 100 | 101 | payload = { 102 | METADATA_URL_KEY: any_s3_url(), 103 | DATASET_ID_SHORT_KEY: any_dataset_id(), 104 | S3_ROLE_ARN_KEY: any_role_arn(), 105 | } 106 | 107 | with patch("geostore.dataset_versions.create.LOGGER.warning") as logger_mock: 108 | # when 109 | create_dataset_version(payload) 110 | 111 | # then 112 | logger_mock.assert_any_call( 113 | LOG_MESSAGE_LAMBDA_FAILURE, 114 | extra={"error": error_message, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 115 | ) 116 | -------------------------------------------------------------------------------- /tests/stac_generators.py: -------------------------------------------------------------------------------- 1 | from hashlib import sha256 2 | from random import choice, randrange 3 | from uuid import uuid4 4 | 5 | from multihash import SHA2_256 6 | 7 | from geostore.dataset_properties import TITLE_CHARACTERS 8 | from geostore.stac_format import ( 9 | LINZ_STAC_CREATED_KEY, 10 | LINZ_STAC_UPDATED_KEY, 11 | STAC_MAXIMUM_KEY, 12 | STAC_MINIMUM_KEY, 13 | ) 14 | from geostore.types import JsonObject 15 | 16 | from .general_generators import ( 17 | _random_string_choices, 18 | any_description, 19 | any_https_url, 20 | any_name, 21 | any_past_datetime, 22 | random_string, 23 | ) 24 | 25 | 26 | def any_hex_multihash() -> str: 27 | hex_digest = any_sha256_hex_digest() 28 | return sha256_hex_digest_to_multihash(hex_digest) 29 | 30 | 31 | def any_sha256_hex_digest() -> str: 32 | return sha256(random_string(20).encode()).hexdigest() 33 | 34 | 35 | def sha256_hex_digest_to_multihash(hex_digest: str) -> str: 36 | return f"{SHA2_256:x}{32:x}{hex_digest}" 37 | 38 | 39 | def any_dataset_id() -> str: 40 | return uuid4().hex 41 | 42 | 43 | def any_dataset_version_id() -> str: 44 | """Arbitrary-length string""" 45 | return uuid4().hex 46 | 47 | 48 | def any_dataset_title() -> str: 49 | """Arbitrary-length string of valid dataset title characters""" 50 | return _random_string_choices(TITLE_CHARACTERS, 20) 51 | 52 | 53 | def any_asset_name() -> str: 54 | """Arbitrary-length string""" 55 | return random_string(20) 56 | 57 | 58 | def any_dataset_description() -> str: 59 | """Arbitrary-length string""" 60 | return random_string(100) 61 | 62 | 63 | def any_linz_asset_summaries() -> JsonObject: 64 | """ 65 | Semi-arbitrary dates: 66 | 67 | - The first creation date can't be after any of the other dates 68 | - The last created and first updated dates can be anywhere within the range 69 | - The last updated date can't be before any of the other dates 70 | """ 71 | datetimes = [any_past_datetime(), any_past_datetime(), any_past_datetime(), any_past_datetime()] 72 | return { 73 | LINZ_STAC_CREATED_KEY: { 74 | STAC_MINIMUM_KEY: min(datetimes).isoformat(), 75 | STAC_MAXIMUM_KEY: choice(datetimes).isoformat(), 76 | }, 77 | LINZ_STAC_UPDATED_KEY: { 78 | STAC_MINIMUM_KEY: choice(datetimes).isoformat(), 79 | STAC_MAXIMUM_KEY: max(datetimes).isoformat(), 80 | }, 81 | } 82 | 83 | 84 | def any_linz_geospatial_type() -> str: 85 | return choice( 86 | [ 87 | "black and white image", 88 | "circular string", 89 | "color image", 90 | "compound curve", 91 | "curve polygon", 92 | "geometry", 93 | "geometry collection", 94 | "grayscale", 95 | "grid", 96 | "hyperspectral", 97 | "multicurve", 98 | "multilinestring", 99 | "multipoint", 100 | "multipolygon", 101 | "multispectral", 102 | "multisurface", 103 | "linestring", 104 | "point", 105 | "point cloud", 106 | "polygon", 107 | "polyhedral surface", 108 | "rgb", 109 | "tin", 110 | "triangle", 111 | ] 112 | ) 113 | 114 | 115 | def any_linz_history() -> str: 116 | """Arbitrary-length string""" 117 | return random_string(20) 118 | 119 | 120 | def any_linz_lifecycle() -> str: 121 | return choice(["under development", "preview", "ongoing", "completed", "deprecated"]) 122 | 123 | 124 | def any_provider(role: str) -> JsonObject: 125 | return { 126 | "name": any_name(), 127 | "description": any_description(), 128 | "roles": [role], 129 | "url": any_https_url(), 130 | } 131 | 132 | 133 | def any_linz_provider_custodian() -> JsonObject: 134 | return any_provider("custodian") 135 | 136 | 137 | def any_linz_provider_manager() -> JsonObject: 138 | return any_provider("manager") 139 | 140 | 141 | def any_provider_licensor() -> JsonObject: 142 | return any_provider("licensor") 143 | 144 | 145 | def any_provider_producer() -> JsonObject: 146 | return any_provider("producer") 147 | 148 | 149 | def any_epsg() -> int: 150 | return randrange(1_000_000) 151 | 152 | 153 | def any_version_version() -> str: 154 | return f"{randrange(1_000)}.{randrange(1_000)}.{randrange(1_000)}" 155 | -------------------------------------------------------------------------------- /geostore/populate_catalog/task.py: -------------------------------------------------------------------------------- 1 | from json import dumps 2 | from logging import Logger 3 | from typing import TYPE_CHECKING 4 | 5 | import boto3 6 | from linz_logger import get_log 7 | from pystac import read_file 8 | from pystac.catalog import Catalog, CatalogType 9 | from pystac.collection import Collection 10 | from pystac.item import Item 11 | from pystac.layout import HrefLayoutStrategy 12 | from pystac.stac_io import StacIO 13 | 14 | from ..api_keys import EVENT_KEY 15 | from ..aws_keys import BODY_KEY 16 | from ..boto3_config import CONFIG 17 | from ..logging_keys import GIT_COMMIT, LOG_MESSAGE_LAMBDA_FAILURE 18 | from ..parameter_store import ParameterName, get_param 19 | from ..pystac_io_methods import S3StacIO 20 | from ..resources import Resource 21 | from ..s3 import S3_URL_PREFIX 22 | from ..types import JsonObject 23 | 24 | if TYPE_CHECKING: 25 | # When type checking we want to use the third party package's stub 26 | from mypy_boto3_s3 import S3Client 27 | else: 28 | # In production we want to avoid depending on a package which has no runtime impact 29 | S3Client = object # pragma: no mutate 30 | 31 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 32 | 33 | ROOT_CATALOG_ID = "root_catalog" 34 | ROOT_CATALOG_TITLE = "Toitū Te Whenua Land Information New Zealand Geostore" 35 | ROOT_CATALOG_DESCRIPTION = ( 36 | "The Geospatial Data Store (Geostore) contains all the important " 37 | "geospatial data held by Toitū Te Whenua Land Information New Zealand.
" 38 | "Please browse this catalog to find and access our data." 39 | ) 40 | CATALOG_FILENAME = "catalog.json" 41 | CONTENTS_KEY = "Contents" 42 | RECORDS_KEY = "Records" 43 | 44 | LOGGER: Logger = get_log() 45 | 46 | StacIO.set_default(S3StacIO) 47 | 48 | 49 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 50 | """Main Lambda entry point.""" 51 | 52 | LOGGER.debug(dumps({EVENT_KEY: event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)})) 53 | 54 | for message in event[RECORDS_KEY]: 55 | handle_message(message[BODY_KEY]) 56 | 57 | return {} 58 | 59 | 60 | class GeostoreSTACLayoutStrategy(HrefLayoutStrategy): 61 | def get_catalog_href(self, cat: Catalog, parent_dir: str, is_root: bool) -> str: 62 | return str(cat.get_self_href()) 63 | 64 | def get_collection_href(self, col: Collection, parent_dir: str, is_root: bool) -> str: 65 | assert not is_root 66 | return str(col.get_self_href()) 67 | 68 | def get_item_href(self, item: Item, parent_dir: str) -> str: # pragma: no cover 69 | raise NotImplementedError() 70 | 71 | 72 | def handle_message(metadata_key: str) -> None: 73 | """Handle writing a new dataset to the root catalog""" 74 | 75 | storage_bucket_path = f"{S3_URL_PREFIX}{Resource.STORAGE_BUCKET_NAME.resource_name}" 76 | 77 | # there could be a myriad of problems preventing catalog from being populated 78 | # hence a rather broad try except exception clause is used 79 | # an exception thrown here indicates stuck message(s) in the sqs queue 80 | # logging is monitored by elasticsearch and alerting is set up to notify the team of a problem 81 | try: 82 | dataset_metadata = read_file(f"{storage_bucket_path}/{metadata_key}") 83 | assert isinstance(dataset_metadata, (Catalog, Collection)) 84 | 85 | results = S3_CLIENT.list_objects( 86 | Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, Prefix=CATALOG_FILENAME 87 | ) 88 | 89 | # create root catalog if it doesn't exist 90 | if CONTENTS_KEY in results: 91 | root_catalog = Catalog.from_file(f"{storage_bucket_path}/{CATALOG_FILENAME}") 92 | 93 | else: 94 | root_catalog = Catalog( 95 | id=ROOT_CATALOG_ID, 96 | title=ROOT_CATALOG_TITLE, 97 | description=ROOT_CATALOG_DESCRIPTION, 98 | catalog_type=CatalogType.SELF_CONTAINED, 99 | ) 100 | root_catalog.set_self_href(f"{storage_bucket_path}/{CATALOG_FILENAME}") 101 | 102 | if root_catalog.get_child(dataset_metadata.id) is None: 103 | root_catalog.add_child(child=dataset_metadata, strategy=GeostoreSTACLayoutStrategy()) 104 | 105 | root_catalog.save(catalog_type=CatalogType.SELF_CONTAINED) 106 | 107 | except Exception as error: 108 | LOGGER.warning( 109 | f"{LOG_MESSAGE_LAMBDA_FAILURE}: Unable to populate catalog due to “{error}”", 110 | extra={GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 111 | ) 112 | raise 113 | -------------------------------------------------------------------------------- /geostore/update_root_catalog/task.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | from os.path import basename 3 | from typing import TYPE_CHECKING 4 | from urllib.parse import urlparse 5 | from uuid import uuid4 6 | 7 | import boto3 8 | from jsonschema import ValidationError, validate 9 | from linz_logger import get_log 10 | 11 | from ..boto3_config import CONFIG 12 | from ..datasets_model import datasets_model_with_meta 13 | from ..error_response_keys import ERROR_MESSAGE_KEY 14 | from ..logging_keys import ( 15 | GIT_COMMIT, 16 | LOG_MESSAGE_LAMBDA_FAILURE, 17 | LOG_MESSAGE_LAMBDA_START, 18 | LOG_MESSAGE_S3_DELETION_RESPONSE, 19 | ) 20 | from ..models import DATASET_ID_PREFIX 21 | from ..parameter_store import ParameterName, get_param 22 | from ..processing_assets_model import processing_assets_model_with_meta 23 | from ..resources import Resource 24 | from ..s3 import S3_URL_PREFIX 25 | from ..step_function import get_hash_key 26 | from ..step_function_keys import ( 27 | CURRENT_VERSION_ID_KEY, 28 | DATASET_ID_KEY, 29 | DATASET_TITLE_KEY, 30 | METADATA_URL_KEY, 31 | NEW_VERSION_ID_KEY, 32 | NEW_VERSION_S3_LOCATION, 33 | ) 34 | from ..types import JsonObject 35 | 36 | if TYPE_CHECKING: 37 | # When type checking we want to use the third party package's stub 38 | from mypy_boto3_s3 import S3Client 39 | from mypy_boto3_sqs import SQSServiceResource 40 | else: 41 | # In production we want to avoid depending on a package which has no runtime impact 42 | S3Client = SQSServiceResource = object # pragma: no mutate 43 | 44 | LOGGER: Logger = get_log() 45 | SQS_RESOURCE: SQSServiceResource = boto3.resource("sqs") 46 | S3_CLIENT: S3Client = boto3.client("s3", config=CONFIG) 47 | 48 | SQS_MESSAGE_GROUP_ID = "update_root_catalog_message_group" 49 | 50 | 51 | def lambda_handler(event: JsonObject, _context: bytes) -> JsonObject: 52 | """Main Lambda entry point.""" 53 | LOGGER.debug( 54 | LOG_MESSAGE_LAMBDA_START, 55 | extra={"lambda_input": event, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 56 | ) 57 | 58 | # validate input 59 | try: 60 | validate( 61 | event, 62 | { 63 | "type": "object", 64 | "properties": { 65 | CURRENT_VERSION_ID_KEY: {"type": "string"}, 66 | DATASET_ID_KEY: {"type": "string"}, 67 | DATASET_TITLE_KEY: {"type": "string"}, 68 | NEW_VERSION_ID_KEY: {"type": "string"}, 69 | METADATA_URL_KEY: {"type": "string"}, 70 | }, 71 | "required": [ 72 | CURRENT_VERSION_ID_KEY, 73 | DATASET_ID_KEY, 74 | DATASET_TITLE_KEY, 75 | METADATA_URL_KEY, 76 | NEW_VERSION_ID_KEY, 77 | ], 78 | }, 79 | ) 80 | except ValidationError as error: 81 | LOGGER.warning( 82 | LOG_MESSAGE_LAMBDA_FAILURE, 83 | extra={"error": error, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 84 | ) 85 | return {ERROR_MESSAGE_KEY: error.message} 86 | 87 | dataset_key = ( 88 | f"{event[DATASET_TITLE_KEY]}/{basename(urlparse(event[METADATA_URL_KEY]).path[1:])}" 89 | ) 90 | 91 | # add reference to root catalog 92 | SQS_RESOURCE.get_queue_by_name( 93 | QueueName=get_param(ParameterName.UPDATE_CATALOG_MESSAGE_QUEUE_NAME) 94 | ).send_message( 95 | MessageBody=dataset_key, 96 | MessageGroupId=SQS_MESSAGE_GROUP_ID, 97 | MessageDeduplicationId=uuid4().hex, 98 | ) 99 | 100 | processing_assets_model = processing_assets_model_with_meta() 101 | for item in processing_assets_model.query( 102 | get_hash_key(event[DATASET_ID_KEY], event[CURRENT_VERSION_ID_KEY]), 103 | filter_condition=processing_assets_model.replaced_in_new_version.does_not_exist(), 104 | ): 105 | s3_response = S3_CLIENT.delete_object( 106 | Bucket=Resource.STORAGE_BUCKET_NAME.resource_name, 107 | Key=f"{event[DATASET_TITLE_KEY]}/{item.filename}", 108 | ) 109 | LOGGER.debug( 110 | LOG_MESSAGE_S3_DELETION_RESPONSE, 111 | extra={"response": s3_response, GIT_COMMIT: get_param(ParameterName.GIT_COMMIT)}, 112 | ) 113 | 114 | # Update dataset record with the latest version 115 | datasets_model = datasets_model_with_meta() 116 | dataset = datasets_model.get( 117 | hash_key=f"{DATASET_ID_PREFIX}{event[DATASET_ID_KEY]}", consistent_read=True 118 | ) 119 | dataset.update(actions=[datasets_model.current_dataset_version.set(event[NEW_VERSION_ID_KEY])]) 120 | 121 | return { 122 | NEW_VERSION_S3_LOCATION: f"{S3_URL_PREFIX}" 123 | f"{Resource.STORAGE_BUCKET_NAME.resource_name}/" 124 | f"{dataset_key}" 125 | } 126 | -------------------------------------------------------------------------------- /infrastructure/application_stack.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import shutil 3 | from os import environ 4 | 5 | import constructs 6 | from aws_cdk import Environment, Stack, aws_iam 7 | 8 | from geostore.environment import environment_name 9 | from infrastructure.constructs.bundled_code import LambdaPackaging 10 | 11 | from .constructs.api import API 12 | from .constructs.lambda_layers import LambdaLayers 13 | from .constructs.lds import LDS 14 | from .constructs.notify import Notify 15 | from .constructs.opentopo import OpenTopography 16 | from .constructs.processing import Processing 17 | from .constructs.staging import Staging 18 | from .constructs.storage import Storage 19 | 20 | 21 | class Application(Stack): 22 | def __init__(self, scope: constructs.Construct, stack_id: str) -> None: 23 | environment = Environment( 24 | account=environ["CDK_DEFAULT_ACCOUNT"], region=environ["CDK_DEFAULT_REGION"] 25 | ) 26 | 27 | super().__init__(scope, stack_id, env=environment) 28 | 29 | env_name = environment_name() 30 | 31 | principal: aws_iam.PrincipalBase 32 | if saml_provider_arn := environ.get("GEOSTORE_SAML_IDENTITY_PROVIDER_ARN"): 33 | principal = aws_iam.FederatedPrincipal( 34 | federated=saml_provider_arn, 35 | assume_role_action="sts:AssumeRoleWithSAML", 36 | conditions={"StringEquals": {"SAML:aud": "https://signin.aws.amazon.com/saml"}}, 37 | ) 38 | else: 39 | open_id_connect_provider_arn = ( 40 | f"arn:aws:iam::" 41 | f"{aws_iam.AccountRootPrincipal().account_id}" 42 | f":oidc-provider/token.actions.githubusercontent.com" 43 | ) 44 | 45 | principal = aws_iam.CompositePrincipal( 46 | aws_iam.AccountPrincipal(account_id=aws_iam.AccountRootPrincipal().account_id), 47 | aws_iam.WebIdentityPrincipal( 48 | identity_provider=open_id_connect_provider_arn, 49 | conditions={ 50 | "StringLike": { 51 | "token.actions.githubusercontent.com:aud": ["sts.amazonaws.com"], 52 | "token.actions.githubusercontent.com:sub": ["repo:linz/geostore:*"], 53 | } 54 | }, 55 | ), 56 | ) 57 | 58 | storage = Storage(self, "storage", env_name=env_name) 59 | 60 | lambda_layers = LambdaLayers(self, "lambda-layers", env_name=env_name) 61 | 62 | processing = Processing( 63 | self, 64 | "processing", 65 | botocore_lambda_layer=lambda_layers.botocore, 66 | env_name=env_name, 67 | principal=principal, 68 | s3_role_arn_parameter=storage.s3_role_arn_parameter, 69 | storage_bucket=storage.storage_bucket, 70 | validation_results_table=storage.validation_results_table, 71 | datasets_table=storage.datasets_table, 72 | git_commit_parameter=storage.git_commit_parameter, 73 | ) 74 | Staging(self, "staging", users_role=processing.staging_users_role) 75 | 76 | API( 77 | self, 78 | "api", 79 | botocore_lambda_layer=lambda_layers.botocore, 80 | datasets_table=storage.datasets_table, 81 | env_name=env_name, 82 | processing_assets_table=processing.processing_assets_table, 83 | state_machine=processing.state_machine, 84 | state_machine_parameter=processing.state_machine_parameter, 85 | sqs_queue=processing.message_queue, 86 | sqs_queue_parameter=processing.message_queue_name_parameter, 87 | storage_bucket=storage.storage_bucket, 88 | validation_results_table=storage.validation_results_table, 89 | git_commit_parameter=storage.git_commit_parameter, 90 | ) 91 | 92 | Notify( 93 | self, 94 | "notify", 95 | botocore_lambda_layer=lambda_layers.botocore, 96 | env_name=env_name, 97 | state_machine=processing.state_machine, 98 | validation_results_table=storage.validation_results_table, 99 | git_commit_parameter=storage.git_commit_parameter, 100 | ) 101 | 102 | if self.node.try_get_context("enableLDSAccess"): 103 | LDS(self, "lds", env_name=env_name, storage_bucket=storage.storage_bucket) 104 | 105 | if self.node.try_get_context("enableOpenTopographyAccess"): 106 | OpenTopography( 107 | self, "opentopography", env_name=env_name, storage_bucket=storage.storage_bucket 108 | ) 109 | 110 | # Remove temp lambda packaging directory at exit to purge pip packages 111 | # Reusing pip packages would speed things up, but also makes things 112 | # harder to troubleshoot when there is a change in one of the Python packages 113 | atexit.register(lambda: shutil.rmtree(LambdaPackaging.directory)) 114 | -------------------------------------------------------------------------------- /infrastructure/constructs/api.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import ( 2 | Tags, 3 | aws_iam, 4 | aws_lambda_python_alpha, 5 | aws_s3, 6 | aws_sqs, 7 | aws_ssm, 8 | aws_stepfunctions, 9 | ) 10 | from constructs import Construct 11 | 12 | from geostore.resources import Resource 13 | 14 | from .common import grant_parameter_read_access 15 | from .lambda_endpoint import LambdaEndpoint 16 | from .roles import LINZ_ORGANIZATION_ID, MAX_SESSION_DURATION 17 | from .s3_policy import ALLOW_DESCRIBE_ANY_S3_JOB 18 | from .table import Table 19 | 20 | 21 | class API(Construct): 22 | def __init__( # pylint: disable=too-many-locals 23 | self, 24 | scope: Construct, 25 | stack_id: str, 26 | *, 27 | botocore_lambda_layer: aws_lambda_python_alpha.PythonLayerVersion, 28 | datasets_table: Table, 29 | env_name: str, 30 | processing_assets_table: Table, 31 | state_machine: aws_stepfunctions.StateMachine, 32 | state_machine_parameter: aws_ssm.StringParameter, 33 | sqs_queue: aws_sqs.Queue, 34 | sqs_queue_parameter: aws_ssm.StringParameter, 35 | storage_bucket: aws_s3.Bucket, 36 | validation_results_table: Table, 37 | git_commit_parameter: aws_ssm.StringParameter, 38 | ) -> None: 39 | super().__init__(scope, stack_id) 40 | 41 | ############################################################################################ 42 | # ### API ENDPOINTS ######################################################################## 43 | ############################################################################################ 44 | 45 | api_users_role = aws_iam.Role( 46 | self, 47 | "api-users-role", 48 | role_name=Resource.API_USERS_ROLE_NAME.resource_name, 49 | assumed_by=aws_iam.OrganizationPrincipal(LINZ_ORGANIZATION_ID), 50 | max_session_duration=MAX_SESSION_DURATION, 51 | ) 52 | 53 | datasets_endpoint_lambda = LambdaEndpoint( 54 | self, 55 | Resource.DATASETS_ENDPOINT_FUNCTION_NAME.resource_name, 56 | package_name="datasets", 57 | env_name=env_name, 58 | users_role=api_users_role, 59 | botocore_lambda_layer=botocore_lambda_layer, 60 | ) 61 | 62 | dataset_versions_endpoint_lambda = LambdaEndpoint( 63 | self, 64 | Resource.DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME.resource_name, 65 | package_name="dataset_versions", 66 | env_name=env_name, 67 | users_role=api_users_role, 68 | botocore_lambda_layer=botocore_lambda_layer, 69 | ) 70 | processing_assets_table.grant_read_write_data(dataset_versions_endpoint_lambda) 71 | processing_assets_table.grant(dataset_versions_endpoint_lambda, "dynamodb:DescribeTable") 72 | 73 | state_machine.grant_start_execution(dataset_versions_endpoint_lambda) 74 | 75 | storage_bucket.grant_read_write(datasets_endpoint_lambda) 76 | 77 | sqs_queue.grant_send_messages(datasets_endpoint_lambda) 78 | 79 | for function in [datasets_endpoint_lambda, dataset_versions_endpoint_lambda]: 80 | datasets_table.grant_read_write_data(function) 81 | datasets_table.grant(function, "dynamodb:DescribeTable") # required by pynamodb 82 | 83 | import_status_endpoint_lambda = LambdaEndpoint( 84 | self, 85 | Resource.IMPORT_STATUS_ENDPOINT_FUNCTION_NAME.resource_name, 86 | package_name="import_status", 87 | env_name=env_name, 88 | users_role=api_users_role, 89 | botocore_lambda_layer=botocore_lambda_layer, 90 | ) 91 | 92 | validation_results_table.grant_read_data(import_status_endpoint_lambda) 93 | validation_results_table.grant( 94 | import_status_endpoint_lambda, "dynamodb:DescribeTable" 95 | ) # required by pynamodb 96 | 97 | state_machine.grant_read(import_status_endpoint_lambda) 98 | import_status_endpoint_lambda.add_to_role_policy(ALLOW_DESCRIBE_ANY_S3_JOB) 99 | 100 | grant_parameter_read_access( 101 | { 102 | datasets_table.name_parameter: [ 103 | datasets_endpoint_lambda, 104 | dataset_versions_endpoint_lambda, 105 | ], 106 | processing_assets_table.name_parameter: [dataset_versions_endpoint_lambda], 107 | validation_results_table.name_parameter: [import_status_endpoint_lambda], 108 | state_machine_parameter: [dataset_versions_endpoint_lambda], 109 | sqs_queue_parameter: [datasets_endpoint_lambda], 110 | git_commit_parameter: [ 111 | datasets_endpoint_lambda, 112 | dataset_versions_endpoint_lambda, 113 | import_status_endpoint_lambda, 114 | ], 115 | } 116 | ) 117 | 118 | Tags.of(self).add("ApplicationLayer", "api") 119 | -------------------------------------------------------------------------------- /infrastructure/constructs/storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Geostore AWS resources definitions. 3 | """ 4 | from aws_cdk import Tags, aws_dynamodb, aws_iam, aws_s3, aws_ssm 5 | from constructs import Construct 6 | 7 | from geostore.datasets_model import DatasetsTitleIdx 8 | from geostore.parameter_store import ParameterName 9 | from geostore.resources import Resource 10 | from geostore.validation_results_model import ValidationOutcomeIdx 11 | 12 | from .removal_policy import REMOVAL_POLICY 13 | from .roles import LINZ_ORGANIZATION_ID, MAX_SESSION_DURATION 14 | from .table import Table 15 | from .version import GIT_BRANCH, GIT_COMMIT, GIT_TAG 16 | 17 | 18 | class Storage(Construct): 19 | def __init__(self, scope: Construct, stack_id: str, *, env_name: str) -> None: 20 | super().__init__(scope, stack_id) 21 | 22 | ############################################################################################ 23 | # ### DEPLOYMENT VERSION ################################################################### 24 | ############################################################################################ 25 | 26 | aws_ssm.StringParameter( 27 | self, 28 | "git-branch", 29 | parameter_name=f"/{env_name}/git_branch", 30 | string_value=GIT_BRANCH, 31 | description="Deployment git branch", 32 | ) 33 | 34 | self.git_commit_parameter = aws_ssm.StringParameter( 35 | self, 36 | "git-commit", 37 | parameter_name=f"/{env_name}/git_commit", 38 | string_value=GIT_COMMIT, 39 | description="Deployment git commit", 40 | ) 41 | 42 | aws_ssm.StringParameter( 43 | self, 44 | "git-tag", 45 | parameter_name=f"/{env_name}/version", 46 | string_value=GIT_TAG, 47 | description="Deployment version", 48 | ) 49 | 50 | ############################################################################################ 51 | # ### STORAGE S3 BUCKET #################################################################### 52 | ############################################################################################ 53 | self.storage_bucket = aws_s3.Bucket( 54 | self, 55 | "storage-bucket", 56 | bucket_name=Resource.STORAGE_BUCKET_NAME.resource_name, 57 | access_control=aws_s3.BucketAccessControl.PRIVATE, 58 | block_public_access=aws_s3.BlockPublicAccess.BLOCK_ALL, 59 | versioned=True, 60 | removal_policy=REMOVAL_POLICY, 61 | enforce_ssl=True, 62 | ) 63 | 64 | s3_users_role = aws_iam.Role( 65 | self, 66 | "s3-users-role", 67 | role_name=Resource.S3_USERS_ROLE_NAME.resource_name, 68 | assumed_by=aws_iam.OrganizationPrincipal(LINZ_ORGANIZATION_ID), 69 | max_session_duration=MAX_SESSION_DURATION, 70 | ) 71 | self.storage_bucket.grant_read(s3_users_role) 72 | 73 | self.s3_role_arn_parameter = aws_ssm.StringParameter( 74 | self, 75 | "s3-users-role-arn", 76 | string_value=s3_users_role.role_arn, 77 | parameter_name=ParameterName.S3_USERS_ROLE_ARN.value, 78 | ) 79 | 80 | ############################################################################################ 81 | # ### APPLICATION DB ####################################################################### 82 | ############################################################################################ 83 | self.datasets_table = Table( 84 | self, 85 | f"{env_name}-datasets", 86 | env_name=env_name, 87 | parameter_name=ParameterName.STORAGE_DATASETS_TABLE_NAME, 88 | ) 89 | 90 | self.datasets_table.add_global_secondary_index( 91 | index_name=DatasetsTitleIdx.Meta.index_name, 92 | partition_key=aws_dynamodb.Attribute( 93 | name="title", type=aws_dynamodb.AttributeType.STRING 94 | ), 95 | ) 96 | 97 | self.validation_results_table = Table( 98 | self, 99 | f"{env_name}-validation-results", 100 | env_name=env_name, 101 | parameter_name=ParameterName.STORAGE_VALIDATION_RESULTS_TABLE_NAME, 102 | sort_key=aws_dynamodb.Attribute(name="sk", type=aws_dynamodb.AttributeType.STRING), 103 | ) 104 | 105 | self.validation_results_table.add_global_secondary_index( 106 | index_name=ValidationOutcomeIdx.Meta.index_name, 107 | partition_key=aws_dynamodb.Attribute( 108 | name=ValidationOutcomeIdx.pk.attr_name, type=aws_dynamodb.AttributeType.STRING 109 | ), 110 | sort_key=aws_dynamodb.Attribute( 111 | name=ValidationOutcomeIdx.result.attr_name, type=aws_dynamodb.AttributeType.STRING 112 | ), 113 | ) 114 | 115 | Tags.of(self).add("ApplicationLayer", "storage") 116 | --------------------------------------------------------------------------------