├── .codacy.yml ├── .codecov.yml ├── .coveragerc ├── .flake8 ├── .gitignore ├── .gitlab-ci-prod.yml ├── .gitlab-ci.yml ├── .gitmodules ├── .travis.yml ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── LICENSE ├── Makefile ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── allspark.Dockerfile ├── chalice ├── .chalice │ └── config.json ├── .gitignore ├── Makefile ├── app.py ├── build_deploy_config.sh └── chalicelib │ ├── .keep │ └── index.html ├── codecov.yml ├── common.mk ├── daemons ├── .gitignore ├── Makefile ├── build_deploy_config.sh ├── build_tf_deploy_config.py ├── deploy_aws_elasticsearch.sh ├── dss-checkout-sfn │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-dlq-reaper │ ├── .chalice │ │ └── config.json │ ├── README.md │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-events-scribe │ ├── .chalice │ │ └── config.json │ ├── Readme.md │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-gs-copy-sfn │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-gs-copy-write-metadata-sfn │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-gs-event-relay │ ├── Readme.md │ ├── main.py │ └── requirements.txt ├── dss-index │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-notify-v2 │ ├── .chalice │ │ └── config.json │ ├── Readme.md │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-notify │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-operations │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-s3-copy-sfn │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-s3-copy-write-metadata-sfn │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-scalability-test │ ├── .chalice │ │ └── config.json │ ├── README.md │ ├── app.py │ └── domovoilib │ │ └── json_generator │ │ └── __init__.py ├── dss-sfn-launcher │ ├── .chalice │ │ └── config.json │ ├── app.py │ └── domovoilib │ │ └── .keep ├── dss-sync-sfn │ ├── .chalice │ │ └── config.json │ ├── .gitignore │ ├── Readme.md │ ├── app.py │ ├── domovoilib │ │ └── .keep │ ├── dss-sync-sfn-workflow-graph.svg │ └── dss-sync-sfn.png ├── invoke_lambda.sh └── package_daemon.sh ├── docs ├── environment │ └── README.md └── resource_def.md ├── dss-api ├── dss-api.yml ├── dss ├── __init__.py ├── api │ ├── __init__.py │ ├── bundles │ │ ├── __init__.py │ │ └── checkout.py │ ├── collections.py │ ├── events.py │ ├── files.py │ ├── health.py │ ├── search.py │ ├── subscriptions.py │ ├── subscriptions_v1.py │ └── subscriptions_v2.py ├── collections │ ├── __init__.py │ └── owner_lookup.py ├── config.py ├── dynamodb │ └── __init__.py ├── error.py ├── events │ ├── __init__.py │ └── handlers │ │ ├── __init__.py │ │ ├── notify_v2.py │ │ └── sync.py ├── index │ ├── __init__.py │ ├── backend.py │ ├── bundle.py │ ├── es │ │ ├── __init__.py │ │ ├── backend.py │ │ ├── document.py │ │ ├── manager.py │ │ ├── mapping.json │ │ ├── schemainfo.py │ │ └── validator.py │ └── indexer.py ├── logging.py ├── notify │ ├── README.md │ ├── __init__.py │ ├── attachment.py │ ├── notification.py │ └── notifier.py ├── operations │ ├── __init__.py │ ├── checkout.py │ ├── elasticsearch.py │ ├── events.py │ ├── iam.py │ ├── lambda_params.py │ ├── secrets.py │ ├── stepfunctions.py │ ├── storage.py │ ├── sync.py │ └── util.py ├── stepfunctions │ ├── __init__.py │ ├── checkout │ │ ├── checkout_states.py │ │ └── constants.py │ ├── gscopyclient │ │ ├── __init__.py │ │ └── implementation.py │ ├── lambdaexecutor │ │ └── __init__.py │ └── s3copyclient │ │ ├── __init__.py │ │ └── implementation.py ├── storage │ ├── __init__.py │ ├── blobstore.py │ ├── bundles.py │ ├── checkout │ │ ├── __init__.py │ │ ├── bundle.py │ │ ├── cache_flow.py │ │ ├── common.py │ │ ├── error.py │ │ └── file.py │ ├── files.py │ ├── hcablobstore │ │ ├── __init__.py │ │ ├── gs.py │ │ └── s3.py │ └── identifiers.py ├── subscriptions_v2 │ └── __init__.py ├── test.log ├── util │ ├── __init__.py │ ├── async_state.py │ ├── aws │ │ ├── __init__.py │ │ ├── _boto3_loader.py │ │ ├── clients.py │ │ ├── cloudwatch_logging.py │ │ └── resources.py │ ├── email.py │ ├── iterators.py │ ├── json_gen │ │ ├── __init__.py │ │ ├── generator.py │ │ └── hca_generator.py │ ├── networking.py │ ├── parallel_worker.py │ ├── retry.py │ ├── s3urlcache.py │ ├── security.py │ ├── streaming.py │ ├── time.py │ ├── tracing.py │ ├── types.py │ └── version.py └── vendored │ ├── README.md │ ├── __init__.py │ └── frozendict │ └── __init__.py ├── environment ├── environment.integration ├── environment.local.example ├── environment.prod ├── environment.staging ├── iam └── policy-templates │ ├── ci-cd.json │ ├── dss-checkout-sfn-lambda.json │ ├── dss-dlq-reaper-lambda.json │ ├── dss-events-scribe-lambda.json │ ├── dss-gs-copy-sfn-lambda.json │ ├── dss-gs-copy-write-metadata-sfn-lambda.json │ ├── dss-index-lambda.json │ ├── dss-lambda.json │ ├── dss-notify-lambda.json │ ├── dss-notify-v2-lambda.json │ ├── dss-operations-lambda.json │ ├── dss-s3-copy-sfn-lambda.json │ ├── dss-s3-copy-write-metadata-sfn-lambda.json │ ├── dss-scalability-test-lambda.json │ ├── dss-sfn-launcher-lambda.json │ ├── dss-sync-sfn-lambda.json │ └── scheduled-ci-build-lambda.json ├── infra ├── Makefile ├── async_state_db │ └── main.tf ├── buckets │ ├── gs.tf │ └── s3.tf ├── build_deploy_config.py ├── collections_db │ └── main.tf ├── domain │ └── main.tf ├── dss-events-scribe │ └── main.tf ├── elasticsearch │ ├── access_ips.tf │ └── main.tf ├── gcp_service_account │ └── main.tf └── subscription_v2_db │ └── main.tf ├── requirements-dev.txt ├── requirements-dev.txt.in ├── requirements.txt ├── requirements.txt.in ├── roles.json ├── scripts ├── authorize_aws_deploy.sh ├── check_deployment_secrets.py ├── check_env.py ├── check_fusillade.py ├── create_config_aws_event_relay_user.py ├── deploy_gcf.py ├── deploy_scale_dashboard.py ├── deploy_scale_tables.py ├── dss-ops.py ├── dss-start-stop--think-before-you-invoke-me.py ├── envhook.py ├── find_missing_wheels.py ├── generate_upload_requirements_layer.sh ├── kibana-proxy.py ├── release.sh ├── set_apigateway_base_path_mapping.py ├── set_version.sh ├── smugglers-box │ ├── README.md │ ├── links_inspection.py │ └── requirements.txt ├── status.py ├── swagger_auth.py ├── tombstone_bundles.py └── update_collection_db.py ├── tests ├── README.md ├── __init__.py ├── daemons │ ├── a47b90b2-0967-4fbf-87bc-c6c12db3fedf.2017-07-12T055120.037644Z │ └── sample_s3_bundle_created_event.json.template ├── es │ └── __init__.py ├── fixtures │ ├── __init__.py │ ├── cloud_uploader.py │ ├── datafiles │ │ ├── 011c7340-9b3c-4d62-bf49-090d79daf198.2017-06-20T214506.766634Z │ │ ├── 7f8c686d-a439-4376-b367-ac93fc28df43.2019-02-21T184000.899031Z │ │ ├── 9cdc9050cecf59381fed55a2433140b69596fc861bee55abeafd1f9150f3e2da │ │ ├── ce55fd51-7833-469b-be0b-5da88ebebfcd.2017-06-16T193604.240704Z │ │ ├── ce55fd51-7833-469b-be0b-5da88ebebfcd.2017-06-18T075702.020366Z │ │ ├── cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30 │ │ ├── empty │ │ ├── example_bundle │ │ │ ├── cell_suspension_0.json │ │ │ ├── dissociation_protocol_0.json │ │ │ ├── donor_organism_0.json │ │ │ ├── enrichment_protocol_0.json │ │ │ ├── library_preparation_protocol_0.json │ │ │ ├── links.json │ │ │ ├── process_0.json │ │ │ ├── process_1.json │ │ │ ├── process_2.json │ │ │ ├── project_0.json │ │ │ ├── sequence_file_0.json │ │ │ ├── sequence_file_1.json │ │ │ ├── sequencing_protocol_0.json │ │ │ └── specimen_from_organism_0.json │ │ ├── indexing │ │ │ ├── text_data_file1.txt │ │ │ ├── text_data_file2.txt │ │ │ └── unparseable_json.json │ │ └── tombstones │ │ │ ├── deadbeef-0000-4a6b-8f0d-a7d2105c23be.2017-12-05T235728.441373Z │ │ │ ├── deadbeef-0000-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z │ │ │ ├── deadbeef-0000-4a6b-8f0d-a7d2105c23be.dead │ │ │ ├── deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235528.321679Z │ │ │ ├── deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235728.441373Z │ │ │ ├── deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z │ │ │ └── deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z.dead │ ├── populate.py │ └── populate_lib.py ├── hcablobstore_base.py ├── infra │ ├── __init__.py │ ├── assert_mixin.py │ ├── auth_tests_mixin.py │ ├── base_smoketest.py │ ├── elasticsearch_test_case.py │ ├── mock_storage_handler.py │ ├── server.py │ ├── storage_mixin.py │ ├── testmode.py │ └── upload_mixin.py ├── sample_doc_tombstone.json ├── sample_search_queries.py ├── sample_v0_index_doc_tombstone.json ├── sample_vx_index_doc.json ├── sample_vx_index_doc_tombstone.json ├── scalability │ ├── __init__.py │ ├── scale_test_runner.py │ └── sns │ │ └── __init__.py ├── test_api.py ├── test_async_state.py ├── test_bundle.py ├── test_checkout.py ├── test_checkout_caching.py ├── test_cloudwatch_logging.py ├── test_collections.py ├── test_config.py ├── test_doctests.py ├── test_email.py ├── test_events.py ├── test_exptime.py ├── test_file.py ├── test_gscopyclient.py ├── test_gshcablobstore.py ├── test_hcagenerator.py ├── test_headers.py ├── test_health.py ├── test_indexer.py ├── test_indexer_daemon.py ├── test_jsongenerator.py ├── test_lambdaexecutor.py ├── test_modify_swagger_auth.py ├── test_notifier.py ├── test_notify_v2.py ├── test_object_identifier.py ├── test_operations.py ├── test_parallel_worker.py ├── test_prod_smoketest.py ├── test_s3hcablobstore.py ├── test_s3parallelcopy.py ├── test_s3urlcache.py ├── test_search.py ├── test_server.py ├── test_smoketest.py ├── test_standalone_script.py ├── test_storage.py ├── test_subscriptions.py ├── test_sync.py └── test_utils.py └── vendor.in └── .keep /.codacy.yml: -------------------------------------------------------------------------------- 1 | --- 2 | engines: 3 | pylint: 4 | enabled: true 5 | python_version: 3 -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | after_n_builds: 5 -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | # Regexes for lines to exclude from consideration 3 | exclude_lines = 4 | # 23 is ASCII for pound sign 5 | \x23 no coverage 6 | raise AssertionError 7 | raise NotImplementedError 8 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=120 3 | ignore: E301, E302, E305, E401, F401, W503, W504, W605, E252 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Reminder: 2 | # - A leading slash means the pattern is anchored at the root. 3 | # - No leading slash means the pattern matches at any depth. 4 | 5 | # Python files 6 | *.pyc 7 | __pycache__/ 8 | .mypy_cache/ 9 | .tox/ 10 | *.egg-info/ 11 | /build/ 12 | /dist/ 13 | /.eggs/ 14 | 15 | # Sphinx documentation 16 | /docs/_build/ 17 | 18 | # IDE project files 19 | /.pydevproject 20 | 21 | # vim python-mode plugin 22 | /.ropeproject 23 | 24 | # vsCode IDE files 25 | /.vscode 26 | 27 | # IntelliJ IDEA / PyCharm project files 28 | /.idea 29 | /*.iml 30 | 31 | # JS/node/npm/web dev files 32 | node_modules 33 | npm-debug.log 34 | 35 | # OS X metadata files 36 | .DS_Store 37 | 38 | # Python coverage 39 | .coverage 40 | .coverage.* 41 | 42 | # Credential files 43 | /gcp-credentials.json 44 | /application_secrets.json 45 | 46 | # Temporary folder 47 | /tmp 48 | 49 | # Local environment configuration 50 | /environment.local 51 | /environment.*.local 52 | 53 | # virtualenv folder 54 | venv 55 | v3nv 56 | 57 | # Terraform files 58 | .terraform 59 | backend.tf 60 | providers.tf 61 | variables.tf 62 | 63 | /smoketest-*.tmp 64 | -------------------------------------------------------------------------------- /.gitlab-ci-prod.yml: -------------------------------------------------------------------------------- 1 | image: humancellatlas/dss-build-box 2 | # The Docker image `humancellatlas/dss-build-box` is created through a manual process from 3 | # `${DSS_HOME}/allspark.Dockerfile`. See the contents of `${DSS_HOME}/allspark.Dockerfile` 4 | # creation and usage instructions. 5 | 6 | variables: 7 | GIT_SUBMODULE_STRATEGY: normal 8 | DSS_ES_TIMEOUT: 30 9 | DSS_UNITTEST_OPTS: "-v" 10 | GITHUB_API: "https://api.github.com" 11 | 12 | stages: 13 | - deploy 14 | - fusillade 15 | - test 16 | 17 | before_script: 18 | - date && date -u 19 | # TODO: figure out how to get the gitlab-runner to not clone the repo as root - Brian H 20 | - cp -r /HumanCellAtlas/data-store ~/data-store && cd ~/data-store 21 | - git reset --hard HEAD 22 | - virtualenv ~/venv 23 | - source ~/venv/bin/activate 24 | - pip install -r requirements-dev.txt 25 | - source environment 26 | - source environment.prod 27 | - scripts/dss-ops.py secrets get application_secrets.json > application_secrets.json 28 | - scripts/dss-ops.py secrets get gcp-credentials.json > gcp-credentials.json 29 | - export GOOGLE_APPLICATION_CREDENTIALS=$(pwd -P)/gcp-credentials.json 30 | 31 | setup_fusillade: 32 | stage: fusillade 33 | script: 34 | - git clone -b master https://github.com/HumanCellAtlas/dcp-fusillade.git 35 | - cd dcp-fusillade 36 | # currently, there is no environment.prod in the dcp-fusillade repo 37 | - source environment # && source environment.prod 38 | - cd .. 39 | - source environment 40 | - if [[ -f "environment.$CI_COMMIT_REF_NAME" ]]; then 41 | - source environment.$CI_COMMIT_REF_NAME 42 | - fi 43 | - cat ./roles.json | sed "s/\${stage}/${DSS_DEPLOYMENT_STAGE}/g" > temp-roles.json 44 | - FUS_STAGE=$DSS_DEPLOYMENT_STAGE 45 | - python -m json.tool ./temp-roles.json > /dev/null || exit 1 46 | - dcp-fusillade/scripts/setup_fusillade.py --file temp-roles.json --force $FUS_STAGE 47 | - scripts/check_fusillade.py $FUS_STAGE 48 | except: 49 | - schedules 50 | only: 51 | - prod 52 | 53 | deploy: 54 | stage: deploy 55 | script: 56 | - make plan-infra 57 | - make deploy 58 | - scripts/set_version.sh 59 | environment: 60 | name: prod 61 | url: https://dss.data.humancellatlas.org 62 | only: 63 | - prod 64 | when: manual 65 | 66 | test: 67 | stage: test 68 | script: 69 | - make smoketest-prod 70 | dependencies: 71 | - deploy 72 | only: 73 | - prod 74 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/.gitmodules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: 4 | pip: true 5 | directories: 6 | - chalice/.chalice/venv 7 | - daemons/dss-sync/.chalice/venv 8 | - daemons/dss-index/.chalice/venv 9 | 10 | python: 3.6 11 | 12 | dist: trusty 13 | 14 | addons: 15 | apt: 16 | packages: 17 | - jq 18 | - moreutils 19 | - gettext 20 | 21 | install: 22 | - pip install -r requirements-dev.txt 23 | - wget -q ${ES_DOWNLOAD_URL} 24 | - tar -xzf elasticsearch-${ES_VERSION}.tar.gz 25 | - wget -q http://us-east-1.ec2.archive.ubuntu.com/ubuntu/pool/main/m/make-dfsg/make_4.1-6_amd64.deb 26 | - mkdir make4 27 | - dpkg -x make*.deb make4 28 | - export PATH=$(pwd)/make4/usr/bin:$PATH 29 | 30 | before_script: 31 | - export -n _JAVA_OPTIONS # https://github.com/travis-ci/travis-ci/issues/8408 32 | - source environment 33 | 34 | script: 35 | - python3 scripts/swagger_auth.py --travis 36 | - make $MAKE_ARGS 37 | 38 | after_success: 39 | - if [[ "$TRAVIS_EVENT_TYPE" == "push" ]]; then bash <(curl -s https://codecov.io/bash); fi 40 | 41 | if: tag IS blank # don't build tags 42 | 43 | stages: 44 | - name: test 45 | if: env(TRAVIS_DSS_INTEGRATION_MODE) != 1 46 | - name: integration_test 47 | if: env(TRAVIS_DSS_INTEGRATION_MODE) = 1 48 | 49 | jobs: 50 | include: 51 | - stage: test 52 | env: 53 | - MAKE_ARGS="-j4 parallel_test" 54 | - stage: test 55 | env: 56 | - MAKE_ARGS="-j1 tests/test_search.py" 57 | - stage: test 58 | env: 59 | - MAKE_ARGS="-j1 tests/test_indexer.py" 60 | - DSS_UNITTEST_OPTS="-b -v TestAWSIndexer" 61 | - stage: test 62 | env: 63 | - MAKE_ARGS="-j1 tests/test_indexer.py" 64 | - DSS_UNITTEST_OPTS="-b -v TestGCPIndexer" 65 | - stage: test 66 | env: 67 | - MAKE_ARGS="-j1 tests/test_subscriptions.py" 68 | - stage: integration_test 69 | env: 70 | - MAKE_ARGS="-j1 integration_test" 71 | 72 | env: 73 | global: 74 | - ES_VERSION=5.4.2 75 | - ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz 76 | - DSS_TEST_ES_PATH=./elasticsearch-${ES_VERSION}/bin/elasticsearch 77 | - DSS_ES_TIMEOUT=30 78 | - DSS_UNITTEST_OPTS="-v -b" 79 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Contributions to HCA working group repos are subject to overall HCA 4 | governance and technical guidance. In addition, contributors are 5 | expected to abide by the following guidelines: 6 | 7 | 1. Rough consensus and running code: instead of formal procedures for 8 | agreeing on everything, systems with working prototypes and existing 9 | users are prioritized as platforms for discussion. 10 | 11 | 1. Keep it simple: prioritize scalability and the ability to keep the 12 | project easy to understand and scale over features. Use 13 | well-supported upstream solutions where possible. Provide useful 14 | defaults and don't expose unnecessary configuration options. 15 | 16 | 1. Separation of concerns and layers: (TODO: elaborate) 17 | 18 | 1. Don't break the build: pull requests are expected to pass all 19 | automated CI checks. 20 | 21 | 1. Keep the build simple: Automated CI checks that are fragile or don't 22 | serve a clear agreed upon purpose will be removed. 23 | 24 | 1. All Pull Request comments must be addressed, even after merge. 25 | 26 | 1. All code must be reviewed by at least 1 other team member. 27 | 28 | 1. All pull requests relating to an issue must include "connected to 29 | \#123", where \#123 is the issue number. 30 | 31 | 1. Individual commit messages should clearly express the commit's purpose. 32 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | 12 | 13 | 16 | 17 | 28 | 29 | 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 Human Cell Atlas, https://humancellatlas.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | https://github.com/slezica/python-frozendict 24 | 25 | Copyright (c) 2012 Santiago Lezica 26 | 27 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 28 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 29 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 30 | permit persons to whom the Software is furnished to do so, subject to the following conditions: 31 | 32 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 33 | Software. 34 | 35 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 36 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 37 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 38 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 39 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | 11 | 12 | 18 | 19 | 25 | 26 | 32 | 33 | 37 | -------------------------------------------------------------------------------- /allspark.Dockerfile: -------------------------------------------------------------------------------- 1 | # This is the build image for the DSS, intended for use with the allspark GitLab server 2 | # It may be built and uploaded with the commands: 3 | # `docker login 4 | # `docker build -f allspark.Dockerfile -t {docker_username}/{tag_key}:{tag_value} .` 5 | # `docker push {docker_username}/{tag_key}:{tag_value}` 6 | # For example, 7 | # `docker login 8 | # `docker build -f allspark.Dockerfile -t humancellatlas/dss-build-box .` 9 | # `docker push humancellatlas/dss-build-box` 10 | # 11 | # Now reference the image in .gitlab-ci.yml with the line: 12 | # `image: {docker_username}/{tag_key}:{tag_value} 13 | # 14 | # Please see Docker startup guide for additional info: 15 | # https://docs.docker.com/get-started/ 16 | 17 | FROM ubuntu:18.04 18 | 19 | ENV DEBIAN_FRONTEND noninteractive 20 | 21 | RUN apt-get update --quiet \ 22 | && apt-get install --assume-yes --no-install-recommends \ 23 | ca-certificates \ 24 | build-essential \ 25 | default-jre \ 26 | gettext \ 27 | git \ 28 | httpie \ 29 | jq \ 30 | make \ 31 | moreutils \ 32 | openssl \ 33 | python3-pip \ 34 | python3.6-dev \ 35 | unzip \ 36 | wget \ 37 | xxd \ 38 | zlib1g-dev \ 39 | zip 40 | 41 | RUN apt-get update --quiet 42 | 43 | RUN python3 -m pip install --upgrade pip==10.0.1 44 | RUN python3 -m pip install virtualenv==16.0.0 45 | RUN ln -s /usr/bin/python3.6 /usr/bin/python 46 | RUN ln -s /usr/bin/pip3 /usr/bin/pip 47 | 48 | RUN useradd -d /home/hca_cicd -ms /bin/bash -g root -G sudo hca_cicd 49 | RUN mkdir /HumanCellAtlas && chown hca_cicd /HumanCellAtlas 50 | USER hca_cicd 51 | WORKDIR /home/hca_cicd 52 | 53 | ENV PATH /home/hca_cicd/bin:${PATH} 54 | RUN mkdir -p /home/hca_cicd/bin 55 | 56 | ENV ES_VERSION 5.4.2 57 | ENV DSS_TEST_ES_PATH=/home/hca_cicd/elasticsearch-${ES_VERSION}/bin/elasticsearch 58 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz \ 59 | && tar -xzf elasticsearch-${ES_VERSION}.tar.gz -C /home/hca_cicd 60 | 61 | ENV TF_VERSION 0.12.16 62 | RUN wget https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip \ 63 | && unzip terraform_${TF_VERSION}_linux_amd64.zip -d /home/hca_cicd/bin 64 | 65 | # Address locale problem, see "Python 3 Surrogate Handling": 66 | # http://click.pocoo.org/5/python3/ 67 | ENV LANG C.UTF-8 68 | ENV LC_ALL C.UTF-8 69 | -------------------------------------------------------------------------------- /chalice/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "app_name": "dss", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/chalicelib/gcp-credentials.json", 6 | "GOOGLE_APPLICATION_SECRETS": "/var/task/chalicelib/application_secrets.json" 7 | }, 8 | "stages": { 9 | "dev": { 10 | "api_gateway_stage": "", 11 | "environment_variables": { 12 | } 13 | } 14 | }, 15 | "lambda_timeout": 300, 16 | "lambda_memory_size": 1280 17 | } 18 | -------------------------------------------------------------------------------- /chalice/.gitignore: -------------------------------------------------------------------------------- 1 | .chalice/deployed.json 2 | .chalice/deployments/ 3 | .chalice/policy.json 4 | .chalice/policy-*.json 5 | .chalice/venv/ 6 | chalicelib 7 | vendor 8 | -------------------------------------------------------------------------------- /chalice/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | deploy: 4 | git clean -df chalicelib vendor 5 | shopt -s nullglob; for wheel in vendor.in/*/*.whl; do unzip -q -o -d vendor $$wheel; done 6 | cp -R ../dss ../dss-api.yml chalicelib 7 | aws secretsmanager get-secret-value --secret-id ${DSS_SECRETS_STORE}/${DSS_DEPLOYMENT_STAGE}/gcp-credentials.json | jq -r .SecretString > chalicelib/gcp-credentials.json 8 | aws secretsmanager get-secret-value --secret-id ${DSS_SECRETS_STORE}/${DSS_DEPLOYMENT_STAGE}/application_secrets.json | jq -r .SecretString > chalicelib/application_secrets.json 9 | ${DSS_HOME}/scripts/generate_upload_requirements_layer.sh 10 | ./build_deploy_config.sh 11 | chalice deploy --no-autogen-policy --stage $(DSS_DEPLOYMENT_STAGE) --api-gateway-stage $(DSS_DEPLOYMENT_STAGE) 12 | -------------------------------------------------------------------------------- /chalice/chalicelib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/chalice/chalicelib/.keep -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | threshold: 2.5% 6 | 7 | comment: off 8 | -------------------------------------------------------------------------------- /common.mk: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | ifndef DSS_HOME 4 | $(error Please run "source environment" in the data-store repo root directory before running make commands) 5 | endif 6 | 7 | ifeq ($(shell which jq),) 8 | $(error Please install jq using "apt-get install jq" or "brew install jq") 9 | endif 10 | 11 | ifeq ($(shell which sponge),) 12 | $(error Please install sponge using "apt-get install moreutils" or "brew install moreutils") 13 | endif 14 | 15 | ifeq ($(shell which envsubst),) 16 | $(error Please install envsubst using "apt-get install gettext" or "brew install gettext; brew link gettext") 17 | endif 18 | 19 | ifeq ($(findstring Python 3.6, $(shell python --version 2>&1)),) 20 | $(error Please run make commands from a Python 3.6 virtualenv) 21 | endif 22 | 23 | 24 | ifeq ($(findstring terraform, $(shell which terraform 2>&1)),) 25 | else ifeq ($(findstring Terraform v0.12.16, $(shell terraform --version 2>&1)),) 26 | $(error You must use Terraform v0.12.16, please check your terraform version.) 27 | endif 28 | -------------------------------------------------------------------------------- /daemons/.gitignore: -------------------------------------------------------------------------------- 1 | **/.chalice/deployed.json 2 | **/.chalice/deployments/ 3 | **/.chalice/policy.json 4 | **/.chalice/policy-*.json 5 | **/.chalice/venv/ 6 | **/domovoilib 7 | **/vendor 8 | -------------------------------------------------------------------------------- /daemons/Makefile: -------------------------------------------------------------------------------- 1 | include ../common.mk 2 | 3 | SERIAL_AWS_DAEMONS := \ 4 | dss-sync-sfn \ 5 | dss-notify-v2 \ 6 | dss-operations \ 7 | dss-index 8 | 9 | SERIAL_GCP_DAEMONS := 10 | 11 | PARALLEL_AWS_DAEMONS := \ 12 | dss-checkout-sfn \ 13 | dss-gs-copy-sfn \ 14 | dss-gs-copy-write-metadata-sfn \ 15 | dss-s3-copy-sfn \ 16 | dss-s3-copy-write-metadata-sfn \ 17 | dss-scalability-test \ 18 | dss-dlq-reaper \ 19 | dss-sfn-launcher \ 20 | dss-notify \ 21 | dss-events-scribe 22 | 23 | 24 | PARALLEL_GCP_DAEMONS := \ 25 | dss-gs-event-relay 26 | 27 | deploy: deploy-serial deploy-parallel 28 | deploy-serial: $(SERIAL_AWS_DAEMONS) $(SERIAL_GCP_DAEMONS) 29 | deploy-parallel: $(PARALLEL_AWS_DAEMONS) $(PARALLEL_GCP_DAEMONS) 30 | 31 | generate-dependencies: 32 | ${DSS_HOME}/scripts/generate_upload_requirements_layer.sh 33 | 34 | $(SERIAL_AWS_DAEMONS) $(PARALLEL_AWS_DAEMONS): generate-dependencies 35 | @for f in $@/*.tf; do \ 36 | echo "Terraforming $@"; \ 37 | if [ -e $$f ]; then \ 38 | rm -rf $@/.terraform/*.tfstate;\ 39 | ./build_tf_deploy_config.py $@;\ 40 | (cd $@ ; terraform init);\ 41 | (cd $@ ; TF_VAR_daemon=$@ terraform apply -auto-approve);\ 42 | fi;\ 43 | break;\ 44 | done 45 | ./package_daemon.sh $@ 46 | ./build_deploy_config.sh $@ 47 | cd $@ && domovoi deploy --stage $(DSS_DEPLOYMENT_STAGE) 48 | @if [[ $@ == "dss-sync-sfn" || $@ == "dss-index" || $@ == "dss-notify-v2" ]]; then \ 49 | ./invoke_lambda.sh $@ $(DSS_DEPLOYMENT_STAGE) \ 50 | ../tests/daemons/sample_s3_bundle_created_event.json.template \ 51 | ../tests/daemons/a47b90b2-0967-4fbf-87bc-c6c12db3fedf.2017-07-12T055120.037644Z; \ 52 | fi 53 | @if [[ $@ == "dss-notify" ]]; then \ 54 | cd $@ && python -c 'import app; app.deploy_notifier()' ; \ 55 | fi 56 | 57 | dss-gs-event-relay: 58 | $(DSS_HOME)/scripts/deploy_gcf.py $@ --entry-point "dss_gs_bucket_events_$(subst -,_,$(DSS_GS_BUCKET))" 59 | 60 | import-test: 61 | set -e; \ 62 | for daemon in $(SERIAL_AWS_DAEMONS) $(PARALLEL_AWS_DAEMONS); do \ 63 | ./package_daemon.sh $$daemon; \ 64 | python $$daemon/app.py; \ 65 | done 66 | python dss-gs-event-relay/main.py 67 | 68 | .PHONY: deploy deploy-serial deploy-parallel import-test $(SERIAL_AWS_DAEMONS) $(SERIAL_GCP_DAEMONS) $(PARALLEL_AWS_DAEMONS) $(PARALLEL_GCP_DAEMONS) 69 | -------------------------------------------------------------------------------- /daemons/build_tf_deploy_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script generates Terraform scripting needed for daemons that deploy infrastructure. 4 | """ 5 | 6 | import os 7 | import glob 8 | import json 9 | import boto3 10 | import argparse 11 | 12 | 13 | daemons_root = os.path.abspath(os.path.dirname(__file__)) 14 | 15 | 16 | parser = argparse.ArgumentParser(description=__doc__) 17 | parser.add_argument("daemon") 18 | args = parser.parse_args() 19 | 20 | 21 | env_vars_to_lambda = os.environ['EXPORT_ENV_VARS_TO_LAMBDA'].split() 22 | 23 | 24 | terraform_backend_template = """terraform {{ 25 | backend "s3" {{ 26 | bucket = "{bucket}" 27 | key = "dss-{daemon}-{stage}.tfstate" 28 | region = "{region}" 29 | {profile_setting} 30 | }} 31 | }} 32 | """ 33 | 34 | terraform_providers_template = """ 35 | provider aws {{ 36 | region = "{aws_region}" 37 | }} 38 | """ 39 | 40 | account_id = boto3.client("sts").get_caller_identity()['Account'] 41 | backend_bucket = os.environ['DSS_TERRAFORM_BACKEND_BUCKET_TEMPLATE'].format(account_id=account_id) 42 | 43 | terraform_variable_info = {'variable': dict()} 44 | for key in env_vars_to_lambda: 45 | terraform_variable_info['variable'][key] = { 46 | 'default': os.environ[key] 47 | } 48 | 49 | with open(os.path.join(daemons_root, args.daemon, "backend.tf"), "w") as fp: 50 | if os.environ.get('AWS_PROFILE'): 51 | profile = os.environ['AWS_PROFILE'] 52 | profile_setting = f'profile = "{profile}"' 53 | else: 54 | profile_setting = '' 55 | fp.write(terraform_backend_template.format( 56 | bucket=backend_bucket, 57 | daemon=args.daemon, 58 | stage=os.environ['DSS_DEPLOYMENT_STAGE'], 59 | region=os.environ['AWS_DEFAULT_REGION'], 60 | profile_setting=profile_setting, 61 | )) 62 | 63 | with open(os.path.join(daemons_root, args.daemon, "variables.tf"), "w") as fp: 64 | fp.write(json.dumps(terraform_variable_info, indent=2)) 65 | 66 | with open(os.path.join(daemons_root, args.daemon, "providers.tf"), "w") as fp: 67 | fp.write(terraform_providers_template.format( 68 | aws_region=os.environ['AWS_DEFAULT_REGION'], 69 | )) 70 | -------------------------------------------------------------------------------- /daemons/dss-checkout-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "app_name": "dss-checkout-sfn", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 6 | }, 7 | "stages": { 8 | "dev": { 9 | "api_gateway_stage": "dev", 10 | "environment_variables": {} 11 | } 12 | }, 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-checkout-sfn/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | import domovoi 6 | 7 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 8 | sys.path.insert(0, pkg_root) # noqa 9 | 10 | import dss 11 | from dss.logging import configure_lambda_logging 12 | from dss.stepfunctions.checkout.checkout_states import state_machine_def 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | configure_lambda_logging() 17 | app = domovoi.Domovoi(configure_logs=False) 18 | 19 | dss.Config.set_config(dss.BucketConfig.NORMAL) 20 | 21 | app.register_state_machine(state_machine_def) 22 | -------------------------------------------------------------------------------- /daemons/dss-checkout-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-checkout-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-dlq-reaper/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "app_name": "dss-dlq-reaper", 4 | "stages": { 5 | "dev": {} 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /daemons/dss-dlq-reaper/README.md: -------------------------------------------------------------------------------- 1 | # HCA DSS: Dead-letter queue based retry framework 2 | 3 | This daemon is a part of the DLQ-based framework for reprocessing failed Lambda invocations 4 | 5 | #### About the DLQ retry framework 6 | 7 | 1. The retry framework is based on the DLQ support built into AWS Lambda. When Lambda invocation fails the message that triggered the invocation is placed into DLQ 8 | 2. The daemon is triggred by cloudwatch cron every minute 9 | 3. The daemon retrieves messages from SQS queue `dss-dlq-{stage}` 10 | 4. The SNS messages are resent to the original SNS topic ARN with original payload 11 | 6. The SNS message attribute DSS-REAPER-RETRY-COUNT tracks the number of reprocess attempts on the given message. When this exceeds DSS_MAX_RETRY_COUNT (default 10), the daemon gives up and removes the message from the queue without retrying. 12 | 13 | #### Enabling DLQ-based retries for DSS daemons Lambdas 14 | 15 | In order to enable DLQ-based reprocessing for DSS daemons each daemon needs to be configured individually. 16 | 17 | - Locate config.json file in the daemon's .chalice directory 18 | - Add the following entry to the `config.json` file `"dead_letter_queue_target_arn": "",`. 19 | - The entry needs to be created at the top level of the json attribute hierarchy. During deployment the value would be replaced with approriate SQS queue name. 20 | -------------------------------------------------------------------------------- /daemons/dss-dlq-reaper/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-dlq-reaper/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-events-scribe/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-events-scribe", 3 | "version": "2.0", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 6 | }, 7 | "stages": { 8 | "dev": { 9 | "api_gateway_stage": "dev", 10 | "tags": {}, 11 | "environment_variables": {} 12 | } 13 | }, 14 | "reserved_concurrency": 1, 15 | "lambda_timeout": 600, 16 | "lambda_memory_size": 2560 17 | } 18 | -------------------------------------------------------------------------------- /daemons/dss-events-scribe/Readme.md: -------------------------------------------------------------------------------- 1 | # DSS Event Journaling and Update Daemon 2 | 3 | The dss-events-scribe daemon compiles event data into journals, and applies event updates and deletes to existing 4 | journals. 5 | 6 | DSS events are managed with the [flashflood library](https://github.com/HumanCellAtlas/flash-flood). 7 | 8 | ## Concurrency 9 | 10 | The flashflood event journaling and update API is not concurrency safe, so this daemon uses a synchronous 11 | execution model by setting Lambda reserved concurrency to 1. For more information about Lambda reserved concurrency, 12 | see [AWS documentation](https://docs.aws.amazon.com/lambda/latest/dg/per-function-concurrency.html). 13 | 14 | ## Rate limiting 15 | 16 | Daemon invocation is rate limited similarly to the [token bucket algorithm](https://en.wikipedia.org/wiki/Token_bucket) as follows: 17 | 1) A message is added to a queue every N minutes. 18 | 2) Messages older than `M>N` minutes are discarded from the queue. 19 | 3) The event daemon is invoked non-concurrently on each message until the queue is empty. 20 | If messages cannot be processed quickly enough, the queue will grow to a maximum length of `M/N` items. 21 | 22 | `M` and `N` should be adjusted so the daemon does not constantly operate with a full queue. 23 | 24 | This algorithm is built on top of 25 | [scheduled AWS CloudWatch rules](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/Create-CloudWatch-Events-Scheduled-Rule.html) 26 | and [AWS SQS queues](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/welcome.html). 27 | 28 | ## Configuration 29 | 30 | A scheduled CloudWatch rule is configured to send one message per replica to the dss-events-scribe SQS queue. The 31 | dss-events-scribe lambda is configured to process messages from the queue in batches of size 1. 32 | 33 | - The scheduled CloudWatch rule and SQS queue are configured in [infra/dss-events-scribe/main.tf](../../infra/dss-events-scribe/main.tf). 34 | - Reserved concurrency is configured in [daemons/dss-events-scribe/.chalice/config.json](.chalice/config.json). 35 | - CloudWatch-SQS integration requires an IAM policy on the SQS queue, managed in [infra/dss-events-scribe/main.tf](../../infra/dss-events-scribe/main.tf). 36 | These permissions should be a superset of SQS permissions assigned to the Lambda execution role in 37 | [iam/policy-templates/dss-events-scribe-lambda.json](../../iam/policy-templates/dss-events-scribe-lambda.json). 38 | - The queue configurations in [daemons/dss-events-scribe/app.py](app.py) should match the values in 39 | [infra/dss-events-scribe/main.tf](../../infra/dss-events-scribe/main.tf), otherwise Domovoi will change the queue configuration upon deploy. 40 | 41 | -------------------------------------------------------------------------------- /daemons/dss-events-scribe/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-events-scribe/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-gs-copy-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-gs-copy-sfn", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-gs-copy-sfn/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import domovoi 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from dss.logging import configure_lambda_logging 10 | import dss.stepfunctions.gscopyclient as gscopyclient 11 | from dss.util import tracing 12 | 13 | configure_lambda_logging() 14 | 15 | app = domovoi.Domovoi() 16 | app.register_state_machine(gscopyclient.sfn) 17 | -------------------------------------------------------------------------------- /daemons/dss-gs-copy-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-gs-copy-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-gs-copy-write-metadata-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-gs-copy-write-metadata-sfn", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-gs-copy-write-metadata-sfn/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import domovoi 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from dss.logging import configure_lambda_logging 10 | import dss.stepfunctions.gscopyclient as gscopyclient 11 | from dss.util import tracing 12 | 13 | configure_lambda_logging() 14 | 15 | app = domovoi.Domovoi() 16 | app.register_state_machine(gscopyclient.copy_write_metadata_sfn) 17 | -------------------------------------------------------------------------------- /daemons/dss-gs-copy-write-metadata-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-gs-copy-write-metadata-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-gs-event-relay/Readme.md: -------------------------------------------------------------------------------- 1 | #### Activating Google Cloud APIs 2 | 3 | ``` 4 | gcloud service-management enable cloudfunctions.googleapis.com 5 | gcloud service-management enable runtimeconfig.googleapis.com 6 | ``` 7 | 8 | #### Retrieving GCF logs 9 | 10 | ``` 11 | gcloud beta functions logs read dss-gs-event-relay 12 | ``` 13 | 14 | #### Example Google Storage event 15 | 16 | ``` 17 | { 18 | "timestamp": "2017-08-25T23:08:25.270Z", 19 | "eventType": "providers/cloud.storage/eventTypes/object.change", 20 | "resource": "projects/_/buckets/GS_BUCKET/objects/GS_BLOB_KEY#1503702505270802", 21 | "data": { 22 | "kind": "storage#object", 23 | "resourceState": "exists", 24 | "id": "GS_BUCKET/GS_BLOB_KEY/1503702505270802", 25 | "selfLink": "https://www.googleapis.com/storage/v1/b/GS_BUCKET/o/GS_BLOB_KEY", 26 | "name": "GS_BLOB_KEY", 27 | "bucket": "GS_BUCKET", 28 | "generation": "1503702505270802", 29 | "metageneration": "1", 30 | "contentType": "application/octet-stream", 31 | "timeCreated": "2017-08-25T23:08:25.245Z", 32 | "updated": "2017-08-25T23:08:25.245Z", 33 | "storageClass": "REGIONAL", 34 | "size": "1130", 35 | "md5Hash": "ZDllMDNlNjA0YzZiNjI4NWMxN2NlY2YxZDM4NWE3YzE=", 36 | "mediaLink": "https://www.googleapis.com/download/storage/v1/b/GS_BUCKET/o/GS_BLOB_KEY?generation=1503702505270802&alt=media", 37 | "crc32c": "yHkaXA==" 38 | } 39 | } 40 | ``` 41 | 42 | #### Environment variables in the GCF container 43 | 44 | ``` 45 | { 46 | "WORKER_PORT": "8091", 47 | "GCLOUD_PROJECT": "PROJECT_NAME", 48 | "FUNCTION_NAME": "dss_gs_event_relay", 49 | "SUPERVISOR_HOSTNAME": "192.168.1.1", 50 | "PATH": "/usr/local/bin:/usr/bin:/bin", 51 | "PWD": "/user_code", 52 | "FUNCTION_TRIGGER_TYPE": "CLOUD_STORAGE_TRIGGER", 53 | "NODE_ENV": "production", 54 | "SHLVL": "1", 55 | "CODE_LOCATION": "/user_code", 56 | "FUNCTION_MEMORY_MB": "256", 57 | "GCP_PROJECT": "PROJECT_NAME", 58 | "PORT": "8080", 59 | "SUPERVISOR_INTERNAL_PORT": "8081", 60 | "ENTRY_POINT": "dss_gs_event_relay", 61 | "OLDPWD": "/var/tmp/worker", 62 | "_": "/usr/bin/env", 63 | "HOME": "/tmp" 64 | } 65 | ``` 66 | -------------------------------------------------------------------------------- /daemons/dss-gs-event-relay/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.9.253 2 | -------------------------------------------------------------------------------- /daemons/dss-index/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-index", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536, 15 | "reserved_concurrent_executions": 200 16 | } 17 | -------------------------------------------------------------------------------- /daemons/dss-index/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-index/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-notify-v2/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-notify-v2", 3 | "version": "2.0", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 6 | }, 7 | "stages": { 8 | "dev": { 9 | "api_gateway_stage": "dev", 10 | "tags": {}, 11 | "environment_variables": {} 12 | } 13 | }, 14 | "lambda_timeout": 900, 15 | "lambda_memory_size": 2560 16 | } 17 | -------------------------------------------------------------------------------- /daemons/dss-notify-v2/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-notify-v2/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-notify/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-notify", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-notify/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import domovoi 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from dss import Config 10 | from dss.logging import configure_lambda_logging 11 | from dss.notify.notifier import Notifier 12 | from dss.util.time import RemainingLambdaContextTime, AdjustedRemainingTime 13 | from dss.util.types import LambdaContext 14 | 15 | configure_lambda_logging() 16 | 17 | 18 | def deploy_notifier(): 19 | notifier = Notifier.from_config() 20 | notifier.deploy() 21 | 22 | 23 | app = domovoi.Domovoi(configure_logs=False) 24 | 25 | 26 | @app.scheduled_function("rate(1 minute)", rule_name='run_notifier_' + Config.deployment_stage()) 27 | def run_notifier(event, context: LambdaContext): 28 | notifier = Notifier.from_config() 29 | shutdown_seconds = 5.0 30 | remaining_time = RemainingLambdaContextTime(context) 31 | adjusted_remaining_time = AdjustedRemainingTime(-shutdown_seconds, remaining_time) 32 | notifier.run(adjusted_remaining_time) 33 | -------------------------------------------------------------------------------- /daemons/dss-notify/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-notify/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-operations/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-operations", 3 | "version": "2.0", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 6 | }, 7 | "stages": { 8 | "dev": { 9 | "api_gateway_stage": "dev", 10 | "tags": {}, 11 | "environment_variables": {} 12 | } 13 | }, 14 | "lambda_timeout": 900, 15 | "lambda_memory_size": 2560 16 | } 17 | -------------------------------------------------------------------------------- /daemons/dss-operations/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | This Lambda executes commands forwarded by the DSS operations CLI. Commands are forward via SQS by embedding the 3 | command as plain text directly in the message body. 4 | 5 | For example the command 6 | ``` 7 | scripts/dss-ops.py storage verify-referential integrity --replica aws --keys key1 key2 key 8 | ``` 9 | would be forwarded with the message 10 | ``` 11 | "storage verify-referential integrity --replica aws --keys key1 key2 key" 12 | ``` 13 | """ 14 | 15 | import os 16 | import sys 17 | import logging 18 | 19 | import domovoi 20 | 21 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 22 | sys.path.insert(0, pkg_root) # noqa 23 | 24 | import dss 25 | import dss.operations.storage 26 | import dss.operations.events 27 | from dss.operations import dispatch 28 | 29 | logging.basicConfig(stream=sys.stdout) 30 | logger = logging.getLogger(__name__) # noqa 31 | logger.setLevel(logging.WARNING) # noqa 32 | # TODO: Can log level be passed in through command arguments? 33 | 34 | dss.Config.set_config(dss.BucketConfig.NORMAL) 35 | 36 | app = domovoi.Domovoi() 37 | app.log.setLevel(logging.WARNING) # suppress domovoi info logs 38 | 39 | # Handle commands forwarded from the DSS Operations CLI 40 | @app.sqs_queue_subscriber( 41 | "dss-operations-" + os.environ['DSS_DEPLOYMENT_STAGE'], 42 | queue_attributes=dict( 43 | VisibilityTimeout="3600", # Retry every hour 44 | MessageRetentionPeriod=str(2 * 24 * 3600) # Retain messages for 2 days 45 | ) 46 | ) 47 | def launch_from_notification_queue(event, context): 48 | for event_record in event['Records']: 49 | dispatch(event_record['body'].split()) 50 | -------------------------------------------------------------------------------- /daemons/dss-operations/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-operations/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-s3-copy-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-s3-copy-sfn", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-s3-copy-sfn/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import domovoi 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from dss.logging import configure_lambda_logging 10 | import dss.stepfunctions.s3copyclient as s3copyclient 11 | from dss.util import tracing 12 | 13 | 14 | configure_lambda_logging() 15 | 16 | app = domovoi.Domovoi(configure_logs=False) 17 | app.register_state_machine(s3copyclient.sfn) 18 | -------------------------------------------------------------------------------- /daemons/dss-s3-copy-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-s3-copy-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-s3-copy-write-metadata-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-s3-copy-write-metadata-sfn", 3 | "environment_variables": { 4 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 5 | }, 6 | "stages": { 7 | "dev": { 8 | "api_gateway_stage": "dev", 9 | "environment_variables": {} 10 | } 11 | }, 12 | "version": "2.0", 13 | "lambda_timeout": 300, 14 | "lambda_memory_size": 1536 15 | } 16 | -------------------------------------------------------------------------------- /daemons/dss-s3-copy-write-metadata-sfn/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import domovoi 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from dss.logging import configure_lambda_logging 10 | import dss.stepfunctions.s3copyclient as s3copyclient 11 | from dss.util import tracing 12 | 13 | 14 | configure_lambda_logging() 15 | app = domovoi.Domovoi(configure_logs=False) 16 | 17 | app.register_state_machine(s3copyclient.copy_write_metadata_sfn) 18 | -------------------------------------------------------------------------------- /daemons/dss-s3-copy-write-metadata-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-s3-copy-write-metadata-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-scalability-test/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "app_name": "dss-scalability-test", 4 | "environment_variables": { 5 | "HOME": "/tmp", 6 | "HOME_comment": "HOME variable is necessary in order to create templfile/tempdirectories in Lambda function" 7 | }, 8 | "stages": { 9 | "dev": { 10 | "api_gateway_stage": "api", 11 | "environment_variables": {} 12 | } 13 | }, 14 | "lambda_timeout": 300, 15 | "lambda_memory_size": 1536 16 | } 17 | -------------------------------------------------------------------------------- /daemons/dss-scalability-test/README.md: -------------------------------------------------------------------------------- 1 | # HCA DSS: Scalability Testing framework 2 | 3 | This is a scalability test framework for the replicated data storage system (aka the "blue box") of 4 | the [Human Cell Atlas](https://www.humancellatlas.org/). 5 | 6 | #### About the scalability testing framework 7 | 8 | 1. The scalability test framework is based on AWS Step Functions. Workflow definition resembles smoke test for DSS. 9 | 1. The execution is triggered by sending SNS messages to `dss-scalability-test-run-{STAGE}` topic 10 | 1. The scalability test writes results of execution of individual executions and aggregated run metrics into the 11 | following DynamoDB tables: `scalability_test_result`, `scalability_test` 12 | 1. The SFN execution is initiated and starts by entering WAIT step. Wait is configured to end at the 5 minute intervals 13 | to accommodate the AWS limit on starting SFN executions and enable generation of bursts of load 14 | 1. Once all parallel branches of execution are done, it writes summary of the run in DynamoDB 15 | 1. DynamoDB is configured to stream new records into Lambda which aggregates the results and writes incremental metrics 16 | back into DynamoDB 17 | 1. CloudWatch dashboard has been configured to display relevant execution metrics and deployed automatically. The 18 | dashboard is named as `Scalability{stage}` 19 | 20 | #### Running the scale test locally 21 | 22 | * Run with default configuration `make scaletest` in the top-level `data-store` directory. 23 | * Run with custom configuration `./tests/scalability/scale_test_runner.py -r -d ` in the 24 | top-level `data-store` directory, where `` is a number of requests generated per second and 25 | `` is the duration of the test in seconds. 26 | 27 | #### Adding new tests 28 | 29 | New tests can easily be addeed to the existing step function definition at `app.py`. 30 | -------------------------------------------------------------------------------- /daemons/dss-scalability-test/domovoilib/json_generator/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import string 4 | 5 | from dss.util.json_gen.hca_generator import HCAJsonGenerator 6 | 7 | 8 | def id_generator(size=30, chars=string.ascii_uppercase + string.digits): 9 | return ''.join(random.choice(chars) for _ in range(size)) 10 | 11 | 12 | schema_urls = [ 13 | "https://schema.humancellatlas.org/bundle/5.1.0/project", 14 | "https://schema.humancellatlas.org/bundle/5.1.0/submission", 15 | "https://schema.humancellatlas.org/bundle/5.1.0/ingest_audit", 16 | ] 17 | 18 | json_faker = None 19 | 20 | def generate_sample() -> str: 21 | global json_faker 22 | if json_faker is None: 23 | json_faker = HCAJsonGenerator(schema_urls) 24 | return json_faker.generate() 25 | -------------------------------------------------------------------------------- /daemons/dss-sfn-launcher/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "app_name": "dss-sfn-launcher", 4 | "dead_letter_queue_target_arn": "", 5 | "stages": { 6 | "dev": {} 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /daemons/dss-sfn-launcher/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | 6 | import boto3 7 | import domovoi 8 | from botocore.exceptions import ClientError 9 | 10 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), 'domovoilib')) # noqa 11 | sys.path.insert(0, pkg_root) # noqa 12 | 13 | from dss import stepfunctions 14 | from dss.stepfunctions import SFN_TEMPLATE_KEY, SFN_EXECUTION_KEY, SFN_INPUT_KEY, sfn_sns_topic 15 | from dss.util import tracing 16 | from dss.logging import configure_lambda_logging 17 | 18 | logger = logging.getLogger(__name__) 19 | configure_lambda_logging() 20 | app = domovoi.Domovoi(configure_logs=False) 21 | sqs = boto3.resource('sqs') 22 | 23 | @app.sns_topic_subscriber(sfn_sns_topic) 24 | def launch_sfn_run(event, context): 25 | sns_msg = event["Records"][0]["Sns"] 26 | logger.debug(f'sns_message: {sns_msg}') 27 | msg = json.loads(sns_msg["Message"]) 28 | attrs = sns_msg["MessageAttributes"] 29 | 30 | if 'DSS-REAPER-RETRY-COUNT' in attrs: 31 | logger.info("Reprocessing attempts so far %s", attrs['DSS-REAPER-RETRY-COUNT']['Value']) 32 | 33 | sfn_name_template = msg[SFN_TEMPLATE_KEY] 34 | sfn_execution = msg[SFN_EXECUTION_KEY] 35 | sfn_input = msg[SFN_INPUT_KEY] 36 | logger.debug("Launching Step Function %s execution: %s input: %s}", sfn_name_template, sfn_execution, sfn_input) 37 | try: 38 | response = stepfunctions._step_functions_start_execution(sfn_name_template, sfn_execution, sfn_input) 39 | logger.debug(f"Started step function execution: %s", str(response)) 40 | except ClientError as e: 41 | if e.response.get('Error'): 42 | if e.response['Error'].get('Code') == 'ExecutionAlreadyExists': 43 | logger.warning("Execution id %s already exists for %s. Not retrying.", sfn_execution, sfn_name_template) 44 | else: 45 | logger.warning("Failed to start step function execution id %s: due to %s", sfn_execution, str(e)) 46 | raise e 47 | -------------------------------------------------------------------------------- /daemons/dss-sfn-launcher/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-sfn-launcher/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-sync-sfn/.chalice/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "dss-sync-sfn", 3 | "version": "2.0", 4 | "environment_variables": { 5 | "GOOGLE_APPLICATION_CREDENTIALS": "/var/task/domovoilib/gcp-credentials.json" 6 | }, 7 | "stages": { 8 | "dev": { 9 | "api_gateway_stage": "dev", 10 | "tags": {}, 11 | "environment_variables": {} 12 | } 13 | }, 14 | "lambda_timeout": 900, 15 | "lambda_memory_size": 1536 16 | } 17 | -------------------------------------------------------------------------------- /daemons/dss-sync-sfn/.gitignore: -------------------------------------------------------------------------------- 1 | .chalice/deployments/ 2 | .chalice/venv/ 3 | -------------------------------------------------------------------------------- /daemons/dss-sync-sfn/Readme.md: -------------------------------------------------------------------------------- 1 | # DSS sync daemon 2 | 3 | Event handlers in the dss-sync daemon use utility functions in 4 | [dss.events.handlers.sync](../../dss/events/handlers/sync.py). 5 | 6 | Events are first received from object storage: 7 | 8 | * S3 invokes the `dss-sync-sfn` Lambda via an S3 event notification forwarded via SQS (`app.py:launch_from_s3_event()`). 9 | 10 | * GS invokes the `dss-gs-event-relay` GCF via a PubSub event notification 11 | (`/daemons/dss-gs-event-relay-python/main.py:dss_gs_event_relay()`). The GCF then forwards the event to the 12 | `dss-sync-sfn` Lambda via SQS (`app.py:launch_from_forwarded_event()`). 13 | 14 | If the event concerns an object that already exists in the destination bucket, the process stops. Otherwise, the sync 15 | step function is started (defined in `app.py`): 16 | 17 | ![DSS Sync SFN diagram](dss-sync-sfn.png) 18 | 19 | The state function starts with the `DispatchSync` state (`app.py:dispatch_sync()`). This state processes the storage 20 | event notification and orchestrates the rest of the copying: 21 | 22 | - If the notification is for a file blob: 23 | - If the blob is under the one-shot threshold, sends the state machine to `OneshotCopy`, which immediately copies 24 | it and exits the state machine. 25 | - Otherwise, configures the state machine to run the threadpool (copy blob parts) and closer (compose the copy). 26 | - Otherwise, enters a loop between `WaitForDeps` (configurable sleep state) and `CheckDeps`, which checks if all 27 | referenced entities are already copied to the destination: 28 | - For a file manifest, checks that the blob is there. 29 | - For a bundle manifest, checks that file manifests for all files in the bundle are there. 30 | - For a collection manifest, checks that all collection contents are there. 31 | If the checks fail, `CheckDeps` sends the state machine to sleep in `WaitForDeps` for 8 seconds, then try again. 32 | If the checks succeed: 33 | - If the blob is under the one-shot threshold, sends the state machine to `OneshotCopy`, which immediately copies 34 | it and exits the state machine. 35 | - Otherwise, configures the state machine to run the threadpool (copy blob parts) and closer (compose the copy). 36 | 37 | The one-shot threshold is equal to the part size (640MB on GS, 64MB on S3). 38 | 39 | All the event handler and SFN state lambdas above are actually the same lambda distribution (dss-sync), called via 40 | different entry points from different event notifications, orchestrated by the domovoi library. 41 | -------------------------------------------------------------------------------- /daemons/dss-sync-sfn/domovoilib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-sync-sfn/domovoilib/.keep -------------------------------------------------------------------------------- /daemons/dss-sync-sfn/dss-sync-sfn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/daemons/dss-sync-sfn/dss-sync-sfn.png -------------------------------------------------------------------------------- /daemons/invoke_lambda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | if [[ $# != 4 ]]; then 6 | echo "Usage: $(basename $0) daemon-name stage lambda-input-file bundle-file" 7 | exit 1 8 | fi 9 | 10 | lambda_name="$1-$2" 11 | lambda_input_file=$3 12 | bundle_file="$4" 13 | 14 | 15 | BUNDLE_KEY="bundles/$(basename "${bundle_file}")" 16 | 17 | if ! aws s3 ls s3://${DSS_S3_BUCKET}/"${BUNDLE_KEY}"; then 18 | aws s3 cp "${bundle_file}" s3://${DSS_S3_BUCKET}/"${BUNDLE_KEY}" 19 | fi 20 | BUNDLE_FILE_ETAG=$(aws s3api head-object --bucket ${DSS_S3_BUCKET} --key "${BUNDLE_KEY}" | jq -r '.ETag | fromjson') 21 | BUNDLE_FILE_SIZE=$(cat "${bundle_file}" | wc -c) 22 | 23 | # the wonky if-else is required because us-east-1 is represented as a null location constraint. weird, eh? 24 | DSS_S3_BUCKET_REGION=$(aws s3api get-bucket-location --bucket ${DSS_S3_BUCKET} | jq -r 'if (.LocationConstraint == null) then "us-east-1" else .LocationConstraint end') 25 | envsubst_vars='$BUNDLE_KEY $BUNDLE_FILE_ETAG $BUNDLE_FILE_SIZE $DSS_S3_BUCKET $DSS_S3_BUCKET_REGION' 26 | for varname in ${envsubst_vars}; do 27 | export ${varname##$} 28 | done 29 | 30 | raw_lambda_output="$(aws lambda invoke --function-name $lambda_name --invocation-type RequestResponse --payload "$(envsubst "${envsubst_vars}" < "${lambda_input_file}")" --log-type Tail /dev/stdout)" 31 | lambda_output="$(echo $raw_lambda_output | jq -r '. | select(.LogResult)')" 32 | 33 | # lambda output is occasionally malformed as appended JSON objects: {"wrong_obj": ... }{"LogResult": ...} 34 | # This selects the object we wish to examine for error 35 | echo "$lambda_output" | jq -r .LogResult | base64 --decode 36 | 37 | [[ $(echo "$lambda_output" | jq -r .FunctionError) == null ]] 38 | -------------------------------------------------------------------------------- /daemons/package_daemon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | if [[ $# != 1 ]]; then 6 | echo "Usage: $(basename $0) daemon-name" 7 | exit 1 8 | fi 9 | 10 | if [[ -z $DSS_DEPLOYMENT_STAGE ]]; then 11 | echo 'Please run "source environment" in the data-store repo root directory before running this command' 12 | exit 1 13 | fi 14 | 15 | daemon=$1 16 | 17 | echo "Running pre-packaging steps for $daemon" 18 | 19 | git clean -df $daemon/domovoilib $daemon/vendor 20 | 21 | shopt -s nullglob 22 | for wheel in $daemon/vendor.in/*/*.whl; do 23 | unzip -q -o -d $daemon/vendor $wheel 24 | done 25 | 26 | cp -R ../dss ../dss-api.yml $daemon/domovoilib 27 | aws secretsmanager get-secret-value --secret-id ${DSS_SECRETS_STORE}/${DSS_DEPLOYMENT_STAGE}/gcp-credentials.json \ 28 | | jq -r .SecretString > $daemon/domovoilib/gcp-credentials.json 29 | -------------------------------------------------------------------------------- /docs/resource_def.md: -------------------------------------------------------------------------------- 1 | |Actions| Resource| 2 | |-------|-----------| 3 | |dss:GetCheckout |arn:hca:dss:{stage}:{replica}:checkout/{checkout_job_id} 4 | |dss:DeleteBundle |arn:hca:dss:{stage}:{replica}:bundles/{uuid}.{version} 5 | |dss:GetBundle |arn:hca:dss:{stage}:{replica}:bundles/{uuid}.{version} 6 | |dss:PatchBundle |arn:hca:dss:{stage}:{replica}:bundles/{uuid}.{version} 7 | |dss:PutBundle |arn:hca:dss:{stage}:{replica}:bundles/{uuid}.{version} 8 | |dss:PostCheckout |arn:hca:dss:{stage}:{replica}:bundles/{uuid}.{version} 9 | |dss:GetCollections |arn:hca:dss:{stage}:{replica}:{user}/collections 10 | |dss:PutCollection |arn:hca:dss:{stage}:{replica}:collections/{uuid}.{version}, arn:hca:dss:{stage}:{replica}:{user}/collections/{uuid} 11 | |dss:DeleteCollection |arn:hca:dss:{stage}:{replica}:collections/{uuid}.{version}, arn:hca:dss:{stage}:{replica}:{user}/collections/{uuid} 12 | |dss:GetCollection |arn:hca:dss:{stage}:{replica}:collections/{uuid}.{version}, arn:hca:dss:{stage}:{replica}:{user}/collections/{uuid} 13 | |dss:PatchCollection |arn:hca:dss:{stage}:{replica}:collections/{uuid}.{version}, arn:hca:dss:{stage}:{replica}:{user}/collections/{uuid} 14 | |dss:GetFiles |arn:hca:dss:{stage}:{replica}:files/{uuid}.{version} 15 | |dss:HeadFiles |arn:hca:dss:{stage}:{replica}:files/{uuid}.{version} 16 | |dss:PutFiles |arn:hca:dss:{stage}:{replica}:files/{uuid}.{version} 17 | |dss:PostSearch |arn:hca:dss:{stage}:{replica}:query 18 | |dss:GetSubscriptions |arn:hca:dss:{stage}:{replica}:{user}/subscriptions 19 | |dss:PutSubscriptions |arn:hca:dss:{stage}:{replica}:subscription/{uuid}, arn:hca:dss:{stage}:{replica}:{user}/subscription/{uuid} 20 | |dss:GetSubscription |arn:hca:dss:{stage}:{replica}:subscription/{uuid}, arn:hca:dss:{stage}:{replica}:{user}/subscription/{uuid} 21 | |dss:DeleteSubscription |arn:hca:dss:{stage}:{replica}:subscription/{uuid}, arn:hca:dss:{stage}:{replica}:{user}/subscription/{uuid} -------------------------------------------------------------------------------- /dss-api: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Entry point for starting a local test HCA DSS API server. 5 | """ 6 | 7 | import os 8 | import sys 9 | import logging 10 | import argparse 11 | 12 | from chalice.cli import CLIFactory, run_local_server 13 | 14 | import dss 15 | 16 | parser = argparse.ArgumentParser(description=__doc__) 17 | parser.add_argument("--host", default="") 18 | parser.add_argument("--port", type=int, default=5000) 19 | parser.add_argument("--no-debug", dest="debug", action="store_false", 20 | help="Disable Chalice/Connexion/Flask debug mode") 21 | parser.add_argument("--project-dir", help=argparse.SUPPRESS, 22 | default=os.path.join(os.path.dirname(__file__), "chalice")) 23 | parser.add_argument("--log-level", 24 | help=str([logging.getLevelName(i) for i in range(0, 60, 10)]), 25 | choices={logging.getLevelName(i) for i in range(0, 60, 10)}, 26 | default=logging.DEBUG) 27 | args = parser.parse_args() 28 | 29 | if "DSS_HOME" not in os.environ: 30 | parser.exit('Please run "source environment" in the data-store repo root directory') 31 | 32 | logging.basicConfig(level=args.log_level, stream=sys.stderr) 33 | 34 | factory = CLIFactory(project_dir=args.project_dir, debug=args.debug) 35 | 36 | # The following code snippet is basically stolen from chalice/__init__py:run_local_server 37 | config = factory.create_config_obj( 38 | chalice_stage_name=os.environ["DSS_DEPLOYMENT_STAGE"] 39 | ) 40 | app_obj = factory.load_chalice_app() 41 | # When running `chalice local`, a stdout logger is configured 42 | # so you'll see the same stdout logging as you would when 43 | # running in lambda. This is configuring the root logger. 44 | # The app-specific logger (app.log) will still continue 45 | # to work. 46 | # FIXME: (hannes) I don't think this works. basicConfig() only does anything if there aren't any handlers, so a second 47 | # invocation is usually pointless unless something removed all handlers in between. 48 | logging.basicConfig(stream=sys.stdout) 49 | server = factory.create_local_server(app_obj=app_obj, config=config, host=args.host, port=args.port) 50 | server.serve_forever() 51 | -------------------------------------------------------------------------------- /dss/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/api/__init__.py -------------------------------------------------------------------------------- /dss/api/bundles/checkout.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from cloud_blobstore import BlobNotFoundError 3 | from flask import jsonify 4 | 5 | from dss import dss_handler, Replica 6 | from dss.error import DSSException 7 | from dss.storage.checkout import BundleNotFoundError 8 | from dss.storage.checkout.bundle import get_bundle_checkout_status, start_bundle_checkout 9 | 10 | 11 | @dss_handler 12 | def post(uuid: str, json_request_body: dict, replica: str, version: str = None): 13 | assert replica is not None 14 | _replica: Replica = Replica[replica] 15 | dst_bucket = json_request_body.get('destination', _replica.checkout_bucket) 16 | if '/' in dst_bucket: 17 | raise DSSException(400, "illegal_arguments", "Destination bucket invalid!") 18 | try: 19 | execution_id = start_bundle_checkout( 20 | _replica, 21 | uuid, 22 | version, 23 | dst_bucket=dst_bucket, 24 | email_address=json_request_body.get('email', None), 25 | ) 26 | except BundleNotFoundError: 27 | raise DSSException(404, "not_found", "Cannot find bundle!") 28 | return jsonify(dict(checkout_job_id=execution_id)), requests.codes.ok 29 | 30 | 31 | @dss_handler 32 | def get(replica: str, checkout_job_id: str): 33 | assert replica is not None 34 | _replica = Replica[replica] 35 | try: 36 | response = get_bundle_checkout_status(checkout_job_id, _replica, _replica.checkout_bucket) 37 | except BlobNotFoundError: 38 | raise DSSException(requests.codes.not_found, "not_found", "Cannot find checkout!") 39 | return response, requests.codes.ok 40 | -------------------------------------------------------------------------------- /dss/api/events.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from datetime import datetime 4 | 5 | import requests 6 | from flask import request, make_response, jsonify 7 | 8 | from dss.util.aws import resources 9 | from dss import Config, Replica 10 | from dss import events 11 | from dss.error import DSSException, dss_handler 12 | from dss.util import security, hashabledict, UrlBuilder 13 | from dss.util.version import datetime_from_timestamp 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | @dss_handler 20 | def list_events(replica: str, from_date: str=None, to_date: str=None, per_page: int=1, token: str=None): 21 | if token: 22 | fdate = datetime_from_timestamp(token) 23 | else: 24 | fdate = datetime_from_timestamp(from_date) if from_date else datetime.min 25 | tdate = datetime_from_timestamp(to_date) if to_date else datetime.max 26 | if fdate > tdate: 27 | raise DSSException(400, "bad_request", "to_date must be greater than from_date") 28 | ff = Config.get_flashflood_handle(Replica[replica].flashflood_prefix_read) 29 | event_streams = list() 30 | for i, event_stream in enumerate(ff.list_event_streams(fdate, tdate)): 31 | if datetime_from_timestamp(event_stream['from_date']) < tdate: 32 | event_streams.append(event_stream) 33 | else: 34 | break 35 | if i == per_page: 36 | break 37 | 38 | if len(event_streams) <= per_page: 39 | response = make_response(jsonify(dict(event_streams=event_streams)), requests.codes.ok) 40 | response.headers['X-OpenAPI-Pagination'] = 'false' 41 | else: 42 | next_url = UrlBuilder(request.url) 43 | next_url.replace_query("token", event_streams[-1]['from_date']) 44 | link = f"<{next_url}>; rel='next'" 45 | response = make_response(jsonify(dict(event_streams=event_streams[:-1])), requests.codes.partial) 46 | response.headers['Link'] = link 47 | response.headers['X-OpenAPI-Pagination'] = 'true' 48 | response.headers['X-OpenAPI-Paginated-Content-Key'] = 'event_streams' 49 | return response 50 | 51 | @dss_handler 52 | def get(uuid: str, replica: str, version: str = None): 53 | key = f"bundles/{uuid}.{version}" 54 | doc = events.get_bundle_metadata_document(Replica[replica], key) 55 | if doc is None: 56 | raise DSSException(404, "not_found", "Cannot find event!") 57 | return doc 58 | -------------------------------------------------------------------------------- /dss/api/subscriptions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from dss.error import dss_handler 4 | from dss.api import subscriptions_v1, subscriptions_v2 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | @dss_handler 9 | def get(uuid: str, replica: str, subscription_type: str): 10 | if "elasticsearch" == subscription_type: 11 | return subscriptions_v1.get(uuid, replica) 12 | else: 13 | return subscriptions_v2.get(uuid, replica) 14 | 15 | @dss_handler 16 | def find(replica: str, subscription_type: str): 17 | if "elasticsearch" == subscription_type: 18 | return subscriptions_v1.find(replica) 19 | else: 20 | return subscriptions_v2.find(replica) 21 | 22 | @dss_handler 23 | def delete(uuid: str, replica: str, subscription_type: str): 24 | if "elasticsearch" == subscription_type: 25 | return subscriptions_v1.delete(uuid, replica) 26 | else: 27 | return subscriptions_v2.delete(uuid, replica) 28 | 29 | @dss_handler 30 | def put(json_request_body: dict, replica: str): 31 | if json_request_body.get('es_query'): 32 | return subscriptions_v1.put(json_request_body, replica) 33 | else: 34 | return subscriptions_v2.put(json_request_body, replica) 35 | -------------------------------------------------------------------------------- /dss/collections/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/collections/__init__.py -------------------------------------------------------------------------------- /dss/collections/owner_lookup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from botocore.exceptions import ClientError 3 | 4 | from dss import dynamodb # type: ignore 5 | 6 | 7 | collection_db_table = f"dss-collections-db-{os.environ['DSS_DEPLOYMENT_STAGE']}" 8 | 9 | 10 | def put_collection(owner: str, collection_fqid: str, permission_level: str = 'owner'): 11 | try: 12 | dynamodb.put_item(table=collection_db_table, 13 | hash_key=owner, 14 | sort_key=collection_fqid, 15 | value=permission_level, 16 | dont_overwrite='sort_key') 17 | except ClientError as e: 18 | if e.response['Error']['Code'] != 'ConditionalCheckFailedException': 19 | raise 20 | 21 | 22 | def get_collection(owner: str, collection_fqid: str): 23 | return dynamodb.get_item(table=collection_db_table, 24 | hash_key=owner, 25 | sort_key=collection_fqid, 26 | return_key='sort_key') 27 | 28 | 29 | def get_collection_fqids_for_owner(owner: str): 30 | """Returns an Iterator of uuid strings.""" 31 | return dynamodb.get_primary_key_items(table=collection_db_table, 32 | key=owner, 33 | return_key='sort_key') 34 | 35 | 36 | def get_all_collection_keys(): 37 | """Returns an Iterator of (owner, uuid) for all items in the collections db table.""" 38 | return dynamodb.get_all_table_items(table=collection_db_table, both_keys=True) 39 | 40 | 41 | def delete_collection(owner: str, collection_fqid: str): 42 | """Deletes one collection item from a database.""" 43 | dynamodb.delete_item(table=collection_db_table, 44 | hash_key=owner, 45 | sort_key=collection_fqid) 46 | 47 | 48 | def delete_collection_uuid(owner: str, uuid: str): 49 | """Deletes all versions of a uuid in the database.""" 50 | for collection_fqid in get_collection_fqids_for_owner(owner): 51 | if collection_fqid.startswith(uuid): 52 | dynamodb.delete_item(table=collection_db_table, 53 | hash_key=owner, 54 | sort_key=collection_fqid) 55 | -------------------------------------------------------------------------------- /dss/events/handlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/events/handlers/__init__.py -------------------------------------------------------------------------------- /dss/index/__init__.py: -------------------------------------------------------------------------------- 1 | from dss.index.es.backend import ElasticsearchIndexBackend 2 | 3 | DEFAULT_BACKENDS = [ElasticsearchIndexBackend] 4 | 5 | __all__ = ['DEFAULT_BACKENDS'] 6 | -------------------------------------------------------------------------------- /dss/index/es/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "analysis": { 4 | "normalizer": { 5 | "keyword_lowercase": { 6 | "type": "custom", 7 | "filter": ["lowercase"] 8 | } 9 | } 10 | }, 11 | "index": { 12 | "percolator": { 13 | "map_unmapped_fields_as_string": true 14 | } 15 | } 16 | }, 17 | "mappings": { 18 | "doc": { 19 | "date_detection": true, 20 | "dynamic_date_formats": [ 21 | "yyyy-MM-dd HH:mm:ss", 22 | "strict_date_optional_time" 23 | ], 24 | "dynamic_templates": [ 25 | { 26 | "dates": { 27 | "match_mapping_type": "date", 28 | "mapping": { 29 | "type": "date" 30 | } 31 | } 32 | }, 33 | { 34 | "short_strings": { 35 | "match_mapping_type": "string", 36 | "mapping": { 37 | "normalizer": "keyword_lowercase", 38 | "type": "keyword", 39 | "ignore_above": 256, 40 | "fields": { 41 | "text": { 42 | "type": "text" 43 | } 44 | } 45 | } 46 | } 47 | } 48 | ], 49 | "properties": { 50 | "uuid": {"type": "keyword"}, 51 | "manifest": { 52 | "properties":{ 53 | "version": {"type": "keyword"} 54 | } 55 | } 56 | } 57 | }, 58 | "query":{ 59 | "properties":{ 60 | "query": { 61 | "type": "percolator" 62 | } 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /dss/notify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/notify/__init__.py -------------------------------------------------------------------------------- /dss/notify/attachment.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Mapping, Union 3 | 4 | import jmespath 5 | from jmespath.exceptions import JMESPathError 6 | import requests 7 | 8 | from dss import DSSException 9 | from dss.util.types import JSON 10 | 11 | # This 50% of the maximum size of an SQS message. Keep in mind that the payload will be gzipp'ed and base85 eoncoded. 12 | # 13 | size_limit = 128 * 1024 14 | 15 | 16 | # See Swagger schema for details on the structure of these 17 | # 18 | Definitions = Mapping[str, Mapping[str, str]] 19 | Attachments = Mapping[str, JSON] 20 | 21 | 22 | def validate(definitions: Definitions) -> None: 23 | """ 24 | Validate the given attachment definitions. This should be called in a request handling context as it raises 25 | DSSException referring to HTTP status code, as well as error code and description. 26 | """ 27 | for name, definition in definitions.items(): 28 | if name.startswith('_'): 29 | raise DSSException(requests.codes.bad_request, 30 | "invalid_attachment_name", 31 | f"Attachment names must not start with underscore ({name})") 32 | type_ = definition['type'] 33 | if type_ == 'jmespath': 34 | expression = definition['expression'] 35 | try: 36 | jmespath.compile(expression) 37 | except JMESPathError as e: 38 | raise DSSException(requests.codes.bad_request, 39 | "invalid_attachment_expression", 40 | f"Unable to compile JMESPath expression for attachment {name}") from e 41 | else: 42 | assert False, type_ 43 | 44 | 45 | def select(definitions: Definitions, document: JSON) -> Attachments: 46 | """ 47 | Return a defined subset of the given document for the pupose of attaching that subset to a notification about that 48 | document. 49 | """ 50 | attachments = {} 51 | errors = {} 52 | for name, attachment in definitions.items(): 53 | type_ = attachment['type'] 54 | if type_ == 'jmespath': 55 | try: 56 | expression = attachment['expression'] 57 | value = jmespath.search(expression, document) 58 | except BaseException as e: 59 | errors[name] = str(e) 60 | else: 61 | attachments[name] = value 62 | else: 63 | assert False, type_ 64 | if errors: 65 | attachments['_errors'] = errors 66 | size = len(json.dumps(attachments).encode('utf-8')) 67 | if size > size_limit: 68 | attachments = {'_errors': f"Attachments too large ({size} > {size_limit})"} 69 | return attachments 70 | -------------------------------------------------------------------------------- /dss/operations/stepfunctions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/operations/stepfunctions.py -------------------------------------------------------------------------------- /dss/operations/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import argparse 5 | import logging 6 | import typing 7 | import time 8 | from functools import wraps 9 | from datetime import datetime, timedelta 10 | 11 | from cloud_blobstore import BlobStore 12 | from dcplib.aws.sqs import SQSMessenger 13 | 14 | from dss.util.aws.clients import sts # type: ignore 15 | from dss.config import Config, Replica 16 | from concurrent.futures import ThreadPoolExecutor, as_completed 17 | 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | _account_id = sts.get_caller_identity()['Account'] 23 | command_queue_url = "https://sqs.{}.amazonaws.com/{}/dss-operations-{}".format( 24 | os.environ['AWS_DEFAULT_REGION'], 25 | _account_id, 26 | os.environ['DSS_DEPLOYMENT_STAGE'] 27 | ) 28 | 29 | 30 | LOG_MONITOR_SLEEP_DURATION = 10 31 | 32 | 33 | def polite_print(quiet, msg): 34 | if not quiet: 35 | print(msg) 36 | 37 | def map_bucket_results(func: typing.Callable, handle: BlobStore, bucket: str, base_pfx: str, parallelization=10): 38 | """ 39 | Call `func` on an iterable of keys 40 | func is expected to be thread safe. 41 | """ 42 | with ThreadPoolExecutor(max_workers=parallelization) as e: 43 | futures = list() 44 | for pfx in "0123456789abcdef": 45 | f = e.submit(func, handle.list(bucket, prefix=f"{base_pfx}{pfx}")) 46 | futures.append(f) 47 | for f in as_completed(futures): 48 | try: 49 | yield f.result() 50 | except Exception: 51 | logger.error(traceback.format_exc()) 52 | 53 | def map_bucket(*args, **kwargs): 54 | for _ in map_bucket_results(*args, **kwargs): 55 | pass 56 | 57 | 58 | def monitor_logs(logs_client, job_id: str, start_time: datetime): 59 | start = new_start = int(1000 * (datetime.timestamp(datetime.utcnow()))) 60 | log_group = f"/aws/lambda/dss-operations-{os.environ['DSS_DEPLOYMENT_STAGE']}" 61 | paginator = logs_client.get_paginator('filter_log_events') 62 | while True: 63 | for info in paginator.paginate(logGroupName=log_group, startTime=start, filterPattern=f'"{job_id}"'): 64 | for e in info['events']: 65 | print(e['message']) 66 | new_start = e['timestamp'] + 1 67 | if start == new_start: 68 | sys.stderr.write(f"No new CloudWatch log messages, sleeping {LOG_MONITOR_SLEEP_DURATION}s" + os.linesep) 69 | time.sleep(LOG_MONITOR_SLEEP_DURATION) 70 | else: 71 | start = new_start 72 | -------------------------------------------------------------------------------- /dss/stepfunctions/checkout/constants.py: -------------------------------------------------------------------------------- 1 | STATE_MACHINE_NAME_TEMPLATE = "dss-checkout-sfn-{stage}" 2 | 3 | 4 | class EventConstants: 5 | """Externally visible constants used in the SFN messages.""" 6 | 7 | EXECUTION_ID = "execution_id" 8 | BUNDLE_UUID = "bundle" 9 | BUNDLE_VERSION = "version" 10 | DSS_BUCKET = "dss_bucket" 11 | DST_BUCKET = "bucket" 12 | STATUS_BUCKET = "checkout_status_bucket" 13 | REPLICA = "replica" 14 | STATUS = "status" 15 | EMAIL = "email" 16 | 17 | STATUS_COMPLETE_COUNT = "complete_count" 18 | STATUS_TOTAL_COUNT = "total_count" 19 | STATUS_CHECK_COUNT = "check_count" 20 | STATUS_OVERALL_STATUS = "checkout_status" 21 | -------------------------------------------------------------------------------- /dss/stepfunctions/gscopyclient/__init__.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from .implementation import CopyWriteMetadataKey, Key, sfn, copy_write_metadata_sfn 4 | 5 | 6 | def copy_sfn_event( 7 | source_bucket: str, source_key: str, 8 | destination_bucket: str, destination_key: str 9 | ) -> typing.MutableMapping[str, str]: 10 | """Returns the initial event object to start the gs-gs copy stepfunction.""" 11 | return { 12 | Key.SOURCE_BUCKET: source_bucket, 13 | Key.SOURCE_KEY: source_key, 14 | Key.DESTINATION_BUCKET: destination_bucket, 15 | Key.DESTINATION_KEY: destination_key, 16 | } 17 | 18 | 19 | def copy_write_metadata_sfn_event( 20 | source_bucket: str, source_key: str, 21 | destination_bucket: str, destination_key: str, 22 | file_uuid: str, file_version: str, 23 | metadata: str, 24 | ) -> typing.MutableMapping[str, str]: 25 | """ 26 | Returns the initial event object to start the stepfunction that performs a s3-s3 copy and writes the HCA /files 27 | metadata file. 28 | """ 29 | base = copy_sfn_event(source_bucket, source_key, destination_bucket, destination_key) 30 | base[CopyWriteMetadataKey.FILE_UUID] = file_uuid 31 | base[CopyWriteMetadataKey.FILE_VERSION] = file_version 32 | base[CopyWriteMetadataKey.METADATA] = metadata 33 | 34 | return base 35 | -------------------------------------------------------------------------------- /dss/stepfunctions/lambdaexecutor/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import threading 3 | import typing 4 | 5 | 6 | StateType = typing.TypeVar('StateType') 7 | 8 | 9 | class TimedThread(typing.Generic[StateType]): 10 | """ 11 | This is a "Thread" class that runs a job for a maximum period of time. The class provides concurrency-safe methods 12 | to retrieve and persist a chunk of state. 13 | """ 14 | def __init__(self, timeout_seconds: float, state: StateType) -> None: 15 | self.timeout_seconds = timeout_seconds 16 | self.__state = copy.deepcopy(state) 17 | self.lock = threading.Lock() 18 | self._exception: Exception = None 19 | 20 | def run(self) -> StateType: 21 | raise NotImplementedError() 22 | 23 | def _run(self) -> None: 24 | try: 25 | state = self.run() 26 | except Exception as e: 27 | self._exception = e 28 | else: 29 | self.save_state(state) 30 | 31 | def _start_async(self) -> None: 32 | self.thread = threading.Thread(target=self._run, daemon=True) 33 | self.thread.start() 34 | 35 | def _join(self) -> StateType: 36 | self.thread.join(self.timeout_seconds) 37 | 38 | with self.lock: 39 | state = copy.deepcopy(self.__state) 40 | return state 41 | 42 | def start(self) -> StateType: 43 | self._start_async() 44 | state = self._join() 45 | if self._exception: 46 | raise self._exception 47 | return state 48 | 49 | def get_state_copy(self) -> StateType: 50 | with self.lock: 51 | state_copy = copy.deepcopy(self.__state) 52 | return state_copy 53 | 54 | def save_state(self, new_state: StateType) -> None: 55 | new_state = copy.deepcopy(new_state) 56 | with self.lock: 57 | self.__state = new_state 58 | -------------------------------------------------------------------------------- /dss/stepfunctions/s3copyclient/__init__.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from .implementation import CopyWriteMetadataKey, Key, sfn, copy_write_metadata_sfn 4 | 5 | 6 | def copy_sfn_event( 7 | source_bucket: str, source_key: str, 8 | destination_bucket: str, destination_key: str 9 | ) -> typing.MutableMapping[str, str]: 10 | """Returns the initial event object to start the s3-s3 copy stepfunction.""" 11 | return { 12 | Key.SOURCE_BUCKET: source_bucket, 13 | Key.SOURCE_KEY: source_key, 14 | Key.DESTINATION_BUCKET: destination_bucket, 15 | Key.DESTINATION_KEY: destination_key, 16 | } 17 | 18 | 19 | def copy_write_metadata_sfn_event( 20 | source_bucket: str, source_key: str, 21 | destination_bucket: str, destination_key: str, 22 | file_uuid: str, file_version: str, 23 | metadata: str, 24 | ) -> typing.MutableMapping[str, str]: 25 | """ 26 | Returns the initial event object to start the stepfunction that performs a s3-s3 copy and writes the HCA /files 27 | metadata file. 28 | """ 29 | base = copy_sfn_event(source_bucket, source_key, destination_bucket, destination_key) 30 | base[CopyWriteMetadataKey.FILE_UUID] = file_uuid 31 | base[CopyWriteMetadataKey.FILE_VERSION] = file_version 32 | base[CopyWriteMetadataKey.METADATA] = metadata 33 | 34 | return base 35 | -------------------------------------------------------------------------------- /dss/storage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/storage/__init__.py -------------------------------------------------------------------------------- /dss/storage/checkout/cache_flow.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | """ 4 | These functions assist with the caching process and provide greater availability of heavily accessed files to the user. 5 | 6 | The criteria used to determine if a file should be cached or not is set with: CHECKOUT_CACHE_CRITERIA 7 | For example: CHECKOUT_CACHE_CRITERIA='[{"type":"application/json","max_size":12314}]' 8 | 9 | Uncached files are controlled by a lifecycle policy that deletes them regularly. Cached files are ignored by 10 | this lifecycle policy and are (currently) never deleted. 11 | 12 | For AWS object tagging is used to mark uncached files: TagSet=[{uncached:True}] 13 | For GCP object storage classes are used to indicate what is to be cached: STANDARD (MULTI_REGIONAL) are cached 14 | 15 | Metadata Caching RFC: https://docs.google.com/document/d/1PQBO5qYUVJFAXFNaMdgxq8j0y-OI_EF2b15I6fvEYjo 16 | """ 17 | 18 | 19 | def is_dss_bucket(dst_bucket: str): 20 | """Function checks if the passed bucket is managed by the DSS""" 21 | return dst_bucket in (os.environ['DSS_S3_CHECKOUT_BUCKET'], os.environ['DSS_GS_CHECKOUT_BUCKET']) 22 | 23 | 24 | def should_cache_file(content_type: str, size: int) -> bool: 25 | """Returns True if a file should be cached (marked as long-lived) for the dss checkout bucket.""" 26 | # Each file type may have a size limit that determines uncached status. 27 | cache_criteria = json.loads(os.getenv("CHECKOUT_CACHE_CRITERIA")) 28 | for file_criteria in cache_criteria: 29 | if content_type.startswith(file_criteria['type']) and file_criteria['max_size'] >= size: 30 | return True 31 | return False 32 | -------------------------------------------------------------------------------- /dss/storage/checkout/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | 4 | from dss import stepfunctions 5 | from dss.config import Replica 6 | from dss.stepfunctions import s3copyclient, gscopyclient 7 | 8 | 9 | log = logging.getLogger(__package__) 10 | 11 | 12 | class CheckoutTokenKeys: 13 | """ 14 | When we are executing a request that involves a checkout, the client will periodically check back in to see if the 15 | checkout is complete. To avoid duplicating checkout requests, the client will check back using a token. These are 16 | keys that make up the token. 17 | """ 18 | EXECUTION_ID = "execution_id" 19 | """Execution ID of the step function managing the checkout.""" 20 | 21 | START_TIME = "start_time" 22 | """Start time of the request.""" 23 | 24 | ATTEMPTS = "attempts" 25 | """Number of times the client has attempted to check on the state of a checkout.""" 26 | 27 | 28 | def get_execution_id() -> str: 29 | return str(uuid.uuid4()) 30 | 31 | 32 | def parallel_copy( 33 | replica: Replica, 34 | source_bucket: str, 35 | source_key: str, 36 | destination_bucket: str, 37 | destination_key: str) -> str: 38 | log.debug(f"Copy file from bucket {source_bucket} with key {source_key} to " 39 | f"bucket {destination_bucket} destination file: {destination_key}") 40 | 41 | if replica == Replica.aws: 42 | state = s3copyclient.copy_sfn_event( 43 | source_bucket, source_key, 44 | destination_bucket, destination_key, 45 | ) 46 | state_machine_name_template = "dss-s3-copy-sfn-{stage}" 47 | elif replica == Replica.gcp: 48 | state = gscopyclient.copy_sfn_event( 49 | source_bucket, source_key, 50 | destination_bucket, destination_key 51 | ) 52 | state_machine_name_template = "dss-gs-copy-sfn-{stage}" 53 | else: 54 | raise ValueError("Unsupported replica") 55 | 56 | execution_id = get_execution_id() 57 | stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) 58 | return execution_id 59 | -------------------------------------------------------------------------------- /dss/storage/checkout/error.py: -------------------------------------------------------------------------------- 1 | class TokenError(Exception): 2 | """Raised when we can't parse the token or it is missing fields.""" 3 | pass 4 | 5 | 6 | class CheckoutError(Exception): 7 | """Raised when the checkout fails.""" 8 | pass 9 | 10 | 11 | class PreExecCheckoutError(CheckoutError): 12 | """Raised when one of the quick checks before we start the checkout fails.""" 13 | pass 14 | 15 | 16 | class BundleNotFoundError(PreExecCheckoutError): 17 | """Raised when we attempt to check out a non-existent bundle.""" 18 | pass 19 | 20 | 21 | class DestinationBucketNotFoundError(PreExecCheckoutError): 22 | """Raised when we attempt to check out to a non-existent bucket.""" 23 | pass 24 | 25 | 26 | class DestinationBucketNotWritableError(PreExecCheckoutError): 27 | """Raised when we attempt to check out to a bucket that we can't write to.""" 28 | pass 29 | -------------------------------------------------------------------------------- /dss/storage/checkout/file.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the module for file checkouts. 3 | """ 4 | import typing 5 | 6 | from dss.config import Replica 7 | from .common import parallel_copy 8 | 9 | 10 | def start_file_checkout(replica: Replica, blob_key, dst_bucket: typing.Optional[str] = None) -> str: 11 | """ 12 | Starts a file checkout. 13 | 14 | :param blob_key: The key of the blob that contains the file. 15 | :param replica: The replica to execute the checkout in. 16 | :param dst_bucket: If provided, check out to this bucket. If not provided, check out to the default checkout bucket 17 | for the replica. 18 | :return: The execution ID of the request. 19 | """ 20 | dst_bucket = dst_bucket or replica.checkout_bucket 21 | # change the assert below once the bug causing this to occur is found 22 | assert dst_bucket != replica.bucket, f'Cannot checkout a file from {dst_bucket} to itself!' 23 | source_bucket = replica.bucket 24 | return parallel_copy(replica, source_bucket, blob_key, dst_bucket, get_dst_key(blob_key)) 25 | 26 | 27 | def get_dst_key(blob_key: str): 28 | """ 29 | Returns the destination key where a file checkout will be saved to. 30 | :param blob_key: The key for the file's data in the DSS bucket. 31 | :return: The key for the file's data in the checkout bucket. 32 | """ 33 | return f"{blob_key}" 34 | -------------------------------------------------------------------------------- /dss/storage/files.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from cloud_blobstore import BlobAlreadyExistsError, BlobNotFoundError, BlobStore 4 | 5 | 6 | def write_file_metadata( 7 | handle: BlobStore, 8 | dst_bucket: str, 9 | file_uuid: str, 10 | file_version: str, 11 | document: str): 12 | # what's the target object name for the file metadata? 13 | metadata_key = f"files/{file_uuid}.{file_version}" 14 | 15 | # if it already exists, then it's a failure. 16 | try: 17 | handle.get_user_metadata(dst_bucket, metadata_key) 18 | except BlobNotFoundError: 19 | pass 20 | else: 21 | raise BlobAlreadyExistsError() 22 | 23 | handle.upload_file_handle( 24 | dst_bucket, 25 | metadata_key, 26 | io.BytesIO(document.encode("utf-8"))) 27 | -------------------------------------------------------------------------------- /dss/storage/hcablobstore/gs.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from . import HCABlobStore 4 | 5 | 6 | class GSHCABlobStore(HCABlobStore): 7 | def verify_blob_checksum_from_staging_metadata( 8 | self, bucket: str, key: str, metadata: typing.Dict[str, str]) -> bool: 9 | """ 10 | Given a blob, verify that the checksum on the cloud store matches the checksum in the metadata dictionary. The 11 | keys to the metadata dictionary will be the items in ``MANDATORY_METADATA``. Each cloud-specific implementation 12 | of ``HCABlobStore`` should extract the correct field and check it against the cloud-provided checksum. 13 | :param bucket: 14 | :param key: 15 | :param metadata: 16 | :return: True iff the checksum is correct. 17 | """ 18 | checksum = self.handle.get_cloud_checksum(bucket, key) 19 | metadata_checksum_key = typing.cast(str, HCABlobStore.MANDATORY_STAGING_METADATA['CRC32C']['keyname']) 20 | return checksum.lower() == metadata[metadata_checksum_key].lower() 21 | 22 | def verify_blob_checksum_from_dss_metadata( 23 | self, bucket: str, key: str, dss_metadata: typing.Dict[str, str]) -> bool: 24 | """ 25 | Given a blob, verify that the checksum on the cloud store matches the checksum in the metadata stored in the 26 | DSS. Each cloud-specific implementation of ``HCABlobStore`` should extract the correct field and check it 27 | against the cloud-provided checksum. 28 | :param bucket: 29 | :param key: 30 | :param dss_metadata: 31 | :return: True iff the checksum is correct. 32 | """ 33 | checksum = self.handle.get_cloud_checksum(bucket, key) 34 | return checksum.lower() == dss_metadata["crc32c"].lower() 35 | -------------------------------------------------------------------------------- /dss/storage/hcablobstore/s3.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from . import HCABlobStore 4 | 5 | 6 | class S3HCABlobStore(HCABlobStore): 7 | def verify_blob_checksum_from_staging_metadata( 8 | self, bucket: str, key: str, metadata: typing.Dict[str, str]) -> bool: 9 | """ 10 | Given a blob, verify that the checksum on the cloud store matches the checksum in the metadata dictionary. The 11 | keys to the metadata dictionary will be the items in ``MANDATORY_METADATA``. Each cloud-specific implementation 12 | of ``HCABlobStore`` should extract the correct field and check it against the cloud-provided checksum. 13 | :param bucket: 14 | :param key: 15 | :param metadata: 16 | :return: True iff the checksum is correct. 17 | """ 18 | checksum = self.handle.get_cloud_checksum(bucket, key) 19 | metadata_checksum_key = typing.cast(str, HCABlobStore.MANDATORY_STAGING_METADATA['S3_ETAG']['keyname']) 20 | return checksum.lower() == metadata[metadata_checksum_key].lower() 21 | 22 | def verify_blob_checksum_from_dss_metadata( 23 | self, bucket: str, key: str, dss_metadata: typing.Dict[str, str]) -> bool: 24 | """ 25 | Given a blob, verify that the checksum on the cloud store matches the checksum in the metadata stored in the 26 | DSS. Each cloud-specific implementation of ``HCABlobStore`` should extract the correct field and check it 27 | against the cloud-provided checksum. 28 | :param bucket: 29 | :param key: 30 | :param dss_metadata: 31 | :return: True iff the checksum is correct. 32 | """ 33 | checksum = self.handle.get_cloud_checksum(bucket, key) 34 | return checksum.lower() == dss_metadata["s3-etag"].lower() 35 | -------------------------------------------------------------------------------- /dss/subscriptions_v2/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from dss.config import Replica 5 | from dss import dynamodb # type: ignore 6 | 7 | 8 | class SubscriptionData: 9 | REPLICA = 'replica' 10 | OWNER = 'owner' 11 | UUID = 'uuid' 12 | CALLBACK_URL = 'callback_url' 13 | JMESPATH_QUERY = 'jmespath_query' 14 | METHOD = 'method' 15 | ENCODING = 'encoding' 16 | FORM_FIELDS = 'form_fields' 17 | PAYLOAD_FORM_FIELD = 'payload_form_field' 18 | ATTACHMENTS = 'attachments' 19 | 20 | 21 | subscription_db_table = f"dss-subscriptions-v2-{{}}-{os.environ['DSS_DEPLOYMENT_STAGE']}" 22 | 23 | 24 | def put_subscription(doc: dict): 25 | dynamodb.put_item(table=subscription_db_table.format(doc[SubscriptionData.REPLICA]), 26 | hash_key=doc[SubscriptionData.OWNER], 27 | sort_key=doc[SubscriptionData.UUID], 28 | value=json.dumps(doc)) 29 | 30 | 31 | def get_subscription(replica: Replica, owner: str, uuid: str): 32 | try: 33 | item = dynamodb.get_item(table=subscription_db_table.format(replica.name), 34 | hash_key=owner, 35 | sort_key=uuid) 36 | return json.loads(item) 37 | except dynamodb.DynamoDBItemNotFound: 38 | return None 39 | 40 | 41 | def get_subscriptions_for_owner(replica: Replica, owner: str) -> list: 42 | items = dynamodb.get_primary_key_items(table=subscription_db_table.format(replica.name), 43 | key=owner) 44 | return [json.loads(item) for item in items] 45 | 46 | 47 | def count_subscriptions_for_owner(replica: Replica, owner: str) -> int: 48 | return dynamodb.get_primary_key_count(table=subscription_db_table.format(replica.name), 49 | key=owner) 50 | 51 | 52 | def get_subscriptions_for_replica(replica: Replica) -> list: 53 | items = dynamodb.get_all_table_items(table=subscription_db_table.format(replica.name)) 54 | return [json.loads(item) for item in items] 55 | 56 | 57 | def delete_subscription(replica: Replica, owner: str, uuid: str): 58 | dynamodb.delete_item(table=subscription_db_table.format(replica.name), 59 | hash_key=owner, 60 | sort_key=uuid) 61 | -------------------------------------------------------------------------------- /dss/test.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/test.log -------------------------------------------------------------------------------- /dss/util/aws/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import botocore 3 | from . import clients, resources, cloudwatch_logging 4 | 5 | 6 | class ARN: 7 | fields = "arn partition service region account_id resource".split() 8 | _default_region, _default_account_id, _default_iam_username = None, None, None 9 | 10 | def __init__(self, arn="arn:aws::::", **kwargs): 11 | self.__dict__.update(dict(zip(self.fields, arn.split(":", 5)), **kwargs)) 12 | if "region" not in kwargs and not self.region: 13 | self.region = self.get_region() 14 | if "account_id" not in kwargs and not self.account_id: 15 | self.account_id = self.get_account_id() 16 | 17 | @classmethod 18 | def get_region(cls): 19 | if cls._default_region is None: 20 | cls._default_region = botocore.session.Session().get_config_variable("region") 21 | return cls._default_region 22 | 23 | @classmethod 24 | def get_account_id(cls): 25 | if cls._default_account_id is None: 26 | cls._default_account_id = clients.sts.get_caller_identity()["Account"] 27 | return cls._default_account_id 28 | 29 | def __str__(self): 30 | return ":".join(getattr(self, field) for field in self.fields) 31 | 32 | 33 | def send_sns_msg(topic_arn, message, attributes=None): 34 | sns_topic = resources.sns.Topic(str(topic_arn)) 35 | args = {'Message': json.dumps(message)} 36 | if attributes is not None: 37 | args['MessageAttributes'] = attributes 38 | sns_topic.publish(**args) 39 | -------------------------------------------------------------------------------- /dss/util/aws/_boto3_loader.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | class Loader: 4 | cache = dict(resource={}, client={}) # type: typing.Dict[str, dict] 5 | def __init__(self, factory): 6 | self.factory = factory 7 | 8 | def __getattr__(self, attr): 9 | if attr == "__all__": 10 | return list(self.cache[self.factory]) 11 | if attr == "__path__" or attr == "__loader__": 12 | return None 13 | if attr not in self.cache[self.factory]: 14 | if self.factory == "client" and attr in self.cache["resource"]: 15 | self.cache["client"][attr] = self.cache["resource"][attr].meta.client 16 | else: 17 | import boto3 18 | factory = getattr(boto3, self.factory) 19 | self.cache[self.factory][attr] = factory(attr) 20 | return self.cache[self.factory][attr] 21 | -------------------------------------------------------------------------------- /dss/util/aws/clients.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from ._boto3_loader import Loader 3 | 4 | sys.modules[__name__] = Loader("client") # type: ignore 5 | -------------------------------------------------------------------------------- /dss/util/aws/cloudwatch_logging.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import boto3 4 | import botocore.exceptions 5 | 6 | 7 | def log_message(log_group_name: str, log_stream_name: str, message: str): 8 | """Logs a message to cloudwatch.""" 9 | 10 | logs_client = boto3.client("logs") 11 | 12 | def get_sequence_token(): 13 | # try to get the upload sequence token 14 | paginator = logs_client.get_paginator('describe_log_streams') 15 | for page in paginator.paginate(logGroupName=log_group_name, logStreamNamePrefix=log_stream_name): 16 | for log_stream in page['logStreams']: 17 | if log_stream['logStreamName'] == log_stream_name: 18 | return log_stream.get('uploadSequenceToken', None) 19 | 20 | return None 21 | 22 | while True: 23 | try: 24 | logs_client.create_log_group(logGroupName=log_group_name) 25 | except logs_client.exceptions.ResourceAlreadyExistsException: 26 | pass 27 | try: 28 | logs_client.create_log_stream( 29 | logGroupName=log_group_name, logStreamName=log_stream_name) 30 | except logs_client.exceptions.ResourceAlreadyExistsException: 31 | pass 32 | 33 | sequence_token = get_sequence_token() 34 | 35 | try: 36 | kwargs = dict( 37 | logGroupName=log_group_name, 38 | logStreamName=log_stream_name, 39 | logEvents=[dict( 40 | timestamp=int(time.time() * 1000), 41 | message=message, 42 | )], 43 | ) 44 | if sequence_token is not None: 45 | kwargs['sequenceToken'] = sequence_token 46 | 47 | logs_client.put_log_events(**kwargs) 48 | break 49 | except logs_client.exceptions.InvalidSequenceTokenException: 50 | pass 51 | -------------------------------------------------------------------------------- /dss/util/aws/resources.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from ._boto3_loader import Loader 3 | 4 | sys.modules[__name__] = Loader("resource") # type: ignore 5 | -------------------------------------------------------------------------------- /dss/util/email.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | from dss import Replica 4 | 5 | # The character encoding for the email. 6 | CHARSET = "UTF-8" 7 | SUCCESS_SUBJECT = "Bundle checkout complete" 8 | FAILURE_SUBJECT = "Bundle checkout failed" 9 | 10 | # Create a new SES resource 11 | client = boto3.client('ses') 12 | 13 | def send_email(sender: str, to: str, subject: str, html: str, text: str) -> str: 14 | # Provide the contents of the email. 15 | response = client.send_email( 16 | Destination={ 17 | 'ToAddresses': [ 18 | to 19 | ], 20 | }, 21 | Message={ 22 | 'Body': { 23 | 'Html': { 24 | 'Charset': CHARSET, 25 | 'Data': html, 26 | }, 27 | 'Text': { 28 | 'Charset': CHARSET, 29 | 'Data': text, 30 | }, 31 | }, 32 | 'Subject': { 33 | 'Charset': CHARSET, 34 | 'Data': subject, 35 | }, 36 | }, 37 | Source=sender, 38 | ) 39 | return "Email sent! Message ID: {}".format(response['ResponseMetadata']['RequestId']) 40 | 41 | def send_checkout_success_email(sender: str, to: str, bucket: str, location: str, replica: Replica): 42 | text = "Hello, your checkout request has been processed. Your files are available at bucket {} location {}.".\ 43 | format(bucket, location) 44 | 45 | html = """ 46 | 47 | 48 |

Hello,

49 |

50 | Your checkout request has been processed. 51 | Your files are available at {}://{}/{} 52 |

53 | 54 | 55 | """.format(replica.storage_schema, bucket, location) 56 | return send_email(sender, to, SUCCESS_SUBJECT, html, text) 57 | 58 | 59 | def send_checkout_failure_email(sender: str, to: str, cause: str): 60 | text = "Hello, your checkout request has failed due to {}.".format(cause) 61 | html = """ 62 | 63 | 64 |

Hello,

65 |

Your checkout request has failed due to {}.

66 | 67 | 68 | """.format(cause) 69 | return send_email(sender, to, FAILURE_SUBJECT, html, text) 70 | -------------------------------------------------------------------------------- /dss/util/json_gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/util/json_gen/__init__.py -------------------------------------------------------------------------------- /dss/util/networking.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import socket 3 | 4 | 5 | def unused_tcp_port(): 6 | with contextlib.closing(socket.socket()) as sock: 7 | sock.bind(('127.0.0.1', 0)) 8 | return sock.getsockname()[1] 9 | -------------------------------------------------------------------------------- /dss/util/time.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | import time 3 | 4 | from dss.util import require 5 | from dss.util.types import LambdaContext 6 | 7 | 8 | class RemainingTime(metaclass=ABCMeta): 9 | """ 10 | A monotonically decreasing, non-negative estimate of time remaining in a particular context 11 | """ 12 | 13 | @abstractmethod 14 | def get(self) -> float: 15 | """ 16 | Returns the estimated remaining time in seconds 17 | """ 18 | raise NotImplementedError() 19 | 20 | 21 | class RemainingLambdaContextTime(RemainingTime): 22 | """ 23 | The estimated running time in an AWS Lambda context 24 | """ 25 | 26 | def __init__(self, context: LambdaContext) -> None: 27 | super().__init__() 28 | self._context = context 29 | 30 | def get(self) -> float: 31 | return self._context.get_remaining_time_in_millis() / 1000 32 | 33 | 34 | class RemainingTimeUntil(RemainingTime): 35 | """ 36 | The remaining wall clock time up to a given absolute deadline in terms of time.time() 37 | """ 38 | 39 | def __init__(self, deadline: float) -> None: 40 | super().__init__() 41 | self._deadline = deadline 42 | 43 | def get(self) -> float: 44 | return max(0.0, self._deadline - time.time()) 45 | 46 | 47 | class SpecificRemainingTime(RemainingTimeUntil): 48 | """ 49 | A specific relative amount of wall clock time in seconds 50 | """ 51 | 52 | def __init__(self, amount: float) -> None: 53 | require(amount >= 0, "Inital remaining time must be non-negative") 54 | super().__init__(time.time() + amount) 55 | 56 | 57 | class AdjustedRemainingTime(RemainingTime): 58 | """ 59 | Some other estimate of remaining time, adjusted by a fixed offset. Use a negative offset to reduce the remaining 60 | time or a positive offset to increase it. 61 | """ 62 | 63 | def __init__(self, offset: float, actual: RemainingTime) -> None: 64 | super().__init__() 65 | self._offset = offset 66 | self._actual = actual 67 | 68 | def get(self) -> float: 69 | return max(0.0, self._actual.get() + self._offset) 70 | -------------------------------------------------------------------------------- /dss/util/types.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Mapping, Any, List 2 | 3 | JSONObj = Mapping[str, Any] 4 | 5 | # Strictly speaking, this is the generic JSON type: 6 | 7 | AnyJSON = Union[str, int, float, bool, None, JSONObj, List[Any]] 8 | 9 | # Most JSON structures, however, start with an JSON object, so we'll use the shorter name for that type: 10 | 11 | JSON = Mapping[str, AnyJSON] 12 | 13 | 14 | # A stub for the AWS Lambda context 15 | 16 | class LambdaContext(object): 17 | 18 | @property 19 | def aws_request_id(self) -> str: 20 | raise NotImplementedError 21 | 22 | @property 23 | def log_group_name(self) -> str: 24 | raise NotImplementedError 25 | 26 | @property 27 | def log_stream_name(self) -> str: 28 | raise NotImplementedError 29 | 30 | @property 31 | def function_name(self) -> str: 32 | raise NotImplementedError 33 | 34 | @property 35 | def memory_limit_in_mb(self) -> str: 36 | raise NotImplementedError 37 | 38 | @property 39 | def function_version(self) -> str: 40 | raise NotImplementedError 41 | 42 | @property 43 | def invoked_function_arn(self) -> str: 44 | raise NotImplementedError 45 | 46 | def get_remaining_time_in_millis(self) -> int: 47 | raise NotImplementedError 48 | 49 | def log(self, msg: str) -> None: 50 | raise NotImplementedError 51 | -------------------------------------------------------------------------------- /dss/util/version.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | _datetime_format = "%Y-%m-%dT%H%M%S.%fZ" 4 | 5 | def datetime_to_version_format(timestamp: datetime.datetime) -> str: 6 | return timestamp.strftime(_datetime_format) 7 | 8 | def datetime_from_timestamp(ts: str) -> datetime.datetime: 9 | return datetime.datetime.strptime(ts, _datetime_format) 10 | -------------------------------------------------------------------------------- /dss/vendored/README.md: -------------------------------------------------------------------------------- 1 | This directory contains vendored distributions. 2 | 3 | Please link each vendored distribution along with a link to the tree at the 4 | exact commit when the tree was copied. 5 | 6 | * [frozendict](https://github.com/slezica/python-frozendict/tree/7e078bf084ee734367dde8db2c8a2f00ec37375f) 7 | 8 | Place the vendored distributions's license into LICENSE at the project root. 9 | 10 | Try maintain the package path. If the unvendored distribution introduces the 11 | package `foo.bar`, the vendored package should be at `dss.vendored.foo.bar` 12 | -------------------------------------------------------------------------------- /dss/vendored/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/dss/vendored/__init__.py -------------------------------------------------------------------------------- /dss/vendored/frozendict/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict, Mapping 2 | 3 | 4 | class frozendict(Mapping): 5 | """ 6 | An immutable wrapper around dictionaries that implements the complete :py:class:`collections.Mapping` 7 | interface. It can be used as a drop-in replacement for dictionaries where immutability is desired. 8 | 9 | >>> fd1 = frozendict(a=1, b=2) 10 | 11 | Frozendicts are hashable and thus can be used in sets and as keys on dictionaries: 12 | 13 | >>> {fd1:1, fd1:2}[fd1] 14 | 2 15 | 16 | They can be copied and have pass-by-value semantics (as opposed to pass-by-reference sematics): 17 | 18 | >>> fd2=fd1.copy() 19 | >>> fd1 == fd2 20 | True 21 | >>> fd1 is fd2 22 | False 23 | >>> {fd1:1, fd2:2}[fd1] 24 | 2 25 | 26 | They are also immutable: 27 | 28 | >>> fd1['a'] = 3 29 | Traceback (most recent call last): 30 | ... 31 | TypeError: 'frozendict' object does not support item assignment 32 | >>> del fd1['a'] 33 | Traceback (most recent call last): 34 | ... 35 | TypeError: 'frozendict' object does not support item deletion 36 | >>> fd1.keys().remove('a') 37 | Traceback (most recent call last): 38 | ... 39 | AttributeError: 'KeysView' object has no attribute 'remove' 40 | """ 41 | 42 | dict_cls = dict 43 | 44 | def __init__(self, *args, **kwargs): 45 | self._dict = self.dict_cls(*args, **kwargs) 46 | self._hash = None 47 | 48 | def __getitem__(self, key): 49 | return self._dict[key] 50 | 51 | def __contains__(self, key): 52 | return key in self._dict 53 | 54 | def copy(self, **add_or_replace): 55 | return self.__class__(self, **add_or_replace) 56 | 57 | def __iter__(self): 58 | return iter(self._dict) 59 | 60 | def __len__(self): 61 | return len(self._dict) 62 | 63 | def __repr__(self): 64 | return '<%s %r>' % (self.__class__.__name__, self._dict) 65 | 66 | def __hash__(self): 67 | if self._hash is None: 68 | h = 0 69 | for key, value in self._dict.items(): 70 | h ^= hash((key, value)) 71 | self._hash = h 72 | return self._hash 73 | 74 | 75 | class FrozenOrderedDict(frozendict): 76 | """ 77 | A frozendict subclass that maintains key order 78 | """ 79 | 80 | dict_cls = OrderedDict 81 | -------------------------------------------------------------------------------- /environment.integration: -------------------------------------------------------------------------------- 1 | # HCA DSS environment variables: integration deployment 2 | 3 | set -a 4 | DSS_DEPLOYMENT_STAGE=integration 5 | DSS_S3_BUCKET=$DSS_S3_BUCKET_INTEGRATION 6 | DSS_GS_BUCKET=$DSS_GS_BUCKET_INTEGRATION 7 | DSS_S3_CHECKOUT_BUCKET=$DSS_S3_CHECKOUT_BUCKET_INTEGRATION 8 | DSS_GS_CHECKOUT_BUCKET=$DSS_GS_CHECKOUT_BUCKET_INTEGRATION 9 | DSS_ES_DOMAIN="dss-index-$DSS_DEPLOYMENT_STAGE" 10 | DCP_DOMAIN=${DSS_DEPLOYMENT_STAGE}.data.humancellatlas.org 11 | # TODO remove https://dev.data.humancellatlas.org/ from OIDC_AUDIENCE 12 | OIDC_AUDIENCE=https://dev.data.humancellatlas.org/,https://${DCP_DOMAIN}/ 13 | API_DOMAIN_NAME="dss.${DCP_DOMAIN}" 14 | DSS_GCP_SERVICE_ACCOUNT_NAME="org-humancellatlas-integration" 15 | DSS_ZONE_NAME="integration.data.humancellatlas.org." 16 | ACM_CERTIFICATE_IDENTIFIER="390e82ea-a684-49f4-a4b3-857f22cee874" 17 | DSS_CHECKOUT_BUCKET_OBJECT_VIEWERS="serviceAccount:1037839730885-compute@developer.gserviceaccount.com,serviceAccount:caas-account@broad-dsde-mint-test.iam.gserviceaccount.com,serviceAccount:caas-prod-account-for-int@broad-dsde-mint-integration.iam.gserviceaccount.com,serviceAccount:cromwell-metadata-uploader@broad-dsde-mint-integration.iam.gserviceaccount.com,serviceAccount:bluebox-subscription-manager@broad-dsde-mint-integration.iam.gserviceaccount.com" 18 | DSS_TERRAFORM_BACKEND_BUCKET_TEMPLATE="org-humancellatlas-dss-{account_id}-${DSS_DEPLOYMENT_STAGE}-terraform" 19 | DSS_FLASHFLOOD_BUCKET=$DSS_FLASHFLOOD_BUCKET_INTEGRATION 20 | AUTH_URL=https://auth.integration.data.humancellatlas.org 21 | DSS_AWS_FLASHFLOOD_PREFIX_READ=$DSS_AWS_FLASHFLOOD_PREFIX_READ_INTEGRATION 22 | DSS_AWS_FLASHFLOOD_PREFIX_WRITE=$DSS_AWS_FLASHFLOOD_PREFIX_WRITE_INTEGRATION 23 | DSS_GCP_FLASHFLOOD_PREFIX_READ=$DSS_GCP_FLASHFLOOD_PREFIX_READ_INTEGRATION 24 | DSS_GCP_FLASHFLOOD_PREFIX_WRITE=$DSS_GCP_FLASHFLOOD_PREFIX_WRITE_INTEGRATION 25 | set +a 26 | 27 | if [[ -f "${DSS_HOME}/environment.integration.local" ]]; then 28 | source "${DSS_HOME}/environment.integration.local" 29 | fi 30 | -------------------------------------------------------------------------------- /environment.local.example: -------------------------------------------------------------------------------- 1 | # HCA DSS environment variables: local configuration file 2 | 3 | # Copy this file to environment.local then set your site-specific or deploy-specific environment variable values there. 4 | # These settings override the values in the "environment" file in this directory. 5 | # This file is sourced when you run "source environment". 6 | 7 | set -a 8 | # Environment variable assignments (var=value) in "set -a" mode are 9 | # automatically exported. 10 | set +a 11 | -------------------------------------------------------------------------------- /environment.prod: -------------------------------------------------------------------------------- 1 | # HCA DSS environment variables: production deployment 2 | 3 | set -a 4 | DSS_DEPLOYMENT_STAGE=prod 5 | DSS_S3_BUCKET=$DSS_S3_BUCKET_PROD 6 | DSS_GS_BUCKET=$DSS_GS_BUCKET_PROD 7 | DSS_S3_CHECKOUT_BUCKET=$DSS_S3_CHECKOUT_BUCKET_PROD 8 | DSS_GS_CHECKOUT_BUCKET=$DSS_GS_CHECKOUT_BUCKET_PROD 9 | DSS_ES_DOMAIN="dss-index-$DSS_DEPLOYMENT_STAGE" 10 | DCP_DOMAIN=data.humancellatlas.org 11 | # TODO remove https://dev.data.humancellatlas.org/ from OIDC_AUDIENCE 12 | OIDC_AUDIENCE=https://dev.data.humancellatlas.org/,https://${DCP_DOMAIN}/ 13 | API_DOMAIN_NAME="dss.${DCP_DOMAIN}" 14 | ACM_CERTIFICATE_IDENTIFIER="8dac5c5d-2742-4564-bc85-b7b3e251a51c" 15 | DSS_GCP_SERVICE_ACCOUNT_NAME="org-humancellatlas-prod" 16 | DSS_TERRAFORM_BACKEND_BUCKET_TEMPLATE="org-humancellatlas-dss-109067257620-${DSS_DEPLOYMENT_STAGE}-terraform" 17 | DSS_ZONE_NAME="${DCP_DOMAIN}." 18 | DSS_ES_INSTANCE_TYPE="m4.2xlarge.elasticsearch" 19 | DSS_ES_INSTANCE_COUNT="3" 20 | DSS_ES_VOLUME_SIZE="512" # Maximum volume size for m4.large.elasticsearch 21 | # human-cell-atlas-travis-test has been removed to not allow CI systems to push data to the DSS 22 | DSS_AUTHORIZED_GOOGLE_PROJECT_DOMAIN_ARRAY=( 23 | broad-dsde-mint-{dev,test,staging}.iam.gserviceaccount.com 24 | ) 25 | DSS_AUTHORIZED_DOMAINS=${DSS_AUTHORIZED_GOOGLE_PROJECT_DOMAIN_ARRAY[*]} 26 | DSS_AUTHORIZED_DOMAINS="hca-dcp-production.iam.gserviceaccount.com hca-dcp-pipelines-prod.iam.gserviceaccount.com ${DSS_AUTHORIZED_DOMAINS}" 27 | DSS_CHECKOUT_BUCKET_OBJECT_VIEWERS="serviceAccount:619310558212-compute@developer.gserviceaccount.com,serviceAccount:caas-account@broad-dsde-mint-dev.iam.gserviceaccount.com,serviceAccount:caas-prod-account-for-dev@broad-dsde-mint-dev.iam.gserviceaccount.com,group:GROUP_All_Users@firecloud.org,serviceAccount:cromwell-metadata-uploader@hca-dcp-pipelines-prod.iam.gserviceaccount.com" 28 | DSS_TERRAFORM_BACKEND_BUCKET_TEMPLATE="org-humancellatlas-dss-{account_id}-${DSS_DEPLOYMENT_STAGE}-terraform" 29 | DSS_FLASHFLOOD_BUCKET=$DSS_FLASHFLOOD_BUCKET_PROD 30 | AUTH_URL=https://auth.data.humancellatlas.org 31 | DSS_AWS_FLASHFLOOD_PREFIX_READ=$DSS_AWS_FLASHFLOOD_PREFIX_READ_PROD 32 | DSS_AWS_FLASHFLOOD_PREFIX_WRITE=$DSS_AWS_FLASHFLOOD_PREFIX_WRITE_PROD 33 | DSS_GCP_FLASHFLOOD_PREFIX_READ=$DSS_GCP_FLASHFLOOD_PREFIX_READ_PROD 34 | DSS_GCP_FLASHFLOOD_PREFIX_WRITE=$DSS_GCP_FLASHFLOOD_PREFIX_WRITE_PROD 35 | GCP_PROJECT_NAME="hca-dcp-dss-prod" 36 | set +a 37 | 38 | if [[ -f "${DSS_HOME}/environment.prod.local" ]]; then 39 | source "${DSS_HOME}/environment.prod.local" 40 | fi 41 | -------------------------------------------------------------------------------- /environment.staging: -------------------------------------------------------------------------------- 1 | # HCA DSS environment variables: staging deployment 2 | 3 | set -a 4 | DSS_DEPLOYMENT_STAGE=staging 5 | DSS_S3_BUCKET=$DSS_S3_BUCKET_STAGING 6 | DSS_GS_BUCKET=$DSS_GS_BUCKET_STAGING 7 | DSS_S3_CHECKOUT_BUCKET=$DSS_S3_CHECKOUT_BUCKET_STAGING 8 | DSS_GS_CHECKOUT_BUCKET=$DSS_GS_CHECKOUT_BUCKET_STAGING 9 | # `staging` currently shares the ES domain with `dev` 10 | DSS_ES_DOMAIN="dss-index-$DSS_DEPLOYMENT_STAGE" 11 | DSS_ES_DOMAIN_INDEX_LOGS_ENABLED="false" 12 | DCP_DOMAIN=${DSS_DEPLOYMENT_STAGE}.data.humancellatlas.org 13 | # TODO remove https://dev.data.humancellatlas.org/ from OIDC_AUDIENCE 14 | OIDC_AUDIENCE=https://dev.data.humancellatlas.org/,https://${DCP_DOMAIN}/ 15 | API_DOMAIN_NAME="dss.${DCP_DOMAIN}" 16 | DSS_ZONE_NAME="staging.data.humancellatlas.org." 17 | ACM_CERTIFICATE_IDENTIFIER="99fff90e-6ff5-44a5-852e-67c78f88c1f1" 18 | DSS_GCP_SERVICE_ACCOUNT_NAME="org-humancellatlas-staging" 19 | DSS_CHECKOUT_BUCKET_OBJECT_VIEWERS="serviceAccount:154609999906-compute@developer.gserviceaccount.com,serviceAccount:caas-account@broad-dsde-mint-staging.iam.gserviceaccount.com,serviceAccount:caas-prod-account-for-staging@broad-dsde-mint-staging.iam.gserviceaccount.com,serviceAccount:cromwell-metadata-uploader@broad-dsde-mint-staging.iam.gserviceaccount.com" 20 | DSS_TERRAFORM_BACKEND_BUCKET_TEMPLATE="org-humancellatlas-dss-{account_id}-${DSS_DEPLOYMENT_STAGE}-terraform" 21 | DSS_FLASHFLOOD_BUCKET=$DSS_FLASHFLOOD_BUCKET_STAGING 22 | AUTH_URL=https://auth.staging.data.humancellatlas.org 23 | DSS_AWS_FLASHFLOOD_PREFIX_READ=$DSS_AWS_FLASHFLOOD_PREFIX_READ_STAGING 24 | DSS_AWS_FLASHFLOOD_PREFIX_WRITE=$DSS_AWS_FLASHFLOOD_PREFIX_WRITE_STAGING 25 | DSS_GCP_FLASHFLOOD_PREFIX_READ=$DSS_GCP_FLASHFLOOD_PREFIX_READ_STAGING 26 | DSS_GCP_FLASHFLOOD_PREFIX_WRITE=$DSS_GCP_FLASHFLOOD_PREFIX_WRITE_STAGING 27 | set +a 28 | 29 | if [[ -f "${DSS_HOME}/environment.staging.local" ]]; then 30 | source "${DSS_HOME}/environment.staging.local" 31 | fi 32 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-dlq-reaper-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "sqs:Publish", 17 | "sqs:ReceiveMessage", 18 | "sqs:SendMessage", 19 | "sqs:GetQueueUrl", 20 | "sqs:DeleteMessage" 21 | ], 22 | "Resource": [ 23 | "arn:aws:sqs:*:$account_id:dss-dlq-$stage" 24 | ] 25 | }, 26 | { 27 | "Effect": "Allow", 28 | "Action": "sns:Publish", 29 | "Resource": [ 30 | "arn:aws:sns:*:$account_id:dss-*-$stage" 31 | ] 32 | }, 33 | { 34 | "Effect": "Allow", 35 | "Action": [ 36 | "xray:PutTelemetryRecords", 37 | "xray:PutTraceSegments" 38 | ], 39 | "Resource": "*" 40 | } 41 | ] 42 | } 43 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-events-scribe-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": "s3:*", 16 | "Resource": [ 17 | "arn:aws:s3:::*", 18 | "arn:aws:s3:::*/*" 19 | ] 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "sqs:GetQueueUrl", 25 | "sqs:ReceiveMessage", 26 | "sqs:SendMessage", 27 | "sqs:DeleteMessage", 28 | "sqs:GetQueueAttributes" 29 | ], 30 | "Resource": "*" 31 | } 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-gs-copy-sfn-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Action": [ 15 | "lambda:*" 16 | ], 17 | "Resource": [ 18 | "arn:aws:lambda:*:$account_id:function:dss-gs-copy-sfn-$stage", 19 | "arn:aws:lambda:*:$account_id:function:dss-gs-copy-sfn-$stage:*" 20 | ], 21 | "Effect": "Allow" 22 | }, 23 | { 24 | "Effect": "Allow", 25 | "Action": [ 26 | "xray:PutTelemetryRecords", 27 | "xray:PutTraceSegments" 28 | ], 29 | "Resource": "*" 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-gs-copy-write-metadata-sfn-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Action": [ 15 | "lambda:*" 16 | ], 17 | "Resource": [ 18 | "arn:aws:lambda:*:$account_id:function:dss-gs-copy-write-metadata-sfn-$stage", 19 | "arn:aws:lambda:*:$account_id:function:dss-gs-copy-write-metadata-sfn-$stage:*" 20 | ], 21 | "Effect": "Allow" 22 | }, 23 | { 24 | "Effect": "Allow", 25 | "Action": [ 26 | "xray:PutTelemetryRecords", 27 | "xray:PutTraceSegments" 28 | ], 29 | "Resource": "*" 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-index-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": "s3:*", 16 | "Resource": [ 17 | "arn:aws:s3:::$DSS_S3_BUCKET", 18 | "arn:aws:s3:::$DSS_S3_BUCKET/*" 19 | ] 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "es:ESHttpDelete", 25 | "es:ESHttpGet", 26 | "es:ESHttpHead", 27 | "es:ESHttpPost", 28 | "es:ESHttpPut" 29 | ], 30 | "Resource": "arn:aws:es:*:$account_id:domain/$dss_es_domain/*" 31 | }, 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "sqs:GetQueueUrl", 36 | "sqs:SendMessage" 37 | ], 38 | "Resource": [ 39 | "arn:aws:sqs:*:$account_id:dss-notify-$stage-*" 40 | ] 41 | }, 42 | { 43 | "Effect": "Allow", 44 | "Action": [ 45 | "sqs:ListQueues", 46 | "sqs:ReceiveMessage", 47 | "sqs:DeleteMessage", 48 | "sqs:GetQueueAttributes" 49 | ], 50 | "Resource": "*" 51 | }, 52 | { 53 | "Effect": "Allow", 54 | "Action": [ 55 | "xray:PutTelemetryRecords", 56 | "xray:PutTraceSegments" 57 | ], 58 | "Resource": "*" 59 | } 60 | ] 61 | } 62 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:DescribeLogGroups", 10 | "logs:DescribeLogStreams", 11 | "logs:PutLogEvents" 12 | ], 13 | "Resource": "arn:aws:logs:*:*:*" 14 | }, 15 | { 16 | "Effect": "Allow", 17 | "Action": "s3:*", 18 | "Resource": [ 19 | "arn:aws:s3:::$DSS_S3_BUCKET", 20 | "arn:aws:s3:::$DSS_S3_BUCKET/*", 21 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET", 22 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET/*" 23 | ] 24 | }, 25 | { 26 | "Effect": "Allow", 27 | "Action": [ 28 | "s3:Get*", 29 | "s3:List*" 30 | ], 31 | "Resource": "*" 32 | }, 33 | { 34 | "Effect": "Allow", 35 | "Action": "sns:Publish", 36 | "Resource": [ 37 | "arn:aws:sns:*:$account_id:dss-*-$stage" 38 | ] 39 | }, 40 | { 41 | "Effect": "Allow", 42 | "Action": [ 43 | "es:ESHttpGet", 44 | "es:ESHttpHead", 45 | "es:ESHttpPut", 46 | "es:ESHttpPost", 47 | "es:ESHttpDelete" 48 | ], 49 | "Resource": "arn:aws:es:*:$account_id:domain/$dss_es_domain/*" 50 | }, 51 | { 52 | "Effect": "Allow", 53 | "Action": [ 54 | "states:StartExecution", 55 | "states:DescribeExecution" 56 | ], 57 | "Resource": "arn:aws:states:*:$account_id:*:dss-*" 58 | }, 59 | { 60 | "Effect": "Allow", 61 | "Action": [ 62 | "xray:PutTelemetryRecords", 63 | "xray:PutTraceSegments" 64 | ], 65 | "Resource": "*" 66 | }, 67 | { 68 | "Effect": "Allow", 69 | "Action": [ 70 | "dynamodb:*" 71 | ], 72 | "Resource": "arn:aws:dynamodb:*:$account_id:*" 73 | }, 74 | { 75 | "Effect": "Allow", 76 | "Action": [ 77 | "tag:GetTagKeys", 78 | "tag:GetResources", 79 | "tag:GetTagValues" 80 | ], 81 | "Resource": "*" 82 | } 83 | ] 84 | } 85 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-notify-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:DescribeLogGroups", 10 | "logs:DescribeLogStreams", 11 | "logs:PutLogEvents" 12 | ], 13 | "Resource": "arn:aws:logs:*:*:*" 14 | }, 15 | { 16 | "Effect": "Allow", 17 | "Action": [ 18 | "sqs:CreateQueue", 19 | "sqs:DeleteQueue", 20 | "sqs:ChangeMessageVisibility*", 21 | "sqs:DeleteMessage*", 22 | "sqs:GetQueueAttributes", 23 | "sqs:GetQueueUrl", 24 | "sqs:ReceiveMessage", 25 | "sqs:SendMessage", 26 | "sqs:SetQueueAttributes" 27 | ], 28 | "Resource": [ 29 | "arn:aws:sqs:*:$account_id:dss-notify-$stage-*" 30 | ] 31 | }, 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "sqs:ListQueues" 36 | ], 37 | "Resource": [ 38 | "arn:aws:sqs:*:$account_id:*" 39 | ] 40 | }, 41 | { 42 | "Effect": "Allow", 43 | "Action": [ 44 | "xray:PutTelemetryRecords", 45 | "xray:PutTraceSegments" 46 | ], 47 | "Resource": "*" 48 | } 49 | ] 50 | } 51 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-notify-v2-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": "s3:*", 16 | "Resource": [ 17 | "arn:aws:s3:::*", 18 | "arn:aws:s3:::*/*" 19 | ] 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "sqs:GetQueueUrl", 25 | "sqs:ReceiveMessage", 26 | "sqs:SendMessage", 27 | "sqs:DeleteMessage", 28 | "sqs:GetQueueAttributes" 29 | ], 30 | "Resource": "*" 31 | }, 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "xray:PutTelemetryRecords", 36 | "xray:PutTraceSegments" 37 | ], 38 | "Resource": "*" 39 | }, 40 | { 41 | "Effect": "Allow", 42 | "Action": [ 43 | "dynamodb:*" 44 | ], 45 | "Resource": "arn:aws:dynamodb:*:$account_id:*" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-operations-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": "s3:*", 16 | "Resource": [ 17 | "arn:aws:s3:::*", 18 | "arn:aws:s3:::*/*" 19 | ] 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "sqs:GetQueueUrl", 25 | "sqs:ReceiveMessage", 26 | "sqs:SendMessage", 27 | "sqs:DeleteMessage", 28 | "sqs:GetQueueAttributes" 29 | ], 30 | "Resource": "*" 31 | }, 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "xray:PutTelemetryRecords", 36 | "xray:PutTraceSegments" 37 | ], 38 | "Resource": "*" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-s3-copy-sfn-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "s3:AbortMultipartUpload", 17 | "s3:PutObject*" 18 | ], 19 | "Resource": [ 20 | "arn:aws:s3:::$DSS_S3_BUCKET", 21 | "arn:aws:s3:::$DSS_S3_BUCKET/*", 22 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST", 23 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST/*", 24 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET", 25 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET/*", 26 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST", 27 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST/*", 28 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST_USER", 29 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST_USER/*" 30 | ] 31 | }, 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "s3:Get*", 36 | "s3:List*" 37 | ], 38 | "Resource": "*" 39 | }, 40 | { 41 | "Action": [ 42 | "lambda:*" 43 | ], 44 | "Resource": [ 45 | "arn:aws:lambda:*:$account_id:function:dss-s3-copy-sfn-$stage", 46 | "arn:aws:lambda:*:$account_id:function:dss-s3-copy-sfn-$stage:*" 47 | ], 48 | "Effect": "Allow" 49 | }, 50 | { 51 | "Effect": "Allow", 52 | "Action": [ 53 | "xray:PutTelemetryRecords", 54 | "xray:PutTraceSegments" 55 | ], 56 | "Resource": "*" 57 | }, 58 | { 59 | "Effect": "Allow", 60 | "Action": [ 61 | "dynamodb:*" 62 | ], 63 | "Resource": "arn:aws:dynamodb:*:$account_id:table/dss-async-state-$stage" 64 | } 65 | ] 66 | } 67 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-s3-copy-write-metadata-sfn-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "s3:AbortMultipartUpload", 17 | "s3:PutObject*" 18 | ], 19 | "Resource": [ 20 | "arn:aws:s3:::$DSS_S3_BUCKET", 21 | "arn:aws:s3:::$DSS_S3_BUCKET/*", 22 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST", 23 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST/*" 24 | ] 25 | }, 26 | { 27 | "Effect": "Allow", 28 | "Action": [ 29 | "s3:Get*", 30 | "s3:List*" 31 | ], 32 | "Resource": "*" 33 | }, 34 | { 35 | "Action": [ 36 | "lambda:*" 37 | ], 38 | "Resource": [ 39 | "arn:aws:lambda:*:$account_id:function:dss-s3-copy-write-metadata-sfn-$stage", 40 | "arn:aws:lambda:*:$account_id:function:dss-s3-copy-write-metadata-sfn-$stage:*" 41 | ], 42 | "Effect": "Allow" 43 | }, 44 | { 45 | "Effect": "Allow", 46 | "Action": [ 47 | "xray:PutTelemetryRecords", 48 | "xray:PutTraceSegments" 49 | ], 50 | "Resource": "*" 51 | }, 52 | { 53 | "Effect": "Allow", 54 | "Action": [ 55 | "dynamodb:*" 56 | ], 57 | "Resource": "arn:aws:dynamodb:*:$account_id:table/dss-async-state-$stage" 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-scalability-test-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:DescribeLogGroups", 10 | "logs:DescribeLogStreams", 11 | "logs:PutLogEvents" 12 | ], 13 | "Resource": "arn:aws:logs:*:*:*" 14 | }, 15 | { 16 | "Effect": "Allow", 17 | "Action": [ 18 | "s3:List*", 19 | "s3:Get*", 20 | "s3:PutObject*" 21 | ], 22 | "Resource": [ 23 | "arn:aws:s3:::$DSS_S3_BUCKET", 24 | "arn:aws:s3:::$DSS_S3_BUCKET/*", 25 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST", 26 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST/*", 27 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET", 28 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET/*", 29 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST", 30 | "arn:aws:s3:::$DSS_S3_CHECKOUT_BUCKET_TEST/*", 31 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST_FIXTURES", 32 | "arn:aws:s3:::$DSS_S3_BUCKET_TEST_FIXTURES/*" 33 | ] 34 | }, 35 | { 36 | "Effect": "Allow", 37 | "Action": "sns:Publish", 38 | "Resource": [ 39 | "arn:aws:sns:*:$account_id:dss-*-$stage", 40 | "arn:aws:sns:*:$account_id:*-*-$stage" 41 | ] 42 | }, 43 | { 44 | "Effect": "Allow", 45 | "Action": "lambda:*", 46 | "Resource": [ 47 | "arn:aws:lambda:*:$account_id:function:dss-*" 48 | ] 49 | }, 50 | { 51 | "Effect": "Allow", 52 | "Action": "dynamodb:*", 53 | "Resource": [ 54 | "arn:aws:dynamodb:*:$account_id:table/scalability_test", 55 | "arn:aws:dynamodb:*:$account_id:table/scalability_test_result", 56 | "arn:aws:dynamodb:*:$account_id:table/scalability_test/stream/*" 57 | ] 58 | }, 59 | { 60 | "Effect": "Allow", 61 | "Action": [ 62 | "states:ListExecutions", 63 | "states:StartExecution", 64 | "states:DescribeExecution" 65 | ], 66 | "Resource": [ 67 | "arn:aws:states:*:$account_id:stateMachine:dss-scalability*", 68 | "arn:aws:states:*:$account_id:execution:dss-scalability*" 69 | ] 70 | }, 71 | { 72 | "Effect": "Allow", 73 | "Action": [ 74 | "xray:PutTelemetryRecords", 75 | "xray:PutTraceSegments" 76 | ], 77 | "Resource": "*" 78 | } 79 | ] 80 | } 81 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-sfn-launcher-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "sqs:Publish", 17 | "sqs:ReceiveMessage", 18 | "sqs:SendMessage", 19 | "sqs:GetQueueUrl", 20 | "sqs:DeleteMessage" 21 | ], 22 | "Resource": [ 23 | "arn:aws:sqs:*:$account_id:dss-dlq-$stage" 24 | ] 25 | }, 26 | { 27 | "Effect": "Allow", 28 | "Action": "sns:Publish", 29 | "Resource": [ 30 | "arn:aws:sns:*:$account_id:dss-sfn-$stage" 31 | ] 32 | }, 33 | { 34 | "Effect": "Allow", 35 | "Action": [ 36 | "states:StartExecution" 37 | ], 38 | "Resource": "arn:aws:states:*:$account_id:*:dss-*" 39 | }, 40 | { 41 | "Effect": "Allow", 42 | "Action": [ 43 | "xray:PutTelemetryRecords", 44 | "xray:PutTraceSegments" 45 | ], 46 | "Resource": "*" 47 | } 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /iam/policy-templates/dss-sync-sfn-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": "s3:*", 16 | "Resource": [ 17 | "arn:aws:s3:::*", 18 | "arn:aws:s3:::*/*" 19 | ] 20 | }, 21 | { 22 | "Effect": "Allow", 23 | "Action": [ 24 | "states:StartExecution" 25 | ], 26 | "Resource": "arn:aws:states:*:$account_id:*:dss-*" 27 | }, 28 | { 29 | "Effect": "Allow", 30 | "Action": "sns:Publish", 31 | "Resource": [ 32 | "arn:aws:sns:*:*:*" 33 | ] 34 | }, 35 | { 36 | "Effect": "Allow", 37 | "Action": [ 38 | "sqs:ReceiveMessage", 39 | "sqs:DeleteMessage", 40 | "sqs:GetQueueAttributes" 41 | ], 42 | "Resource": "*" 43 | }, 44 | { 45 | "Effect": "Allow", 46 | "Action": "lambda:InvokeFunction", 47 | "Resource": [ 48 | "arn:aws:lambda:*:$account_id:function:dss-*" 49 | ] 50 | }, 51 | { 52 | "Effect": "Allow", 53 | "Action": [ 54 | "xray:PutTelemetryRecords", 55 | "xray:PutTraceSegments" 56 | ], 57 | "Resource": "*" 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /iam/policy-templates/scheduled-ci-build-lambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "logs:CreateLogGroup", 8 | "logs:CreateLogStream", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": "arn:aws:logs:*:*:*" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /infra/Makefile: -------------------------------------------------------------------------------- 1 | DIRS=${shell find . -name "*.tf" -exec dirname {} \; | sort --unique} 2 | COMPONENTS=${shell for d in $(DIRS); do basename $$d; done} 3 | 4 | all: init-all 5 | 6 | init-all: 7 | @for c in $(COMPONENTS); do \ 8 | $(MAKE) init COMPONENT=$$c || exit 1; \ 9 | done 10 | 11 | plan-all: 12 | @for c in $(COMPONENTS); do \ 13 | $(MAKE) plan COMPONENT=$$c || exit 1; \ 14 | done 15 | 16 | apply-all: 17 | @for c in $(COMPONENTS); do \ 18 | $(MAKE) apply COMPONENT=$$c || exit 1; \ 19 | done 20 | 21 | destroy-all: 22 | @for c in $(COMPONENTS); do \ 23 | $(MAKE) destroy COMPONENT=$$c || exit 1; \ 24 | done 25 | 26 | clean-all: 27 | @for c in $(COMPONENTS); do \ 28 | $(MAKE) clean COMPONENT=$$c || exit 1; \ 29 | done 30 | 31 | plan: init 32 | @echo $(COMPONENTS) 33 | cd $(COMPONENT); terraform plan -detailed-exitcode 34 | 35 | apply: init 36 | @echo $(COMPONENTS) 37 | cd $(COMPONENT); terraform apply 38 | 39 | destroy: init 40 | cd $(COMPONENT); terraform destroy 41 | 42 | init: 43 | rm -rf $(COMPONENT)/.terraform/*.tfstate 44 | ./build_deploy_config.py $(COMPONENT) 45 | cd $(COMPONENT); terraform init; 46 | 47 | clean: 48 | cd $(COMPONENT); rm -rf .terraform 49 | 50 | .PHONY: init-all plan-all apply-all clean-all plan apply destroy init clean 51 | -------------------------------------------------------------------------------- /infra/async_state_db/main.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | 3 | locals { 4 | common_tags = "${map( 5 | "managedBy" , "terraform", 6 | "Name" , "${var.DSS_INFRA_TAG_SERVICE}-asyncdynamodb", 7 | "project" , var.DSS_INFRA_TAG_PROJECT, 8 | "env" , var.DSS_DEPLOYMENT_STAGE, 9 | "service" , var.DSS_INFRA_TAG_SERVICE, 10 | "owner" , var.DSS_INFRA_TAG_OWNER 11 | )}" 12 | } 13 | 14 | resource "aws_dynamodb_table" "sfn_state" { 15 | name = "dss-async-state-${var.DSS_DEPLOYMENT_STAGE}" 16 | billing_mode = "PAY_PER_REQUEST" 17 | hash_key = "hash_key" 18 | 19 | ttl { 20 | attribute_name = "ttl" 21 | enabled = true 22 | } 23 | 24 | attribute { 25 | name = "hash_key" 26 | type = "S" 27 | } 28 | 29 | tags = local.common_tags 30 | } 31 | -------------------------------------------------------------------------------- /infra/collections_db/main.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | locals { 3 | common_tags = "${map( 4 | "managedBy" , "terraform", 5 | "Name" , "${var.DSS_INFRA_TAG_SERVICE}-collectionsdynamodb", 6 | "project" , "${var.DSS_INFRA_TAG_PROJECT}", 7 | "env" , "${var.DSS_DEPLOYMENT_STAGE}", 8 | "service" , "${var.DSS_INFRA_TAG_SERVICE}", 9 | "owner" , "${var.DSS_INFRA_TAG_OWNER}" 10 | )}" 11 | replicas = ["aws", "gcp"] 12 | } 13 | 14 | resource "aws_dynamodb_table" "collections-db-aws" { 15 | name = "dss-collections-db-${var.DSS_DEPLOYMENT_STAGE}" 16 | billing_mode = "PAY_PER_REQUEST" 17 | hash_key = "hash_key" 18 | range_key = "sort_key" 19 | 20 | point_in_time_recovery { 21 | enabled = true 22 | } 23 | 24 | attribute { 25 | name = "hash_key" 26 | type = "S" 27 | } 28 | 29 | attribute { 30 | name = "sort_key" 31 | type = "S" 32 | } 33 | 34 | tags = local.common_tags 35 | } 36 | -------------------------------------------------------------------------------- /infra/domain/main.tf: -------------------------------------------------------------------------------- 1 | data aws_caller_identity current {} 2 | locals {account_id = data.aws_caller_identity.current.account_id} 3 | 4 | data aws_route53_zone selected { 5 | name = var.DSS_ZONE_NAME 6 | } 7 | 8 | resource "aws_api_gateway_domain_name" "dss" { 9 | domain_name = var.API_DOMAIN_NAME 10 | regional_certificate_arn = "arn:aws:acm:${var.AWS_DEFAULT_REGION}:${local.account_id}:certificate/${var.ACM_CERTIFICATE_IDENTIFIER}" 11 | 12 | endpoint_configuration { 13 | types = ["REGIONAL"] 14 | } 15 | } 16 | 17 | resource "aws_route53_record" "dss" { 18 | zone_id = data.aws_route53_zone.selected.zone_id 19 | name = var.API_DOMAIN_NAME 20 | type = "CNAME" 21 | ttl = "300" 22 | records = [aws_api_gateway_domain_name.dss.regional_domain_name] 23 | } 24 | -------------------------------------------------------------------------------- /infra/dss-events-scribe/main.tf: -------------------------------------------------------------------------------- 1 | data aws_caller_identity current {} 2 | data aws_region current {} 3 | 4 | locals { 5 | region = "${data.aws_region.current.name}" 6 | account_id = "${data.aws_caller_identity.current.account_id}" 7 | common_tags = "${map( 8 | "managedBy" , "terraform", 9 | "Name" , "${var.DSS_INFRA_TAG_SERVICE}-dss-events-scribe", 10 | "project" , "${var.DSS_INFRA_TAG_PROJECT}", 11 | "env" , "${var.DSS_DEPLOYMENT_STAGE}", 12 | "service" , "${var.DSS_INFRA_TAG_SERVICE}", 13 | "owner" , "${var.DSS_INFRA_TAG_OWNER}" 14 | )}" 15 | } 16 | 17 | locals { 18 | replicas = ["aws", "gcp"] 19 | } 20 | 21 | data "aws_iam_policy_document" "sqs" { 22 | statement { 23 | principals { 24 | type = "AWS" 25 | identifiers = ["*"] 26 | } 27 | actions = ["sqs:SendMessage"] 28 | resources = ["arn:aws:sqs:${local.region}:${local.account_id}:dss-events-scribe-${var.DSS_DEPLOYMENT_STAGE}"] 29 | condition { 30 | test = "StringEquals" 31 | variable = "aws:SourceArn" 32 | values = [aws_cloudwatch_event_rule.events-scribe.arn] 33 | } 34 | } 35 | statement { 36 | principals { 37 | type = "AWS" 38 | identifiers = ["*"] 39 | } 40 | actions = [ 41 | "sqs:*", 42 | ] 43 | resources = ["arn:aws:sqs:${local.region}:${local.account_id}:dss-events-scribe-${var.DSS_DEPLOYMENT_STAGE}"] 44 | condition { 45 | test = "StringEquals" 46 | variable = "aws:SourceArn" 47 | values = ["arn:aws:lambda:${local.region}:${local.account_id}:function:dss-events-scribe-${var.DSS_DEPLOYMENT_STAGE}"] 48 | } 49 | } 50 | } 51 | 52 | resource "aws_sqs_queue" "dss-events-scribe-queue" { 53 | name = "dss-events-scribe-${var.DSS_DEPLOYMENT_STAGE}" 54 | tags = local.common_tags 55 | message_retention_seconds = "3600" 56 | visibility_timeout_seconds = "600" 57 | policy = data.aws_iam_policy_document.sqs.json 58 | } 59 | 60 | resource "aws_cloudwatch_event_rule" "events-scribe" { 61 | name = "dss-events-scribe-${var.DSS_DEPLOYMENT_STAGE}" 62 | description = "Queue event journal/update" 63 | schedule_expression = "rate(10 minutes)" 64 | tags = local.common_tags 65 | } 66 | 67 | resource "aws_cloudwatch_event_target" "send-journal-and-update-message" { 68 | count = length(local.replicas) 69 | rule = aws_cloudwatch_event_rule.events-scribe.name 70 | arn = aws_sqs_queue.dss-events-scribe-queue.arn 71 | input = <<-DOC 72 | { 73 | "replica":"${local.replicas[count.index]}" 74 | } 75 | DOC 76 | } 77 | -------------------------------------------------------------------------------- /infra/elasticsearch/access_ips.tf: -------------------------------------------------------------------------------- 1 | data "aws_secretsmanager_secret_version" "source_ips" { 2 | secret_id = "${var.DSS_SECRETS_STORE}/${var.DSS_DEPLOYMENT_STAGE}/${var.ES_ALLOWED_SOURCE_IP_SECRETS_NAME}" 3 | } 4 | 5 | locals { 6 | ips_str = data.aws_secretsmanager_secret_version.source_ips.secret_string 7 | access_ips = compact(split(",", local.ips_str)) 8 | } 9 | -------------------------------------------------------------------------------- /infra/gcp_service_account/main.tf: -------------------------------------------------------------------------------- 1 | data "google_project" "project" {} 2 | 3 | resource "google_service_account" "dss" { 4 | display_name = var.DSS_GCP_SERVICE_ACCOUNT_NAME 5 | account_id = var.DSS_GCP_SERVICE_ACCOUNT_NAME 6 | } 7 | 8 | # Useful command to discover role names (Guessing based on console titles is difficult): 9 | # `gcloud iam list-grantable-roles //cloudresourcemanager.googleapis.com/projects/{project-id}` 10 | 11 | resource "google_project_iam_member" "serviceaccountactor" { 12 | project = data.google_project.project.project_id 13 | role = "roles/iam.serviceAccountActor" 14 | member = "serviceAccount:${google_service_account.dss.email}" 15 | } 16 | 17 | resource "google_project_iam_member" "cloudruntimeconfiguratoradmin" { 18 | project = data.google_project.project.project_id 19 | role = "roles/runtimeconfig.admin" 20 | member = "serviceAccount:${google_service_account.dss.email}" 21 | } 22 | 23 | resource "google_project_iam_member" "storageadmin" { 24 | project = data.google_project.project.project_id 25 | role = "roles/storage.admin" 26 | member = "serviceAccount:${google_service_account.dss.email}" 27 | } 28 | 29 | resource "google_project_iam_member" "storageobjectcreator" { 30 | project = data.google_project.project.project_id 31 | role = "roles/storage.objectCreator" 32 | member = "serviceAccount:${google_service_account.dss.email}" 33 | } 34 | 35 | resource "google_project_iam_member" "cloudfunctionsdeveloper" { 36 | project = data.google_project.project.project_id 37 | role = "roles/cloudfunctions.developer" 38 | member = "serviceAccount:${google_service_account.dss.email}" 39 | } 40 | 41 | resource "google_project_iam_member" "viewer" { 42 | project = data.google_project.project.project_id 43 | role = "roles/viewer" 44 | member = "serviceAccount:${google_service_account.dss.email}" 45 | } 46 | 47 | output "service_account" { 48 | value = var.DSS_GCP_SERVICE_ACCOUNT_NAME 49 | } 50 | -------------------------------------------------------------------------------- /infra/subscription_v2_db/main.tf: -------------------------------------------------------------------------------- 1 | data "aws_caller_identity" "current" {} 2 | locals { 3 | common_tags = "${map( 4 | "managedBy" , "terraform", 5 | "Name" , "${var.DSS_INFRA_TAG_SERVICE}-subscriptionsdynamodb", 6 | "project" , "${var.DSS_INFRA_TAG_PROJECT}", 7 | "env" , "${var.DSS_DEPLOYMENT_STAGE}", 8 | "service" , "${var.DSS_INFRA_TAG_SERVICE}", 9 | "owner" , "${var.DSS_INFRA_TAG_OWNER}" 10 | )}" 11 | } 12 | 13 | locals { 14 | replicas = ["aws", "gcp"] 15 | } 16 | 17 | resource "aws_dynamodb_table" "subscriptions-aws" { 18 | count = length(local.replicas) 19 | name = "dss-subscriptions-v2-${local.replicas[count.index]}-${var.DSS_DEPLOYMENT_STAGE}" 20 | billing_mode = "PAY_PER_REQUEST" 21 | hash_key = "hash_key" 22 | range_key = "sort_key" 23 | 24 | point_in_time_recovery { 25 | enabled = true 26 | } 27 | 28 | attribute { 29 | name = "hash_key" 30 | type = "S" 31 | } 32 | 33 | attribute { 34 | name = "sort_key" 35 | type = "S" 36 | } 37 | 38 | tags = local.common_tags 39 | } 40 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # You should not edit this file directly. Instead, you should edit requirements-dev.txt.in. 2 | asn1crypto==0.24.0 3 | attrs==19.1.0 4 | aws-xray-sdk==2.4.2 5 | awscli==1.16.242 6 | azure-common==1.1.23 7 | azure-nspkg==3.0.2 8 | azure-storage==0.36.0 9 | boto3==1.9.232 10 | botocore==1.12.232 11 | cachetools==3.1.1 12 | certifi==2019.9.11 13 | cffi==1.12.3 14 | chalice==1.11.1 15 | chardet==3.0.4 16 | click==6.7 17 | clickclick==1.2.2 18 | cloud-blobstore==3.2.0 19 | colorama==0.3.9 20 | connexion==1.5.3 21 | coverage==4.5.4 22 | crc32c==1.7 23 | crcmod==1.7 24 | cryptography==2.7 25 | dcplib==3.7.0 26 | docutils==0.15.2 27 | domovoi==2.0.2 28 | elasticsearch==5.5.3 29 | elasticsearch-dsl==5.4.0 30 | entrypoints==0.3 31 | enum-compat==0.0.2 32 | Faker==2.0.2 33 | fasteners==0.15 34 | flake8==3.7.8 35 | flash-flood==0.4.3 36 | Flask==1.1.1 37 | furl==2.0.0 38 | future==0.17.1 39 | google-api-core==1.14.2 40 | google-api-python-client==1.7.11 41 | google-apitools==0.5.30 42 | google-auth==1.6.3 43 | google-auth-httplib2==0.0.3 44 | google-cloud-core==1.0.3 45 | google-cloud-storage==1.19.1 46 | google-resumable-media==0.4.1 47 | googleapis-common-protos==1.6.0 48 | httpie==1.0.3 49 | httplib2==0.18.0 50 | idna==2.8 51 | inflection==0.3.1 52 | iso8601==0.1.12 53 | itsdangerous==1.1.0 54 | Jinja2==2.10.1 55 | jmespath==0.9.4 56 | jsonpickle==1.2 57 | jsonpointer==2.0 58 | jsonschema==2.6.0 59 | MarkupSafe==1.1.1 60 | mccabe==0.6.1 61 | monotonic==1.5 62 | mypy==0.720 63 | mypy-extensions==0.4.1 64 | nestedcontext==0.0.4 65 | oauth2client==4.1.3 66 | orderedmultidict==1.0.1 67 | protobuf==3.9.1 68 | puremagic==1.4 69 | pyasn1==0.4.7 70 | pyasn1-modules==0.2.6 71 | pycodestyle==2.5.0 72 | pycparser==2.19 73 | pyflakes==2.1.1 74 | Pygments==2.4.2 75 | PyJWT==1.7.1 76 | python-dateutil==2.8.0 77 | pytz==2019.2 78 | PyYAML==5.1 79 | requests==2.22.0 80 | requests-aws4auth==0.9 81 | requests-http-signature==0.1.0 82 | rsa==3.4.2 83 | rstr==2.2.6 84 | s3transfer==0.2.1 85 | six==1.12.0 86 | swagger-spec-validator==2.4.3 87 | termcolor==1.1.0 88 | text-unidecode==1.3 89 | typed-ast==1.4.0 90 | typing==3.6.4 91 | typing-extensions==3.7.4 92 | uritemplate==3.0.0 93 | urllib3==1.25.5 94 | Werkzeug==0.16.0 95 | wrapt==1.11.2 96 | xmltodict==0.12.0 97 | yq==2.7.2 98 | -------------------------------------------------------------------------------- /requirements-dev.txt.in: -------------------------------------------------------------------------------- 1 | # Be sure to run "make requirements-dev.txt" after editing this file. 2 | flake8 >= 3.4.1 3 | mypy >= 0.700 4 | domovoi >= 1.8.2 5 | chalice >= 1.8.0 6 | coverage >= 4.4.1 7 | awscli >= 1.16.0 8 | crcmod >= 1.7 9 | httpie >= 1.0.3 10 | yq >= 2.3.3 11 | Faker >= 0.8.11 12 | rstr >=2.2.6 13 | click >= 6.6, < 7.0 14 | -r requirements.txt.in 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # You should not edit this file directly. Instead, you should edit requirements.txt.in. 2 | asn1crypto==0.24.0 3 | aws-xray-sdk==2.4.2 4 | azure-common==1.1.23 5 | azure-nspkg==3.0.2 6 | azure-storage==0.36.0 7 | boto3==1.9.232 8 | botocore==1.12.232 9 | cachetools==3.1.1 10 | certifi==2019.9.11 11 | cffi==1.12.3 12 | chardet==3.0.4 13 | Click==7.0 14 | clickclick==1.2.2 15 | cloud-blobstore==3.2.0 16 | connexion==1.5.3 17 | crc32c==1.7 18 | cryptography==2.7 19 | dcplib==3.7.0 20 | docutils==0.15.2 21 | elasticsearch==5.5.3 22 | elasticsearch-dsl==5.4.0 23 | fasteners==0.15 24 | flash-flood==0.4.3 25 | Flask==1.1.1 26 | furl==2.0.0 27 | future==0.17.1 28 | google-api-core==1.14.2 29 | google-api-python-client==1.7.11 30 | google-apitools==0.5.30 31 | google-auth==1.6.3 32 | google-auth-httplib2==0.0.3 33 | google-cloud-core==1.0.3 34 | google-cloud-storage==1.19.1 35 | google-resumable-media==0.4.1 36 | googleapis-common-protos==1.6.0 37 | httplib2==0.18.0 38 | idna==2.8 39 | inflection==0.3.1 40 | iso8601==0.1.12 41 | itsdangerous==1.1.0 42 | Jinja2==2.10.1 43 | jmespath==0.9.4 44 | jsonpickle==1.2 45 | jsonpointer==2.0 46 | jsonschema==2.6.0 47 | MarkupSafe==1.1.1 48 | monotonic==1.5 49 | nestedcontext==0.0.4 50 | oauth2client==4.1.3 51 | orderedmultidict==1.0.1 52 | protobuf==3.9.1 53 | puremagic==1.4 54 | pyasn1==0.4.7 55 | pyasn1-modules==0.2.6 56 | pycparser==2.19 57 | PyJWT==1.7.1 58 | python-dateutil==2.8.0 59 | pytz==2019.2 60 | PyYAML==5.1 61 | requests==2.22.0 62 | requests-aws4auth==0.9 63 | requests-http-signature==0.1.0 64 | rsa==4.0 65 | s3transfer==0.2.1 66 | six==1.12.0 67 | swagger-spec-validator==2.4.3 68 | termcolor==1.1.0 69 | uritemplate==3.0.0 70 | urllib3==1.25.5 71 | Werkzeug==0.16.0 72 | wrapt==1.11.2 73 | -------------------------------------------------------------------------------- /requirements.txt.in: -------------------------------------------------------------------------------- 1 | # Be sure to run "make requirements.txt requirements-dev.txt" after editing this file. 2 | azure-storage >= 0.36.0 3 | boto3 >= 1.6.0 4 | botocore >= 1.10.16 # 1.9.x does not support AWS secretsmanager support 5 | cloud-blobstore >= 3.2.0 6 | connexion >= 1.1.15, < 2.0.0 # TODO migrate to 2.0.0 7 | dcplib>=2.0.3 8 | elasticsearch >= 5.4.0, < 6.0.0 9 | elasticsearch-dsl >= 5.3.0, < 6.0.0 10 | google-apitools >= 0.5.0 11 | google-api-python-client >= 1.7.8 12 | google-cloud-storage >= 1.7.0 13 | iso8601 >= 0.1.12 14 | jmespath >= 0.9.3 15 | jsonpointer >= 2.0 16 | jsonschema < 3.0.0 17 | nestedcontext >= 0.0.4 18 | requests-aws4auth >= 0.9 19 | urllib3 >= 1.21.1 20 | requests-http-signature >= 0.0.3 21 | aws-xray-sdk >= 1.0 22 | pyjwt >= 1.6.4 23 | pyyaml >= 4.2b1, <= 5.1.0 24 | flash-flood >= 0.4.3 25 | -------------------------------------------------------------------------------- /scripts/authorize_aws_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Grant an AWS IAM principal (user/group/role) the permissions necessary to 4 | # test and deploy the data store. Requires AWS credentials with IAM write access. 5 | 6 | source "$(dirname $0)/../environment" 7 | 8 | set -euo pipefail 9 | 10 | if [[ $# != 2 ]]; then 11 | echo "Given an IAM principal intended to be used by a test/CI/CD pipeline," 12 | echo "this script grants the principal the AWS IAM permissions necessary to" 13 | echo "test and deploy the DSS application. Run this script using privileged" 14 | echo "(IAM write access) IAM credentials." 15 | echo "Usage: $(basename $0) iam-principal-type iam-principal-name" 16 | echo "Example: $(basename $0) user hca-test" 17 | exit 1 18 | fi 19 | 20 | export iam_principal_type=$1 iam_principal_name=$2 21 | export account_id=$(aws sts get-caller-identity | jq -r .Account) 22 | policy_json="$(dirname $0)/../iam/policy-templates/ci-cd.json" 23 | envsubst_vars='$DSS_DEPLOYMENT_STAGE 24 | $DSS_S3_BUCKET 25 | $DSS_S3_BUCKET_TEST 26 | $DSS_S3_BUCKET_TEST_FIXTURES 27 | $DSS_S3_BUCKET_INTEGRATION 28 | $DSS_S3_BUCKET_STAGING 29 | $DSS_S3_CHECKOUT_BUCKET 30 | $DSS_S3_CHECKOUT_BUCKET_TEST 31 | $DSS_S3_CHECKOUT_BUCKET_TEST_USER 32 | $DSS_S3_CHECKOUT_BUCKET_INTEGRATION 33 | $DSS_S3_CHECKOUT_BUCKET_STAGING 34 | $DSS_SECRETS_STORE 35 | $DSS_EVENT_RELAY_AWS_ACCESS_KEY_ID_SECRETS_NAME 36 | $DSS_EVENT_RELAY_AWS_SECRET_ACCESS_KEY_SECRETS_NAME 37 | $account_id' 38 | 39 | aws iam put-${iam_principal_type}-policy \ 40 | --${iam_principal_type}-name $iam_principal_name \ 41 | --policy-name hca-dss-ci-cd \ 42 | --policy-document file://<(cat "$policy_json" | \ 43 | envsubst "$envsubst_vars" | \ 44 | jq -c 'del(.Statement[].Sid)') 45 | -------------------------------------------------------------------------------- /scripts/create_config_aws_event_relay_user.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Create and configure an AWS user with permission to send event notifications to SNS, 4 | then create an access key for the new AWS user and add it to the DSS secrets store. 5 | """ 6 | import os 7 | import sys 8 | import json 9 | import boto3 10 | import subprocess 11 | 12 | IAM = boto3.client('iam') 13 | STS = boto3.client('sts') 14 | 15 | region = os.environ['AWS_DEFAULT_REGION'] 16 | username = os.environ['EVENT_RELAY_AWS_USERNAME'] 17 | secret_name = os.environ['EVENT_RELAY_AWS_ACCESS_KEY_SECRETS_NAME'] 18 | account_id = STS.get_caller_identity().get('Account') 19 | sns_arn = f'arn:aws:sns:{region}:{account_id}:*' 20 | sqs_arn = f'arn:aws:sqs:{region}:{account_id}:*' 21 | 22 | try: 23 | resp = IAM.create_user( 24 | Path='/', 25 | UserName=username 26 | ) 27 | except IAM.exceptions.EntityAlreadyExistsException: 28 | pass 29 | 30 | IAM.put_user_policy( 31 | UserName=username, 32 | PolicyName='sns_publisher', 33 | PolicyDocument=json.dumps({ 34 | 'Version': '2012-10-17', 35 | 'Statement': [ 36 | { 37 | 'Action': [ 38 | 'sns:Publish' 39 | ], 40 | 'Effect': 'Allow', 41 | 'Resource': sns_arn 42 | } 43 | ] 44 | }) 45 | ) 46 | 47 | IAM.put_user_policy( 48 | UserName=username, 49 | PolicyName='sqs_sender', 50 | PolicyDocument=json.dumps({ 51 | 'Version': '2012-10-17', 52 | 'Statement': [ 53 | { 54 | 'Action': [ 55 | 'sqs:SendMessage' 56 | ], 57 | 'Effect': 'Allow', 58 | 'Resource': sqs_arn 59 | } 60 | ] 61 | }) 62 | ) 63 | 64 | aws_relay_user_key_info = IAM.create_access_key(UserName=username) 65 | aws_relay_user_key_info['AccessKey']['CreateDate'] = aws_relay_user_key_info['AccessKey']['CreateDate'].isoformat() 66 | secret_info = { 67 | 'AccessKey': { 68 | 'AccessKeyId': aws_relay_user_key_info['AccessKey']['AccessKeyId'], 69 | 'SecretAccessKey': aws_relay_user_key_info['AccessKey']['SecretAccessKey'], 70 | } 71 | } 72 | subprocess.run( 73 | [ 74 | os.path.join(os.path.dirname(__file__), "dss-ops.py"), 75 | "secrets", 76 | "set", 77 | secret_name, 78 | ], 79 | input=json.dumps(secret_info).encode("utf-8"), 80 | ) 81 | -------------------------------------------------------------------------------- /scripts/deploy_scale_tables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Assemble and deploy AWS DynamoDB tables (for keeping track of runs and run output) for scale tests 4 | """ 5 | import boto3 6 | 7 | dynamodb_client = boto3.client('dynamodb') 8 | 9 | SCALABILITY_TEST_TABLE = 'scalability_test' 10 | SCALABILITY_TEST_RUN_TABLE = 'scalability_test_result' 11 | 12 | existing_tables = dynamodb_client.list_tables()['TableNames'] 13 | if SCALABILITY_TEST_TABLE not in existing_tables: 14 | dynamodb_client.create_table( 15 | AttributeDefinitions=[ 16 | { 17 | 'AttributeName': 'execution_id', 18 | 'AttributeType': 'S', 19 | }, 20 | { 21 | 'AttributeName': 'run_id', 22 | 'AttributeType': 'S', 23 | } 24 | ], 25 | KeySchema=[ 26 | { 27 | 'AttributeName': 'execution_id', 28 | 'KeyType': 'HASH', 29 | }, 30 | { 31 | 'AttributeName': 'run_id', 32 | 'KeyType': 'RANGE', 33 | }, 34 | ], 35 | StreamSpecification={ 36 | 'StreamEnabled': True, 37 | 'StreamViewType': 'NEW_IMAGE' 38 | }, 39 | ProvisionedThroughput={ 40 | 'ReadCapacityUnits': 5, 41 | 'WriteCapacityUnits': 5, 42 | }, 43 | TableName=SCALABILITY_TEST_TABLE, 44 | ) 45 | 46 | waiter = dynamodb_client.get_waiter('table_exists') 47 | waiter.wait(TableName=SCALABILITY_TEST_TABLE) 48 | 49 | dynamodb_client.update_time_to_live( 50 | TableName=SCALABILITY_TEST_TABLE, 51 | TimeToLiveSpecification={ 52 | 'Enabled': True, 53 | 'AttributeName': 'expiration_ttl' 54 | } 55 | ) 56 | 57 | if SCALABILITY_TEST_RUN_TABLE not in existing_tables: 58 | dynamodb_client.create_table( 59 | AttributeDefinitions=[ 60 | { 61 | 'AttributeName': 'run_id', 62 | 'AttributeType': 'S', 63 | } 64 | ], 65 | KeySchema=[ 66 | { 67 | 'AttributeName': 'run_id', 68 | 'KeyType': 'HASH', 69 | } 70 | ], 71 | ProvisionedThroughput={ 72 | 'ReadCapacityUnits': 5, 73 | 'WriteCapacityUnits': 5, 74 | }, 75 | TableName=SCALABILITY_TEST_RUN_TABLE, 76 | ) 77 | 78 | waiter = dynamodb_client.get_waiter('table_exists') 79 | waiter.wait(TableName=SCALABILITY_TEST_RUN_TABLE) 80 | -------------------------------------------------------------------------------- /scripts/dss-ops.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Central entrypoint for DSS operational scripts 4 | """ 5 | import os 6 | import sys 7 | import logging 8 | 9 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 10 | sys.path.insert(0, pkg_root) # noqa 11 | 12 | import dss 13 | import dss.operations.checkout 14 | import dss.operations.storage 15 | import dss.operations.sync 16 | import dss.operations.iam 17 | import dss.operations.lambda_params 18 | import dss.operations.elasticsearch 19 | import dss.operations.events 20 | import dss.operations.secrets 21 | 22 | from dss.operations import dispatch 23 | 24 | logging.basicConfig(stream=sys.stdout) 25 | dss.Config.set_config(dss.BucketConfig.NORMAL) 26 | 27 | if __name__ == "__main__": 28 | dispatch(sys.argv[1:]) 29 | -------------------------------------------------------------------------------- /scripts/dss-start-stop--think-before-you-invoke-me.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Use the AWS Lambda concurrency property to disable/enable lambda functions: 4 | - set concurrency to 0 to disable execution 5 | - remove concurrency setting to enable execution 6 | 7 | As a consequence of this script, previously set Lambda concurrency limits 8 | will be lost. 9 | 10 | Asynchronously triggered lambda functions which are throttled are 11 | automatically added to a redrive queue, and may be retried when lambdas 12 | are restarted, depending on downtime. 13 | """ 14 | import os 15 | import sys 16 | import boto3 17 | import click 18 | import argparse 19 | 20 | 21 | stage = os.environ['DSS_DEPLOYMENT_STAGE'] 22 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 23 | 24 | 25 | parser = argparse.ArgumentParser(description=__doc__) 26 | parser.add_argument("action", choices=["start", "stop"]) 27 | args = parser.parse_args() 28 | 29 | 30 | if "start" == args.action: 31 | msg = f"(DSS {stage} START) Lambdas will be restarted with default concurrency limits. Continue?" 32 | elif "stop" == args.action: 33 | msg = f"(DSS {stage} STOP) Lambdas will be halted by setting concurrency=0. Continue?" 34 | else: 35 | raise Exception(f"Unknown action {args.action}") 36 | 37 | if not click.confirm(msg): 38 | sys.exit(0) 39 | 40 | 41 | lambda_client = boto3.client('lambda') 42 | 43 | 44 | def disable_lambda(name): 45 | lambda_client.put_function_concurrency(FunctionName=name, ReservedConcurrentExecutions=0) 46 | print(f"halted {name}") 47 | 48 | def enable_lambda(name): 49 | lambda_client.delete_function_concurrency(FunctionName=name) 50 | print(f"started {name}") 51 | 52 | 53 | root, dirs, files = next(os.walk(os.path.join(pkg_root, 'daemons'))) 54 | functions = [f'{name}-{stage}' for name in dirs] 55 | functions.append(f"dss-{stage}") 56 | 57 | 58 | for f in functions: 59 | try: 60 | resp = lambda_client.get_function(FunctionName=f) 61 | except lambda_client.exceptions.ResourceNotFoundException: 62 | print(f"{f} not deployed, or does not deploy a Lambda function") 63 | continue 64 | 65 | if "start" == args.action: 66 | enable_lambda(f) 67 | elif "stop" == args.action: 68 | disable_lambda(f) 69 | else: 70 | raise Exception(f"Unknown action {args.action}") 71 | -------------------------------------------------------------------------------- /scripts/set_apigateway_base_path_mapping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script idempotently maps the API Gateway custom domain name to the API Gateway 4 | stage. It should be executed for a first-time deployment after successfully 5 | running `make deploy-infra` and `make deploy`. 6 | """ 7 | 8 | import os 9 | import sys 10 | import json 11 | import boto3 12 | 13 | stage = os.environ['DSS_DEPLOYMENT_STAGE'] 14 | domain_name = os.environ['API_DOMAIN_NAME'] 15 | 16 | APIGATEWAY = boto3.client('apigateway') 17 | LAMBDA = boto3.client('lambda') 18 | lambda_name = f'dss-{stage}' 19 | 20 | lambda_arn = None 21 | paginator = LAMBDA.get_paginator('list_functions') 22 | for page in paginator.paginate(): 23 | for l in page['Functions']: 24 | if lambda_name == l['FunctionName']: 25 | lambda_arn = l['FunctionArn'] 26 | break 27 | 28 | if not lambda_arn: 29 | raise Exception(f'Lambda function {lambda_name} not found. Did you run `make deploy`?') 30 | 31 | policy = json.loads( 32 | LAMBDA.get_policy(FunctionName=lambda_name)['Policy'] 33 | ) 34 | 35 | source_arn = policy['Statement'][0]['Condition']['ArnLike']['AWS:SourceArn'] 36 | api_id = source_arn.split(':')[5].split('/')[0] 37 | 38 | try: 39 | APIGATEWAY.create_base_path_mapping( 40 | domainName=domain_name, 41 | restApiId=api_id, 42 | stage=stage 43 | ) 44 | except APIGATEWAY.exceptions.ConflictException: 45 | pass 46 | -------------------------------------------------------------------------------- /scripts/set_version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script sets the version variable DSS_VERSION into the SSM parameter contianing 4 | # the environment variables used when deploying lambdas, and into all deployed lambdas 5 | 6 | set -euo pipefail 7 | 8 | if [[ -z $DSS_DEPLOYMENT_STAGE ]]; then 9 | echo 'Please run "source environment" in the data-store repo root directory before running this command' 10 | exit 1 11 | fi 12 | 13 | if [[ $DSS_DEPLOYMENT_STAGE == dev ]]; then 14 | version=$(git rev-parse HEAD) 15 | elif [[ "$(git tag --points-at HEAD)" != "" ]]; then 16 | version=$(git tag --points-at HEAD | tail -n 1) 17 | else 18 | version=$(git describe --tags --always) 19 | fi 20 | 21 | echo -n ${version} | scripts/dss-ops.py lambda set --quiet DSS_VERSION 22 | -------------------------------------------------------------------------------- /scripts/smugglers-box/README.md: -------------------------------------------------------------------------------- 1 | This folder is for stashing scripts that were for a single use, but we would like to keep a record of. 2 | 3 | No critical operation scripts shall be placed within this directory. 4 | 5 | Place any additional python modules within the requirements.txt file present in this directory. 6 | -------------------------------------------------------------------------------- /scripts/smugglers-box/requirements.txt: -------------------------------------------------------------------------------- 1 | deepdiff -------------------------------------------------------------------------------- /scripts/status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script outputs GitLab pipeline status. 4 | The GitLab API is expected to be stored in AWS secretsmanager with secret id "dcp/dss/gitlab-api" 5 | The GitLab Token is expected to be stored in AWS secretsmanager with secret id "dcp/dss/gitlab-token" 6 | Usage: `scripts/status.py owner repo branch` 7 | Example: `scripts/status.py HumanCellAtlas data-store master` 8 | """ 9 | import json 10 | import boto3 11 | import requests 12 | import argparse 13 | import urllib.parse 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("owner", help="The group or owner of the repository") 17 | parser.add_argument("repo", help="The repository name") 18 | parser.add_argument("branch", help="Branch to return most recent CI pipeline status") 19 | args = parser.parse_args() 20 | 21 | sm = boto3.client("secretsmanager") 22 | 23 | gitlab_api = sm.get_secret_value(SecretId="dcp/dss/gitlab-api")['SecretString'] 24 | gitlab_token = sm.get_secret_value(SecretId="dcp/dss/gitlab-token")['SecretString'] 25 | slug = urllib.parse.quote_plus(f"{args.owner}/{args.repo}") 26 | r = requests.get( 27 | f"https://{gitlab_api}/projects/{slug}/pipelines", 28 | params={"ref": args.branch}, 29 | headers={"Private-Token": gitlab_token}, 30 | ) 31 | print(json.loads(r.text)[0]['status']) 32 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Test Convention 2 | 3 | All tests are decorated with either `testmode.standalone` or `testmode.integration`. The environment variable 4 | `DSS_TEST_MODE` selects which type of tests to run. If the word "integration" is in `DSS_TEST_MODE`, then `make test` 5 | will run integration tests. If the word "standalone" is in `DSS_TEST_MODE`, then`make test` will run standalone tests. 6 | `DSS_TEST_MODE` can contain both words, in which case `make test` will run both sets of tests. 7 | 8 | Standalone tests may only use the fixture and storage buckets in each replica. They may not use any other cloud 9 | resources such as Elasticsearch instances, API gateway or Lambda functions. 10 | 11 | Integration tests require cloud resource to run. 12 | 13 | # How to Run 14 | 15 | * `make test` will run tests based off the environment variable `DSS_TEST_MODE` value. 16 | 17 | * `make integration_test` will run "integration" test cases. 18 | 19 | * `make all_tests` will run "standalone" and "integration" tests. 20 | 21 | * `make smoketest` will run the test_smoketest.py test suite. -------------------------------------------------------------------------------- /tests/daemons/a47b90b2-0967-4fbf-87bc-c6c12db3fedf.2017-07-12T055120.037644Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.1", "version": "2017-07-12T055120.037644Z", "files": [], "creator_uid": 12345} -------------------------------------------------------------------------------- /tests/daemons/sample_s3_bundle_created_event.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "Records":[ 3 | { 4 | "eventVersion":"2.0", 5 | "eventSource":"aws:s3", 6 | "awsRegion":"$DSS_S3_BUCKET_REGION", 7 | "eventTime":"1970-01-01T00:00:00.000Z", 8 | "eventName":"ObjectCreated:Put", 9 | "userIdentity":{ 10 | "principalId":"AIDAJDPLRKLG7UEXAMPLE" 11 | }, 12 | "requestParameters":{ 13 | "sourceIPAddress":"127.0.0.1" 14 | }, 15 | "responseElements":{ 16 | "x-amz-request-id":"C3D13FE58DE4C810", 17 | "x-amz-id-2":"FMyUVURIY8/IgAtTv8xRjskZQpcIZ9KG4V5Wp6S7S/JRWeUWerMUE5JgHvANOjpD" 18 | }, 19 | "s3":{ 20 | "s3SchemaVersion":"1.0", 21 | "configurationId":"testConfigRule", 22 | "bucket":{ 23 | "name":"$DSS_S3_BUCKET", 24 | "ownerIdentity":{ 25 | "principalId":"A3NL1KOZZKExample" 26 | }, 27 | "arn":"arn:aws:s3:::$DSS_S3_BUCKET" 28 | }, 29 | "object":{ 30 | "key":"$BUNDLE_KEY", 31 | "size":$BUNDLE_FILE_SIZE, 32 | "eTag":"$BUNDLE_FILE_ETAG", 33 | "versionId":"096fKKXTRTtl3on89fVO.nfljtsv6qko", 34 | "sequencer":"0055AED6DCD90281E5" 35 | } 36 | } 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/tests/fixtures/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/datafiles/011c7340-9b3c-4d62-bf49-090d79daf198.2017-06-20T214506.766634Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-06-20T214506.766634Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/ce55fd51-7833-469b-be0b-5da88ebebfcd.2017-06-16T193604.240704Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.3", "bundle_uuid": "f325f679-9ac9-459c-97e3-92d79db18db8", "creator_uid": 4321, "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/ce55fd51-7833-469b-be0b-5da88ebebfcd.2017-06-18T075702.020366Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.3", "bundle_uuid": "1321b3c7-c6b1-4c98-826b-f1ae44a13e3a", "creator_uid": 1234, "version": "2017-06-18T075702.020366Z", "content-type": "text/plain", "size": 8685, "crc32c": "114dee2c", "s3-etag": "7f54939b30ae7b6d45d473a4c82a41b0", "sha1": "15684690e8132044f378b4d4af8a7331c8da17b1", "sha256": "9cdc9050cecf59381fed55a2433140b69596fc861bee55abeafd1f9150f3e2da"} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/tests/fixtures/datafiles/empty -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/cell_suspension_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/biomaterial/6.1.1/cell_suspension", 3 | "schema_type": "biomaterial", 4 | "biomaterial_core": { 5 | "biomaterial_id": "Q4_DEMO-cellsus_SAMN02797092", 6 | "ncbi_taxon_id": [ 7 | 9606 8 | ] 9 | }, 10 | "provenance": { 11 | "document_id": "cb5b6f2b-8561-4cfa-96f0-ca1dda3b661c", 12 | "submission_date": "2018-07-22T23:38:35.874Z", 13 | "update_date": "2018-07-22T23:38:43.913Z" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/dissociation_protocol_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/protocol/biomaterial_collection/2.0.0/dissociation_protocol", 3 | "schema_type": "protocol", 4 | "protocol_core": { 5 | "protocol_id": "dissociation_1", 6 | "protocol_name": "a FACS method to separate cells" 7 | }, 8 | "dissociation_method": { 9 | "text": "fluorescence-activated cell sorting", 10 | "ontology": "EFO:0009108" 11 | }, 12 | "provenance": { 13 | "document_id": "500215c8-7877-4e20-8b30-e3820bd098e4", 14 | "submission_date": "2018-07-22T23:38:36.133Z", 15 | "update_date": "2018-07-22T23:38:43.914Z" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/donor_organism_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/biomaterial/5.2.1/donor_organism", 3 | "schema_type": "biomaterial", 4 | "biomaterial_core": { 5 | "biomaterial_id": "Q4_DEMO-donor_MGH30", 6 | "biomaterial_name": "Q4 DEMO donor MGH30", 7 | "ncbi_taxon_id": [ 8 | 9606 9 | ] 10 | }, 11 | "medical_history": { 12 | "smoking_history": "yes" 13 | }, 14 | "genus_species": [ 15 | { 16 | "text": "Homo sapiens" 17 | } 18 | ], 19 | "is_living": false, 20 | "biological_sex": "unknown", 21 | "provenance": { 22 | "document_id": "dcbe0115-7871-41de-8502-b68f6ca024ab", 23 | "submission_date": "2018-07-22T23:38:35.835Z", 24 | "update_date": "2018-07-22T23:38:44.151Z" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/enrichment_protocol_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/protocol/biomaterial_collection/2.0.0/enrichment_protocol", 3 | "schema_type": "protocol", 4 | "protocol_core": { 5 | "protocol_id": "enrichment1" 6 | }, 7 | "enrichment_method": { 8 | "text": "fluorescence-activated cell sorting", 9 | "ontology": "EFO:0009108" 10 | }, 11 | "provenance": { 12 | "document_id": "d2d57618-fa80-4b98-8c76-3c96b76c44bf", 13 | "submission_date": "2018-07-22T23:38:36.163Z", 14 | "update_date": "2018-07-22T23:38:44.095Z" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/library_preparation_protocol_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/3.0.0/library_preparation_protocol", 3 | "schema_type": "protocol", 4 | "protocol_core": { 5 | "protocol_id": "preparation1" 6 | }, 7 | "input_nucleic_acid_molecule": { 8 | "text": "polyA RNA" 9 | }, 10 | "library_construction_approach": { 11 | "text": "Smart-seq2", 12 | "ontology": "EFO:0008931" 13 | }, 14 | "end_bias": "5 prime end bias", 15 | "strand": "unstranded", 16 | "provenance": { 17 | "document_id": "6603e237-9f5d-4a40-88b8-8df65474a794", 18 | "submission_date": "2018-07-22T23:38:36.171Z", 19 | "update_date": "2018-07-22T23:38:44.032Z" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/links.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "http://schema.dev.data.humancellatlas.org/system/1.1.1/links", 3 | "schema_type": "link_bundle", 4 | "schema_version": "1.1.1", 5 | "links": [ 6 | { 7 | "process": "fae5415a-7a16-433b-882b-399c0f5efe34", 8 | "inputs": [ 9 | "cb5b6f2b-8561-4cfa-96f0-ca1dda3b661c" 10 | ], 11 | "input_type": "biomaterial", 12 | "outputs": [ 13 | "c2c44dd4-7fb3-410e-a765-37c6331e97b4", 14 | "78971d24-b317-4f5b-9c95-e606905414ab" 15 | ], 16 | "output_type": "file", 17 | "protocols": [ 18 | { 19 | "protocol_type": "library_preparation_protocol", 20 | "protocol_id": "6603e237-9f5d-4a40-88b8-8df65474a794" 21 | }, 22 | { 23 | "protocol_type": "sequencing_protocol", 24 | "protocol_id": "67b7bddf-c0bf-42da-b3ee-804073b26269" 25 | } 26 | ] 27 | }, 28 | { 29 | "process": "7ead592c-32d6-4a89-8e2c-471a46436ee0", 30 | "inputs": [ 31 | "bc707c41-cd66-4540-9f69-e5fed3ba6459" 32 | ], 33 | "input_type": "biomaterial", 34 | "outputs": [ 35 | "cb5b6f2b-8561-4cfa-96f0-ca1dda3b661c" 36 | ], 37 | "output_type": "biomaterial", 38 | "protocols": [ 39 | { 40 | "protocol_type": "dissociation_protocol", 41 | "protocol_id": "500215c8-7877-4e20-8b30-e3820bd098e4" 42 | }, 43 | { 44 | "protocol_type": "enrichment_protocol", 45 | "protocol_id": "d2d57618-fa80-4b98-8c76-3c96b76c44bf" 46 | } 47 | ] 48 | }, 49 | { 50 | "process": "a3400037-be42-41c3-9605-d094472646d6", 51 | "inputs": [ 52 | "dcbe0115-7871-41de-8502-b68f6ca024ab" 53 | ], 54 | "input_type": "biomaterial", 55 | "outputs": [ 56 | "bc707c41-cd66-4540-9f69-e5fed3ba6459" 57 | ], 58 | "output_type": "biomaterial", 59 | "protocols": [] 60 | } 61 | ] 62 | } 63 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/process_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "process_core": { 3 | "process_id": "sequence_process_file_1" 4 | }, 5 | "schema_type": "process", 6 | "describedBy": "https://schema.humancellatlas.org/type/process/2.1.1/process", 7 | "provenance": { 8 | "document_id": "fae5415a-7a16-433b-882b-399c0f5efe34", 9 | "submission_date": "2018-07-22T23:38:36.243Z", 10 | "update_date": "2018-07-22T23:38:40.219Z" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/process_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "process_core": { 3 | "process_id": "process_id_2" 4 | }, 5 | "schema_type": "process", 6 | "describedBy": "https://schema.humancellatlas.org/type/process/2.1.1/process", 7 | "provenance": { 8 | "document_id": "7ead592c-32d6-4a89-8e2c-471a46436ee0", 9 | "submission_date": "2018-07-22T23:38:36.213Z", 10 | "update_date": "2018-07-22T23:38:40.065Z" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/process_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "process_core": { 3 | "process_id": "process_id_1" 4 | }, 5 | "schema_type": "process", 6 | "describedBy": "https://schema.humancellatlas.org/type/process/2.1.1/process", 7 | "provenance": { 8 | "document_id": "a3400037-be42-41c3-9605-d094472646d6", 9 | "submission_date": "2018-07-22T23:38:36.205Z", 10 | "update_date": "2018-07-22T23:38:40.025Z" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/sequence_file_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/file/6.1.1/sequence_file", 3 | "schema_type": "file", 4 | "file_core": { 5 | "file_name": "R1.fastq.gz", 6 | "file_format": "fastq.gz" 7 | }, 8 | "read_index": "read1", 9 | "lane_index": 1, 10 | "provenance": { 11 | "document_id": "c2c44dd4-7fb3-410e-a765-37c6331e97b4", 12 | "submission_date": "2018-07-22T23:38:35.990Z", 13 | "update_date": "2018-07-22T23:39:37.628Z" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/sequence_file_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/file/6.1.1/sequence_file", 3 | "schema_type": "file", 4 | "file_core": { 5 | "file_name": "R2.fastq.gz", 6 | "file_format": "fastq.gz" 7 | }, 8 | "read_index": "read2", 9 | "lane_index": 1, 10 | "provenance": { 11 | "document_id": "78971d24-b317-4f5b-9c95-e606905414ab", 12 | "submission_date": "2018-07-22T23:38:36.122Z", 13 | "update_date": "2018-07-22T23:39:37.631Z" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/sequencing_protocol_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/7.0.0/sequencing_protocol", 3 | "schema_type": "protocol", 4 | "protocol_core": { 5 | "protocol_id": "assay_1" 6 | }, 7 | "instrument_manufacturer_model": { 8 | "text": "Illumina HiSeq 2500" 9 | }, 10 | "paired_ends": true, 11 | "sequencing_approach": { 12 | "text": "full length single cell RNA sequencing", 13 | "ontology": "EFO:0008441" 14 | }, 15 | "provenance": { 16 | "document_id": "67b7bddf-c0bf-42da-b3ee-804073b26269", 17 | "submission_date": "2018-07-22T23:38:36.185Z", 18 | "update_date": "2018-07-22T23:38:44.016Z" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/example_bundle/specimen_from_organism_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "describedBy": "https://schema.humancellatlas.org/type/biomaterial/5.2.2/specimen_from_organism", 3 | "schema_type": "biomaterial", 4 | "biomaterial_core": { 5 | "biomaterial_id": "Q4_DEMO-sample_SAMN02797092", 6 | "biomaterial_name": "Q4_DEMO-Single cell mRNA-seq_MGH30_A01", 7 | "ncbi_taxon_id": [ 8 | 9606 9 | ], 10 | "supplementary_files": [ 11 | "Q4_DEMO-protocol" 12 | ] 13 | }, 14 | "genus_species": [ 15 | { 16 | "text": "Homo sapiens" 17 | } 18 | ], 19 | "organ": { 20 | "text": "brain" 21 | }, 22 | "organ_part": { 23 | "text": "astrocyte" 24 | }, 25 | "disease": [ 26 | { 27 | "text": "glioblastoma" 28 | } 29 | ], 30 | "collection_time": "2018-07-22", 31 | "provenance": { 32 | "document_id": "bc707c41-cd66-4540-9f69-e5fed3ba6459", 33 | "submission_date": "2018-07-22T23:38:35.863Z", 34 | "update_date": "2018-07-22T23:38:40.442Z" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/indexing/text_data_file1.txt: -------------------------------------------------------------------------------- 1 | This file represents data that is not intended to be indexed. -------------------------------------------------------------------------------- /tests/fixtures/datafiles/indexing/text_data_file2.txt: -------------------------------------------------------------------------------- 1 | This file represents data that is not intended to be indexed. -------------------------------------------------------------------------------- /tests/fixtures/datafiles/indexing/unparseable_json.json: -------------------------------------------------------------------------------- 1 | This file represents data that is has content type 'application/json' yet cannot be parsed as json. -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0000-4a6b-8f0d-a7d2105c23be.2017-12-05T235728.441373Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-12-05T235728.441373Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0000-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-12-05T235850.950361Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0000-4a6b-8f0d-a7d2105c23be.dead: -------------------------------------------------------------------------------- 1 | { 2 | "creator_uid": 12345, 3 | "reason": "test" 4 | } 5 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235528.321679Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-12-05T235528.321679Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235728.441373Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-12-05T235728.441373Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z: -------------------------------------------------------------------------------- 1 | {"format": "0.0.2", "version": "2017-12-05T235850.950361Z", "files": [{"name": "LICENSE", "uuid": "ce55fd51-7833-469b-be0b-5da88ebebfcd", "version": "2017-06-16T193604.240704Z", "content-type": "text/plain", "size": 11358, "indexed": false, "crc32c": "e16e07b9", "s3-etag": "3b83ef96387f14655fc854ddc3c6bd57", "sha1": "2b8b815229aa8a61e483fb4ba0588b8b6c491890", "sha256": "cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30"}], "creator_uid": 12345} 2 | -------------------------------------------------------------------------------- /tests/fixtures/datafiles/tombstones/deadbeef-0001-4a6b-8f0d-a7d2105c23be.2017-12-05T235850.950361Z.dead: -------------------------------------------------------------------------------- 1 | { 2 | "creator_uid": 12345, 3 | "reason": "test" 4 | } 5 | -------------------------------------------------------------------------------- /tests/fixtures/populate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import sys 6 | 7 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', "..")) # noqa 8 | sys.path.insert(0, pkg_root) # noqa 9 | 10 | from tests.fixtures.populate_lib import populate 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = argparse.ArgumentParser(description="Set up test fixtures in cloud storage buckets") 15 | parser.add_argument("--s3-bucket", type=str) 16 | parser.add_argument("--gs-bucket", type=str) 17 | 18 | args = parser.parse_args() 19 | 20 | populate(args.s3_bucket, args.gs_bucket) 21 | 22 | print("Fixtures populated. Run tests to ensure fixture integrity!") 23 | -------------------------------------------------------------------------------- /tests/hcablobstore_base.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from cloud_blobstore import BlobNotFoundError 4 | 5 | from tests.infra import testmode 6 | 7 | 8 | @testmode.integration 9 | class HCABlobStoreTests: 10 | """ 11 | Common HCA blobstore tests. We want to avoid repeating ourselves, so if we 12 | built the abstractions correctly, common operations can all be tested here. 13 | """ 14 | 15 | def test_verify_blob_checksum_from_staging_metadata(self): 16 | bucket = self.test_fixtures_bucket 17 | key = "test_good_source_data/0" 18 | self.assertTrue( 19 | self.hcahandle.verify_blob_checksum_from_staging_metadata( 20 | bucket, key, 21 | self.blobhandle.get_user_metadata(bucket, key))) 22 | 23 | key = "test_bad_source_data/incorrect_checksum" 24 | self.assertFalse( 25 | self.hcahandle.verify_blob_checksum_from_staging_metadata( 26 | bucket, key, 27 | self.blobhandle.get_user_metadata(bucket, key))) 28 | 29 | key = "DOES_NOT_EXIST" 30 | with self.assertRaises(BlobNotFoundError): 31 | self.hcahandle.verify_blob_checksum_from_staging_metadata( 32 | bucket, key, {}) 33 | 34 | def test_verify_blob_checksum_from_dss_metadata(self): 35 | bucket = self.test_fixtures_bucket 36 | key = ("blobs/cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30." 37 | "2b8b815229aa8a61e483fb4ba0588b8b6c491890.3b83ef96387f14655fc854ddc3c6bd57.e16e07b9") 38 | bundle_key = "bundles/011c7340-9b3c-4d62-bf49-090d79daf198.2017-06-20T214506.766634Z" 39 | bundle = json.loads(self.blobhandle.get(bucket, bundle_key)) 40 | self.assertTrue( 41 | self.hcahandle.verify_blob_checksum_from_dss_metadata( 42 | bucket, key, bundle['files'][0])) 43 | 44 | key = ("blobs/9cdc9050cecf59381fed55a2433140b69596fc861bee55abeafd1f9150f3e2da." 45 | "15684690e8132044f378b4d4af8a7331c8da17b1.7f54939b30ae7b6d45d473a4c82a41b0.114dee2c") 46 | self.assertFalse( 47 | self.hcahandle.verify_blob_checksum_from_dss_metadata( 48 | bucket, key, bundle['files'][0])) 49 | 50 | key = "DOES_NOT_EXIST" 51 | with self.assertRaises(BlobNotFoundError): 52 | self.hcahandle.verify_blob_checksum_from_dss_metadata( 53 | bucket, key, bundle['files'][0]) 54 | -------------------------------------------------------------------------------- /tests/infra/__init__.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import uuid 4 | import time 5 | 6 | from dss.util.types import LambdaContext 7 | from .assert_mixin import DSSAssertResponse, DSSAssertMixin, ExpectedErrorFields 8 | from .storage_mixin import DSSStorageMixin, TestBundle 9 | from .testmode import integration, standalone 10 | from .upload_mixin import DSSUploadMixin 11 | from .auth_tests_mixin import TestAuthMixin 12 | from .mock_storage_handler import MockStorageHandler 13 | 14 | 15 | def get_env(varname): 16 | if varname not in os.environ: 17 | raise RuntimeError( 18 | "Please set the {} environment variable".format(varname)) 19 | return os.environ[varname] 20 | 21 | 22 | def generate_test_key() -> str: 23 | callerframerecord = inspect.stack()[1] # 0 represents this line, 1 represents line at caller. 24 | frame = callerframerecord[0] 25 | info = inspect.getframeinfo(frame) 26 | filename = os.path.basename(info.filename) 27 | unique_key = str(uuid.uuid4()) 28 | 29 | return f"{filename}/{info.function}/{unique_key}" 30 | 31 | 32 | # noinspection PyAbstractClass 33 | class MockLambdaContext(LambdaContext): 34 | """ 35 | A mock of the class an instance of which the AWS Lambda Python runtime injects into each invocation. 36 | """ 37 | 38 | def __init__(self, timeout: float = 300.0) -> None: 39 | self.deadline = time.time() + timeout 40 | 41 | def get_remaining_time_in_millis(self): 42 | return int(max(0.0, self.deadline - time.time()) * 1000) 43 | -------------------------------------------------------------------------------- /tests/infra/auth_tests_mixin.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from tests import get_auth_header 4 | from tests.infra import DSSAssertMixin 5 | 6 | 7 | class TestAuthMixin(DSSAssertMixin): 8 | def _test_auth_errors(self, method: str, url: str, skip_group_test=False, **kwargs): 9 | with self.subTest("Gibberish auth header"): # type: ignore 10 | resp = self.assertResponse(method, url, requests.codes.unauthorized, headers=get_auth_header(False), 11 | **kwargs) 12 | self.assertEqual(resp.response.headers['Content-Type'], "application/problem+json") 13 | self.assertEqual(resp.json['title'], 'Failed to decode token.') 14 | 15 | with self.subTest("No auth header"): # type: ignore 16 | resp = self.assertResponse(method, url, requests.codes.unauthorized, **kwargs) # type: ignore 17 | self.assertEqual(resp.response.headers['Content-Type'], "application/problem+json") 18 | self.assertEqual(resp.json['title'], 'No authorization token provided') 19 | 20 | if not skip_group_test: 21 | with self.subTest("unauthorized group"): # type: ignore 22 | resp = self.assertResponse(method, url, requests.codes.forbidden, 23 | headers=get_auth_header(group='someone'), 24 | **kwargs) 25 | self.assertEqual(resp.response.headers['Content-Type'], "application/problem+json") 26 | self.assertEqual(resp.json['title'], 'User is not authorized to access this resource') 27 | 28 | # Don't run this test for test_bundle and test_file because they don't need email 29 | if not url.split('/')[2] in ('files', 'bundles'): 30 | with self.subTest("no email claims"): # type: ignore 31 | resp = self.assertResponse(method, url, 32 | requests.codes.unauthorized, 33 | headers=get_auth_header(email=False, email_claim=False), 34 | **kwargs) 35 | self.assertEqual(resp.response.headers['Content-Type'], "application/problem+json") 36 | self.assertEqual(resp.json['title'], 'Authorization token is missing email claims.') 37 | -------------------------------------------------------------------------------- /tests/infra/elasticsearch_test_case.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import os 4 | 5 | from dss.config import Config 6 | from tests.es import elasticsearch_delete_index, ElasticsearchServer 7 | 8 | 9 | class ElasticsearchTestCase(unittest.TestCase): 10 | 11 | server = None 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | super().setUpClass() 16 | Config.test_index_suffix.prepend(cls.__name__.lower()) 17 | cls.server = ElasticsearchServer() 18 | os.environ['DSS_ES_PORT'] = str(cls.server.port) 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | elasticsearch_delete_index('*' + Config.test_index_suffix.value) 23 | Config.test_index_suffix.restore() 24 | cls.server.shutdown() 25 | os.unsetenv('DSS_ES_PORT') 26 | super().tearDownClass() 27 | 28 | def setUp(self): 29 | super().setUp() 30 | Config.test_index_suffix.prepend(self._testMethodName.lower()) 31 | 32 | def tearDown(self): 33 | Config.test_index_suffix.restore() 34 | super().tearDown() 35 | -------------------------------------------------------------------------------- /tests/infra/testmode.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | 5 | def standalone(f): 6 | return unittest.skipUnless(is_standalone(), "Skipping standalone test")(f) 7 | 8 | 9 | def is_standalone(): 10 | return "standalone" in _test_mode() 11 | 12 | 13 | def integration(f): 14 | return unittest.skipUnless(is_integration(), "Skipping integration test")(f) 15 | 16 | 17 | def is_integration(): 18 | return "integration" in _test_mode() 19 | 20 | 21 | def always(f): 22 | return f 23 | 24 | 25 | def _test_mode(): 26 | return os.environ.get('DSS_TEST_MODE', "standalone") 27 | -------------------------------------------------------------------------------- /tests/sample_doc_tombstone.json: -------------------------------------------------------------------------------- 1 | { 2 | "email": "sample_doc_tombstone@fixtures.data.humancellatlas.org", 3 | "reason": "disappeared", 4 | "admin_deleted": true 5 | } 6 | -------------------------------------------------------------------------------- /tests/sample_search_queries.py: -------------------------------------------------------------------------------- 1 | """Sample queries for testing elastic search""" 2 | 3 | smartseq2_paired_ends_vx_query = \ 4 | { 5 | 'query': { 6 | 'bool': { 7 | 'must': [{ 8 | 'match': { 9 | "files.donor_organism_json.medical_history.smoking_history": "yes" 10 | } 11 | }, { 12 | 'match': { 13 | "files.specimen_from_organism_json.genus_species.text": "Homo sapiens" 14 | } 15 | }, { 16 | 'match': { 17 | "files.specimen_from_organism_json.organ.text": "brain" 18 | } 19 | }] 20 | } 21 | } 22 | } 23 | 24 | tombstone_query = { 25 | "query": { 26 | 'bool': { 27 | 'must': [ 28 | { 29 | 'term': { 30 | "admin_deleted": True 31 | } 32 | } 33 | ] 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/sample_v0_index_doc_tombstone.json: -------------------------------------------------------------------------------- 1 | { 2 | "email": "sample_v0_index_doc_tombstone@fixtures.data.humancellatlas.org", 3 | "reason": "disappeared", 4 | "admin_deleted": true, 5 | "uuid": "2712e1e7-d276-4c56-b33f-ead49e50b19e", 6 | "version": "2019-01-18T190257.602413Z" 7 | } 8 | -------------------------------------------------------------------------------- /tests/sample_vx_index_doc_tombstone.json: -------------------------------------------------------------------------------- 1 | { 2 | "email": "sample_vx_index_doc_tombstone@fixtures.data.humancellatlas.org", 3 | "reason": "disappeared", 4 | "admin_deleted": true, 5 | "uuid": "2712e1e7-d276-4c56-b33f-ead49e50b19e", 6 | "manifest": { 7 | "version": "2019-01-18T190257.602413Z" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /tests/scalability/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/tests/scalability/__init__.py -------------------------------------------------------------------------------- /tests/scalability/scale_test_runner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import datetime 5 | import os 6 | import sys 7 | import time 8 | import uuid 9 | 10 | import argparse 11 | 12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'sns'))) # noqa 13 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) # noqa 14 | 15 | from sns import SnsClient 16 | 17 | ONE_SECOND_IN_MS = 1 * 1000 * 1000 # 1 sec in microseconds 18 | 19 | class ScaleTestRunner: 20 | def __init__(self, rps: int, duration_seconds: int) -> None: 21 | self.rps = rps 22 | self.duration_seconds = duration_seconds 23 | self.run_id = str(uuid.uuid4()) 24 | self.sns_client = SnsClient() 25 | self.loading = '.' * duration_seconds 26 | 27 | def run(self): 28 | print(f"Starting runId: {self.run_id}") 29 | self.end_time = datetime.datetime.now() + datetime.timedelta(seconds=self.duration_seconds) 30 | self.sns_client.start_test_run(self.run_id) 31 | for cnt in range(self.duration_seconds): 32 | self.generate_load_rps() 33 | self.loading = self.loading[:cnt] + '#' + self.loading[cnt + 1:] 34 | print('\r%s Sending at %3d percent!' % (self.loading, (cnt + 1) * 100 / self.duration_seconds), end='') 35 | print("\nDone") 36 | 37 | def generate_load_rps(self): 38 | start = datetime.datetime.now() 39 | for cnt in range(self.rps): 40 | self.sns_client.start_test_execution(self.run_id, str(uuid.uuid4())) 41 | duration = datetime.datetime.now() - start 42 | elapsed_time = duration.microseconds / ONE_SECOND_IN_MS 43 | 44 | if elapsed_time < 1.0: 45 | time.sleep(1.0 - elapsed_time) 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser(description='DSS scalability test runner') 50 | parser.add_argument('-r', '--rps', 51 | help='requests generated per second', 52 | default='10') 53 | parser.add_argument('-d', '--duration', 54 | help='duration of the test', 55 | default='20') 56 | results = parser.parse_args(sys.argv[1:]) 57 | 58 | rps = int(results.rps) 59 | duration = int(results.duration) 60 | 61 | print(f"Test configuration rps: {rps} duration: {duration}") 62 | 63 | runner = ScaleTestRunner(rps, duration) 64 | runner.run() 65 | -------------------------------------------------------------------------------- /tests/scalability/sns/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import os 4 | from concurrent.futures import ThreadPoolExecutor 5 | 6 | from dss.util.aws import ARN 7 | 8 | sns = boto3.client('sns') 9 | sns.meta.config.max_pool_connections = 100 10 | sns_topic_run = "dss-scalability-test-run" 11 | sns_topic_exec = "dss-scalability-test" 12 | stage = os.environ["DSS_DEPLOYMENT_STAGE"] 13 | 14 | 15 | class SnsClient(): 16 | def __init__(self): 17 | self.executor = ThreadPoolExecutor(max_workers=4) 18 | 19 | def get_sns_topic_arn(self, sns_topic): 20 | return f"arn:aws:sns:{ARN.get_region()}:{ARN.get_account_id()}:{sns_topic}-{stage}" 21 | 22 | def start_test_run(self, run_id: str): 23 | self.executor.submit(self.send_start_run(run_id)) 24 | 25 | def start_test_execution(self, run_id: str, execution_id: str): 26 | self.execution_id = execution_id 27 | self.executor.submit(self.send_start_exec(run_id)) 28 | 29 | def _send(self, sns_topic, msg): 30 | publish_args = { 31 | 'Message': json.dumps(msg), 32 | 'TopicArn': self.get_sns_topic_arn(sns_topic) 33 | } 34 | sns.publish(**publish_args) 35 | 36 | def send_start_run(self, run_id: str): 37 | msg = {'run_id': run_id} 38 | self._send(sns_topic_run, msg) 39 | 40 | def send_start_exec(self, run_id): 41 | msg = {"run_id": run_id, "execution_id": self.execution_id} 42 | self._send(sns_topic_exec, msg) 43 | -------------------------------------------------------------------------------- /tests/test_doctests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Run selected doctests as a regular unit test. 6 | """ 7 | 8 | import doctest 9 | import os 10 | import sys 11 | import unittest 12 | 13 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 14 | sys.path.insert(0, pkg_root) # noqa 15 | 16 | from tests.infra import testmode 17 | 18 | 19 | @testmode.standalone 20 | def load_tests(loader, tests, ignore): 21 | tests.addTests(doctest.DocTestSuite('dss.util.retry')) 22 | tests.addTests(doctest.DocTestSuite('dss.util.iterators')) 23 | tests.addTests(doctest.DocTestSuite('dss.index.es')) 24 | tests.addTests(doctest.DocTestSuite('dss.vendored.frozendict')) 25 | return tests 26 | 27 | 28 | if __name__ == '__main__': 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /tests/test_email.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from unittest import mock 4 | 5 | import os 6 | 7 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 8 | sys.path.insert(0, pkg_root) # noqa 9 | 10 | import dss.util.email 11 | from tests.infra import testmode 12 | from dss import Replica 13 | 14 | 15 | @testmode.standalone 16 | class TestEmail(unittest.TestCase): 17 | @classmethod 18 | def tearDownClass(cls): 19 | pass 20 | 21 | @classmethod 22 | def setUp(self): 23 | pass 24 | 25 | @mock.patch('dss.util.email.send_email') 26 | def test_send_email(self, send_email_func): 27 | send_email_func.return_value = 'Success' 28 | dss.util.email.send_email('sender', 'receiver', 'subject', 'html', 'txt') 29 | self.assertEquals(send_email_func('sender', 'receiver', 'subject', 'html', 'txt'), 'Success') 30 | 31 | @mock.patch('dss.util.email.send_email') 32 | def test_send_checkout_success_email(self, send_email_func): 33 | send_email_func.return_value = 'Success' 34 | dss.util.email.send_checkout_success_email('sender', 'to', 'bucket', 'location', Replica.aws) 35 | 36 | args, kwargs = send_email_func.call_args 37 | self.assertEqual(args[0], 'sender') 38 | self.assertEqual(args[1], 'to') 39 | self.assertEqual(args[2], dss.util.email.SUCCESS_SUBJECT) 40 | self.assertIn('', args[3]) 41 | self.assertNotIn('', args[4]) 42 | 43 | @mock.patch('dss.util.email.send_email') 44 | def test_send_checkout_failure_email(self, send_email_func): 45 | send_email_func.return_value = 'Success' 46 | dss.util.email.send_checkout_failure_email('sender', 'to', 'cause') 47 | 48 | args, kwargs = send_email_func.call_args 49 | self.assertEqual(args[0], 'sender') 50 | self.assertEqual(args[1], 'to') 51 | self.assertEqual(args[2], dss.util.email.FAILURE_SUBJECT) 52 | self.assertIn('', args[3]) 53 | self.assertNotIn('', args[4]) 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tests/test_exptime.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import sys 6 | import unittest 7 | import uuid 8 | 9 | import requests 10 | 11 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 12 | sys.path.insert(0, pkg_root) # noqa 13 | 14 | from dss.config import DeploymentStage 15 | from tests.infra import DSSAssertMixin, DSSUploadMixin, ExpectedErrorFields, testmode 16 | from tests.infra.server import ThreadedLocalServer 17 | 18 | 19 | @testmode.standalone 20 | class TestExptime(unittest.TestCase, DSSAssertMixin, DSSUploadMixin): 21 | @classmethod 22 | def setUpClass(cls): 23 | cls.app = ThreadedLocalServer() 24 | cls.app.start() 25 | cls.app._chalice_app._override_exptime_seconds = 15.0 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | cls.app.shutdown() 30 | 31 | def test_exptime(self): 32 | self.assertGetResponse( 33 | "/internal/slow_request", 34 | requests.codes.gateway_timeout, 35 | expected_error=ExpectedErrorFields( 36 | code="timed_out", 37 | status=requests.codes.gateway_timeout, 38 | ) 39 | ) 40 | 41 | @unittest.skipIf(DeploymentStage.IS_PROD(), "Skipping synthetic 504 test for PROD.") 42 | def test_synthetic_504(self): 43 | file_uuid = str(uuid.uuid4()) 44 | r = self.assertGetResponse( 45 | f"/v1/files/{file_uuid}?replica=aws", 46 | requests.codes.gateway_timeout, 47 | expected_error=ExpectedErrorFields( 48 | code="timed_out", 49 | status=requests.codes.gateway_timeout, 50 | ), 51 | headers={ 52 | "DSS_FAKE_504_PROBABILITY": "1.0", 53 | } 54 | ) 55 | with self.subTest('Retry-After headers are included in a GET /v1/bundles/{uuid} 504 response.'): 56 | self.assertEqual(int(r.response.headers['Retry-After']), 10) 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /tests/test_gshcablobstore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import sys 6 | import unittest 7 | 8 | from cloud_blobstore.gs import GSBlobStore 9 | 10 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 11 | sys.path.insert(0, pkg_root) # noqa 12 | 13 | from dss.storage.hcablobstore.gs import GSHCABlobStore 14 | from tests import infra 15 | from tests.hcablobstore_base import HCABlobStoreTests 16 | 17 | 18 | class TestGSHCABlobStore(unittest.TestCase, HCABlobStoreTests): 19 | def setUp(self): 20 | self.credentials = infra.get_env("GOOGLE_APPLICATION_CREDENTIALS") 21 | self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST") 22 | self.test_fixtures_bucket = infra.get_env("DSS_GS_BUCKET_TEST_FIXTURES") 23 | self.blobhandle = GSBlobStore.from_auth_credentials(self.credentials) 24 | self.hcahandle = GSHCABlobStore(self.blobhandle) 25 | 26 | def tearDown(self): 27 | pass 28 | 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /tests/test_hcagenerator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import os 5 | import sys 6 | import unittest 7 | 8 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 9 | sys.path.insert(0, pkg_root) # noqa 10 | 11 | from dss import Config, BucketConfig 12 | from dss.util.json_gen.hca_generator import HCAJsonGenerator 13 | from tests.infra import testmode 14 | 15 | schema_urls = [ 16 | "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.0/json_schema/analysis_bundle.json", 17 | "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.0/json_schema/assay_bundle.json", 18 | "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.0/json_schema/project_bundle.json", 19 | "https://schema.humancellatlas.org/bundle/5.1.0/project", 20 | "https://schema.humancellatlas.org/bundle/5.1.0/submission", 21 | "https://schema.humancellatlas.org/bundle/5.1.0/ingest_audit", 22 | ] 23 | 24 | 25 | @testmode.standalone 26 | class TestHCAGenerator(unittest.TestCase): 27 | repeat = 25 28 | 29 | @classmethod 30 | def setUpClass(cls): 31 | Config.set_config(BucketConfig.TEST) 32 | 33 | def setUp(self): 34 | self.faker = HCAJsonGenerator(schema_urls) 35 | 36 | def test_locals(self): 37 | for url in schema_urls: 38 | name = url.split('/')[-1] 39 | self.assertEqual(self.faker.schemas[name], {'$ref': url, 'id': url}) 40 | 41 | def test_generation(self): 42 | for i in range(self.repeat): 43 | for name in self.faker.schemas.keys(): 44 | with self.subTest(name): 45 | fake_json = self.faker.generate(name) 46 | self.assertIsInstance(json.loads(fake_json), dict) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/test_health.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | from unittest import mock 5 | import logging 6 | 7 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 8 | sys.path.insert(0, pkg_root) # noqa 9 | 10 | from dss.api import health 11 | from tests.infra import testmode 12 | from tests.es import ElasticsearchServer 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | @testmode.integration 18 | class TestHealth(unittest.TestCase): 19 | server = None 20 | 21 | @classmethod 22 | def setUpClass(cls): 23 | cls.server = ElasticsearchServer() 24 | os.environ['DSS_ES_PORT'] = str(cls.server.port) 25 | pass 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | cls.server.shutdown() 30 | os.unsetenv('DSS_ES_PORT') 31 | pass 32 | 33 | def test_elastic_search(self): 34 | test_es = health._get_es_status(port=os.getenv("DSS_ES_PORT")) 35 | self.assertIn(True, test_es) 36 | 37 | def test_dynamodb(self): 38 | test_ddb = health._get_dynamodb_status() 39 | self.assertIn(True, test_ddb) 40 | 41 | def test_event_relay(self): 42 | test_er = health._get_event_relay_status() 43 | self.assertIn(True, test_er) 44 | 45 | @mock.patch("dss.api.health._get_es_status") 46 | @mock.patch("dss.api.health._get_dynamodb_status") 47 | @mock.patch("dss.api.health._get_event_relay_status") 48 | def test_full_health_check(self, mock_er, mock_ddb, mock_es): 49 | healthy_res = {"Healthy": True} 50 | mock_es.return_value = (True, None) 51 | mock_ddb.return_value = (True, None) 52 | mock_er.return_value = (True, None) 53 | mock_res = health.l2_health_checks() 54 | self.assertDictEqual(healthy_res, mock_res) 55 | 56 | @testmode.standalone 57 | def test_resource_fetch(self): 58 | 59 | service_tags = {"Key": "service", "Values": ["dss"]} 60 | resource_list = health.get_resource_by_tag(resource_string='dynamodb:table', tag_filter=service_tags) 61 | ddb_tables = [x['ResourceARN'].split('/')[1] for x in resource_list['ResourceTagMappingList'] if 62 | os.environ.get('DSS_DEPLOYMENT_STAGE') in x['ResourceARN']] 63 | print(ddb_tables) 64 | self.assertGreater(len(ddb_tables), 0) 65 | 66 | 67 | if __name__ == "__main__": 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /tests/test_s3hcablobstore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import sys 6 | import unittest 7 | 8 | from cloud_blobstore.s3 import S3BlobStore 9 | 10 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 11 | sys.path.insert(0, pkg_root) # noqa 12 | 13 | from dss.storage.hcablobstore.s3 import S3HCABlobStore 14 | from tests import infra 15 | from tests.hcablobstore_base import HCABlobStoreTests 16 | 17 | 18 | class TestS3HCABlobStore(unittest.TestCase, HCABlobStoreTests): 19 | def setUp(self): 20 | self.test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") 21 | self.test_fixtures_bucket = infra.get_env("DSS_S3_BUCKET_TEST_FIXTURES") 22 | self.blobhandle = S3BlobStore.from_environment() 23 | self.hcahandle = S3HCABlobStore(self.blobhandle) 24 | 25 | def tearDown(self): 26 | pass 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import requests 5 | 6 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # noqa 7 | sys.path.insert(0, pkg_root) # noqa 8 | 9 | from tests.infra import testmode 10 | from tests.infra.server import ThreadedLocalServer, MockFusilladeHandler 11 | import dss.error 12 | from dss.util import security 13 | from dss import BucketConfig, Config, DeploymentStage 14 | 15 | 16 | def setUpModule(): 17 | MockFusilladeHandler.start_serving() 18 | 19 | 20 | def tearDownModule(): 21 | MockFusilladeHandler.stop_serving() 22 | 23 | 24 | class TestMockFusilladeServer(unittest.TestCase): 25 | """Test that the mock Fusillade server in dss/tests/infra/server.py is functioning properly""" 26 | 27 | def test_get_policy(self): 28 | actions = ["dss:PutBundle"] 29 | resources = ["arn:hca:dss:dev:*:bundle/123456/0"] 30 | 31 | # Ensure whitelisted principals are granted access 32 | for principal in MockFusilladeHandler._whitelist: 33 | security.assert_authorized(principal, actions, resources) 34 | 35 | # Ensure non-whitelisted principals are denied access 36 | for principal in ['invalid-email@test-server.data.humancellatlas.org']: 37 | with self.assertRaises(dss.error.DSSForbiddenException): 38 | security.assert_authorized(principal, actions, resources) 39 | 40 | 41 | if __name__ == "__main__": 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /tests/test_standalone_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Test that the standalone can start up and answer a request. 6 | """ 7 | 8 | import contextlib 9 | import os 10 | import socket 11 | import subprocess 12 | import sys 13 | import time 14 | import unittest 15 | import uuid 16 | import requests 17 | 18 | pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # noqa 19 | sys.path.insert(0, pkg_root) # noqa 20 | 21 | from tests import get_auth_header 22 | from dss.util import networking 23 | from tests.infra import testmode 24 | 25 | 26 | class TestStandaloneScript(unittest.TestCase): 27 | @classmethod 28 | def setUpClass(cls, timeout_seconds=10): 29 | dss_root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 30 | cls.port = networking.unused_tcp_port() 31 | cls.subprocess = subprocess.Popen( 32 | [ 33 | os.path.join(dss_root_path, "dss-api"), 34 | "--port", 35 | str(cls.port), 36 | "--log-level", 37 | "CRITICAL", 38 | ], 39 | cwd=dss_root_path 40 | ) 41 | 42 | end_time = time.time() + timeout_seconds 43 | delay = 0.05 44 | while time.time() < end_time: 45 | try: 46 | with contextlib.closing(socket.create_connection(("127.0.0.1", cls.port))): 47 | pass 48 | break 49 | except ConnectionError: 50 | delay = max(1.0, delay * 2) 51 | time.sleep(delay) 52 | continue 53 | 54 | @classmethod 55 | def tearDownClass(cls): 56 | cls.subprocess.terminate() 57 | cls.subprocess.wait() 58 | 59 | @testmode.standalone 60 | def test_simple_request(self): 61 | file_uuid = str(uuid.uuid4()) 62 | response = requests.api.get(f"http://127.0.0.1:{self.port}/v1/files/{file_uuid}?replica=aws", 63 | headers=get_auth_header()) 64 | self.assertEqual(response.status_code, requests.codes.not_found) 65 | 66 | 67 | if __name__ == '__main__': 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /vendor.in/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/data-store/6b27d0f7e0110c62b3079151708689ab5145f15b/vendor.in/.keep --------------------------------------------------------------------------------