├── LaunchPortal.jpg ├── Authorized URL.jpg ├── SandboxContainers.jpg ├── LoggedInScreenshot.jpg ├── Authorization_URL_2020.jpg ├── scripts ├── fence_key_helper.py ├── wait_for_esproxy.sh ├── arborist_setup.sh ├── peregrine_setup.sh ├── indexd_setup.sh ├── sheepdog_setup.sh ├── fence_setup.sh ├── waitForContainers.sh ├── postgres_always.sh ├── postgres_run.sh └── postgres_init.sql ├── datadictionary ├── gdcdictionary │ ├── examples │ │ ├── valid │ │ │ ├── program.json │ │ │ ├── publication.json │ │ │ ├── case.json │ │ │ ├── keyword.json │ │ │ ├── acknowledgement.json │ │ │ ├── project.json │ │ │ ├── aliquot.json │ │ │ ├── slide_count.json │ │ │ ├── demographic.json │ │ │ ├── exposure.json │ │ │ ├── family_history.json │ │ │ ├── slide_image.json │ │ │ ├── experimental_metadata.json │ │ │ ├── submitted_somatic_mutation.json │ │ │ ├── submitted_copy_number.json │ │ │ ├── aligned_reads_index.json │ │ │ ├── experiment.json │ │ │ ├── treatment.json │ │ │ ├── slide.json │ │ │ ├── submitted_unaligned_reads.json │ │ │ ├── submitted_aligned_reads.json │ │ │ ├── submitted_methylation.json │ │ │ ├── read_group.json │ │ │ ├── sample.json │ │ │ ├── read_group_qc.json │ │ │ ├── clinical_test.json │ │ │ └── diagnosis.json │ │ └── invalid │ │ │ ├── case_invalid_1.json │ │ │ ├── aliquot_invalid_2.json │ │ │ ├── aliquot_invalid_1.json │ │ │ ├── aliquot_invalid_3.json │ │ │ └── case_invalid_2.json │ ├── schemas │ │ ├── _settings.yaml │ │ ├── projects │ │ │ └── project1.yaml │ │ ├── README.md │ │ ├── program.yaml │ │ ├── keyword.yaml │ │ ├── publication.yaml │ │ ├── acknowledgement.yaml │ │ ├── case.yaml │ │ ├── experimental_metadata.yaml │ │ ├── aligned_reads_index.yaml │ │ ├── submitted_somatic_mutation.yaml │ │ ├── family_history.yaml │ │ ├── submitted_unaligned_reads.yaml │ │ ├── submitted_aligned_reads.yaml │ │ ├── submitted_copy_number.yaml │ │ ├── submitted_methylation.yaml │ │ ├── aliquot.yaml │ │ ├── demographic.yaml │ │ ├── exposure.yaml │ │ ├── slide_image.yaml │ │ ├── slide_count.yaml │ │ ├── experiment.yaml │ │ ├── read_group_qc.yaml │ │ ├── slide.yaml │ │ ├── project.yaml │ │ ├── core_metadata_collection.yaml │ │ ├── treatment.yaml │ │ ├── clinical_test.yaml │ │ ├── read_group.yaml │ │ └── _definitions.yaml │ ├── __init__.py │ └── schema_test.py ├── setup.py ├── NOTICE ├── design_notes.md └── README.md ├── templates ├── etl_creds.json ├── guppy_config.json ├── indexd_creds.json ├── peregrine_creds.json ├── sheepdog_creds.json ├── test_config_helper.py ├── indexd_settings.py ├── etlMapping.yaml ├── gitops.json ├── sheepdog_settings.py ├── peregrine_settings.py └── user.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── docker-compose.override.sample.yml ├── NOTICE ├── guppy_setup.sh ├── docs ├── release_history.md ├── database_information.md ├── cheat_sheet.md ├── useful_links.md ├── dev_tips.md └── setup.md ├── smoke_test.sh ├── README.md ├── dump.sh ├── Jenkinsfile ├── creds_setup.sh ├── nginx.conf └── .secrets.baseline /LaunchPortal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/LaunchPortal.jpg -------------------------------------------------------------------------------- /Authorized URL.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/Authorized URL.jpg -------------------------------------------------------------------------------- /SandboxContainers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/SandboxContainers.jpg -------------------------------------------------------------------------------- /LoggedInScreenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/LoggedInScreenshot.jpg -------------------------------------------------------------------------------- /Authorization_URL_2020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/Authorization_URL_2020.jpg -------------------------------------------------------------------------------- /scripts/fence_key_helper.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | key = base64.urlsafe_b64encode(os.urandom(32)) 4 | print(key.decode('UTF-8')) 5 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/program.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "program", 3 | "name": "CGCI", 4 | "dbgap_accession_number": "phs000235" 5 | } 6 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/invalid/case_invalid_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "case", 3 | "alias": "case_1", 4 | "gender": "female", 5 | "race": "Unknown" 6 | } 7 | -------------------------------------------------------------------------------- /templates/etl_creds.json: -------------------------------------------------------------------------------- 1 | { 2 | "db_host": "postgres", 3 | "db_username": "sheepdog_user", 4 | "db_password": "sheepdog_pass", 5 | "db_database": "metadata_db" 6 | } 7 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "aliquot", 3 | "alias": "abc", 4 | "derived_from": ["e58e1f64-d733-405f-95f1-ede1628c81e7"] 5 | } 6 | -------------------------------------------------------------------------------- /templates/guppy_config.json: -------------------------------------------------------------------------------- 1 | { "indices": [ { "index": "etl", "type": "case" }, { "index": "file", "type": "file" } ], "config_index": "etl_array-config", "auth_filter_field": "auth_resource_path" } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Secrets/ 2 | *.lock* 3 | *.env 4 | *.bak 5 | *.old 6 | *~ 7 | *.swp 8 | .DS_Store 9 | *__pycache__ 10 | *.pytest_cache 11 | *.cache 12 | *pyc 13 | docker-compose.override.yml 14 | -------------------------------------------------------------------------------- /templates/indexd_creds.json: -------------------------------------------------------------------------------- 1 | { 2 | "db_host": "postgres", 3 | "db_username": "indexd_user", 4 | "db_password": "indexd_pass", 5 | "db_database": "indexd_db", 6 | "fence_database": "fence_db" 7 | } 8 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/publication.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "publication", 3 | "submitter_id": "publication_1", 4 | "projects": { 5 | "submitter_id": "project_1" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "aliquot", 3 | "alias": "abc", 4 | "derived_from": 5 | { 6 | "id": ["e58e1f64-d733-405f-95f1-ede1628c81e7"] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/case.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "case", 3 | "submitter_id": "BLGSP-71-06-00019", 4 | "experiments": { 5 | "id": "daa208a7-f57a-562c-a04a-7a7c77542c98" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "aliquot", 3 | "alias": "abc", 4 | "derived_from": [ 5 | { 6 | "id": "e58e1f64-d733-405f-95f1-ede1628c81e7" 7 | } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/keyword.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "keyword", 3 | "submitter_id": "keyword_1", 4 | "keyword_name": "Blood Profiling Atlas", 5 | "projects": { 6 | "submitter_id": "project_1" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/_settings.yaml: -------------------------------------------------------------------------------- 1 | # Global settings for the graph 2 | 3 | # Is the graph case centric, that we want 4 | # to create a link between all children to case 5 | # to expedite case filter on nodes 6 | enable_case_cache: false 7 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dictionaryutils import DataDictionary as GDCDictionary 3 | 4 | SCHEMA_DIR = os.path.join( 5 | os.path.abspath(os.path.dirname(__file__)), 'schemas') 6 | gdcdictionary = GDCDictionary(root_dir=SCHEMA_DIR) 7 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/acknowledgement.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "acknowledgement", 3 | "submitter_id": "acknowledgement_1", 4 | "acknowledgee": "Joe Biden", 5 | "projects": { 6 | "submitter_id": "Cancer Moonshot" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "code": "BLGSP", 3 | "name": "Burkitt Lymphoma Genome Sequencing Project", 4 | "state": "open", 5 | "type": "project", 6 | "dbgap_accession_number": "phs000235.v4.p1" 7 | "programs": [ 8 | {"name": "CGCI"} 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /scripts/wait_for_esproxy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | until curl -f -s http://esproxy-service:9200/_cluster/health | python3 -c "import sys, json; sys.exit(0 if json.load(sys.stdin)['status'] != 'red' else 1)" 2>/dev/null; 4 | do 5 | echo "esproxy not ready, waiting..." 6 | sleep 5 7 | done 8 | 9 | echo "esproxy status is green" 10 | 11 | exec "$@" -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/invalid/case_invalid_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "case", 3 | "alias": "case_1", 4 | "member_of": [ 5 | { 6 | "id": "e58e1f64-d733-405f-95f1-ede1628c81e7" 7 | }, 8 | { 9 | "id": "511bf8e8-ae71-4cb9-bb1b-4e58d04d12c1" 10 | } 11 | ], 12 | "gender": "female", 13 | "race": "Unknown" 14 | } 15 | -------------------------------------------------------------------------------- /scripts/arborist_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint script for arborist to setup db 3 | 4 | sleep 2 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do 6 | echo "Postgres is unavailable - sleeping" 7 | sleep 2 8 | done 9 | 10 | echo "postgres is ready" 11 | 12 | update-ca-certificates 13 | 14 | ./migrations/latest 15 | ./bin/arborist 16 | -------------------------------------------------------------------------------- /scripts/peregrine_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint script for peregrine to update CA certificates before running 3 | 4 | sleep 2 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do 6 | echo "Postgres is unavailable - sleeping" 7 | sleep 2 8 | done 9 | 10 | echo "postgres is ready" 11 | 12 | update-ca-certificates 13 | 14 | /dockerrun.sh 15 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/aliquot.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "aliquot", 3 | "submitter_id": "BLGSP-71-06-00019-01A-11D", 4 | "aliquot_quantity": 0.4, 5 | "aliquot_volume": 5, 6 | "amount": 10, 7 | "source_center": "23", 8 | "concentration": 0.07, 9 | "samples": { 10 | "submitter_id": "BLGSP-71-06-00019-99A-01D" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/slide_count.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "slide_count", 3 | "submitter_id": "CD45_slide_count", 4 | "cell_type": "CD45", 5 | "cell_identifier": "1233423", 6 | "cell_count": 100, 7 | "ck_signal": 0.12, 8 | "run_name": "Run1", 9 | "biomarker_signal": 3.45, 10 | "slides": { 11 | "submitter_id": "slide_1" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/demographic.json: -------------------------------------------------------------------------------- 1 | { 2 | "gender": "male", 3 | "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_demographic", 4 | "year_of_birth": 1652, 5 | "race": "white", 6 | "cases": { 7 | "submitter_id": "BLGSP-71-06-00019" 8 | }, 9 | "type": "demographic", 10 | "ethnicity": "not hispanic or latino", 11 | "year_of_death": 2009 12 | } 13 | -------------------------------------------------------------------------------- /templates/peregrine_creds.json: -------------------------------------------------------------------------------- 1 | { 2 | "fence_host": "postgres", 3 | "fence_username": "fence_user", 4 | "fence_password": "fence_pass", 5 | "fence_database": "fence_db", 6 | "db_host": "postgres", 7 | "db_username": "peregrine_user", 8 | "db_password": "peregrine_pass", 9 | "db_database": "metadata_db", 10 | "gdcapi_secret_key": "1JMWnHdApSGMJ8OIqA0IwWUEo8nJ1NJqwDQbjrz5L5v1QtW2ke", 11 | "hostname": "localhost" 12 | } 13 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/exposure.json: -------------------------------------------------------------------------------- 1 | { 2 | "cigarettes_per_day": -1.0, 3 | "weight": -1.0, 4 | "alcohol_history": "", 5 | "alcohol_intensity": "", 6 | "bmi": -1.0, 7 | "years_smoked": -1.0, 8 | "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_exposure", 9 | "cases": { 10 | "submitter_id": "BLGSP-71-06-00019" 11 | }, 12 | "height": -1.0, 13 | "type": "exposure" 14 | } 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: git@github.com:Yelp/detect-secrets 3 | rev: v0.13.1 4 | hooks: 5 | - id: detect-secrets 6 | args: ['--baseline', '.secrets.baseline'] 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v2.5.0 9 | hooks: 10 | - id: no-commit-to-branch 11 | args: [--branch, develop, --branch, master, --pattern, release/.*] 12 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/family_history.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "family_history", 3 | "submitter_id": "family_history_of_patient_X", 4 | "relative_with_cancer_history": "yes", 5 | "relationship_type": "cousin", 6 | "relationship_gender": "unspecified", 7 | "relationship_age_at_diagnosis": 12345, 8 | "relationship_primary_diagnosis": "cancer", 9 | "cases": { 10 | "submitter_id": "patient_z" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /docker-compose.override.sample.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | environment: 5 | # you may override postgres password here: 6 | - POSTGRES_PASSWORD=postgres 7 | # this makes the postgres container available from the host - ex: 8 | # psql -h localhost -d fence -U fence_user 9 | ports: 10 | - 5432:5432 11 | jupyter-service: 12 | environment: 13 | - FRAME_ANCESTORS=http://localhost http://*.example.com 14 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/slide_image.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "slide_image", 3 | "submitter_id": "slide_image_X", 4 | "file_name": "slide.svs", 5 | "file_size": 21234, 6 | "md5sum": "84a72e8aaad3017348cb3f8459c5d5d9", 7 | "data_category": "Biospecimen", 8 | "data_type": "Single Cell Image", 9 | "data_format": "SVS", 10 | "experimental_strategy": "Diagnostic Slide", 11 | "slides": { 12 | "submitter_id": "slide_X" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/experimental_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "experimental_metadata", 3 | "submitter_id": "experiment XML from SRA XML", 4 | "file_name": "experiment.xml", 5 | "file_size": 21234, 6 | "md5sum": "84a72e8aaad3017348cb3f8459c5d5d9", 7 | "data_category": "Sequencing Data", 8 | "data_type": "Experimental Metadata", 9 | "data_format": "SRA XML", 10 | "experiments": { 11 | "submitter_id": "read_group_X" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /scripts/indexd_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint bash script for indexd to healthcheck postgres to make sure that 3 | # postgres is ready before indexd tries to access its database 4 | 5 | sleep 2 6 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do 7 | echo "Postgres is unavailable - sleeping" 8 | sleep 2 9 | done 10 | 11 | echo "postgres is ready" 12 | 13 | python /indexd/bin/index_admin.py create --username indexd_client --password indexd_client_pass 14 | /dockerrun.sh 15 | -------------------------------------------------------------------------------- /scripts/sheepdog_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint script for sheepdog to run setup_transactionlogs.py before running 3 | 4 | sleep 2 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do 6 | echo "Postgres is unavailable - sleeping" 7 | sleep 2 8 | done 9 | 10 | echo "postgres is ready" 11 | 12 | update-ca-certificates 13 | 14 | python /sheepdog/bin/setup_transactionlogs.py --host postgres --user sheepdog_user --password sheepdog_pass --database metadata_db 15 | bash /dockerrun.sh 16 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/submitted_somatic_mutation.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "submitted_somatic_mutation", 3 | "submitter_id": "somatic_mutations_from_x", 4 | "data_category": "Sequencing Data", 5 | "data_type": "Somatic Mutations", 6 | "data_format": "VCF", 7 | "file_name": "test.vcf", 8 | "file_size": 100, 9 | "md5sum": "6fd84891e7a53725d1cf6109c5f2400f", 10 | "experimental_strategy": "Targeted Sequencing", 11 | "read_groups": { 12 | "submitter_id": "read_group_x" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/submitted_copy_number.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "submitted_copy_number", 3 | "submitter_id": "copy_number_information", 4 | "file_name": "cn.copynumber.data.txt", 5 | "file_size": 281653, 6 | "md5sum": "d3266f2577584713ea17f94d331f30c4", 7 | "data_category": "Copy Number Variation", 8 | "data_type": "Copy Number Estimate", 9 | "data_format": "TXT", 10 | "experimental_strategy": "Genotyping Array", 11 | "aliquots": { 12 | "submitter_id": "aliquot_m" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/aligned_reads_index.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "aligned_reads_index", 3 | "submitter_id": "TCGA-AB-2837-03B-01W-0728-08", 4 | "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam.bai", 5 | "file_size": 5990568, 6 | "md5sum": "6fd84891e7a53725d1cf6109c5f2400f", 7 | "data_category": "Sequencing Data", 8 | "data_type": "Aligned Reads Index", 9 | "data_format": "BAI", 10 | "submitted_aligned_reads_files": { 11 | "id": "0cb66276-c29b-4811-a704-38502173c0f8" 12 | } 13 | } 14 | 15 | 16 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/experiment.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "experiment", 3 | "submitter_id": "experiment_1", 4 | "number_experimental_group": 1, 5 | "number_samples_per_experimental_group": 12, 6 | "experimental_description": "Case/Control, Time Course, Responder/Non-Responder", 7 | "experimental_intent": "Temperature, Storage Duration, and Tube Type effects on ctDNA stability", 8 | "type_of_sample": "Clinical", 9 | "type_of_specimen": "Plasma", 10 | "projects": { 11 | "submitter_id": "P0001" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/treatment.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "treatment", 3 | "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_treatment", 4 | "days_to_treatment": -1.0, 5 | "days_to_treatment_end": 14, 6 | "days_to_treatment_start": 25, 7 | "therapeutic_agents": "", 8 | "treatment_anatomic_site": "Arm", 9 | "treatment_intent_type": "", 10 | "treatment_or_therapy": "unknown", 11 | "treatment_type": "Other", 12 | "diagnoses": { 13 | "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_diagnosis" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /datadictionary/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='gdcdictionary', 5 | version='0.0.0', 6 | packages=find_packages(), 7 | install_requires=[ 8 | 'dictionaryutils', 9 | ], 10 | dependency_links=[ 11 | "git+https://github.com/uc-cdis/dictionaryutils.git@2.0.4#egg=dictionaryutils", 12 | ], 13 | package_data={ 14 | "gdcdictionary": [ 15 | "schemas/*.yaml", 16 | "schemas/projects/*.yaml", 17 | "schemas/projects/*/*.yaml", 18 | ] 19 | }, 20 | ) 21 | -------------------------------------------------------------------------------- /scripts/fence_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint script for fence to sync user.yaml before running 3 | 4 | sleep 2 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do 6 | echo "Postgres is unavailable - sleeping" 7 | sleep 2 8 | done 9 | 10 | echo "postgres is ready" 11 | 12 | update-ca-certificates 13 | 14 | until curl -f -s -o /dev/null http://arborist-service/policy; do 15 | echo "arborist not ready, waiting..." 16 | sleep 10 17 | done 18 | 19 | fence-create sync --yaml user.yaml --arborist http://arborist-service 20 | 21 | cd /fence 22 | /dockerrun.sh -------------------------------------------------------------------------------- /templates/sheepdog_creds.json: -------------------------------------------------------------------------------- 1 | { 2 | "fence_host": "postgres", 3 | "fence_username": "fence_user", 4 | "fence_password": "fence_pass", 5 | "fence_database": "fence_db", 6 | "db_host": "postgres", 7 | "db_username": "sheepdog_user", 8 | "db_password": "sheepdog_pass", 9 | "db_database": "metadata_db", 10 | "gdcapi_secret_key": "1JMWnHdApSGMJ8OIqA0IwWUEo8nJ1NJqwDQbjrz5L5v1QtW2ke", 11 | "indexd_client": "indexd_client", 12 | "indexd_password": "indexd_client_pass", 13 | "hostname": "localhost", 14 | "oauth2_client_id": "n/a", 15 | "oauth2_client_secret": "n/a" 16 | } 17 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2015 University of Chicago 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 8 | 9 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/slide.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "slide", 3 | "submitter_id": "TCGA-FW-A3R5-06A-01-TS1", 4 | "section_location": "TOP", 5 | "percent_tumor_cells": 80.0, 6 | "percent_tumor_nuclei": 80.0, 7 | "percent_normal_cells": 0.0, 8 | "percent_necrosis": 0.0, 9 | "percent_stromal_cells": 20.0, 10 | "percent_lymphocyte_infiltration": 0.0, 11 | "percent_monocyte_infiltration": 0.0, 12 | "percent_neutrophil_infiltration": 0.0, 13 | "samples": [ 14 | { 15 | "submitter_id": "TCGA-FW-A3R5-06A-11" 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/submitted_unaligned_reads.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "submitted_unaligned_reads", 3 | "id": "5557a728-1827-4aff-b28b-f004d835f9d6", 4 | "submitter_id": "TCGA-DQ-5630-01A-01R-1873-07", 5 | "file_name": "UNCID_2741452.a38e0f12-7b18-4856-9cc8-314d8f0b63d6.1.fastq", 6 | "file_size": 5747943025, 7 | "md5sum": "d81203da215be180128f260b788900b5", 8 | "data_category": "Sequencing Data", 9 | "data_type": "Unaligned Reads", 10 | "data_format": "FASTQ", 11 | "experimental_strategy": "WGS", 12 | "read_groups": { 13 | "submitter_id": "read_group_1" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /scripts/waitForContainers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # entrypoint script for data-portal to healthcheck sheepdog and peregrine to 3 | # make sure they are ready before dataportal attempts to get information from 4 | # them 5 | 6 | sleep 10 7 | 8 | until curl -f -s -o /dev/null http://sheepdog-service/v0/submission/_dictionary/_all; do 9 | echo "sheepdog not ready, waiting..." 10 | sleep 10 11 | done 12 | 13 | until curl -f -s -o /dev/null http://peregrine-service/v0/submission/getschema ; do 14 | echo "peregrine not ready, waiting..." 15 | sleep 10 16 | done 17 | 18 | echo "both services are ready" 19 | bash ./dockerStart.sh 20 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/submitted_aligned_reads.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "submitted_aligned_reads", 3 | "id": "0cb66276-c29b-4811-a704-38502173c0f8", 4 | "submitter_id": "TCGA-AB-2837-03B-01W-0728-08", 5 | "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam", 6 | "file_size": 28165335141, 7 | "md5sum": "d3266f2577584713ea17f94d331f30c4", 8 | "data_category": "Sequencing Data", 9 | "data_type": "Aligned Reads", 10 | "data_format": "BAM", 11 | "experimental_strategy": "WXS", 12 | "read_groups": { 13 | "submitter_id": "205CTABXX100806.5.C317.TCGA-AB-2837-03B-01W-0728-08.3.bam" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /guppy_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to create and re-create es indices and setup guppy 3 | 4 | sleep 2 5 | docker exec esproxy-service curl -X DELETE http://localhost:9200/etl_0 6 | sleep 2 7 | docker exec esproxy-service curl -X DELETE http://localhost:9200/file_0 8 | sleep 2 9 | docker exec esproxy-service curl -X DELETE http://localhost:9200/file-array-config_0 10 | sleep 2 11 | docker exec esproxy-service curl -X DELETE http://localhost:9200/etl-array-config_0 12 | sleep 2 13 | docker exec tube-service bash -c "python run_config.py && python run_etl.py" 14 | 15 | docker container stop guppy-service 16 | docker container start guppy-service 17 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/projects/project1.yaml: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # Project 1 specific overrides 3 | ##################################################################### 4 | 5 | $schema: "http://json-schema.org/draft-04/schema#" 6 | 7 | ##################################################################### 8 | # Aliquot 9 | ##################################################################### 10 | 11 | id: "aliquot" 12 | program: 'program1' 13 | project: 'project1' 14 | required: 15 | - submitter_aliquot_id 16 | - parents 17 | - project_1_specific_thing 18 | 19 | properties: 20 | project_1_specific_thing: 21 | type: string 22 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/submitted_methylation.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "submitted_methylation", 3 | "id": "0cb66276-c29b-4811-a704-38502173c0f8", 4 | "submitter_id": "TCGA-AB-2837-03B-01W-0728-08", 5 | "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam", 6 | "file_size": 28165335141, 7 | "md5sum": "d3266f2577584713ea17f94d331f30c4", 8 | "data_category": "Methylation Data", 9 | "data_type": "Methylation Intensity Values", 10 | "data_format": "IDAT", 11 | "assay_method": "Methylation Array", 12 | "assay_instrument": "Illumina", 13 | "assay_instrument_model": "Illumina Infinium HumanMethylation450", 14 | "aliquots": { 15 | "submitter_id": "205CTABXX100806.5.C317.TCGA-AB-2837-03B-01W-0728-08.3.bam" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/README.md: -------------------------------------------------------------------------------- 1 | Proposed additional keywords 2 | ============================ 3 | 4 | The schemas defined here follow jsonschema as closely as possbile, 5 | introducing new keywords as needed. 6 | 7 | systemAlias 8 | ----------- 9 | 10 | For implementation. Allows properties to be stored as different 11 | keywords. The property listed in the properties section is what the 12 | user will refer to it, and the systemAlias value is what it will be 13 | stored in the database as. 14 | 15 | systemProperties 16 | --------------- 17 | 18 | The property keys listed under systemProperties are properties that 19 | the submitter is not allowed to update. 20 | 21 | parentType 22 | --------------- 23 | 24 | The type of object that the parent relationship points to. 25 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/read_group.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "b4918128-f3e5-496a-bcd7-a4ec8d7dd1d9", 3 | "submitter_id": "205DDABXX100804.3.C317.TCGA-AB-2899-03A-01W-0733-08.4.bam", 4 | "type": "read_group", 5 | "experiment_name": "Resequencing", 6 | "sequencing_center": "BI", 7 | "sequencing_date": "2010-08-04", 8 | "platform": "Illumina", 9 | "instrument_model": "Illumina HiSeq 2000", 10 | "library_strategy": "WXS", 11 | "flow_cell_barcode": "205DDABXX", 12 | "library_selection": "Hybrid_Selection", 13 | "library_name": "Solexa-34688", 14 | "is_paired_end": true, 15 | "read_length": 75, 16 | "read_group_name": "205DD.3", 17 | "aliquots": [ 18 | { 19 | "submitter_id": "2c9108eb-c59a-4227-b650-56e61b3aa0ea" 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /datadictionary/NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2015 University of Chicago, Ontario Institute for Cancer Research Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Portions of this work, authored by University of Chicago and Ontario Institute for Cancer Research employees, was funded in whole or in part by National Cancer Institute, National Institutes of Health under U.S. Government contract HHSN261200800001E. -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/program.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "program" 4 | title: Program 5 | type: object 6 | category: administrative 7 | program: '*' 8 | project: '*' 9 | description: > 10 | A broad framework of goals to be achieved. (NCIt C52647) 11 | additionalProperties: false 12 | submittable: false 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | 18 | required: 19 | - name 20 | - dbgap_accession_number 21 | 22 | uniqueKeys: 23 | - [id] 24 | - [name] 25 | 26 | links: [] 27 | 28 | # Program is the root entity and so it is the only entity 29 | # without a project or parents. 30 | properties: 31 | type: 32 | type: string 33 | id: 34 | $ref: "_definitions.yaml#/UUID" 35 | systemAlias: node_id 36 | name: 37 | type: string 38 | description: "Full name/title of the program." 39 | dbgap_accession_number: 40 | type: string 41 | description: "The dbgap accession number provided for the program." 42 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "sample", 3 | "submitter_id": "BLGSP-71-06-00019s", 4 | "biospecimen_anatomic_site": "Adipose", 5 | "composition": "Cell", 6 | "current_weight": 1, 7 | "days_to_collection": 25, 8 | "days_to_sample_procurement": 123, 9 | "diagnosis_pathologically_confirmed": "Yes", 10 | "freezing_method": "OCT", 11 | "initial_weight": 0.5, 12 | "intermediate_dimension": "1.2", 13 | "is_ffpe": true, 14 | "longest_dimension": "1.5", 15 | "method_of_sample_procurement": "Indeterminant", 16 | "oct_embedded": "false", 17 | "sample_type": "Blood Derived Normal", 18 | "sample_type_id": "10", 19 | "shortest_dimension": "0.5", 20 | "time_between_clamping_and_freezing": "30", 21 | "tissue_type": "Normal", 22 | "tumor_code": "Osteosarcoma (OS)", 23 | "tumor_code_id": "00", 24 | "tumor_descriptor": "NOS", 25 | "cases": { 26 | "submitter_id": "BLGSP-71-06-00019" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /scripts/postgres_always.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | create_db_idempotent() { 4 | # Creating a DB similar to the "IF NOT EXISTS" syntax is a bit challenging in 5 | # Postgres. 6 | psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = '${1}'" | grep -q 1 || \ 7 | psql -U postgres -c "CREATE DATABASE ${1}" 8 | } 9 | 10 | create_user_idempotent() { 11 | psql -U postgres << EOF 12 | DO \$\$ 13 | BEGIN 14 | IF NOT EXISTS(SELECT 1 FROM pg_roles WHERE rolname='${1}') THEN 15 | CREATE USER ${1}; 16 | END IF; 17 | END 18 | \$\$; 19 | EOF 20 | } 21 | 22 | # The metadata DB and user are here to backfill for installations that did not 23 | # have them originally. These entities did not always exist in compose-services. 24 | # New compose-services users would get them via the standard postgres init 25 | # script, but existing users would need to get them through this mechanism. 26 | create_db_idempotent "metadata" 27 | create_user_idempotent "metadata_user" 28 | 29 | psql -U postgres <//commit/", 23 | "submitted_aligned_reads_files": 24 | {"submitter_id": "bam_file_a"}, 25 | "read_groups": 26 | {"submitter_id": "read_group_a"} 27 | } 28 | -------------------------------------------------------------------------------- /templates/test_config_helper.py: -------------------------------------------------------------------------------- 1 | import config_helper 2 | import os 3 | import time 4 | 5 | # WORKSPACE == Jenkins workspace 6 | TEST_ROOT=os.getenv('WORKSPACE',os.getenv('XDG_RUNTIME_DIR', '/tmp')) + '/test_config_helper/' + str(int(time.time())) 7 | APP_NAME='test_config_helper' 8 | TEST_JSON = ''' 9 | { 10 | "a": "A", 11 | "b": "B", 12 | "c": "C" 13 | } 14 | ''' 15 | TEST_FILENAME='bla.json' 16 | 17 | config_helper.XDG_DATA_HOME=TEST_ROOT 18 | 19 | def setup(): 20 | test_folder = TEST_ROOT + '/cdis/' + APP_NAME 21 | if not os.path.exists(test_folder): 22 | os.makedirs(test_folder) 23 | with open(test_folder + '/' + TEST_FILENAME, 'w') as writer: 24 | writer.write(TEST_JSON) 25 | 26 | def test_find_paths(): 27 | setup() 28 | path_list = config_helper.find_paths(TEST_FILENAME, APP_NAME) 29 | assert len(path_list) == 1 30 | bla_path = TEST_ROOT + '/cdis/' + APP_NAME + '/' + TEST_FILENAME 31 | assert os.path.exists(bla_path) 32 | assert path_list[0] == bla_path 33 | 34 | def test_load_json(): 35 | setup() 36 | data = config_helper.load_json(TEST_FILENAME, APP_NAME) 37 | for key in ['a','b','c']: 38 | assert data[key] == key.upper() 39 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/clinical_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "clinical_test", 3 | "submitter_id": "clinical_test_date_individual_name", 4 | "biomarker_name": "ERBB4", 5 | "biomarker_result": "Normal", 6 | "biomarker_test_method": "Cytogenetics", 7 | "cea_level_preoperative": 1, 8 | "dlco_ref_predictive_percent": 1, 9 | "estrogen_receptor_percent_positive_ihc": "<1%", 10 | "estrogen_receptor_result_ihc": "Negative", 11 | "fev1_ref_post_bronch_percent": 1, 12 | "fev1_ref_pre_bronch_percent": 2, 13 | "fev1_fvc_post_bronch_percent": 10, 14 | "fev1_fvc_pre_bronch_percent": 15, 15 | "her2_erbb2_percent_positive_ihc": "1-10%", 16 | "her2_erbb2_result_fish": "Negative", 17 | "her2_erbb2_result_ihc": "Not Performed", 18 | "ldh_level_at_diagnosis": 3432, 19 | "ldh_normal_range_upper": 13241, 20 | "microsatellite_instability_abnormal": "Yes", 21 | "progesterone_receptor_percent_positive_ihc": "<1%", 22 | "progesterone_receptor_result_ihc": "Positive", 23 | "cases": { 24 | "submitter_id": "Extra large case" 25 | }, 26 | "diagnoses": { 27 | "submitter_id": "BL Diagnosis" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /scripts/postgres_run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Thing shim around the normal Docker postgres entrypoint that allows us to run 4 | # non-application migrations. Things like DB and user creations that would be 5 | # done by cloud-automation tasks in a normal env. 6 | 7 | set -e 8 | 9 | # Initialize the DB, but don't allow outside connections yet. 10 | docker-entrypoint.sh postgres -c listen_addresses='127.0.0.1' & 11 | # Wait until the server is out of initialization mode and online. 12 | while ! psql -U postgres -h localhost -c 'SELECT 1;' 2>/dev/null; do echo "waiting for postgres init..."; sleep 1; done 13 | # Stop the server. 14 | gosu postgres pg_ctl stop 15 | 16 | echo "[postgres] run migrations" 17 | 18 | # Run migrations/scripts that should run on every start. This is handy for data 19 | # we want to backfill or otherwise migrate for users. 20 | gosu postgres bash -c "( 21 | source /usr/local/bin/docker-entrypoint.sh 22 | docker_setup_env 23 | docker_temp_server_start 24 | 25 | bash /postgres_always.sh 26 | 27 | docker_temp_server_stop 28 | )" 29 | 30 | echo "[postgres] migrations complete" 31 | 32 | # Start postgres "normally" allowing all network clients to connect. 33 | docker-entrypoint.sh postgres 34 | -------------------------------------------------------------------------------- /scripts/postgres_init.sql: -------------------------------------------------------------------------------- 1 | /* Entrypoint script for postgres container to set up databases and users for 2 | docker-compose setup */ 3 | 4 | CREATE DATABASE metadata; -- Used by metadata-service (called "metadata" in cloud-automation) 5 | CREATE DATABASE metadata_db; -- Used by sheepdog and peregrine (called "sheepdog" in cloud-automation) 6 | CREATE DATABASE fence_db; 7 | CREATE DATABASE indexd_db; 8 | CREATE DATABASE arborist_db; 9 | 10 | CREATE USER metadata_user; 11 | ALTER USER metadata_user WITH PASSWORD 'metadata_pass'; 12 | ALTER USER metadata_user WITH SUPERUSER; 13 | 14 | CREATE USER fence_user; 15 | ALTER USER fence_user WITH PASSWORD 'fence_pass'; 16 | ALTER USER fence_user WITH SUPERUSER; 17 | 18 | CREATE USER peregrine_user; 19 | ALTER USER peregrine_user WITH PASSWORD 'peregrine_pass'; 20 | ALTER USER peregrine_user WITH SUPERUSER; 21 | 22 | CREATE USER sheepdog_user; 23 | ALTER USER sheepdog_user WITH PASSWORD 'sheepdog_pass'; 24 | ALTER USER sheepdog_user WITH SUPERUSER; 25 | 26 | CREATE USER indexd_user; 27 | ALTER USER indexd_user WITH PASSWORD 'indexd_pass'; 28 | ALTER USER indexd_user WITH SUPERUSER; 29 | 30 | CREATE USER arborist_user; 31 | ALTER USER arborist_user WITH PASSWORD 'arborist_pass'; 32 | ALTER USER arborist_user WITH SUPERUSER; 33 | -------------------------------------------------------------------------------- /docs/release_history.md: -------------------------------------------------------------------------------- 1 | # Release History and Migration Instructions 2 | 3 | # 2019/03 release 4 | 5 | The `2019/03` release includes changes necessary for running the latest versions of the `gen3` services as of March 2019. 6 | This release may fail to run earlier versions of `gen3`. 7 | 8 | * Changes 9 | - add `arborist` and `pidgin` services 10 | - move secrets to `Secrets/` folder which git ignores (via the `.gitignore` file), `apis_configs/` is renamed to a `templates/` folder 11 | - bump to Postgres `9.6` 12 | - do not publish Postgres port to host by default - to avoid port conflicts on the host 13 | 14 | * Migrate an existing commons to the new setup 15 | - move the current secrets to `./Secrets`: `mv ./apis_configs Secrets` 16 | - `git pull` 17 | - `docker-compose pull` - pull the latest `gen3` Docker images 18 | - `bash ./creds_setup.sh` 19 | - edit the `postgres` service in `docker-compose.yaml` to stay on version `9.5` - a `9.6` server cannot read data saved by a `9.5` server. If you want to erase the data currently in the commons, and proceed with Postgres `9.6`, then `docker-compose down -v` clears the old data. 20 | - Set the settings in `Secrets/fence-config.yaml` - be sure to set the `client_secret` and `client_id` fields under `OPENID_CONNECT`. 21 | - ready to go: `docker-compose up -d` 22 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/examples/valid/diagnosis.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "diagnosis", 3 | "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_diagnosis", 4 | "age_at_diagnosis": 47, 5 | "ann_arbor_b_symptoms": "Yes", 6 | "ann_arbor_extranodal_involvement": "No", 7 | "burkitt_lymphoma_clinical_variant": "Endemic", 8 | "cause_of_death": "Unknown", 9 | "classification_of_tumor": "other", 10 | "days_to_birth": -17238.0, 11 | "days_to_death": 1241.0, 12 | "days_to_hiv_diagnosis": null, 13 | "days_to_last_follow_up": -1.0, 14 | "days_to_last_known_disease_status": -1, 15 | "days_to_recurrence": -1, 16 | "hiv_positive": "No", 17 | "last_known_disease_status": "Unknown tumor status", 18 | "ldh_level_at_diagnosis": 1, 19 | "ldh_normal_range_upper": 1.5, 20 | "method_of_diagnosis": "Cytology", 21 | "morphology": "8255/3", 22 | "new_event_anatomic_site": "Bone", 23 | "new_event_type": "Distant Metastasis", 24 | "primary_diagnosis": "c34.3", 25 | "prior_malignancy": "no", 26 | "progression_or_recurrence": "unknown", 27 | "site_of_resection_or_biopsy": "c34.3", 28 | "tissue_or_organ_of_origin": "c34.3", 29 | "tumor_grade": "", 30 | "tumor_stage": "stage iiia", 31 | "vital_status": "dead", 32 | "year_of_diagnosis": 2077, 33 | "cases": { 34 | "submitter_id": "BLGSP-71-06-00019" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/keyword.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "keyword" 4 | title: Keyword 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: administrative 8 | program: '*' 9 | project: '*' 10 | description: "A keyword for a project." 11 | additionalProperties: false 12 | submittable: true 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | - project_id 18 | - state 19 | - created_datetime 20 | - updated_datetime 21 | 22 | links: 23 | - name: projects 24 | backref: keywords 25 | label: describe 26 | target_type: project 27 | multiplicity: many_to_many 28 | required: true 29 | 30 | required: 31 | - submitter_id 32 | - type 33 | - projects 34 | 35 | uniqueKeys: 36 | - [ id ] 37 | - [ project_id, submitter_id ] 38 | 39 | properties: 40 | type: 41 | enum: [ "keyword" ] 42 | id: 43 | $ref: "_definitions.yaml#/UUID" 44 | systemAlias: node_id 45 | state: 46 | $ref: "_definitions.yaml#/state" 47 | submitter_id: 48 | type: 49 | - string 50 | - "null" 51 | keyword_name: 52 | description: "The name of the keyword." 53 | type: string 54 | projects: 55 | $ref: "_definitions.yaml#/to_many_project" 56 | project_id: 57 | type: string 58 | created_datetime: 59 | $ref: "_definitions.yaml#/datetime" 60 | updated_datetime: 61 | $ref: "_definitions.yaml#/datetime" 62 | -------------------------------------------------------------------------------- /smoke_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | help() { 4 | cat - < **NOTE**: You can use docker compose override to configure the Postgres database container and publish the db service port to the host machine by changing the `ports` block under the `postgres` service in `docker-compose.override.yml`, then run `docker-compose up -d postgres`: 19 | ``` 20 | cp docker-compose.override.sample.yml docker-compose.override.yml 21 | ``` 22 | The container host can connect to the database after the port is published - ex: 23 | ``` 24 | psql -h localhost -U fence_user -d fence_db 25 | ``` 26 | 27 | > **Heads-up**: Similarly, you can add/override your custom docker compose config parameters/values in `docker-compose.override.yml` and keep the base config clean. See [docker compose documentation](https://docs.docker.com/compose/extends/) for more. 28 | 29 | -------------------------------------------------------------------------------- /docs/cheat_sheet.md: -------------------------------------------------------------------------------- 1 | # Docker compose services cheat sheet 2 | 3 | **Quick start** 4 | 5 | * bash ./creds_setup.sh (setup secrets) 6 | * docker-compose up (start with logs) 7 | * docker-compose up -d (start without logs) 8 | * docker-compose down (stop) 9 | * docker-compose down -v (stop and wipe existing data) 10 | 11 | **Useful commands** 12 | 13 | * docker ps 14 | * docker logs [-f] xxx-service 15 | * docker-compose restart xxx-service 16 | * docker exec -it fence-service fence-create xxx 17 | 18 | **Update images** 19 | 20 | * docker-compose pull 21 | * docker image prune -f (optional - to free up some space…) 22 | 23 | **Access DB** 24 | 25 | * docker exec -it compose-services_postgres_1 psql -U postgres 26 | * \c DB_name 27 | 28 | **Sync users** 29 | 30 | * docker exec -it fence-service fence-create sync --arborist http://arborist-service --yaml user.yaml 31 | 32 | **Change dictionary** 33 | 34 | Update in docker-compose.yml: 35 | * DICTIONARY_URL 36 | * APP (to get the [corresponding portal setup](https://github.com/uc-cdis/data-portal/tree/master/data/config)), for example: 37 | * dev (goes to "default" config -> Dev data commons) 38 | * edc (Environmental data commons) 39 | 40 | **Use local code (example with fence)** 41 | 42 | Update in docker-compose.yml: 43 | ``` 44 | fence-service: 45 | image: "my-fence:latest" 46 | ``` 47 | Rerun the following commands after changing the code: 48 | * cd fence; docker build . -t my-fence -f Dockerfile 49 | * docker stop fence-service 50 | * docker-compose up -d fence-service 51 | 52 | **Dump logs and config in a zip file** 53 | 54 | * bash dump.sh 55 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/case.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "case" 4 | title: Case 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: administrative 8 | program: '*' 9 | project: '*' 10 | description: > 11 | The collection of all data related to a specific subject in the 12 | context of a specific experiment. 13 | additionalProperties: false 14 | submittable: true 15 | validators: null 16 | 17 | systemProperties: 18 | - id 19 | - project_id 20 | - created_datetime 21 | - updated_datetime 22 | - state 23 | 24 | links: 25 | - name: experiments 26 | backref: cases 27 | label: member_of 28 | target_type: experiment 29 | multiplicity: many_to_one 30 | required: true 31 | 32 | required: 33 | - submitter_id 34 | - type 35 | - experiments 36 | 37 | uniqueKeys: 38 | - [id] 39 | - [project_id, submitter_id] 40 | 41 | # Case properties 42 | properties: 43 | type: 44 | type: string 45 | id: 46 | $ref: "_definitions.yaml#/UUID" 47 | systemAlias: node_id 48 | state: 49 | $ref: "_definitions.yaml#/state" 50 | submitter_id: 51 | type: 52 | - string 53 | - "null" 54 | consent_codes: 55 | type: array 56 | items: 57 | type: string 58 | primary_site: 59 | description: "Primary site for the case." 60 | type: string 61 | disease_type: 62 | description: "Name of the disease for the case." 63 | type: string 64 | experiments: 65 | $ref: "_definitions.yaml#/to_one" 66 | project_id: 67 | $ref: "_definitions.yaml#/project_id" 68 | created_datetime: 69 | $ref: "_definitions.yaml#/datetime" 70 | updated_datetime: 71 | $ref: "_definitions.yaml#/datetime" 72 | -------------------------------------------------------------------------------- /templates/indexd_settings.py: -------------------------------------------------------------------------------- 1 | from indexd.index.drivers.alchemy import SQLAlchemyIndexDriver 2 | from indexd.alias.drivers.alchemy import SQLAlchemyAliasDriver 3 | from indexd.auth.drivers.alchemy import SQLAlchemyAuthDriver 4 | import config_helper 5 | from os import environ 6 | import json 7 | 8 | APP_NAME='indexd' 9 | def load_json(file_name): 10 | return config_helper.load_json(file_name, APP_NAME) 11 | 12 | conf_data = load_json('creds.json') 13 | 14 | usr = conf_data.get('db_username', '{{db_username}}') 15 | db = conf_data.get('db_database', '{{db_database}}') 16 | psw = conf_data.get('db_password', '{{db_password}}') 17 | pghost = conf_data.get('db_host', '{{db_host}}') 18 | pgport = 5432 19 | index_config = conf_data.get('index_config') 20 | CONFIG = {} 21 | 22 | CONFIG['JSONIFY_PRETTYPRINT_REGULAR'] = False 23 | 24 | dist = environ.get('DIST', None) 25 | if dist: 26 | CONFIG['DIST'] = json.loads(dist) 27 | 28 | CONFIG['INDEX'] = { 29 | 'driver': SQLAlchemyIndexDriver('postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format( 30 | usr=usr, 31 | psw=psw, 32 | pghost=pghost, 33 | pgport=pgport, 34 | db=db, 35 | ), index_config=index_config), 36 | } 37 | 38 | CONFIG['ALIAS'] = { 39 | 'driver': SQLAlchemyAliasDriver('postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format( 40 | usr=usr, 41 | psw=psw, 42 | pghost=pghost, 43 | pgport=pgport, 44 | db=db, 45 | )), 46 | } 47 | 48 | AUTH = SQLAlchemyAuthDriver( 49 | 'postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format( 50 | usr=usr, 51 | psw=psw, 52 | pghost=pghost, 53 | pgport=pgport, 54 | db=db, 55 | ), 56 | arborist="http://arborist-service/", 57 | ) 58 | 59 | settings = {'config': CONFIG, 'auth': AUTH} 60 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/experimental_metadata.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "experimental_metadata" 4 | title: Experimental Metadata 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: metadata_file 8 | project: '*' 9 | program: '*' 10 | description: > 11 | Data file containing the metadata for the experiment performed. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | - file_state 23 | - error_type 24 | 25 | links: 26 | - exclusive: false 27 | required: true 28 | subgroup: 29 | - name: core_metadata_collections 30 | backref: experiment_metadata_files 31 | label: data_from 32 | target_type: core_metadata_collection 33 | multiplicity: many_to_many 34 | required: false 35 | - name: experiments 36 | backref: experiment_metadata_files 37 | label: derived_from 38 | target_type: experiment 39 | multiplicity: many_to_many 40 | required: false 41 | 42 | required: 43 | - submitter_id 44 | - type 45 | - file_name 46 | - file_size 47 | - md5sum 48 | - data_category 49 | - data_type 50 | - data_format 51 | 52 | uniqueKeys: 53 | - [ id ] 54 | - [ project_id, submitter_id ] 55 | 56 | properties: 57 | $ref: "_definitions.yaml#/data_file_properties" 58 | type: 59 | enum: [ "experimental_metadata" ] 60 | data_category: 61 | term: 62 | $ref: "_terms.yaml#/data_category" 63 | type: 64 | - string 65 | data_type: 66 | term: 67 | $ref: "_terms.yaml#/data_type" 68 | enum: [ "Experimental Metadata" ] 69 | data_format: 70 | term: 71 | $ref: "_terms.yaml#/data_format" 72 | type: 73 | - string 74 | experiments: 75 | $ref: "_definitions.yaml#/to_one" 76 | core_metadata_collections: 77 | $ref: "_definitions.yaml#/to_many" 78 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/aligned_reads_index.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "aligned_reads_index" 4 | title: Aligned Reads Index 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: index_file 8 | program: '*' 9 | project: '*' 10 | description: "Data file containing the index for a set of aligned reads." 11 | additionalProperties: false 12 | submittable: true 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | - project_id 18 | - created_datetime 19 | - updated_datetime 20 | - state 21 | - file_state 22 | - error_type 23 | 24 | links: 25 | - exclusive: false 26 | required: true 27 | subgroup: 28 | - name: submitted_aligned_reads_files 29 | backref: aligned_reads_indexes 30 | label: derived_from 31 | target_type: submitted_aligned_reads 32 | multiplicity: one_to_one 33 | required: false 34 | - name: core_metadata_collections 35 | backref: aligned_reads_indexes 36 | label: data_from 37 | target_type: core_metadata_collection 38 | multiplicity: many_to_many 39 | required: false 40 | 41 | required: 42 | - submitter_id 43 | - type 44 | - file_name 45 | - file_size 46 | - md5sum 47 | - data_category 48 | - data_type 49 | - data_format 50 | 51 | uniqueKeys: 52 | - [ id ] 53 | - [ project_id, submitter_id ] 54 | 55 | properties: 56 | $ref: "_definitions.yaml#/data_file_properties" 57 | type: 58 | enum: [ "aligned_reads_index" ] 59 | data_category: 60 | term: 61 | $ref: "_terms.yaml#/data_category" 62 | enum: 63 | - Sequencing Data 64 | - Sequencing Reads 65 | - Raw Sequencing Data 66 | data_type: 67 | term: 68 | $ref: "_terms.yaml#/data_type" 69 | enum: [ "Aligned Reads Index" ] 70 | data_format: 71 | term: 72 | $ref: "_terms.yaml#/data_format" 73 | enum: [ "BAI" ] 74 | submitted_aligned_reads_files: 75 | $ref: "_definitions.yaml#/to_one" 76 | core_metadata_collections: 77 | $ref: "_definitions.yaml#/to_many" 78 | -------------------------------------------------------------------------------- /datadictionary/design_notes.md: -------------------------------------------------------------------------------- 1 | One important aspect worth mentioning is that it is purposely chosen to model the dictionary using Directed Acyclic Graph (DAG). The idea behinds it is simpilicity! From a practical point of view, the data dictionary is not meant to model every aspects of the real world GDC entities and their relations with fine grain semantics. Rather, it's important to be able to express and enforce data integrity rules. So far, DAG seems to be a good fit although we should look harder to see whether there are any show-stoppers, are there any relations/rules can not be expressed using DAG and there is no way around it? 2 | 3 | Choosing simplier design will always bring benefits in software development at all levels and phases. In our case, comparing to a general graph, DAG will be much easier to work with. Querying a DAG will be simply searching up to find parents or searching down to find children; finding siblings will require one step up and one step down, which should be performant. As any other graph search, querying a DAG with very wide or deep structure can be expensive. There should be tricks we can play to make the queries we care about performant. We can also keep it in mind while designing the model to avoid very wide or deep DAG whenever possible. 4 | 5 | Taking one step further, practically, type of relations between nodes may not be so important to us. Just like ER modeling in RDBMS, uniqueness/cadinality is the only thing matters. With this thinking, it's possible to entirely eliminate the need for an edge table while implement the DAG model. We will still need to support some not so straightforward relations such as conditional relations, such as when there is A, there should (or shouldn't) be B etc. However such business logic is not necessarily harder for DAG to handle. 6 | 7 | One last point is that converting data in a DAG to JSON should be easier comparing to a general graph. Data in a JSON document is essentially a tree. When converting a DAG to a tree, it is mainly to denormalize child nodes with multiple parents into multiple copies, each parent will have a materialized local child node copy. This should make the logic cleaner when we export data in graph db to JSON to build Elasticsearch indexes. 8 | -------------------------------------------------------------------------------- /dump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copies config and logs into a zip file 3 | 4 | help="$(basename "$0") [help] [--logs-only] 5 | where: 6 | help show this help text 7 | --logs-only do not copy configuration files" 8 | 9 | if [[ "$OSTYPE" != "linux-gnu" && "$OSTYPE" != "darwin"* ]]; then 10 | echo "This script only works on MacOS/Linux" 11 | exit 1 12 | fi 13 | 14 | get_config=true 15 | while [ -n "$1" ]; do 16 | case "$1" in 17 | --logs-only) 18 | get_config=false 19 | ;; 20 | help) 21 | echo "$help" 22 | exit 0 23 | ;; 24 | *) 25 | echo "ignoring unknown option $1" 26 | ;; 27 | esac 28 | shift 29 | done 30 | 31 | dirname=compose-services_dump_`date '+%Y-%m-%d_%H:%M:%S'` 32 | mkdir -p $dirname 33 | mkdir -p $dirname/logs/ 34 | if $get_config; then 35 | mkdir -p $dirname/config/ 36 | fi 37 | 38 | if $get_config; then 39 | echo "Copying config files" 40 | cp docker-compose.yml $dirname/config/ 41 | cp Secrets/etlMapping.yaml $dirname/config/ 42 | cp Secrets/gitops.json $dirname/config/ 43 | cp Secrets/user.yaml $dirname/config/ 44 | cp Secrets/*config.* $dirname/config/ 45 | cp Secrets/*settings.* $dirname/config/ 46 | 47 | # remove lines containing creds 48 | if [[ "$OSTYPE" == "linux-gnu" ]]; then 49 | sed -i "/key/Id" $dirname/config/* 50 | sed -i "/secret/Id" $dirname/config/* 51 | sed -i "/password/Id" $dirname/config/* 52 | elif [[ "$OSTYPE" == "darwin"* ]]; then # MacOS 53 | sed -i "" "/[Kk][Ee][Yy]/d" $dirname/config/* 54 | sed -i "" "/[Ss][Ee][Cc][Rr][Ee][Tt]/d" $dirname/config/* 55 | sed -i "" "/[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]/d" $dirname/config/* 56 | else 57 | echo "WARNING: did not remove lines with creds (unknown OS $OSTYPE)" 58 | fi 59 | fi 60 | 61 | echo "Dumping logs" 62 | cat docker-compose.yml | grep "container_name" | while read -r line ; do 63 | name=$(expr "$line" : ".* \([a-z]*-service\)") 64 | docker-compose logs $name > $dirname/logs/logs-$name.txt 65 | done 66 | 67 | echo "Getting environment details" 68 | # pip freeze > $dirname/pip-freeze.txt 69 | # env > $dirname/env-vars.txt 70 | git rev-parse HEAD > $dirname/latest-commit.txt 71 | 72 | echo "Saving as zip file $dirname.zip" 73 | zip -r $dirname.zip $dirname 74 | 75 | echo "Cleaning up" 76 | rm -r $dirname 77 | 78 | echo "Done" 79 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/submitted_somatic_mutation.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "submitted_somatic_mutation" 4 | title: Submitted Somatic Mutation 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Data file containing somatic mutation calls from a read group. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | - file_state 23 | - error_type 24 | 25 | links: 26 | - exclusive: false 27 | required: true 28 | subgroup: 29 | - name: core_metadata_collections 30 | backref: submitted_somatic_mutations 31 | label: data_from 32 | target_type: core_metadata_collection 33 | multiplicity: many_to_many 34 | required: false 35 | - name: read_groups 36 | backref: submitted_somatic_mutations 37 | label: derived_from 38 | target_type: read_group 39 | multiplicity: many_to_many 40 | required: false 41 | 42 | required: 43 | - submitter_id 44 | - type 45 | - file_name 46 | - file_size 47 | - data_format 48 | - md5sum 49 | - data_category 50 | - data_type 51 | - experimental_strategy 52 | 53 | uniqueKeys: 54 | - [ id ] 55 | - [ project_id, submitter_id ] 56 | 57 | properties: 58 | $ref: "_definitions.yaml#/data_file_properties" 59 | type: 60 | enum: [ "submitted_somatic_mutation" ] 61 | data_category: 62 | term: 63 | $ref: "_terms.yaml#/data_category" 64 | type: string 65 | data_type: 66 | term: 67 | $ref: "_terms.yaml#/data_type" 68 | type: string 69 | data_format: 70 | term: 71 | $ref: "_terms.yaml#/data_format" 72 | type: string 73 | experimental_strategy: 74 | term: 75 | $ref: "_terms.yaml#/experimental_strategy" 76 | type: string 77 | total_variants: 78 | description: "The total number of variants detected carrying a base change difference from the reference genome." 79 | type: integer 80 | read_groups: 81 | $ref: "_definitions.yaml#/to_many" 82 | core_metadata_collections: 83 | $ref: "_definitions.yaml#/to_many" 84 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/family_history.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "family_history" 4 | title: Family History 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: clinical 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Record of a patient's background regarding cancer events of blood relatives. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - state 20 | - created_datetime 21 | - updated_datetime 22 | 23 | required: 24 | - submitter_id 25 | - type 26 | 27 | links: 28 | - name: cases 29 | backref: family_histories 30 | label: describes 31 | target_type: case 32 | multiplicity: many_to_one 33 | required: true 34 | 35 | 36 | uniqueKeys: 37 | #unclear if want submitter ID for clinical 38 | - [id] 39 | - [project_id, submitter_id] 40 | 41 | properties: 42 | type: 43 | enum: [ "family_history" ] 44 | 45 | id: 46 | $ref: "_definitions.yaml#/UUID" 47 | systemAlias: node_id 48 | 49 | state: 50 | $ref: "_definitions.yaml#/state" 51 | 52 | submitter_id: 53 | type: 54 | - string 55 | - "null" 56 | 57 | relative_with_cancer_history: 58 | term: 59 | $ref: "_terms.yaml#/relative_with_cancer_history" 60 | enum: 61 | - "yes" 62 | - "no" 63 | - unknown 64 | - not reported 65 | 66 | relationship_type: 67 | term: 68 | $ref: "_terms.yaml#/relationship_type" 69 | type: string 70 | 71 | relationship_gender: 72 | term: 73 | $ref: "_terms.yaml#/gender" 74 | enum: 75 | - female 76 | - male 77 | - unknown 78 | - unspecified 79 | - not reported 80 | 81 | relationship_age_at_diagnosis: 82 | term: 83 | $ref: "_terms.yaml#/relationship_age_at_diagnosis" 84 | type: number 85 | 86 | relationship_primary_diagnosis: 87 | term: 88 | $ref: "_terms.yaml#/primary_diagnosis" 89 | type: string 90 | 91 | cases: 92 | $ref: "_definitions.yaml#/to_one" 93 | project_id: 94 | $ref: "_definitions.yaml#/project_id" 95 | created_datetime: 96 | $ref: "_definitions.yaml#/datetime" 97 | updated_datetime: 98 | $ref: "_definitions.yaml#/datetime" 99 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/submitted_unaligned_reads.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "submitted_unaligned_reads" 4 | title: Submitted Unaligned Reads 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: "Data file containing unaligned reads that have not been GDC Harmonized." 11 | additionalProperties: false 12 | submittable: true 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | - project_id 18 | - created_datetime 19 | - updated_datetime 20 | - state 21 | - file_state 22 | - error_type 23 | 24 | links: 25 | - exclusive: false 26 | required: true 27 | subgroup: 28 | - name: read_groups 29 | backref: submitted_unaligned_reads_files # pretty ugly 30 | label: data_from 31 | target_type: read_group 32 | multiplicity: many_to_one 33 | required: false 34 | - name: core_metadata_collections 35 | backref: submitted_unaligned_reads_files 36 | label: data_from 37 | target_type: core_metadata_collection 38 | multiplicity: many_to_many 39 | required: false 40 | 41 | required: 42 | - submitter_id 43 | - type 44 | - file_name 45 | - file_size 46 | - md5sum 47 | - data_category 48 | - data_type 49 | - data_format 50 | - experimental_strategy 51 | 52 | uniqueKeys: 53 | - [ id ] 54 | - [ project_id, submitter_id ] 55 | 56 | properties: 57 | $ref: "_definitions.yaml#/data_file_properties" 58 | type: 59 | enum: [ "submitted_unaligned_reads" ] 60 | data_category: 61 | term: 62 | $ref: "_terms.yaml#/data_category" 63 | enum: 64 | - Sequencing Data 65 | - Sequencing Reads 66 | - Raw Sequencing Data 67 | data_type: 68 | term: 69 | $ref: "_terms.yaml#/data_type" 70 | enum: [ "Unaligned Reads" ] 71 | data_format: 72 | term: 73 | $ref: "_terms.yaml#/data_format" 74 | enum: 75 | - BAM 76 | - FASTQ 77 | experimental_strategy: 78 | term: 79 | $ref: "_terms.yaml#/experimental_strategy" 80 | enum: 81 | - WGS 82 | - WXS 83 | - Low Pass WGS 84 | - Validation 85 | - RNA-Seq 86 | - miRNA-Seq 87 | - Total RNA-Seq 88 | - DNA Panel 89 | read_groups: 90 | $ref: "_definitions.yaml#/to_one" 91 | core_metadata_collections: 92 | $ref: "_definitions.yaml#/to_many" 93 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/submitted_aligned_reads.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "submitted_aligned_reads" 4 | title: Submitted Aligned Reads 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Data file containing aligned reads that are used as input to GDC workflows. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | - file_state 23 | - error_type 24 | 25 | links: 26 | - exclusive: false 27 | required: true 28 | subgroup: 29 | - name: read_groups 30 | backref: submitted_aligned_reads_files # pretty ugly 31 | label: data_from 32 | target_type: read_group 33 | multiplicity: one_to_many 34 | required: false 35 | - name: core_metadata_collections 36 | backref: submitted_aligned_reads_files 37 | label: data_from 38 | target_type: core_metadata_collection 39 | multiplicity: many_to_many 40 | required: false 41 | 42 | required: 43 | - submitter_id 44 | - type 45 | - file_name 46 | - file_size 47 | - data_format 48 | - md5sum 49 | - data_category 50 | - data_type 51 | - experimental_strategy 52 | 53 | uniqueKeys: 54 | - [ id ] 55 | - [ project_id, submitter_id ] 56 | 57 | properties: 58 | $ref: "_definitions.yaml#/data_file_properties" 59 | type: 60 | enum: [ "submitted_aligned_reads" ] 61 | data_category: 62 | term: 63 | $ref: "_terms.yaml#/data_category" 64 | enum: 65 | - Sequencing Data 66 | - Sequencing Reads 67 | - Raw Sequencing Data 68 | data_type: 69 | term: 70 | $ref: "_terms.yaml#/data_type" 71 | enum: 72 | - Aligned Reads 73 | - Alignment Coordinates 74 | data_format: 75 | term: 76 | $ref: "_terms.yaml#/data_format" 77 | enum: 78 | - BAM 79 | - BED 80 | experimental_strategy: 81 | term: 82 | $ref: "_terms.yaml#/experimental_strategy" 83 | enum: 84 | - WGS 85 | - WXS 86 | - Low Pass WGS 87 | - Validation 88 | - RNA-Seq 89 | - miRNA-Seq 90 | - Total RNA-Seq 91 | - DNA Panel 92 | read_groups: 93 | $ref: "_definitions.yaml#/to_many" 94 | core_metadata_collections: 95 | $ref: "_definitions.yaml#/to_many" 96 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/submitted_copy_number.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "submitted_copy_number" 4 | title: Submitted Copy Number 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Data file containing normalized copy number information from an aliquot. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | - file_state 23 | - error_type 24 | 25 | links: 26 | - exclusive: false 27 | required: true 28 | subgroup: 29 | - name: core_metadata_collections 30 | backref: submitted_copy_number_files 31 | label: data_from 32 | target_type: core_metadata_collection 33 | multiplicity: many_to_many 34 | required: false 35 | - exclusive: true 36 | required: false 37 | subgroup: 38 | - name: aliquots 39 | backref: submitted_copy_number_files 40 | label: derived_from 41 | target_type: aliquot 42 | multiplicity: one_to_one 43 | required: false 44 | - name: read_groups 45 | backref: submitted_copy_number_files 46 | label: derived_from 47 | target_type: read_group 48 | multiplicity: many_to_many 49 | required: false 50 | 51 | required: 52 | - submitter_id 53 | - type 54 | - file_name 55 | - file_size 56 | - data_format 57 | - md5sum 58 | - data_category 59 | - data_type 60 | - experimental_strategy 61 | 62 | uniqueKeys: 63 | - [ id ] 64 | - [ project_id, submitter_id ] 65 | 66 | properties: 67 | $ref: "_definitions.yaml#/data_file_properties" 68 | type: 69 | enum: [ "submitted_copy_number" ] 70 | data_category: 71 | term: 72 | $ref: "_terms.yaml#/data_category" 73 | type: string 74 | data_type: 75 | term: 76 | $ref: "_terms.yaml#/data_type" 77 | type: string 78 | data_format: 79 | term: 80 | $ref: "_terms.yaml#/data_format" 81 | type: string 82 | experimental_strategy: 83 | term: 84 | $ref: "_terms.yaml#/experimental_strategy" 85 | type: string 86 | aliquots: 87 | $ref: "_definitions.yaml#/to_one" 88 | read_groups: 89 | $ref: "_definitions.yaml#/to_many" 90 | core_metadata_collections: 91 | $ref: "_definitions.yaml#/to_many" 92 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/submitted_methylation.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "submitted_methylation" 4 | title: Submitted Methylation 5 | type: object 6 | namespace: https://www.bloodpac.org/ 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: "DNA methylation data files contain information on raw and normalized signal intensities, detection confidence and calculated beta values for methylated and unmethylated probes. DNA methylation is an epigenetic mark which can be associated with transcriptional inactivity when located in promoter regions." 11 | additionalProperties: false 12 | submittable: true 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | - project_id 18 | - created_datetime 19 | - updated_datetime 20 | - state 21 | - file_state 22 | - error_type 23 | 24 | links: 25 | - exclusive: false 26 | required: true 27 | subgroup: 28 | - name: core_metadata_collections 29 | backref: submitted_methylation_files 30 | label: data_from 31 | target_type: core_metadata_collection 32 | multiplicity: many_to_many 33 | required: false 34 | - name: aliquots 35 | backref: submitted_methylation_files 36 | label: data_from 37 | target_type: aliquot 38 | multiplicity: many_to_one 39 | required: false 40 | 41 | required: 42 | - submitter_id 43 | - type 44 | - file_name 45 | - file_size 46 | - md5sum 47 | - data_category 48 | - data_type 49 | - data_format 50 | 51 | uniqueKeys: 52 | - [ id ] 53 | - [ project_id, submitter_id ] 54 | 55 | properties: 56 | $ref: "_definitions.yaml#/data_file_properties" 57 | type: 58 | enum: [ "submitted_methylation" ] 59 | data_category: 60 | term: 61 | $ref: "_terms.yaml#/data_category" 62 | enum: 63 | - Methylation Data 64 | data_type: 65 | term: 66 | $ref: "_terms.yaml#/data_type" 67 | enum: [ "Methylation Intensity Values" ] 68 | data_format: 69 | term: 70 | $ref: "_terms.yaml#/data_format" 71 | enum: 72 | - IDAT 73 | assay_method: 74 | enum: 75 | - Methylation Array 76 | assay_instrument: 77 | enum: 78 | - Illumina 79 | assay_instrument_model: 80 | enum: 81 | - Illumina Infinium HumanMethylation450 82 | - Illumina Infinium HumanMethylation450K 83 | aliquots: 84 | $ref: "_definitions.yaml#/to_one" 85 | core_metadata_collections: 86 | $ref: "_definitions.yaml#/to_many" 87 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/aliquot.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "aliquot" 4 | title: Aliquot 5 | type: object 6 | category: biospecimen 7 | program: '*' 8 | project: '*' 9 | description: > 10 | Pertaining to a portion of the whole; any one of two or more samples of something, of the same 11 | volume or weight. 12 | additionalProperties: false 13 | submittable: true 14 | validators: [] 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - state 20 | - created_datetime 21 | - updated_datetime 22 | 23 | required: 24 | - submitter_id 25 | - type 26 | - samples 27 | 28 | uniqueKeys: 29 | - [id] 30 | - [project_id, submitter_id] 31 | 32 | links: 33 | - name: samples 34 | backref: aliquots 35 | label: derived_from 36 | multiplicity: many_to_many 37 | target_type: sample 38 | required: true 39 | 40 | constraints: null 41 | 42 | # Aliquot properties 43 | properties: 44 | type: 45 | type: string 46 | id: 47 | $ref: "_definitions.yaml#/UUID" 48 | systemAlias: node_id 49 | state: 50 | $ref: "_definitions.yaml#/state" 51 | submitter_id: 52 | type: 53 | - string 54 | - "null" 55 | description: > 56 | The legacy barcode used before prior to the use 57 | UUIDs. For TCGA this is bcraliquotbarcode. 58 | aliquot_quantity: 59 | term: 60 | $ref: "_terms.yaml#/aliquot_quantity" 61 | type: number 62 | aliquot_volume: 63 | term: 64 | $ref: "_terms.yaml#/aliquot_volume" 65 | type: number 66 | amount: 67 | term: 68 | $ref: "_terms.yaml#/amount" 69 | type: number 70 | analyte_type: 71 | term: 72 | $ref: "_terms.yaml#/analyte_type" 73 | type: string 74 | analyte_type_id: 75 | term: 76 | $ref: "_terms.yaml#/analyte_type_id" 77 | enum: 78 | - D 79 | - E 80 | - G 81 | - H 82 | - R 83 | - S 84 | - T 85 | - W 86 | - X 87 | - Y 88 | concentration: 89 | term: 90 | $ref: "_terms.yaml#/concentration" 91 | type: number 92 | project_id: 93 | $ref: "_definitions.yaml#/project_id" 94 | source_center: 95 | term: 96 | $ref: "_terms.yaml#/source_center" 97 | type: string 98 | samples: 99 | $ref: "_definitions.yaml#/to_one" 100 | created_datetime: 101 | $ref: "_definitions.yaml#/datetime" 102 | updated_datetime: 103 | $ref: "_definitions.yaml#/datetime" 104 | -------------------------------------------------------------------------------- /docs/useful_links.md: -------------------------------------------------------------------------------- 1 | 2 | # Useful links 3 | 4 | Find below a list of links that show the capabilities of our Gen3 software stack tested by and further developed by users and initiatives in the research community all over the globe. These are fantastic resources to explore use cases of Gen3 and may be of help to new and experienced users/operators alike of a Gen3 Data Commons. 5 | 6 | > 🟢 Note: We emphasize that we are not responsible for the content and opinions on the third-party webpages listed below. 7 | 8 | 1. Working with on premises data and servers: 9 | The gen3 system is optimized to deploy on cloud systems and work with cloud buckets. The Oregon Health & Science University (OHSU) has developed [a collection of extensions](https://github.com/ohsu-comp-bio/compose-services/tree/onprem) to enable gen3 to work in a non aws environment. Read this [overview](https://github.com/ohsu-comp-bio/compose-services/blob/onprem/onprem/README.md) for more information. 10 | 2. A group of users shared their experiences with setting up their Gen3 Data Commons on a local desktop using Compose Services in August 2020 in form of three videos: [Gen3 Data Commons Setup Part 1](https://www.youtube.com/watch?v=xM54O4aMpWY), [Gen3 Data Commons Setup Part 2](https://www.youtube.com/watch?v=iMmCxnbHpGo), and [Data Upload](https://www.youtube.com/watch?v=F2EOtHPg6g8&feature=youtu.be). Please note, that the content in these videos might not reflect the current status of the Compose-Services repository. Referring to the video part 1, the following is outdated: the format of the `user.yaml` reflects the one shown in the Fence repository and the arborist DB setup is up to date. 11 | 3. A stand-alone data dictionary viewer for schema.json artifacts was published [here](https://github.com/bioteam/dictionary-visualizer). 12 | 4. The [Australian BioCommons group](https://www.biocommons.org.au/gen3-project) has implemented the Gen3 software stack for easier management and sharing of human genome data in Australia. Browse through the [detailed documentation](https://github.com/umccr/gen3-doc) of how they setup and deployed Gen3 including a [customized Data Dictionary](https://github.com/umccr/umccr-dictionary) and guides for users and admins on working with the [production environment](https://github.com/umccr/gen3-doc/tree/main/cloud). This group also wrote a [client "g3po"](https://github.com/umccr/g3po) to interact with Gen3 APIs and tested Gen3's capability to set [granular access to data files using authz and GA4GH Passport Visa consent codes](https://github.com/umccr/gen3-doc/tree/main/submit/agha-gdr-demo) in combination with CILogon as [a new authentication method](https://github.com/uc-cdis/fence/pull/896). 13 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/demographic.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "demographic" 4 | title: Demographic 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: clinical 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Data for the characterization of the patient by means of segementing the population (e.g., 12 | characterization by age, sex, or race). 13 | additionalProperties: false 14 | submittable: true 15 | validators: null 16 | 17 | systemProperties: 18 | - id 19 | - project_id 20 | - state 21 | - created_datetime 22 | - updated_datetime 23 | 24 | links: 25 | - name: cases 26 | backref: demographics 27 | label: describes 28 | target_type: case 29 | multiplicity: one_to_one 30 | required: true 31 | 32 | required: 33 | - submitter_id 34 | - type 35 | - cases 36 | 37 | preferred: 38 | - year_of_death 39 | 40 | uniqueKeys: 41 | #unclear if want submitter ID for clinical 42 | - [id] 43 | - [project_id, submitter_id] 44 | 45 | properties: 46 | type: 47 | type: string 48 | 49 | id: 50 | $ref: "_definitions.yaml#/UUID" 51 | systemAlias: node_id 52 | 53 | state: 54 | $ref: "_definitions.yaml#/state" 55 | 56 | submitter_id: 57 | type: 58 | - string 59 | - "null" 60 | 61 | gender: 62 | term: 63 | $ref: "_terms.yaml#/gender" 64 | enum: 65 | - female 66 | - male 67 | - unknown 68 | - unspecified 69 | - not reported 70 | 71 | race: 72 | term: 73 | $ref: "_terms.yaml#/race" 74 | enum: 75 | - white 76 | - american indian or alaska native 77 | - black or african american 78 | - asian 79 | - native hawaiian or other pacific islander 80 | - other 81 | - Unknown 82 | - not reported 83 | - not allowed to collect 84 | 85 | ethnicity: 86 | term: 87 | $ref: "_terms.yaml#/ethnicity" 88 | enum: 89 | - hispanic or latino 90 | - not hispanic or latino 91 | - Unknown 92 | - not reported 93 | - not allowed to collect 94 | 95 | year_of_birth: 96 | term: 97 | $ref: "_terms.yaml#/year_of_birth" 98 | type: 99 | - number 100 | - "null" 101 | 102 | year_of_death: 103 | term: 104 | $ref: "_terms.yaml#/year_of_death" 105 | type: number 106 | 107 | cases: 108 | $ref: "_definitions.yaml#/to_one" 109 | project_id: 110 | $ref: "_definitions.yaml#/project_id" 111 | created_datetime: 112 | $ref: "_definitions.yaml#/datetime" 113 | updated_datetime: 114 | $ref: "_definitions.yaml#/datetime" 115 | -------------------------------------------------------------------------------- /templates/etlMapping.yaml: -------------------------------------------------------------------------------- 1 | mappings: 2 | - name: etl 3 | doc_type: case 4 | type: aggregator 5 | root: case 6 | props: 7 | - name: submitter_id 8 | - name: project_id 9 | - name: disease_type 10 | - name: primary_site 11 | flatten_props: 12 | - path: demographics 13 | props: 14 | - name: gender 15 | value_mappings: 16 | - female: F 17 | - male: M 18 | - name: race 19 | value_mappings: 20 | - american indian or alaskan native: Indian 21 | - name: ethnicity 22 | - name: year_of_birth 23 | aggregated_props: 24 | - name: _samples_count 25 | path: samples 26 | fn: count 27 | - name: _aliquots_count 28 | path: samples.aliquots 29 | fn: count 30 | - name: _submitted_methylations_count 31 | path: samples.aliquots.submitted_methylation_files 32 | fn: count 33 | - name: _submitted_copy_number_files_on_aliquots_count 34 | path: samples.aliquots.submitted_copy_number_files 35 | fn: count 36 | - name: _read_groups_count 37 | path: samples.aliquots.read_groups 38 | fn: count 39 | - name: _submitted_aligned_reads_count 40 | path: samples.aliquots.read_groups.submitted_aligned_reads_files 41 | fn: count 42 | - name: _submitted_unaligned_reads_count 43 | path: samples.aliquots.read_groups.submitted_unaligned_reads_files 44 | fn: count 45 | - name: _submitted_copy_number_files_on_read_groups_count 46 | path: samples.aliquots.read_groups.submitted_copy_number_files 47 | fn: count 48 | - name: _submitted_somatic_mutations_count 49 | path: samples.aliquots.read_groups.submitted_somatic_mutations 50 | fn: count 51 | joining_props: 52 | - index: file 53 | join_on: _case_id 54 | props: 55 | - name: data_format 56 | src: data_format 57 | fn: set 58 | - name: data_type 59 | src: data_type 60 | fn: set 61 | - name: _file_id 62 | src: file_id 63 | fn: set 64 | - name: file 65 | doc_type: file 66 | type: collector 67 | root: None 68 | category: data_file 69 | props: 70 | - name: object_id 71 | - name: md5sum 72 | - name: file_name 73 | - name: file_size 74 | - name: data_format 75 | - name: data_type 76 | - name: state 77 | injecting_props: 78 | case: 79 | props: 80 | - name: _case_id 81 | src: id 82 | fn: set 83 | - name: project_id 84 | target_nodes: 85 | - name: slide_image 86 | path: slides.samples.cases 87 | -------------------------------------------------------------------------------- /templates/gitops.json: -------------------------------------------------------------------------------- 1 | { 2 | "gaTrackingId": "UA-119127212-1", 3 | "dataExplorerConfig": { 4 | "charts": { 5 | "project_id": { 6 | "chartType": "count", 7 | "title": "Projects" 8 | }, 9 | "node_id": { 10 | "chartType": "count", 11 | "title": "Cases" 12 | }, 13 | "gender": { 14 | "chartType": "pie", 15 | "title": "Gender" 16 | }, 17 | "race": { 18 | "chartType": "bar", 19 | "title": "Race" 20 | } 21 | }, 22 | "filters": { 23 | "tabs": [ 24 | { 25 | "title": "Case", 26 | "fields":[ 27 | "project_id", 28 | "gender", 29 | "race", 30 | "ethnicity" 31 | ] 32 | } 33 | ] 34 | }, 35 | "table": { 36 | "enabled": false 37 | }, 38 | "dropdowns": {}, 39 | "buttons": [], 40 | "guppyConfig": { 41 | "dataType": "case", 42 | "nodeCountTitle": "Cases", 43 | "fieldMapping": [ 44 | { "field": "disease_type", "name": "Disease type" }, 45 | { "field": "primary_site", "name": "Site where samples were collected"} 46 | ], 47 | "manifestMapping": { 48 | "resourceIndexType": "file", 49 | "resourceIdField": "object_id", 50 | "referenceIdFieldInResourceIndex": "_case_id", 51 | "referenceIdFieldInDataIndex": "_case_id" 52 | }, 53 | "accessibleFieldCheckList": ["project_id"], 54 | "accessibleValidationField": "project_id" 55 | } 56 | }, 57 | "fileExplorerConfig": { 58 | "charts": { 59 | "data_type": { 60 | "chartType": "stackedBar", 61 | "title": "File Type" 62 | }, 63 | "data_format": { 64 | "chartType": "stackedBar", 65 | "title": "File Format" 66 | } 67 | }, 68 | "filters": { 69 | "tabs": [ 70 | { 71 | "title": "File", 72 | "fields": [ 73 | "project_id", 74 | "data_type", 75 | "data_format" 76 | ] 77 | } 78 | ] 79 | }, 80 | "table": { 81 | "enabled": true, 82 | "fields": [ 83 | "project_id", 84 | "file_name", 85 | "file_size", 86 | "object_id" 87 | ] 88 | }, 89 | "dropdowns": {}, 90 | "guppyConfig": { 91 | "dataType": "file", 92 | "fieldMapping": [ 93 | { "field": "object_id", "name": "GUID" } 94 | ], 95 | "nodeCountTitle": "Files", 96 | "manifestMapping": { 97 | "resourceIndexType": "case", 98 | "resourceIdField": "_case_id", 99 | "referenceIdFieldInResourceIndex": "object_id", 100 | "referenceIdFieldInDataIndex": "object_id" 101 | }, 102 | "accessibleFieldCheckList": ["project_id"], 103 | "accessibleValidationField": "project_id", 104 | "downloadAccessor": "object_id" 105 | } 106 | } 107 | } -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/exposure.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "exposure" 4 | title: Exposure 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: clinical 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Clinically relevant patient information not immediately resulting from genetic predispositions. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - state 20 | - created_datetime 21 | - updated_datetime 22 | 23 | required: 24 | - submitter_id 25 | - type 26 | 27 | links: 28 | - name: cases 29 | backref: exposures 30 | label: describes 31 | target_type: case 32 | multiplicity: many_to_one 33 | required: true 34 | 35 | preferred: 36 | - cigarettes_per_day 37 | - years_smoked 38 | 39 | uniqueKeys: 40 | #unclear if want submitter ID for clinical 41 | - [id] 42 | - [project_id, submitter_id] 43 | 44 | properties: 45 | type: 46 | enum: [ "exposure" ] 47 | 48 | id: 49 | $ref: "_definitions.yaml#/UUID" 50 | systemAlias: node_id 51 | 52 | state: 53 | $ref: "_definitions.yaml#/state" 54 | 55 | submitter_id: 56 | type: 57 | - string 58 | - "null" 59 | 60 | alcohol_history: 61 | term: 62 | $ref: "_terms.yaml#/alcohol_history" 63 | type: string 64 | 65 | alcohol_intensity: 66 | term: 67 | $ref: "_terms.yaml#/alcohol_intensity" 68 | type: string 69 | 70 | bmi: 71 | term: 72 | $ref: "_terms.yaml#/bmi" 73 | type: number 74 | 75 | cigarettes_per_day: 76 | term: 77 | $ref: "_terms.yaml#/cigarettes_per_day" 78 | type: number 79 | 80 | height: 81 | term: 82 | $ref: "_terms.yaml#/height" 83 | type: number 84 | 85 | pack_years_smoked: 86 | term: 87 | $ref: "_terms.yaml#/pack_years_smoked" 88 | type: number 89 | 90 | tobacco_smoking_onset_year: 91 | term: 92 | $ref: "_terms.yaml#/tobacco_smoking_onset_year" 93 | type: integer 94 | 95 | tobacco_smoking_quit_year: 96 | term: 97 | $ref: "_terms.yaml#/tobacco_smoking_quit_year" 98 | type: integer 99 | 100 | tobacco_smoking_status: 101 | term: 102 | $ref: "_terms.yaml#/tobacco_smoking_status" 103 | enum: 104 | - "1" 105 | - "2" 106 | - "3" 107 | - "4" 108 | - "5" 109 | - "6" 110 | - "7" 111 | - Unknown 112 | - Not Reported 113 | - Not Allowed To Collect 114 | 115 | weight: 116 | term: 117 | $ref: "_terms.yaml#/weight" 118 | type: number 119 | 120 | years_smoked: 121 | term: 122 | $ref: "_terms.yaml#/years_smoked" 123 | type: number 124 | 125 | cases: 126 | $ref: "_definitions.yaml#/to_one" 127 | project_id: 128 | $ref: "_definitions.yaml#/project_id" 129 | created_datetime: 130 | $ref: "_definitions.yaml#/datetime" 131 | updated_datetime: 132 | $ref: "_definitions.yaml#/datetime" 133 | -------------------------------------------------------------------------------- /templates/sheepdog_settings.py: -------------------------------------------------------------------------------- 1 | from sheepdog.api import app, app_init 2 | from os import environ 3 | import config_helper 4 | 5 | APP_NAME='sheepdog' 6 | def load_json(file_name): 7 | return config_helper.load_json(file_name, APP_NAME) 8 | 9 | conf_data = load_json('creds.json') 10 | config = app.config 11 | 12 | config["AUTH"] = 'https://auth.service.consul:5000/v3/' 13 | config["AUTH_ADMIN_CREDS"] = None 14 | config["INTERNAL_AUTH"] = None 15 | 16 | # SIGNPOST is deprecated, replaced by INDEX_CLIENT (sheepdog>=1.1.12) 17 | config['SIGNPOST'] = { 18 | 'host': environ.get('SIGNPOST_HOST', 'http://indexd-service'), 19 | 'version': 'v0', 20 | 'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')), 21 | } 22 | config["INDEX_CLIENT"] = { 23 | 'host': environ.get('INDEX_CLIENT_HOST', 'http://indexd-service'), 24 | 'version': 'v0', 25 | 'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')), 26 | } 27 | config["FAKE_AUTH"] = False 28 | config["PSQLGRAPH"] = { 29 | 'host': conf_data['db_host'], 30 | 'user': conf_data['db_username'], 31 | 'password': conf_data['db_password'], 32 | 'database': conf_data['db_database'], 33 | } 34 | 35 | config['HMAC_ENCRYPTION_KEY'] = conf_data.get('hmac_key', '{{hmac_key}}') 36 | config['FLASK_SECRET_KEY'] = conf_data.get('gdcapi_secret_key', '{{gdcapi_secret_key}}') 37 | config['PSQL_USER_DB_CONNECTION'] = 'postgresql://%s:%s@%s:5432/%s' % tuple([ conf_data.get(key, key) for key in ['fence_username', 'fence_password', 'fence_host', 'fence_database']]) 38 | config['OIDC_ISSUER'] = 'https://%s/user' % conf_data['hostname'] 39 | 40 | config['OAUTH2'] = { 41 | 'client_id': conf_data.get('oauth2_client_id', '{{oauth2_client_id}}'), 42 | 'client_secret': conf_data.get('oauth2_client_secret', '{{oauth2_client_secret}}'), 43 | 'api_base_url': 'https://%s/user/' % conf_data['hostname'], 44 | 'authorize_url': 'https://%s/user/oauth2/authorize' % conf_data['hostname'], 45 | 'access_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'], 46 | 'refresh_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'], 47 | 'client_kwargs': { 48 | 'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'], 49 | 'scope': 'openid data user', 50 | }, 51 | # deprecated key values, should be removed after all commons use new oidc 52 | 'internal_oauth_provider': 'http://fence-service/oauth2/', 53 | 'oauth_provider': 'https://%s/user/oauth2/' % conf_data['hostname'], 54 | 'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'] 55 | } 56 | config['USER_API'] = 'http://fence-service/' 57 | # option to force authutils to prioritize USER_API setting over the issuer from 58 | # token when redirecting, used during local docker compose setup when the 59 | # services are on different containers but the hostname is still localhost 60 | config['FORCE_ISSUER'] = True 61 | 62 | if environ.get('DICTIONARY_URL'): 63 | config['DICTIONARY_URL'] = environ.get('DICTIONARY_URL') 64 | else: 65 | config['PATH_TO_SCHEMA_DIR'] = environ.get('PATH_TO_SCHEMA_DIR') 66 | 67 | app_init(app) 68 | application = app 69 | application.debug = (environ.get('GEN3_DEBUG') == "True") 70 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/slide_image.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "slide_image" 4 | title: Slide Image 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: data_file 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Data file containing image of a slide. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | - file_state 23 | - error_type 24 | 25 | links: 26 | - exclusive: false 27 | required: true 28 | subgroup: 29 | - name: slides 30 | backref: slide_images 31 | label: data_from 32 | target_type: slide 33 | multiplicity: many_to_one 34 | required: false 35 | - name: core_metadata_collections 36 | backref: slide_images 37 | label: data_from 38 | target_type: core_metadata_collection 39 | multiplicity: many_to_many 40 | required: false 41 | 42 | required: 43 | - submitter_id 44 | - type 45 | - file_name 46 | - file_size 47 | - md5sum 48 | - data_category 49 | - data_type 50 | - data_format 51 | 52 | uniqueKeys: 53 | - [ id ] 54 | - [ project_id, submitter_id ] 55 | 56 | properties: 57 | $ref: "_definitions.yaml#/data_file_properties" 58 | type: 59 | enum: [ "slide_image" ] 60 | data_category: 61 | term: 62 | $ref: "_terms.yaml#/data_category" 63 | enum: 64 | - Biospecimen 65 | - Slide Image 66 | - Mass Cytometry 67 | data_type: 68 | term: 69 | $ref: "_terms.yaml#/data_type" 70 | enum: 71 | - image 72 | - Single Cell Image 73 | - Raw IMC Data 74 | - Single Channel IMC Image 75 | - Antibody Panel Added 76 | data_format: 77 | term: 78 | $ref: "_terms.yaml#/data_format" 79 | type: string 80 | experimental_strategy: 81 | description: "Classification of the slide type with respect to its experimental use." 82 | enum: 83 | - Diagnostic Slide 84 | - Tissue Slide 85 | cell_type: 86 | description: "The type of cell being imaged or otherwised analysed." 87 | type: string 88 | cell_identifier: 89 | description: "An alternative identifier for a given cell type." 90 | type: string 91 | cell_count: 92 | description: "Count of the cell type being imaged or otherwise analysed." 93 | type: integer 94 | frame_identifier: 95 | description: "Name, number, or other identifier given to the frame of the slide from which this image was taken." 96 | type: string 97 | panel_used: 98 | description: "Name or other identifier given to the panel used during an IMC run." 99 | type: string 100 | protocol_used: 101 | description: "Name or other identifier given to the protocol used during an IMC run." 102 | type: string 103 | run_name: 104 | description: "Name, number, or other identifier given to the run that generated this slide image." 105 | type: string 106 | slides: 107 | $ref: "_definitions.yaml#/to_one" 108 | core_metadata_collections: 109 | $ref: "_definitions.yaml#/to_many" 110 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/slide_count.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "slide_count" 4 | title: Slide Count 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: notation 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Information pertaining to processed results obtained from slides; often in the form of counts. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | 23 | links: 24 | - name: slides 25 | backref: slide_counts 26 | label: data_from 27 | target_type: slide 28 | multiplicity: many_to_many 29 | required: true 30 | 31 | required: 32 | - submitter_id 33 | - type 34 | - slides 35 | 36 | uniqueKeys: 37 | - [ id ] 38 | - [ project_id, submitter_id ] 39 | 40 | properties: 41 | type: 42 | enum: [ "slide_count" ] 43 | id: 44 | $ref: "_definitions.yaml#/UUID" 45 | systemAlias: node_id 46 | state: 47 | $ref: "_definitions.yaml#/state" 48 | submitter_id: 49 | type: 50 | - string 51 | - "null" 52 | cell_type: 53 | description: "The type of cell being counted or measured." 54 | type: string 55 | cell_identifier: 56 | description: "An alternative identifier for a given cell type." 57 | type: string 58 | cell_count: 59 | description: "Raw count of a particular cell type." 60 | type: integer 61 | ck_signal: 62 | description: "Numeric quantification of the CK signal." 63 | type: number 64 | biomarker_signal: 65 | description: "Numeric quantification of the biomarker signal." 66 | type: number 67 | er_localization: 68 | description: "Cellular localization of the endoplasmic reticulum as determined by staining." 69 | enum: 70 | - Nuclear 71 | - Cytoplasmic 72 | - Both 73 | - None 74 | - Not Determined 75 | frame_identifier: 76 | description: "Name, number, or other identifier given to the frame of the slide from which this image was taken." 77 | type: string 78 | relative_nuclear_size: 79 | description: "The ratio of the single cell's nucleus size to the average of the surrounding cells." 80 | type: number 81 | relative_nuclear_intensity: 82 | description: "The ratio of the single cell's nuclear staining intensity to the average of the surrounding cells." 83 | type: number 84 | relative_cytokeratin_intensity: 85 | description: "The ratio of the single cell's cytokeratin staining intensity to the average of the surrounding cells." 86 | type: number 87 | relative_er_intensity: 88 | description: "The ratio of the single cell's endoplasmic reticulum staining intensity to the average of the surrounding cells." 89 | type: number 90 | run_name: 91 | description: "The name or identifier given to the run that was used to generate this slide count." 92 | type: string 93 | slides: 94 | $ref: "_definitions.yaml#/to_many" 95 | project_id: 96 | type: string 97 | created_datetime: 98 | $ref: "_definitions.yaml#/datetime" 99 | updated_datetime: 100 | $ref: "_definitions.yaml#/datetime" 101 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | #!groovy 2 | 3 | pipeline { 4 | agent any 5 | 6 | 7 | stages { 8 | stage('FetchCode') { 9 | steps { 10 | checkout scm 11 | dir('cloud-automation') { 12 | git( 13 | url: 'https://github.com/uc-cdis/cloud-automation.git', 14 | branch: 'master' 15 | ) 16 | } 17 | script { 18 | env.GEN3_HOME=env.WORKSPACE+'/cloud-automation' 19 | env.GEN3_NOPROXY='true' 20 | env.KLOCK_USER = "jenkins" + new Random().nextInt() 21 | } 22 | } 23 | } 24 | stage('docker pull') { 25 | steps { 26 | sh('sudo docker-compose pull') 27 | } 28 | } 29 | stage('AcquireLock') { 30 | steps { 31 | script { 32 | // acquire global lock to launch docker services on Jenkins host node 33 | def lockStatus = sh( script: "bash cloud-automation/gen3/bin/klock.sh lock dockerTest ${env.KLOCK_USER} 3600 -w 600", returnStatus: true) 34 | if (lockStatus != 0) { 35 | error("unable to acquire dockerTest lock") 36 | } 37 | } 38 | } 39 | } 40 | stage('docker up') { 41 | steps { 42 | sh 'sudo docker-compose down || true' 43 | sh 'sudo docker-compose config' 44 | //sh 'sudo docker-compose up -d' // see note below - this fails on k8s node 45 | } 46 | } 47 | stage('smoke test') { 48 | when { 49 | expression { 50 | return false // docker-compose -up above fails, because k8s owns the host node networking 51 | // + sudo docker-compose up -d 52 | // Creating network "ithub_org_compose-services_pr-20_devnet" with the default driver 53 | // Failed to program FILTER chain: iptables failed: iptables --wait -I FORWARD -o br-fa829e600aec -j DOCKER: iptables v1.4.21: Couldn't load target `DOCKER':No such file or directory 54 | } 55 | } 56 | steps { 57 | dir('testResults') { 58 | script { 59 | // get the IP address of the node Jenkins is running on 60 | def ipAddress = sh(script: "kubectl describe pod -l app=jenkins | grep Node: | sed 's@^.*/@@'", returnStdout: true) 61 | retry(10) { // retry smoke_test up to 10 times 62 | sleep(60) // give the services some time to start up 63 | sh(script: "bash ./smoke_test.sh ${ipAddress}") 64 | } 65 | } 66 | } 67 | } 68 | } 69 | } 70 | post { 71 | success { 72 | echo "https://jenkins.planx-pla.net/ $env.JOB_NAME pipeline succeeded" 73 | } 74 | failure { 75 | echo "Failure!" 76 | //archiveArtifacts artifacts: '**/output/*.png', fingerprint: true 77 | //slackSend color: 'bad', message: "https://jenkins.planx-pla.net $env.JOB_NAME pipeline failed" 78 | } 79 | unstable { 80 | echo "Unstable!" 81 | //slackSend color: 'bad', message: "https://jenkins.planx-pla.net $env.JOB_NAME pipeline unstable" 82 | } 83 | always { 84 | script { 85 | uid = env.service+"-"+env.quaySuffix+"-"+env.BUILD_NUMBER 86 | withEnv(['GEN3_NOPROXY=true', "GEN3_HOME=$env.WORKSPACE/cloud-automation"]) { 87 | sh("bash cloud-automation/gen3/bin/klock.sh unlock dockerTest ${env.KLOCK_USER} || true") 88 | } 89 | } 90 | echo "done" 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/experiment.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "experiment" 4 | title: Experiment 5 | type: object 6 | namespace: http://bloodprofilingatlas.org/bpa/ 7 | category: administrative 8 | program: '*' 9 | project: '*' 10 | description: > 11 | A coordinated set of actions and observations designed to generate data, with the ultimate goal 12 | of discovery or hypothesis testing. 13 | additionalProperties: false 14 | submittable: true 15 | validators: null 16 | 17 | systemProperties: 18 | - id 19 | - project_id 20 | - created_datetime 21 | - updated_datetime 22 | - state 23 | 24 | links: 25 | - name: projects 26 | backref: experiments 27 | label: performed_for 28 | target_type: project 29 | multiplicity: many_to_one 30 | required: true 31 | 32 | required: 33 | - submitter_id 34 | - type 35 | - projects 36 | 37 | uniqueKeys: 38 | - [ id ] 39 | - [ project_id, submitter_id ] 40 | 41 | properties: 42 | type: 43 | enum: [ "experiment" ] 44 | id: 45 | $ref: "_definitions.yaml#/UUID" 46 | systemAlias: node_id 47 | state: 48 | $ref: "_definitions.yaml#/state" 49 | submitter_id: 50 | type: 51 | - string 52 | - "null" 53 | number_experimental_group: 54 | description: "The number denoting this experiment's place within the group within the whole." 55 | type: 56 | - integer 57 | number_samples_per_experimental_group: 58 | description: "The number of samples contained within this experimental group." 59 | type: 60 | - integer 61 | experimental_description: 62 | description: "A brief description of the experiment being performed." 63 | type: 64 | - string 65 | experimental_intent: 66 | description: "Summary of the goals the experiment is designed to discover." 67 | type: 68 | - string 69 | associated_experiment: 70 | description: "The submitter_id for any experiment with which this experiment is associated, paired, or matched." 71 | type: 72 | - string 73 | type_of_sample: 74 | description: "String indicator identifying the types of samples as contrived or clinical." 75 | type: 76 | - string 77 | type_of_specimen: 78 | description: "Broad description of the specimens used in the experiment." 79 | type: 80 | - string 81 | marker_panel_description: 82 | description: "Brief description of the marker panel used in this experiment." 83 | type: string 84 | somatic_mutations_identified: 85 | description: "Are somatic mutations identified for this experiment?" 86 | type: boolean 87 | indels_identified: 88 | description: "Are indels identified in this experiment?" 89 | type: boolean 90 | copy_numbers_identified: 91 | description: "Are copy number variations identified in this experiment?" 92 | type: boolean 93 | type_of_data: 94 | description: "Is the data raw or processed?" 95 | enum: 96 | - Raw 97 | - Processed 98 | data_description: 99 | description: "Brief description of the data being provided for this experiment." 100 | type: string 101 | projects: 102 | $ref: "_definitions.yaml#/to_one_project" 103 | project_id: 104 | $ref: "_definitions.yaml#/project_id" 105 | created_datetime: 106 | $ref: "_definitions.yaml#/datetime" 107 | updated_datetime: 108 | $ref: "_definitions.yaml#/datetime" 109 | -------------------------------------------------------------------------------- /datadictionary/README.md: -------------------------------------------------------------------------------- 1 | # Data Dictionary 2 | 3 | The data dictionary provides the first level of validation for all data 4 | stored in and generated by the BPA. Written in YAML, JSON schemas define all the individual entities 5 | (nodes) in the data model. Moreover, these schemas define all of the relationships (links) 6 | between the nodes. Finally, the schemas define the valid key-value pairs that can be used to 7 | describe the nodes. 8 | 9 | ## Data Dictionary Structure 10 | 11 | The Data Model covers all of the nodes within the as well as the relationships between 12 | the different types of nodes. All of the nodes in the data model are strongly typed and individually 13 | defined for a specific data type. For example, submitted files can come in different forms, such as 14 | aligned or unaligned reads; within the model we have two separately defined nodes for 15 | `Submitted Unaligned Reads` and `Submitted Aligned Reads`. Doing such allows for faster querying of 16 | the data model as well as providing a clear and concise representation of the data in the BPA. 17 | 18 | Beyond node type, there are also a number of extensions used to further define the nodes within 19 | the data model. Nodes are grouped up into categories that represent broad roles for the node such 20 | as `analysis` or `biospecimen`. Additionally, nodes are defined within their `Program` or `Project` 21 | and have descriptions of their use. All nodes also have a series of `systemProperties`; these 22 | properties are those that will be automatically filled by the system unless otherwise defined by 23 | the user. These basic properties define the node itself but still need to be placed into the model. 24 | 25 | The model itself is represented as a graph. Within the schema are defined `links`; these links 26 | point from child to parent with Program being the root of the graph. The links also contain a 27 | `backref` that allows for a parent to point back to a child. Other features of the link include a 28 | semantic `label` that describes the relationship between the two nodes, a `multiplicity` property 29 | that describes the numeric relationship from the child to the parent, and a requirement property 30 | to define whether a node must have that link. Taken all together the nodes and links create the 31 | directed graph of the Data Model. 32 | 33 | ## Node Properties and Examples 34 | 35 | Each node contains a series of potential key-value pairs (`properties`) that can be used to 36 | characterize the data they represent. Some properties are categorized as `required` or `preferred`. 37 | If a submission lacks a required property, it cannot be accepted. Preferred properties can denote 38 | two things: the property is being highlighted as it has become more desired by the community or 39 | the property is being promoted to required. All properties not designated either `required` or 40 | `preferred` are still sought by BPA, but submissions without them are allowed. 41 | 42 | The properties have further validation through their entries. Legal values are defined in each 43 | property. For the most part these are represented in the `enum` categories although some keys, 44 | such as `submitter_id`, will allow any string value as a valid entry. Other numeric properties 45 | can have maximum and minimum values to limit valid entries. For examples of what a valid entry 46 | would look like, each node has a mock submission located in the `examples/valid/` directory. 47 | 48 | ## Contributing 49 | 50 | Read how to contribute [here](https://github.com/NCI-GDC/portal-ui/blob/develop/CONTRIBUTING.md). 51 | -------------------------------------------------------------------------------- /templates/peregrine_settings.py: -------------------------------------------------------------------------------- 1 | from peregrine.api import app, app_init 2 | from os import environ 3 | import config_helper 4 | 5 | APP_NAME='peregrine' 6 | def load_json(file_name): 7 | return config_helper.load_json(file_name, APP_NAME) 8 | 9 | conf_data = load_json('creds.json') 10 | config = app.config 11 | 12 | config["AUTH"] = 'https://auth.service.consul:5000/v3/' 13 | config["AUTH_ADMIN_CREDS"] = None 14 | config["INTERNAL_AUTH"] = None 15 | 16 | # SIGNPOST is deprecated, replaced by INDEX_CLIENT (peregrine>=1.3.0) 17 | config['SIGNPOST'] = { 18 | 'host': environ.get('SIGNPOST_HOST', 'http://indexd-service'), 19 | 'version': 'v0', 20 | 'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')), 21 | } 22 | config['INDEX_CLIENT'] = { 23 | 'host': environ.get('INDEX_CLIENT_HOST', 'http://indexd-service'), 24 | 'version': 'v0', 25 | 'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')), 26 | } 27 | config["FAKE_AUTH"] = False 28 | config["PSQLGRAPH"] = { 29 | 'host': conf_data.get( 'db_host', '{{db_host}}' ), 30 | 'user': conf_data.get( 'db_username', '{{db_username}}' ), 31 | 'password': conf_data.get( 'db_password', '{{db_password}}' ), 32 | 'database': conf_data.get( 'db_database', '{{db_database}}' ), 33 | } 34 | 35 | config['HMAC_ENCRYPTION_KEY'] = conf_data.get( 'hmac_key', '{{hmac_key}}' ) 36 | config['FLASK_SECRET_KEY'] = conf_data.get( 'gdcapi_secret_key', '{{gdcapi_secret_key}}' ) 37 | config['PSQL_USER_DB_CONNECTION'] = 'postgresql://%s:%s@%s:5432/%s' % tuple([ conf_data.get(key, key) for key in ['fence_username', 'fence_password', 'fence_host', 'fence_database']]) 38 | 39 | if environ.get('DICTIONARY_URL'): 40 | config['DICTIONARY_URL'] = environ.get('DICTIONARY_URL') 41 | else: 42 | config['PATH_TO_SCHEMA_DIR'] = environ.get('PATH_TO_SCHEMA_DIR') 43 | 44 | config['SUBMISSION'] = { 45 | 'bucket': conf_data.get( 'bagit_bucket', '{{bagit_bucket}}' ) 46 | } 47 | 48 | config['STORAGE'] = { 49 | "s3": 50 | { 51 | "access_key": conf_data.get( 's3_access', '{{s3_access}}' ), 52 | 'secret_key': conf_data.get( 's3_secret', '{{s3_secret}}' ) 53 | } 54 | } 55 | 56 | config['OIDC_ISSUER'] = 'https://%s/user' % conf_data['hostname'] 57 | 58 | config['OAUTH2'] = { 59 | 'client_id': conf_data.get('oauth2_client_id', '{{oauth2_client_id}}'), 60 | 'client_secret': conf_data.get('oauth2_client_secret', '{{oauth2_client_secret}}'), 61 | 'api_base_url': 'https://%s/user/' % conf_data['hostname'], 62 | 'authorize_url': 'https://%s/user/oauth2/authorize' % conf_data['hostname'], 63 | 'access_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'], 64 | 'refresh_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'], 65 | 'client_kwargs': { 66 | 'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'], 67 | 'scope': 'openid data user', 68 | }, 69 | # deprecated key values, should be removed after all commons use new oidc 70 | 'internal_oauth_provider': 'http://fence-service/oauth2/', 71 | 'oauth_provider': 'https://%s/user/oauth2/' % conf_data['hostname'], 72 | 'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'] 73 | } 74 | 75 | config['USER_API'] = 'http://fence-service/' 76 | # option to force authutils to prioritize USER_API setting over the issuer from 77 | # token when redirecting, used during local docker compose setup when the 78 | # services are on different containers but the hostname is still localhost 79 | config['FORCE_ISSUER'] = True 80 | 81 | app_init(app) 82 | application = app 83 | application.debug = (environ.get('GEN3_DEBUG') == "True") 84 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/read_group_qc.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "read_group_qc" 4 | title: Read Group QC 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: notation 8 | project: '*' 9 | program: '*' 10 | description: "GDC QC run metadata." 11 | additionalProperties: false 12 | submittable: false 13 | validators: null 14 | 15 | systemProperties: 16 | - id 17 | - project_id 18 | - created_datetime 19 | - updated_datetime 20 | - state 21 | 22 | links: 23 | - exclusive: true 24 | required: true 25 | subgroup: 26 | - name: submitted_aligned_reads_files 27 | backref: read_group_qcs 28 | label: data_from 29 | target_type: submitted_aligned_reads 30 | multiplicity: one_to_one 31 | required: false 32 | - name: submitted_unaligned_reads_files 33 | backref: read_group_qcs 34 | label: data_from 35 | target_type: submitted_unaligned_reads 36 | multiplicity: one_to_many 37 | required: false 38 | - name: read_groups 39 | label: generated_from 40 | target_type: read_group 41 | multiplicity: many_to_one 42 | required: true 43 | backref: read_group_qcs 44 | 45 | required: 46 | - submitter_id 47 | - workflow_link 48 | - type 49 | - percent_gc_content 50 | - encoding 51 | - total_sequences 52 | - basic_statistics 53 | - per_base_sequence_quality 54 | - per_tile_sequence_quality 55 | - per_sequence_quality_score 56 | - per_base_sequence_content 57 | - per_sequence_gc_content 58 | - per_base_n_content 59 | - sequence_length_distribution 60 | - sequence_duplication_levels 61 | - overrepresented_sequences 62 | - adapter_content 63 | - kmer_content 64 | - read_groups 65 | 66 | uniqueKeys: 67 | - [ id ] 68 | - [ project_id, submitter_id ] 69 | 70 | properties: 71 | $ref: "_definitions.yaml#/workflow_properties" 72 | type: 73 | enum: [ "read_group_qc" ] 74 | workflow_type: 75 | term: 76 | $ref: "_terms.yaml#/workflow_type" 77 | enum: [ "Read Group Quality Control" ] 78 | fastq_name: 79 | term: 80 | $ref: "_terms.yaml#/file_name" 81 | type: string 82 | percent_aligned: 83 | description: "The percent of reads with at least one reported alignment." 84 | type: integer 85 | minimum: 0 86 | maximum: 100 87 | percent_gc_content: 88 | term: 89 | $ref: "_terms.yaml#/percent_gc_content" 90 | type: integer 91 | minimum: 0 92 | maximum: 100 93 | encoding: 94 | term: 95 | $ref: "_terms.yaml#/encoding" 96 | type: string 97 | total_aligned_reads: 98 | description: "The total number of reads with at least one reported alignment." 99 | type: integer 100 | total_sequences: 101 | term: 102 | $ref: "_terms.yaml#/total_sequences" 103 | type: integer 104 | basic_statistics: 105 | $ref: "_definitions.yaml#/qc_metrics_state" 106 | per_base_sequence_quality: 107 | $ref: "_definitions.yaml#/qc_metrics_state" 108 | per_tile_sequence_quality: 109 | $ref: "_definitions.yaml#/qc_metrics_state" 110 | per_sequence_quality_score: 111 | $ref: "_definitions.yaml#/qc_metrics_state" 112 | per_base_sequence_content: 113 | $ref: "_definitions.yaml#/qc_metrics_state" 114 | per_sequence_gc_content: 115 | $ref: "_definitions.yaml#/qc_metrics_state" 116 | per_base_n_content: 117 | $ref: "_definitions.yaml#/qc_metrics_state" 118 | sequence_length_distribution: 119 | $ref: "_definitions.yaml#/qc_metrics_state" 120 | sequence_duplication_levels: 121 | $ref: "_definitions.yaml#/qc_metrics_state" 122 | overrepresented_sequences: 123 | $ref: "_definitions.yaml#/qc_metrics_state" 124 | adapter_content: 125 | $ref: "_definitions.yaml#/qc_metrics_state" 126 | kmer_content: 127 | $ref: "_definitions.yaml#/qc_metrics_state" 128 | submitted_aligned_reads_files: 129 | $ref: "_definitions.yaml#/to_one" 130 | submitted_unaligned_reads_files: 131 | $ref: "_definitions.yaml#/to_many" 132 | read_groups: 133 | $ref: "_definitions.yaml#/to_one" 134 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/slide.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "slide" 4 | title: Slide 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: biospecimen 8 | program: '*' 9 | project: '*' 10 | description: > 11 | A digital image, microscopic or otherwise, of any sample, portion, or sub-part thereof. (GDC) 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - state 20 | - created_datetime 21 | - updated_datetime 22 | 23 | links: 24 | - name: samples 25 | backref: slides 26 | label: derived_from 27 | target_type: sample 28 | multiplicity: many_to_many 29 | required: true 30 | 31 | required: 32 | - submitter_id 33 | - type 34 | - samples 35 | 36 | uniqueKeys: 37 | - [id] 38 | - [project_id, submitter_id] 39 | 40 | # slide properties 41 | properties: 42 | type: 43 | type: string 44 | id: 45 | $ref: "_definitions.yaml#/UUID" 46 | systemAlias: node_id 47 | state: 48 | $ref: "_definitions.yaml#/state" 49 | submitter_id: 50 | type: 51 | - string 52 | - "null" 53 | apoptotic_concentration: 54 | description: "The concentration, in cells/mL, of apoptotic cells in the slide blood." 55 | type: number 56 | ctc_concentration: 57 | description: "The concentration, in cells/mL, of traditional CTC cells (intact and enlarged cell and nucleus, cytokeratin positive, and CD45 negative) in the slide blood." 58 | type: number 59 | ctc_low_concentration: 60 | description: "The concentration, in cells/mL, of CTC-low cells (those with low cytokeratin levels compared to traditional CTCs) in the slide blood." 61 | type: number 62 | ctc_small_concentration: 63 | description: "The concentration, in cells/mL, of CTC-small cells (those with a small nuclear and cellular size relative to traditional CTCs) in the slide blood." 64 | type: number 65 | section_location: 66 | term: 67 | $ref: "_terms.yaml#/section_location" 68 | type: string 69 | methanol_added: 70 | description: "True/False indicator for if methanol was used in the slide preparation process." 71 | type: boolean 72 | number_proliferating_cells: 73 | term: 74 | $ref: "_terms.yaml#/number_proliferating_cells" 75 | type: integer 76 | number_nucleated_cells: 77 | description: "The total number of nucleated cells identified on the slide." 78 | type: integer 79 | percent_tumor_cells: 80 | term: 81 | $ref: "_terms.yaml#/percent_tumor_cells" 82 | type: number 83 | percent_tumor_nuclei: 84 | term: 85 | $ref: "_terms.yaml#/percent_tumor_nuclei" 86 | type: number 87 | percent_normal_cells: 88 | term: 89 | $ref: "_terms.yaml#/percent_normal_cells" 90 | type: number 91 | percent_necrosis: 92 | term: 93 | $ref: "_terms.yaml#/percent_necrosis" 94 | type: number 95 | percent_stromal_cells: 96 | term: 97 | $ref: "_terms.yaml#/percent_stromal_cells" 98 | type: number 99 | percent_inflam_infiltration: 100 | term: 101 | $ref: "_terms.yaml#/percent_inflam_infiltration" 102 | type: number 103 | percent_lymphocyte_infiltration: 104 | term: 105 | $ref: "_terms.yaml#/percent_lymphocyte_infiltration" 106 | type: number 107 | percent_monocyte_infiltration: 108 | term: 109 | $ref: "_terms.yaml#/percent_monocyte_infiltration" 110 | type: number 111 | percent_granulocyte_infiltration: 112 | term: 113 | $ref: "_terms.yaml#/percent_granulocyte_infiltration" 114 | type: number 115 | percent_neutrophil_infiltration: 116 | term: 117 | $ref: "_terms.yaml#/percent_neutrophil_infiltration" 118 | type: number 119 | percent_eosinophil_infiltration: 120 | term: 121 | $ref: "_terms.yaml#/percent_eosinophil_infiltration" 122 | type: number 123 | run_datetime: 124 | $ref: "_definitions.yaml#/datetime" 125 | run_name: 126 | description: "Name, number, or other identifier given to this slide's run." 127 | type: string 128 | slide_identifier: 129 | description: "Unique identifier given to the this slide." 130 | type: string 131 | samples: 132 | $ref: "_definitions.yaml#/to_many" 133 | project_id: 134 | $ref: "_definitions.yaml#/project_id" 135 | created_datetime: 136 | $ref: "_definitions.yaml#/datetime" 137 | updated_datetime: 138 | $ref: "_definitions.yaml#/datetime" 139 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/project.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "project" 4 | title: Project 5 | type: object 6 | program: '*' 7 | project: '*' 8 | category: administrative 9 | description: > 10 | Any specifically defined piece of work that is undertaken or attempted to meet a single 11 | requirement. (NCIt C47885) 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - state 19 | - released 20 | - releasable 21 | - intended_release_date 22 | 23 | required: 24 | - code 25 | - name 26 | - dbgap_accession_number 27 | - programs 28 | 29 | uniqueKeys: 30 | - [ id ] 31 | - [ code ] 32 | 33 | links: 34 | - name: programs 35 | backref: projects 36 | label: member_of 37 | target_type: program 38 | multiplicity: many_to_one 39 | required: true 40 | 41 | constraints: null 42 | 43 | properties: 44 | type: 45 | type: string 46 | id: 47 | $ref: "_definitions.yaml#/UUID" 48 | systemAlias: node_id 49 | description: "UUID for the project." # TOREVIEW 50 | name: 51 | type: string 52 | description: "Display name/brief description for the project." # TOREVIEW 53 | code: 54 | type: string 55 | description: "Unique identifier for the project." 56 | investigator_name: 57 | description: "Name of the principal investigator for the project." 58 | type: string 59 | investigator_affiliation: 60 | description: "The investigator's affiliation with respect to a research institution." 61 | type: string 62 | date_collected: 63 | description: "The date or date range in which the project data was collected." 64 | type: string 65 | availability_type: 66 | description: "Is the project open or restricted?" 67 | enum: 68 | - Open 69 | - Restricted 70 | availability_mechanism: 71 | description: "Mechanism by which the project will be made avilable." 72 | type: string 73 | support_source: 74 | description: "The name of source providing support/grant resources." 75 | type: string 76 | support_id: 77 | description: "The ID of the source providing support/grant resources." 78 | type: string 79 | programs: 80 | $ref: "_definitions.yaml#/to_one" 81 | description: > 82 | Indicates that the project is logically part of the indicated project. 83 | state: 84 | description: | 85 | The possible states a project can be in. All but `open` are 86 | equivalent to some type of locked state. 87 | default: open 88 | enum: 89 | # open: the only state users can perform 'upload' actions 90 | # possible actions in `open`: 91 | # - upload (no state change) 92 | # - review -> review 93 | # - release (project.released -> true) 94 | - open 95 | 96 | # locked: admin has locked project for review 97 | # possible actions in `locked`: 98 | # - open -> open 99 | # - submit -> submitted 100 | # - release (project.released -> true) 101 | - review 102 | 103 | # submitted: An admin has submitted project, it is locked against 104 | # upload. 105 | # possible actions in `submitted`: 106 | # - process -> processing 107 | # - release (project.released -> true) 108 | - submitted 109 | 110 | # processing: The system is processing data in the project and 111 | # is locked against upload and submission 112 | # - (system transition to open) 113 | # - release (project.released -> true) 114 | - processing 115 | 116 | 117 | # closed: The closed state is introduced to replace the 118 | # ``legacy`` state and means that no further action 119 | # can be taken on the project 120 | # - (system transition to open) 121 | # - release (project.released -> true) 122 | - closed 123 | 124 | # DEPRECATED(2016-03-01): synonymous with closed. included for 125 | # backwards compatibility 126 | - legacy 127 | 128 | released: 129 | description: | 130 | To release a project is to tell the GDC to include all submitted 131 | entities in the next GDC index. 132 | default: false 133 | type: boolean 134 | 135 | releasable: 136 | description: | 137 | A project can only be released by the user when `releasable` is true. 138 | default: false 139 | type: boolean 140 | 141 | intended_release_date: 142 | description: Tracks a Project's intended release date. 143 | type: string 144 | format: date-time 145 | dbgap_accession_number: 146 | type: string 147 | description: "The dbgap accession number provided for the project." 148 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/core_metadata_collection.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "core_metadata_collection" 4 | title: Core Metadata Collection 5 | type: object 6 | namespace: https://dcp.bionimbus.org/ 7 | category: administrative 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Structured description of a collection of several dataset 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - state 20 | - created_datetime 21 | - updated_datetime 22 | 23 | links: 24 | - name: projects 25 | backref: core_metadata_collections 26 | label: data_from 27 | target_type: project 28 | multiplicity: many_to_one 29 | required: true 30 | 31 | uniqueKeys: 32 | - [id] 33 | - [project_id, submitter_id] 34 | 35 | required: 36 | - submitter_id 37 | - type 38 | - projects 39 | 40 | properties: 41 | $ref: "_definitions.yaml#/ubiquitous_properties" 42 | 43 | contributor: 44 | description: > 45 | An entity responsible for making contributions to the resource. Examples of a Contributor include a person, an organization, or a service. Typically, the name of a Contributor should be used to indicate the entity. 46 | type: string 47 | 48 | coverage: 49 | description: > 50 | The spatial or temporal topic of the resource, the spatial applicability of the resource, or the jurisdiction under which the resource is relevant. Spatial topic and spatial applicability may be a named place or a location specified by its geographic coordinates. Temporal topic may be a named period, date, or date range. A jurisdiction may be a named administrative entity or a geographic place to which the resource applies. Recommended best practice is to use a controlled vocabulary such as the Thesaurus of Geographic Names [TGN] (http://www.getty.edu/research/tools/vocabulary/tgn/index.html). Where appropriate, named places or time periods can be used in preference to numeric identifiers such as sets of coordinates or date ranges. 51 | type: string 52 | 53 | creator: 54 | description: > 55 | An entity primarily responsible for making the resource. Examples of a Creator include a person, an organization, or a service. Typically, the name of a Creator should be used to indicate the entity. 56 | type: string 57 | 58 | date: 59 | $ref: "_definitions.yaml#/datetime" 60 | 61 | description: 62 | description: > 63 | An account of the resource. Description may include but is not limited to: an abstract, a table of contents, a graphical representation, or a free-text account of the resource. 64 | type: string 65 | 66 | format: 67 | description: > 68 | The file format, physical medium, or dimensions of the resource. Examples of dimensions include size and duration. Recommended best practice is to use a controlled vocabulary such as the list of Internet Media Types [MIME] (http://www.iana.org/assignments/media-types/). 69 | type: string 70 | 71 | language: 72 | description: > 73 | A language of the resource. Recommended best practice is to use a controlled vocabulary such as RFC 4646 (http://www.ietf.org/rfc/rfc4646.txt). 74 | type: string 75 | 76 | publisher: 77 | description: > 78 | An entity responsible for making the resource available. Examples of a Publisher include a person, an organization, or a service. Typically, the name of a Publisher should be used to indicate the entity. 79 | type: string 80 | 81 | relation: 82 | description: > 83 | A related resource. Recommended best practice is to identify the related resource by means of a string conforming to a formal identification system.  84 | type: string 85 | 86 | rights: 87 | description: > 88 | Information about rights held in and over the resource. Typically, rights information includes a statement about various property rights associated with the resource, including intellectual property rights. 89 | type: string 90 | 91 | source: 92 | description: > 93 | A related resource from which the described resource is derived. The described resource may be derived from the related resource in whole or in part. Recommended best practice is to identify the related resource by means of a string conforming to a formal identification system. 94 | type: string 95 | 96 | subject: 97 | description: > 98 | The topic of the resource. Typically, the subject will be represented using keywords, key phrases, or classification codes. Recommended best practice is to use a controlled vocabulary. 99 | type: string 100 | 101 | title: 102 | description: > 103 | A name given to the resource. Typically, a Title will be a name by which the resource is formally known. 104 | type: string 105 | 106 | data_type: 107 | description: > 108 | The nature or genre of the resource. Recommended best practice is to use a controlled vocabulary such as the DCMI Type Vocabulary [DCMITYPE]. To describe the file format, physical medium, or dimensions of the resource, use the Format element. 109 | type: string 110 | 111 | projects: 112 | $ref: "_definitions.yaml#/to_one_project" 113 | 114 | -------------------------------------------------------------------------------- /docs/dev_tips.md: -------------------------------------------------------------------------------- 1 | # Dev Tips 2 | 3 | You can quickly find commonly used commands for compose services in our [cheat sheet](https://github.com/uc-cdis/compose-services/docs/cheat_sheet.md). 4 | 5 | When developing, you can have local repositories of the services you are working on and use volumes to mount your local repository files onto the containers to override the containers' code (which is built from GitHub using quay.io). Then, you can restart a single container with 6 | ``` 7 | docker-compose restart [CONTAINER_NAME] 8 | ``` 9 | after you update some code in order to see changes without having to rebuild all the microservices. Keep in mind that running `docker-compose restart` does not apply changes you make in the docker-compose file. Look up the Docker documentation for more information about [volumes](https://docs.docker.com/storage/). 10 | 11 | ## Spark service hdfs reformatting issue 12 | 13 | The `spark-service` starts up runs `hdfs namenode -format` formatting, which is a compute intensive operation. If your `spark-service` fails to start due to being killed by docker daemon, e.g. the container status is `Exited (255)`, then tail the last lines of log as follows: 14 | 15 | ``` 16 | docker logs spark-service --tail=5 17 | /************************************************************ 18 | SHUTDOWN_MSG: Shutting down NameNode at 3b8d38960f74/172.20.0.2 19 | ************************************************************/ 20 | 2021-04-07 02:30:55,414 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 21 | safemode: Your endpoint configuration is wrong; For more details see: http://wiki.apache.org/hadoop/UnsetHostnameOrPort 22 | ``` 23 | 24 | Before attempting to (re)start the `spark-service`, make sure to delete the exited/failed container first. 25 | 26 | ``` 27 | docker rm spark-service 28 | docker-compose up -d 29 | ``` 30 | 31 | Otherwise, you may encounter the following looping in the container log: 32 | 33 | ``` 34 | docker logs spark-service --tail=5 35 | Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input: 36 | Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input: 37 | Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input: 38 | Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input: 39 | Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input: 40 | ``` 41 | 42 | ## Running Docker Compose on a Remote Machine 43 | 44 | To run Docker Compose on a remote machine, modify the `BASE_URL` field in `fence-config.yaml`, and the `hostname` field in `peregrine_creds.json` and `sheepdog_creds.json` in the `Secrets` directory. 45 | 46 | ## Dumping config files and logs (MacOS/Linux) 47 | 48 | If you are encountering difficulties while setting up Docker Compose and need help from the Gen3 team, you can use the `dump.sh` script to create a zip file of your configuration and current logs, which you can share to get help. 49 | ``` 50 | bash dump.sh 51 | ``` 52 | Note that if docker-compose is not running, the logs will be empty. 53 | 54 | The following configuration files will be included: 55 | * docker-compose.yml 56 | * user.yaml 57 | * any file ending with "settings" or "config" 58 | 59 | Credentials files are NOT included and lines containing "password", "secret" or "key" are removed from other files. 60 | If your files contain other kinds of sensitive credentials, make sure to remove them before running the script. 61 | 62 | ## Environment Details 63 | 64 | The sandbox ecosystem deployed thus architecturally looks as shown below: 65 | ![Sandbox](https://github.com/uc-cdis/compose-services/blob/master/SandboxContainers.jpg) 66 | 67 | 68 | All the microservices communicate with the Postgres Container based on the configuration specified above. Once the services are up and running, the environment can be visualized using the windmill microservice running on port 80 by typing the URL of the machine on which the containers are deployed. Please see example screenshot below as an example: 69 | 70 | ![Launch Portal](https://github.com/uc-cdis/compose-services/blob/master/LaunchPortal.jpg) 71 | 72 | Upon clicking 'Login from Google' and providing Google Credentials (if the same Google Account is used where the developer credentials came from), the system redirects the user to their landing page as shown below: 73 | 74 | 75 | ![Logged Into Portal](https://github.com/uc-cdis/compose-services/blob/master/LoggedInScreenshot.jpg) 76 | 77 | 78 | ## Revproxy-service cannot start 79 | 80 | If revproxy-service cannot start an error will occur. It may be useful to 81 | ``` 82 | docker-compose down 83 | docker-compose up -d 84 | ``` 85 | If the error still occurs, make sure that apache2 and revproxy-service do not share the same port. You can change the port for revproxy-service and any other service in the `docker-compose.yaml` [file](https://github.com/uc-cdis/compose-services/blob/bf1dbc0f43519c1d6bc25d9cb331b78c3b35ecca/docker-compose.yml#L215). For revproxy you would also need to change the port in the `nginx.conf` [here](https://github.com/uc-cdis/compose-services/blob/bf1dbc0f43519c1d6bc25d9cb331b78c3b35ecca/nginx.conf#L29). 86 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/treatment.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "treatment" 4 | title: Treatment 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: clinical 8 | program: '*' 9 | project: '*' 10 | description: > 11 | Record of the administration and intention of therapeutic agents provided to a patient to alter 12 | the course of a pathologic process. 13 | additionalProperties: false 14 | submittable: true 15 | validators: null 16 | 17 | systemProperties: 18 | - id 19 | - project_id 20 | - state 21 | - created_datetime 22 | - updated_datetime 23 | 24 | required: 25 | - submitter_id 26 | - type 27 | 28 | links: 29 | - name: diagnoses 30 | backref: treatments 31 | label: describes #need better term here 32 | target_type: diagnosis 33 | multiplicity: many_to_one 34 | required: true 35 | 36 | 37 | 38 | uniqueKeys: 39 | #unclear if want submitter ID for clinical 40 | - [id] 41 | - [project_id, submitter_id] 42 | 43 | properties: 44 | type: 45 | enum: [ "treatment" ] 46 | 47 | id: 48 | $ref: "_definitions.yaml#/UUID" 49 | systemAlias: node_id 50 | 51 | state: 52 | $ref: "_definitions.yaml#/state" 53 | 54 | submitter_id: 55 | type: 56 | - string 57 | - "null" 58 | 59 | days_to_treatment: 60 | term: 61 | $ref: "_terms.yaml#/days_to_treatment" 62 | type: number 63 | 64 | days_to_treatment_end: 65 | term: 66 | $ref: "_terms.yaml#/days_to_treatment_end" 67 | type: number 68 | 69 | days_to_treatment_start: 70 | term: 71 | $ref: "_terms.yaml#/days_to_treatment_start" 72 | type: number 73 | 74 | therapeutic_agents: 75 | term: 76 | $ref: "_terms.yaml#/therapeutic_agents" 77 | type: string 78 | 79 | treatment_anatomic_site: 80 | term: 81 | $ref: "_terms.yaml#/treatment_anatomic_site" 82 | enum: 83 | - Abdomen, total 84 | - Arm 85 | - Ascites 86 | - Axillary 87 | - Body, total 88 | - Bone 89 | - Bone, non-spine 90 | - Brain, focal 91 | - Brain, whole 92 | - Brain-C2 93 | - Breast 94 | - Cervical 95 | - Chest Wall 96 | - Effusion 97 | - Epitrochlear 98 | - Eye 99 | - Femoral 100 | - Gastrointestinal, Colon 101 | - Gastrointestinal, Gallbladder 102 | - Gastrointestinal, Intestine 103 | - Gastrointestinal, Liver 104 | - Gastrointestinal, NOS 105 | - Gastrointestinal, Pancreas 106 | - Gastrointestinal, Rectum 107 | - Gastrointestinal, Stomach 108 | - Genitourinary, Bladder 109 | - Genitourinary, Kidney 110 | - Genitourinary, NOS 111 | - Genitourinary, Prostate 112 | - Genitourinary, Prostate and Seminal Vesicles 113 | - Head 114 | - Head, Face, or Neck 115 | - Hilar 116 | - Iliac-common 117 | - Iliac-external 118 | - Inguinal 119 | - Internal Mammary Nodes 120 | - Leg 121 | - Lung 122 | - Lymph Nodes 123 | - Lymph node, distant (specify site) 124 | - Lymph node, locoregional (specify site) 125 | - Mantle 126 | - Mediastinal 127 | - Mediastinum 128 | - Mesenteric 129 | - Occipital 130 | - Other 131 | - Paraaortic 132 | - Parametrium 133 | - Parotid 134 | - Pelvis 135 | - Popliteal 136 | - Primary tumor site 137 | - Prostate 138 | - Prostate Bed 139 | - Prostate, Seminal Vesicles and Lymph Nodes 140 | - Rectum 141 | - Retroperitoneal 142 | - Sacrum 143 | - Seminal vesicles 144 | - Shoulder 145 | - Skin, lower extremity, local 146 | - Skin, total 147 | - Skin, trunk, local 148 | - Skin, upper extremity, local 149 | - Spine 150 | - Spine, whole 151 | - Splenic 152 | - Submandibular 153 | - Supraclavicular 154 | - Supraclavicular/Axillary Level 3 155 | - Thorax 156 | - Trunk 157 | - Unknown 158 | - Not Reported 159 | - Not Allowed To Collect 160 | 161 | treatment_intent_type: 162 | term: 163 | $ref: "_terms.yaml#/treatment_intent_type" 164 | type: string 165 | 166 | treatment_or_therapy: 167 | term: 168 | $ref: "_terms.yaml#/treatment_or_therapy" 169 | enum: 170 | - "yes" 171 | - "no" 172 | - unknown 173 | - not reported 174 | 175 | treatment_outcome: 176 | term: 177 | $ref: "_terms.yaml#/treatment_outcome" 178 | enum: 179 | - Complete Response 180 | - Partial Response 181 | - Treatment Ongoing 182 | - Treatment Stopped Due to Toxicity 183 | - Unknown 184 | 185 | treatment_type: 186 | term: 187 | $ref: "_terms.yaml#/treatment_type" 188 | enum: 189 | - Ablation 190 | - Chemotherapy 191 | - Concurrent Chemoradiation 192 | - Cryoablation 193 | - Embolization 194 | - Hormone Therapy 195 | - Internal Radiation 196 | - Immunotherapy (Including Vaccines) 197 | - Other 198 | - Pharmaceutical Therapy 199 | - Radiation Therapy 200 | - Stem Cell Treatment 201 | - Surgery 202 | - Targeted Molecular Therapy 203 | - Unknown 204 | - Not Reported 205 | - Not Allowed To Collect 206 | 207 | diagnoses: 208 | $ref: "_definitions.yaml#/to_one" 209 | 210 | project_id: 211 | $ref: "_definitions.yaml#/project_id" 212 | 213 | # ======== Timestamps ======== 214 | created_datetime: 215 | $ref: "_definitions.yaml#/datetime" 216 | updated_datetime: 217 | $ref: "_definitions.yaml#/datetime" 218 | -------------------------------------------------------------------------------- /templates/user.yaml: -------------------------------------------------------------------------------- 1 | authz: 2 | # policies automatically given to anyone, even if they are not authenticated 3 | anonymous_policies: 4 | - open_data_reader 5 | 6 | # policies automatically given to authenticated users (in addition to their other policies) 7 | all_users_policies: [] 8 | 9 | groups: 10 | # can CRUD programs and projects and upload data files 11 | - name: data_submitters 12 | policies: 13 | - services.sheepdog-admin 14 | - data_upload 15 | - MyFirstProject_submitter 16 | users: 17 | - username1@gmail.com 18 | 19 | # can create/update/delete indexd records 20 | - name: indexd_admins 21 | policies: 22 | - indexd_admin 23 | users: 24 | - username1@gmail.com 25 | 26 | resources: 27 | - name: workspace 28 | - name: data_file 29 | - name: services 30 | subresources: 31 | - name: sheepdog 32 | subresources: 33 | - name: submission 34 | subresources: 35 | - name: program 36 | - name: project 37 | - name: open 38 | - name: programs 39 | subresources: 40 | - name: MyFirstProgram 41 | subresources: 42 | - name: projects 43 | subresources: 44 | - name: MyFirstProject 45 | - name: jnkns 46 | subresources: 47 | - name: projects 48 | subresources: 49 | - name: jenkins 50 | - name: program1 51 | subresources: 52 | - name: projects 53 | subresources: 54 | - name: P1 55 | 56 | policies: 57 | - id: workspace 58 | description: be able to use workspace 59 | resource_paths: 60 | - /workspace 61 | role_ids: 62 | - workspace_user 63 | - id: data_upload 64 | description: upload raw data files to S3 65 | role_ids: 66 | - file_uploader 67 | resource_paths: 68 | - /data_file 69 | - id: services.sheepdog-admin 70 | description: CRUD access to programs and projects 71 | role_ids: 72 | - sheepdog_admin 73 | resource_paths: 74 | - /services/sheepdog/submission/program 75 | - /services/sheepdog/submission/project 76 | - id: indexd_admin 77 | description: full access to indexd API 78 | role_ids: 79 | - indexd_admin 80 | resource_paths: 81 | - /programs 82 | - id: open_data_reader 83 | role_ids: 84 | - reader 85 | - storage_reader 86 | resource_paths: 87 | - /open 88 | - id: all_programs_reader 89 | role_ids: 90 | - reader 91 | - storage_reader 92 | resource_paths: 93 | - /programs 94 | - id: MyFirstProject_submitter 95 | role_ids: 96 | - reader 97 | - creator 98 | - updater 99 | - deleter 100 | - storage_reader 101 | - storage_writer 102 | resource_paths: 103 | - /programs/MyFirstProgram/projects/MyFirstProject 104 | - id: jnkns 105 | role_ids: 106 | - reader 107 | - creator 108 | - updater 109 | - deleter 110 | - storage_reader 111 | - storage_writer 112 | resource_paths: 113 | - /programs/jnkns 114 | - /programs/jnkns/projects/jenkins 115 | - id: program1 116 | role_ids: 117 | - reader 118 | - creator 119 | - updater 120 | - deleter 121 | - storage_reader 122 | - storage_writer 123 | resource_paths: 124 | - /programs/program1 125 | - /programs/program1/projects/P1 126 | 127 | roles: 128 | - id: file_uploader 129 | permissions: 130 | - id: file_upload 131 | action: 132 | service: fence 133 | method: file_upload 134 | - id: workspace_user 135 | permissions: 136 | - id: workspace_access 137 | action: 138 | service: jupyterhub 139 | method: access 140 | - id: sheepdog_admin 141 | description: CRUD access to programs and projects 142 | permissions: 143 | - id: sheepdog_admin_action 144 | action: 145 | service: sheepdog 146 | method: '*' 147 | - id: indexd_admin 148 | description: full access to indexd API 149 | permissions: 150 | - id: indexd_admin 151 | action: 152 | service: indexd 153 | method: '*' 154 | - id: admin 155 | permissions: 156 | - id: admin 157 | action: 158 | service: '*' 159 | method: '*' 160 | - id: creator 161 | permissions: 162 | - id: creator 163 | action: 164 | service: '*' 165 | method: create 166 | - id: reader 167 | permissions: 168 | - id: reader 169 | action: 170 | service: '*' 171 | method: read 172 | - id: updater 173 | permissions: 174 | - id: updater 175 | action: 176 | service: '*' 177 | method: update 178 | - id: deleter 179 | permissions: 180 | - id: deleter 181 | action: 182 | service: '*' 183 | method: delete 184 | - id: storage_writer 185 | permissions: 186 | - id: storage_creator 187 | action: 188 | service: '*' 189 | method: write-storage 190 | - id: storage_reader 191 | permissions: 192 | - id: storage_reader 193 | action: 194 | service: '*' 195 | method: read-storage 196 | 197 | clients: 198 | wts: 199 | policies: 200 | - all_programs_reader 201 | - open_data_reader 202 | 203 | users: 204 | username1@gmail.com: 205 | tags: 206 | name: User One 207 | # email: mustbe@differentemail.com 208 | policies: 209 | - workspace 210 | - data_upload 211 | - MyFirstProject_submitter 212 | - jnkns 213 | - program1 214 | username2: 215 | tags: 216 | name: John Doe 217 | email: johndoe@gmail.com 218 | 219 | cloud_providers: {} 220 | groups: {} 221 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schemas/clinical_test.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-04/schema#" 2 | 3 | id: "clinical_test" 4 | title: Clinical Test 5 | type: object 6 | namespace: http://gdc.nci.nih.gov 7 | category: clinical 8 | project: '*' 9 | program: '*' 10 | description: > 11 | Metadata concerning any clinical tests used in relation to a case diagnosis. 12 | additionalProperties: false 13 | submittable: true 14 | validators: null 15 | 16 | systemProperties: 17 | - id 18 | - project_id 19 | - created_datetime 20 | - updated_datetime 21 | - state 22 | 23 | links: 24 | - name: cases 25 | backref: clinical_tests 26 | label: performed_for 27 | target_type: case 28 | multiplicity: many_to_one 29 | required: true 30 | - name: diagnoses 31 | backref: clinical_tests 32 | label: relates_to 33 | target_type: diagnosis 34 | multiplicity: many_to_many 35 | required: false 36 | 37 | required: 38 | - submitter_id 39 | - type 40 | - biomarker_name 41 | - biomarker_result 42 | - biomarker_test_method 43 | - cases 44 | 45 | uniqueKeys: 46 | - [id] 47 | - [project_id, submitter_id] 48 | 49 | properties: 50 | type: 51 | enum: [ "clinical_test" ] 52 | 53 | id: 54 | $ref: "_definitions.yaml#/UUID" 55 | systemAlias: node_id 56 | 57 | state: 58 | $ref: "_definitions.yaml#/state" 59 | 60 | submitter_id: 61 | type: 62 | - string 63 | - "null" 64 | 65 | biomarker_name: 66 | term: 67 | $ref: "_terms.yaml#/biomarker_name" 68 | type: string 69 | 70 | biomarker_result: 71 | term: 72 | $ref: "_terms.yaml#/biomarker_result" 73 | enum: 74 | - Amplification 75 | - Gain 76 | - Loss 77 | - Normal 78 | - Other 79 | - Translocation 80 | - Not Reported 81 | - Not Allowed To Collect 82 | - Pending 83 | 84 | biomarker_test_method: 85 | term: 86 | $ref: "_terms.yaml#/biomarker_test_method" 87 | enum: 88 | - Cytogenetics 89 | - FISH 90 | - IHC 91 | - Karyotype 92 | - NGS 93 | - Nuclear Staining 94 | - Other 95 | - RT-PCR 96 | - Southern 97 | - Not Reported 98 | - Not Allowed To Collect 99 | - Pending 100 | 101 | cea_level_preoperative: 102 | term: 103 | $ref: "_terms.yaml#/cea_level_preoperative" 104 | type: number 105 | 106 | dlco_ref_predictive_percent: 107 | term: 108 | $ref: "_terms.yaml#/dlco_ref_predictive_percent" 109 | type: number 110 | 111 | estrogen_receptor_percent_positive_ihc: 112 | term: 113 | $ref: "_terms.yaml#/estrogen_receptor_percent_positive_ihc" 114 | enum: 115 | - <1% 116 | - 1-10% 117 | - 11-20% 118 | - 21-30% 119 | - 31-40% 120 | - 41-50% 121 | - 51-60% 122 | - 61-70% 123 | - 71-80% 124 | - 81-90% 125 | - 91-100% 126 | 127 | estrogen_receptor_result_ihc: 128 | term: 129 | $ref: "_terms.yaml#/estrogen_receptor_result_ihc" 130 | enum: 131 | - Negative 132 | - Not Performed 133 | - Positive 134 | - Unknown 135 | 136 | fev1_ref_post_bronch_percent: 137 | term: 138 | $ref: "_terms.yaml#/fev1_ref_post_bronch_percent" 139 | type: number 140 | 141 | fev1_ref_pre_bronch_percent: 142 | term: 143 | $ref: "_terms.yaml#/fev1_ref_pre_bronch_percent" 144 | type: number 145 | 146 | fev1_fvc_post_bronch_percent: 147 | term: 148 | $ref: "_terms.yaml#/fev1_fvc_post_bronch_percent" 149 | type: number 150 | 151 | fev1_fvc_pre_bronch_percent: 152 | term: 153 | $ref: "_terms.yaml#/fev1_fvc_pre_bronch_percent" 154 | type: number 155 | 156 | her2_erbb2_percent_positive_ihc: 157 | term: 158 | $ref: "_terms.yaml#/her2_erbb2_percent_positive_ihc" 159 | enum: 160 | - <1% 161 | - 1-10% 162 | - 11-20% 163 | - 21-30% 164 | - 31-40% 165 | - 41-50% 166 | - 51-60% 167 | - 61-70% 168 | - 71-80% 169 | - 81-90% 170 | - 91-100% 171 | 172 | her2_erbb2_result_fish: 173 | term: 174 | $ref: "_terms.yaml#/her2_erbb2_result_fish" 175 | enum: 176 | - Negative 177 | - Not Performed 178 | - Positive 179 | - Unknown 180 | 181 | her2_erbb2_result_ihc: 182 | term: 183 | $ref: "_terms.yaml#/her2_erbb2_result_ihc" 184 | enum: 185 | - Negative 186 | - Not Performed 187 | - Positive 188 | - Unknown 189 | 190 | ldh_level_at_diagnosis: 191 | term: 192 | $ref: "_terms.yaml#/ldh_level_at_diagnosis" 193 | type: number 194 | 195 | ldh_normal_range_upper: 196 | term: 197 | $ref: "_terms.yaml#/ldh_normal_range_upper" 198 | type: number 199 | 200 | microsatellite_instability_abnormal: 201 | term: 202 | $ref: "_terms.yaml#/microsatellite_instability_abnormal" 203 | enum: 204 | - "Yes" 205 | - "No" 206 | - Unknown 207 | 208 | progesterone_receptor_percent_positive_ihc: 209 | term: 210 | $ref: "_terms.yaml#/progesterone_receptor_percent_positive_ihc" 211 | enum: 212 | - <1% 213 | - 1-10% 214 | - 11-20% 215 | - 21-30% 216 | - 31-40% 217 | - 41-50% 218 | - 51-60% 219 | - 61-70% 220 | - 71-80% 221 | - 81-90% 222 | - 91-100% 223 | 224 | progesterone_receptor_result_ihc: 225 | term: 226 | $ref: "_terms.yaml#/progesterone_receptor_result_ihc" 227 | enum: 228 | - Negative 229 | - Not Performed 230 | - Positive 231 | - Unknown 232 | 233 | cases: 234 | $ref: "_definitions.yaml#/to_one" 235 | diagnoses: 236 | $ref: "_definitions.yaml#/to_many" 237 | project_id: 238 | $ref: "_definitions.yaml#/project_id" 239 | created_datetime: 240 | $ref: "_definitions.yaml#/datetime" 241 | updated_datetime: 242 | $ref: "_definitions.yaml#/datetime" 243 | -------------------------------------------------------------------------------- /creds_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to setup keys for fence as well as ssl credentials 3 | 4 | if [[ ! -d ./templates ]]; then 5 | echo "ERROR: ./templates not found - run in compose-services folder" 6 | exit 1 7 | fi 8 | if [[ -d Secrets ]]; then 9 | # make a backup 10 | bak="Secrets$(date +%Y%m%d%H%M%S).bak" 11 | if [[ -e "$bak" ]]; then 12 | echo "ERROR: ./Secrets and $bak already exist" 13 | exit 1 14 | fi 15 | echo "Backing up ./Secrets/ to ./$bak/" 16 | cp -r ./Secrets "./$bak" 17 | fi 18 | 19 | mkdir -p Secrets 20 | 21 | for path in templates/*; do 22 | target="Secrets/$(basename "$path")" 23 | if [[ "$path" =~ \.py$ ]]; then # update python files 24 | echo "Copying $path to $target" 25 | cp "$path" "$target" 26 | elif [[ ! -e "$target" ]]; then 27 | echo "Copying $path to $target" 28 | cp -r "$path" "$target" 29 | else 30 | echo "$target already exists" 31 | fi 32 | done 33 | 34 | tempFile="gen3scratch.tmp" 35 | if [ ! -z $1 ]; then 36 | customHost="$1" 37 | shift 38 | # be careful with sed -i on Mac: https://stackoverflow.com/questions/19456518/invalid-command-code-despite-escaping-periods-using-sed 39 | for name in Secrets/fence-config.yaml Secrets/*_creds.json; do 40 | sed "s/localhost/$customHost/g" "$name" > "$tempFile" && \ 41 | cp "$tempFile" "$name" 42 | done 43 | fi 44 | 45 | configFile=./Secrets/fence-config.yaml 46 | if grep "^ENCRYPTION_KEY: ''" "$configFile" > /dev/null; then 47 | # be careful with sed on Mac: https://stackoverflow.com/questions/19456518/invalid-command-code-despite-escaping-periods-using-sed 48 | key="$(python ./scripts/fence_key_helper.py)" && \ 49 | sed "s/^ENCRYPTION_KEY: ''/ENCRYPTION_KEY: '$key'/" "$configFile" > "$tempFile" && \ 50 | cp "$tempFile" "$configFile" 51 | fi 52 | rm "$tempFile" 53 | 54 | cd Secrets 55 | 56 | # make directories for temporary credentials 57 | timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") 58 | 59 | # generate private and public key for fence 60 | yearMonth="$(date +%Y-%m)" 61 | if [[ ! -d ./fenceJwtKeys ]] || ! (ls ./fenceJwtKeys | grep "$yearMonth" > /dev/null 2>&1); then 62 | echo "Generating fence OAUTH key pairs under Secrets/fenceJwtKeys" 63 | mkdir -p fenceJwtKeys 64 | mkdir -p fenceJwtKeys/${timestamp} 65 | 66 | openssl genpkey -algorithm RSA -out fenceJwtKeys/${timestamp}/jwt_private_key.pem \ 67 | -pkeyopt rsa_keygen_bits:2048 68 | openssl rsa -pubout -in fenceJwtKeys/${timestamp}/jwt_private_key.pem \ 69 | -out fenceJwtKeys/${timestamp}/jwt_public_key.pem 70 | chmod -R a+rx fenceJwtKeys 71 | fi 72 | 73 | # generate certs for nginx ssl 74 | ( 75 | mkdir -p TLS 76 | cd TLS 77 | 78 | OS=$(uname) 79 | OPTS="" 80 | if [[ $OS == "Darwin" ]]; then 81 | cp /etc/ssl/openssl.cnf openssl-with-ca.cnf 82 | 83 | __v3_ca=" 84 | [ v3_ca ] 85 | basicConstraints = critical,CA:TRUE 86 | subjectKeyIdentifier = hash 87 | authorityKeyIdentifier = keyid:always,issuer:always 88 | " 89 | echo "$__v3_ca" >> openssl-with-ca.cnf 90 | OPTS=" -extensions v3_ca -config openssl-with-ca.cnf" 91 | fi 92 | 93 | if ! [[ -f openssl.cnf && -f ca.pem && -f ca-key.pem ]]; then 94 | echo "Generating a local certificate authority, and TLS certificates under Secrets/TLS/" 95 | # erase old certs if they exist 96 | /bin/rm -rf service.key service.crt 97 | commonName=${1:-localhost} 98 | SUBJ="/countryName=US/stateOrProvinceName=IL/localityName=Chicago/organizationName=CDIS/organizationalUnitName=PlanX/commonName=$commonName/emailAddress=cdis@uchicago.edu" 99 | openssl req -new -x509 -nodes -extensions v3_ca -keyout ca-key.pem \ 100 | -out ca.pem -days 365 -subj $SUBJ $OPTS 101 | if [[ $? -eq 1 ]]; then 102 | echo "problem with creds_setup.sh script, refer to compose-services wiki" 103 | rm -rf temp* 104 | exit 1 105 | fi 106 | 107 | mkdir -p CA/newcerts 108 | touch CA/index.txt 109 | touch CA/index.txt.attr 110 | echo 1000 > CA/serial 111 | cat > openssl.cnf < 222 | Optional comment about why the file is in the 223 | current state, mainly for invalid state. 224 | project_id: 225 | $ref: "#/project_id" 226 | created_datetime: 227 | $ref: "#/datetime" 228 | updated_datetime: 229 | $ref: "#/datetime" 230 | 231 | workflow_properties: 232 | id: 233 | $ref: "#/UUID" 234 | systemAlias: node_id 235 | submitter_id: 236 | type: 237 | - string 238 | - "null" 239 | description: "The file ID assigned by the submitter." # TOREVIEW 240 | workflow_link: 241 | description: "Link to Github hash for the CWL workflow used." 242 | type: string 243 | workflow_version: 244 | description: "Major version for a GDC workflow." 245 | type: string 246 | workflow_start_datetime: 247 | $ref: "#/datetime" 248 | workflow_end_datetime: 249 | $ref: "#/datetime" 250 | state: 251 | $ref: "#/state" 252 | project_id: 253 | $ref: "#/project_id" 254 | created_datetime: 255 | $ref: "#/datetime" 256 | updated_datetime: 257 | $ref: "#/datetime" 258 | 259 | ubiquitous_properties: 260 | type: 261 | type: string 262 | id: 263 | $ref: "#/UUID" 264 | systemAlias: node_id 265 | submitter_id: 266 | type: 267 | - string 268 | description: > 269 | A project-specific identifier for a node. This property is the calling card/nickname/alias for 270 | a unit of submission. It can be used in place of the UUID for identifying or recalling a node. 271 | state: 272 | $ref: "#/state" 273 | project_id: 274 | $ref: "#/project_id" 275 | created_datetime: 276 | $ref: "#/datetime" 277 | updated_datetime: 278 | $ref: "#/datetime" 279 | -------------------------------------------------------------------------------- /datadictionary/gdcdictionary/schema_test.py: -------------------------------------------------------------------------------- 1 | """This is an example of json schema for the GDC using schemas defined 2 | in local yaml files. 3 | 4 | Included are a few functions to augment jsonschema and the python 5 | validator. 6 | 7 | Examples are at the end. 8 | 9 | """ 10 | 11 | 12 | from jsonschema import validate, ValidationError 13 | import copy 14 | import yaml 15 | import glob 16 | import os 17 | import argparse 18 | import json 19 | import unittest 20 | from gdcdictionary import gdcdictionary 21 | 22 | 23 | 24 | def load_yaml_schema(path): 25 | with open(path, 'r') as f: 26 | return yaml.load(f) 27 | CUR_DIR = os.path.dirname(os.path.realpath(__file__)) 28 | DATA_DIR = os.path.join(CUR_DIR, 'examples') 29 | project1 = load_yaml_schema(os.path.join(CUR_DIR, 'schemas/projects/project1.yaml')) 30 | projects = {'project1': project1} 31 | 32 | def merge_schemas(a, b, path=None): 33 | """Recursively zip schemas together 34 | 35 | """ 36 | path = path if path is not None else [] 37 | for key in b: 38 | if key in a: 39 | if isinstance(a[key], dict) and isinstance(b[key], dict): 40 | merge_schemas(a[key], b[key], path + [str(key)]) 41 | elif a[key] == b[key]: 42 | pass 43 | else: 44 | print("Overriding '{}':\n\t- {}\n\t+ {}".format( 45 | '.'.join(path + [str(key)]), a[key], b[key])) 46 | a[key] = b[key] 47 | else: 48 | print("Adding '{}':\n\t+ {}".format( 49 | '.'.join(path + [str(key)]), b[key])) 50 | a[key] = b[key] 51 | return a 52 | 53 | 54 | def get_project_specific_schema(projects, project, schema, entity_type): 55 | """Look up the core schema for its type and override it with any 56 | project level overrides 57 | 58 | """ 59 | root = copy.deepcopy(schema) 60 | project_overrides = projects.get(project) 61 | if project_overrides: 62 | overrides = project_overrides.get(entity_type) 63 | if overrides: 64 | merge_schemas(root, overrides, [entity_type]) 65 | return root 66 | 67 | 68 | def validate_entity(entity, schemata, project=None, name=''): 69 | """Validate an entity by looking up the core schema for its type and 70 | overriding it with any project level overrides 71 | 72 | """ 73 | local_schema = get_project_specific_schema( 74 | projects, project, schemata[entity['type']], entity['type']) 75 | result = validate(entity, local_schema) 76 | return result 77 | 78 | 79 | def validate_schemata(schemata, metaschema): 80 | # validate schemata 81 | print('Validating schemas against metaschema... '), 82 | for s in schemata.values(): 83 | validate(s, metaschema) 84 | 85 | def assert_link_is_also_prop(link): 86 | assert link in s['properties'],\ 87 | "Entity '{}' has '{}' as a link but not property".format( 88 | s['id'], link) 89 | 90 | for link in [l['name'] for l in s['links'] if 'name' in l]: 91 | assert_link_is_also_prop(link) 92 | for subgroup in [l['subgroup'] for l in s['links'] if 'name' not in l]: 93 | for link in [l['name'] for l in subgroup if 'name' in l]: 94 | assert_link_is_also_prop(link) 95 | 96 | 97 | class SchemaTest(unittest.TestCase): 98 | def setUp(self): 99 | self.dictionary = gdcdictionary 100 | self.definitions = yaml.load(open(os.path.join(CUR_DIR, 'schemas','_definitions.yaml'),'r')) 101 | 102 | def test_schemas(self): 103 | validate_schemata(self.dictionary.schema, self.dictionary.metaschema) 104 | 105 | def test_valid_files(self): 106 | for path in glob.glob(os.path.join(DATA_DIR, 'valid', '*.json')): 107 | print("Validating {}".format(path)) 108 | doc = json.load(open(path, 'r')) 109 | print(doc) 110 | if type(doc) == dict: 111 | self.add_system_props(doc) 112 | validate_entity(doc, self.dictionary.schema) 113 | elif type(doc) == list: 114 | for entity in doc: 115 | self.add_system_props(entity) 116 | validate_entity(entity, self.dictionary.schema) 117 | else: 118 | raise Exception("Invalid json") 119 | 120 | def test_invalid_files(self): 121 | for path in glob.glob(os.path.join(DATA_DIR, 'invalid', '*.json')): 122 | print("Validating {}".format(path)) 123 | doc = json.load(open(path, 'r')) 124 | if type(doc) == dict: 125 | self.add_system_props(doc) 126 | with self.assertRaises(ValidationError): 127 | validate_entity(doc, self.dictionary.schema) 128 | elif type(doc) == list: 129 | for entity in doc: 130 | self.add_system_props(entity) 131 | with self.assertRaises(ValidationError): 132 | validate_entity(entity, self.dictionary.schema) 133 | else: 134 | raise Exception("Invalid json") 135 | 136 | def add_system_props(self, doc): 137 | schema = self.dictionary.schema[doc['type']] 138 | for key in schema['systemProperties']: 139 | use_def_default = ( 140 | '$ref' in schema['properties'][key] and 141 | key in self.definitions and 142 | 'default' in self.definitions[key] 143 | ) 144 | if use_def_default: 145 | doc[key] = self.definitions[key]['default'] 146 | 147 | if __name__ == '__main__': 148 | 149 | #################### 150 | # Setup 151 | #################### 152 | 153 | 154 | parser = argparse.ArgumentParser(description='Validate JSON') 155 | parser.add_argument('jsonfiles', metavar='file', 156 | type=argparse.FileType('r'), nargs='*', 157 | help='json files to test if (in)valid') 158 | 159 | parser.add_argument('--invalid', action='store_true', default=False, 160 | help='expect the files to be invalid instead of valid') 161 | 162 | args = parser.parse_args() 163 | 164 | #################### 165 | # Example validation 166 | #################### 167 | 168 | # Load schemata 169 | dictionary = gdcdictionary 170 | 171 | for f in args.jsonfiles: 172 | doc = json.load(f) 173 | if args.invalid: 174 | try: 175 | print("CHECK if {0} is invalid:".format(f.name)), 176 | print(type(doc)) 177 | if type(doc) == dict: 178 | validate_entity(doc, dictionary.schema) 179 | elif type(doc) == list: 180 | for entity in doc: 181 | validate_entity(entity, dictionary.schema) 182 | else: 183 | raise ValidationError("Invalid json") 184 | except ValidationError as e: 185 | print("Invalid as expected.") 186 | pass 187 | else: 188 | raise Exception("Expected invalid, but validated.") 189 | else: 190 | print ("CHECK if {0} is valid:".format(f.name)), 191 | if type(doc) == dict: 192 | validate_entity(doc, dictionary.schema) 193 | elif type(doc) == list: 194 | for entity in doc: 195 | validate_entity(entity, dictionary.schema) 196 | else: 197 | print("Invalid json") 198 | 199 | print("Valid as expected") 200 | print('ok.') 201 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | ## Dependencies 4 | 5 | - OpenSSL 6 | - Docker and Docker Compose 7 | 8 | ## Docker and Docker Compose Setup 9 | 10 | If you've never used Docker before, it may be helpful to read some of the Docker documentation to familiarize yourself with containers. You can also read an overview of what Docker Compose is [here](https://docs.docker.com/compose/overview/) if you want some extra background information. 11 | 12 | The official *Docker* installation page can be found [here](https://docs.docker.com/install/#supported-platforms). The official *Docker Compose* installation page can be found [here](https://docs.docker.com/compose/install/#prerequisites). For Windows and Mac, Docker Compose is included into Docker Desktop. If you are using Linux, then the official Docker installation does not come with Docker Compose; you will need to install Docker Engine before installing Docker Compose. 13 | Go through the steps of installing Docker Compose for your platform, then proceed to set up credentials. Note, that Docker Desktop is set to use 2 GB runtime memory by default. 14 | 15 | > **NOTE:** 16 | > 17 | > 🛑 As a minimum, make sure to increase the size of the **memory to 6 GB** (or more) as described [here](https://docs.docker.com/docker-for-mac/#resources). 18 | 19 | > ElasticSearch and ETL/Spark jobs through tube/guppy/spark-service are particularly resource intensive. If you are running Compose-Services on your laptop, we recommend minimizing/stopping background jobs/services during running ETL jobs or hdfs formatting phase during `spark-service` startup, etc. Please do observe with `docker stats` and `top` / `htop`. 20 | 21 | ## Docker ElasticSearch 22 | 23 | If you are running on AWS EC2 instance (Amazon Linux), consider setup [Docker ElasticSearch prerequisites](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#docker-prod-prerequisites). The following are known to be required to set on Docker host: 24 | ``` 25 | grep vm.max_map_count /etc/sysctl.conf 26 | vm.max_map_count=262144 27 | ``` 28 | 29 | ## Setting up Credentials 30 | 31 | Setup credentials for Fence, a custom root CA and SSL certs with the provided script by running either: 32 | ``` 33 | bash ./creds_setup.sh 34 | OR 35 | bash ./creds_setup.sh YOUR-CUSTOM-DOMAIN 36 | ``` 37 | This script will create a `Secrets` folder that holds various secrets and configuration files. 38 | The script by default generates an SSL certificate to access the gen3 stack at `https://localhost`. 39 | If you are running this in a remote server with an actual domain, you can run `bash creds_setup.sh YOUR_DOMAIN`. This will create SSL cert signed by the custom CA so that the microservices can talk to each other without bypassing SSL verification. If you are setting this up on AWS, ensure that you use an Elastic IP address BEFORE you set up and use that as your domain. On an EC2 instance, for example, this would be your ec2-YOUR-Elastic-IP-Addr.us-region-number.compute.amazonaws.com. This will save a lot of time and avoid [editing the individual files](https://github.com/uc-cdis/compose-services/blob/master/docs/dev_tips.md#Running-Docker-Compose-on-a-Remote-Machine) to set up the hostname(`fence-config.yaml`, `peregrine_creds.json`, and `sheepdog_creds.json`) when the machine is rebooted. This is because each of the microservices can be configured to run on separate machines and thus have their respective configuration files. You will still need to bypass SSL verification when you hit the services from the browser. If you have real certs for your domain, you can copy to `Secrets/TLS/service.key` and `Secrets/TLS/service.crt` to overwrite our dev certs. 40 | 41 | If you are using MacOS, you may run into an error with the default MacOS OpenSSL config not including the configuration for v3_ca certificate generation. OpenSSL should create the `jwt_private_key.pem` and `jwt_public_key.pem` in the `Secrets/fenceJwtKeys/{dateTtimeZ}` folder. If you do not see them, control whether your version of OpenSSL is correct. You can refer to the solution on [this Github issue](https://github.com/jetstack/cert-manager/issues/279) on a related issue on Jetstack's cert-manager. 42 | 43 | Support for multi-tenant fence (configure another fence as an IDP for this fence) is available and can be edited in the `fence-config.yaml`. If this is not the case, we recommend removing the [relevant section](https://github.com/uc-cdis/compose-services/blob/fa3dcc95a4244805c7a02f315cd330447e189945/templates/fence-config.yaml#L81). 44 | 45 | ## Setting up Google OAuth Client-Id for Fence 46 | 47 | This Docker Compose setup requires Google API Credentials in order for Fence microservice to complete its authentication. 48 | To set up Google API Credentials, go to [the Credentials page of the Google Developer Console](https://console.developers.google.com/apis/credentials) and click the 'Create Credentials' button. Follow the prompts to create a new OAuth Client ID for a Web Application. Add `https://localhost/user/login/google/login/` OR `https://YOUR_REMOTE_MACHINE_DOMAIN/user/login/google/login/` to your Authorized redirect URIs in the Credentials and click 'Create'. Then copy your client ID and client secret and use them to fill in the 'google.client_secret' and 'google.client_id' fields in the `Secrets/fence-config.yaml` file. 49 | See image below for an example on a sample Google account. 50 | 51 | ![Redirection Set up](https://github.com/uc-cdis/compose-services/blob/master/Authorization_URL_2020.jpg) 52 | 53 | If you have Google API credentials set up already that you would like to use with the local gen3 Docker Compose setup, simply add `https://localhost/user/login/google/login/` OR `https://YOUR_REMOTE_MACHINE_DOMAIN/user/login/google/login/` to your Authorized redirect URIs in your credentials and copy your client ID and client secret from your credentials to the 'client_secret' and 'client_id' fields in the `Secrets/fence-config.yaml` under `OPENID_CONNECT` and `google`. 54 | 55 | ## Setting up Users 56 | 57 | To set up user privileges for the services, please edit the `Secrets/user.yaml` file, following [this guide](https://github.com/uc-cdis/fence/blob/master/docs/user.yaml_guide.md). In particular, you should change all occurrences of `username1@gmail.com` to the email you intend to log in with, so that you can create administrative nodes later on. 58 | 59 | Fence container will automatically sync this file to the `fence_db` database on startup. If you wish to update user privileges while the containers are running (without restarting the container), just edit the `Secrets/user.yaml` file and then run 60 | ``` 61 | docker exec -it fence-service fence-create sync --arborist http://arborist-service --yaml user.yaml 62 | ``` 63 | This command will enter Fence container to run the fence-create sync command, which will update your user privileges. If you are logged in to your commons on a browser, you may need to log out and log back in again or clear your cookies in order to see the changes. 64 | 65 | 66 | ## Start running your local Gen3 Docker Compose environment 67 | 68 | > **NOTE**: 69 | > 70 | > 🛑 If your Gen3 Data Commons does not host any data, yet, we recommend commenting out the [kibana-service section](https://github.com/uc-cdis/compose-services/blob/454d06358a49b4455097e34ddc060e76903e1aa3/docker-compose.yml#L309-L320) in the `docker-compose.yaml` and the [guppy section](https://github.com/uc-cdis/compose-services/blob/454d06358a49b4455097e34ddc060e76903e1aa3/nginx.conf#L140-L142) in the `nginx.conf` file. After having setup the first program/project and uploaded the first data, we recommend enabling these sections. Precisely, re-enable both services after you completed the following two steps: 71 | > 1. [Generate Test Metadata](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#generating-test-metadata) 72 | > 2. Upload the simulated test metadata to the Data Portal UI. Follow [gen3.org](https://gen3.org/resources/user/submit-data/) and [Useful links](https://github.com/uc-cdis/compose-services/blob/master/docs/useful_links.md) for how-to guides and tutorials. 73 | 74 | > 🟢 Finally, re-enable kibana and guppy services before continuing with the section [Configuring guppy for exploration page](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#configuring-guppy-for-exploration-page). 75 | 76 | Now that you are done with the setup, all Docker Compose features should be available. If you are a non-root user you may need to add yourself to the 'docker' group: `sudo usermod -aG docker your-user`, and the log out and log back in. 77 | Here are some useful commands: 78 | 79 | The basic command of Docker Compose is 80 | ``` 81 | docker-compose up 82 | ``` 83 | which can be useful for debugging errors. To detach output from the containers, run 84 | ``` 85 | docker-compose up -d 86 | ``` 87 | When doing this, the logs for each service can be accessed using 88 | ``` 89 | docker logs 90 | ``` 91 | To stop the services use 92 | ``` 93 | docker-compose down 94 | ``` 95 | As the Docker images are pulled from quay.io, they do not update automatically. To update your Docker images, run 96 | ``` 97 | docker-compose pull 98 | docker image prune -f 99 | ``` 100 | These commands may take a while, and they also may fail. If they do fail, simply rerun them, or just update/remove images one at a time manually. 101 | Sheepdog and Peregrine services download the dictionary schema at startup, and the 102 | portal service runs a series of pre-launch compilations that depend on Sheepdog and Peregrine, 103 | so it may take several minutes for the portal to finally come up at https://localhost 104 | 105 | Following the portal logs is one way to monitor its startup progress: 106 | ``` 107 | docker logs -f portal-service 108 | ``` 109 | When you see that `bundle.js` and `index.html` were successfully built in the logs, you should be able to log into https://localhost and see the data commons. You are now ready to setup the [first program and project](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#programs-and-projects). 110 | 111 | 112 | ## Update tips 113 | 114 | You should of course `git pull` compose-services if you have not done so for a while. You also need to `docker-compose pull` new images from Quay--this will not happen automatically. If your git pull pulled new commits, and you already have a `Secrets` folder, you may also need to delete your old `Secrets` and rerun `creds_setup.sh` (see [Setting up Credentials](https://github.com/uc-cdis/compose-services/blob/master/docs/setup.md#Setting-up-Credentials)) to recreate it. 115 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | 2 | user nginx; 3 | worker_processes 1; 4 | 5 | error_log /var/log/nginx/error.log warn; 6 | pid /var/run/nginx.pid; 7 | 8 | load_module modules/ngx_http_perl_module.so; 9 | load_module modules/ngx_http_js_module.so; 10 | load_module modules/ngx_http_headers_more_filter_module.so; 11 | 12 | events { 13 | worker_connections 1024; 14 | } 15 | 16 | http { 17 | include /etc/nginx/mime.types; 18 | default_type application/octet-stream; 19 | 20 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 21 | '$status $body_bytes_sent "$http_referer" ' 22 | '"$http_user_agent" "$http_x_forwarded_for"'; 23 | 24 | access_log /var/log/nginx/access.log main; 25 | 26 | sendfile on; 27 | #tcp_nopush on; 28 | 29 | keepalive_timeout 65; 30 | 31 | server { 32 | listen 80; 33 | server_name revproxy-service; 34 | resolver 127.0.0.11; 35 | 36 | listen 443 ssl; 37 | 38 | ssl_certificate /etc/nginx/ssl/nginx.crt; 39 | ssl_certificate_key /etc/nginx/ssl/nginx.key; 40 | 41 | set $access_token ""; 42 | set $csrf_check "ok-tokenauth"; 43 | if ($cookie_access_token) { 44 | set $access_token "bearer $cookie_access_token"; 45 | # cookie auth requires csrf check 46 | set $csrf_check "fail"; 47 | } 48 | if ($http_authorization) { 49 | # Authorization header is present - prefer that token over cookie token 50 | set $access_token "$http_authorization"; 51 | } 52 | 53 | proxy_set_header Authorization "$access_token"; 54 | # proxy_set_header X-Forwarded-For "$realip"; 55 | # proxy_set_header X-UserId "$userid"; 56 | 57 | # 58 | # Accomodate large jwt token headers 59 | # * http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_buffer_size 60 | # * https://ma.ttias.be/nginx-proxy-upstream-sent-big-header-reading-response-header-upstream/ 61 | # 62 | proxy_buffer_size 16k; 63 | proxy_buffers 8 16k; 64 | proxy_busy_buffers_size 32k; 65 | # 66 | # also incoming from client: 67 | # * https://fullvalence.com/2016/07/05/cookie-size-in-nginx/ 68 | # * https://nginx.org/en/docs/http/ngx_http_core_module.html#client_header_buffer_size 69 | large_client_header_buffers 4 8k; 70 | client_header_buffer_size 4k; 71 | 72 | # 73 | # CSRF check 74 | # This block requires a csrftoken for all POST requests. 75 | # 76 | if ($cookie_csrftoken = $http_x_csrf_token) { 77 | # this will fail further below if cookie_csrftoken is empty 78 | set $csrf_check "ok-$cookie_csrftoken"; 79 | } 80 | if ($request_method != "POST") { 81 | set $csrf_check "ok-$request_method"; 82 | } 83 | if ($cookie_access_token = "") { 84 | # do this again here b/c empty cookie_csrftoken == empty http_x_csrf_token - ugh 85 | set $csrf_check "ok-tokenauth"; 86 | } 87 | 88 | location / { 89 | proxy_pass http://portal-service/; 90 | } 91 | 92 | location /user/ { 93 | proxy_pass http://fence-service/; 94 | } 95 | 96 | location /api/ { 97 | proxy_pass http://sheepdog-service/; 98 | } 99 | 100 | location /mds/ { 101 | proxy_pass http://metadata-service/; 102 | } 103 | 104 | location /mds-admin/ { 105 | rewrite ^/mds-admin/(.*) /$1 break; 106 | proxy_pass http://metadata-service; 107 | proxy_redirect http://$host/ https://$host/mds-admin/; 108 | } 109 | 110 | location /coremetadata/ { 111 | # redirect to coremetadata landing page if header does not specify otherwise 112 | if ($http_accept !~ (application/json|x-bibtex|application/vnd\.schemaorg\.ld\+json)) { 113 | rewrite ^/coremetadata/(.*) /files/$1 redirect; 114 | } 115 | 116 | rewrite ^/coremetadata/(.*) /$1 break; 117 | proxy_pass http://pidgin-service; 118 | } 119 | 120 | location /index/ { 121 | proxy_pass http://indexd-service/; 122 | } 123 | 124 | location = /_status { 125 | default_type application/json; 126 | return 200 "{ \"message\": \"Feelin good!\" }\n"; 127 | } 128 | 129 | location /peregrine/_status { 130 | proxy_pass http://peregrine-service/_status; 131 | } 132 | location /pidgin/_status { 133 | proxy_pass http://pidgin-service/_status; 134 | } 135 | 136 | location /api/v0/submission/getschema { 137 | proxy_pass http://peregrine-service/v0/submission/getschema; 138 | } 139 | 140 | location /guppy/ { 141 | proxy_pass http://guppy-service/; 142 | } 143 | 144 | location /api/v0/submission/graphql { 145 | if ($cookie_csrftoken = "") { 146 | add_header Set-Cookie "csrftoken=$request_id$request_length$request_time$time_iso8601;Path=/"; 147 | } 148 | proxy_next_upstream off; 149 | # Forward the host and set Subdir header so api 150 | # knows the original request path for hmac signing 151 | proxy_set_header Host $host; 152 | proxy_set_header Subdir /api; 153 | proxy_set_header Authorization "$access_token"; 154 | proxy_connect_timeout 300; 155 | proxy_send_timeout 300; 156 | proxy_read_timeout 300; 157 | send_timeout 300; 158 | proxy_pass http://peregrine-service/v0/submission/graphql; 159 | } 160 | 161 | location /api/search { 162 | if ($csrf_check !~ ^ok-\S.+$) { 163 | return 403 "failed csrf check"; 164 | } 165 | 166 | gzip off; 167 | proxy_next_upstream off; 168 | proxy_set_header Host $host; 169 | proxy_set_header Authorization "$access_token"; 170 | 171 | proxy_connect_timeout 300; 172 | proxy_send_timeout 300; 173 | proxy_read_timeout 300; 174 | send_timeout 300; 175 | 176 | rewrite ^/api/search/(.*) /$1 break; 177 | proxy_pass http://peregrine-service; 178 | } 179 | 180 | location @errorworkspace { 181 | return 302 https://$host/no-workspace-access; 182 | } 183 | 184 | # 185 | # workspace AuthZ-proxy uses arborist to provide authorization to workpace services 186 | # that don't implement our authn or authz i.e. shiny, jupyter. 187 | # 188 | location = /gen3-authz { 189 | internal; 190 | error_page 400 =403 @errorworkspace; 191 | error_page 500 =403 @errorworkspace; 192 | 193 | proxy_pass http://arborist-service/auth/proxy?resource=$authz_resource&method=$authz_method&service=$authz_service; 194 | 195 | proxy_pass_request_body off; 196 | proxy_set_header Authorization "$access_token"; 197 | proxy_set_header Content-Length ""; 198 | proxy_intercept_errors on; 199 | 200 | # nginx bug that it checks even if request_body off 201 | client_max_body_size 0; 202 | } 203 | 204 | # 205 | # authorization endpoint 206 | # https://hostname/authz?resource=programs/blah&method=acb&service=xyz 207 | # 208 | location ~ /authz/? { 209 | if ($csrf_check !~ ^ok-\S.+$) { 210 | return 403 "failed csrf check"; 211 | } 212 | set $proxy_service "arborist"; 213 | 214 | proxy_pass http://arborist-service/auth/proxy?resource=$arg_resource&method=$arg_method&service=$arg_service; 215 | } 216 | 217 | location = /authz/resources { 218 | if ($csrf_check !~ ^ok-\S.+$) { 219 | return 403 "failed csrf check"; 220 | } 221 | 222 | proxy_pass http://arborist-service/auth/resources; 223 | } 224 | 225 | location = /authz/mapping { 226 | if ($csrf_check !~ ^ok-\S.+$) { 227 | return 403 "failed csrf check"; 228 | } 229 | 230 | # Do not expose POST /auth/mapping 231 | limit_except GET { 232 | deny all; 233 | } 234 | 235 | # Do not pass the username arg here! Otherwise anyone can see anyone's access. 236 | # Arborist will fall back to parsing the jwt for username. 237 | proxy_pass http://arborist-service/auth/mapping; 238 | } 239 | 240 | location = /lw-workspace/status { 241 | default_type application/json; 242 | return 200 "{ \"message\": \"Feelin good!\" }\n"; 243 | } 244 | 245 | 246 | location /lw-workspace/proxy { 247 | set $authz_resource "/workspace"; 248 | set $authz_method "access"; 249 | set $authz_service "jupyterhub"; 250 | # be careful - sub-request runs in same context as this request 251 | auth_request_set $remoteUser $upstream_http_REMOTE_USER; 252 | auth_request_set $saved_set_cookie $upstream_http_set_cookie; 253 | auth_request /gen3-authz; 254 | 255 | if ($saved_set_cookie != "") { 256 | add_header Set-Cookie $saved_set_cookie always; 257 | } 258 | 259 | proxy_set_header REMOTE_USER $remoteUser; 260 | error_page 403 = @errorworkspace; 261 | 262 | # 263 | # jupyter notebooks use websockets 264 | # See https://aptro.github.io/server/architecture/2016/06/21/Jupyter-Notebook-Nginx-Setup.html 265 | # 266 | proxy_pass http://jupyter-service:8888/lw-workspace/proxy; 267 | proxy_http_version 1.1; 268 | proxy_set_header Host $host; 269 | #proxy_set_header X-Real-IP $remote_addr; 270 | #proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 271 | proxy_set_header Upgrade $http_upgrade; 272 | proxy_set_header Connection $http_connection; 273 | #client_max_body_size 0; 274 | } 275 | 276 | location /lw-workspace/ { 277 | return 302 /lw-workspace/proxy; 278 | } 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": { 3 | "files": null, 4 | "lines": null 5 | }, 6 | "generated_at": "2021-06-25T20:50:57Z", 7 | "plugins_used": [ 8 | { 9 | "name": "AWSKeyDetector" 10 | }, 11 | { 12 | "name": "ArtifactoryDetector" 13 | }, 14 | { 15 | "base64_limit": 4.5, 16 | "name": "Base64HighEntropyString" 17 | }, 18 | { 19 | "name": "BasicAuthDetector" 20 | }, 21 | { 22 | "name": "CloudantDetector" 23 | }, 24 | { 25 | "hex_limit": 3, 26 | "name": "HexHighEntropyString" 27 | }, 28 | { 29 | "name": "IbmCloudIamDetector" 30 | }, 31 | { 32 | "name": "IbmCosHmacDetector" 33 | }, 34 | { 35 | "name": "JwtTokenDetector" 36 | }, 37 | { 38 | "keyword_exclude": null, 39 | "name": "KeywordDetector" 40 | }, 41 | { 42 | "name": "MailchimpDetector" 43 | }, 44 | { 45 | "name": "PrivateKeyDetector" 46 | }, 47 | { 48 | "name": "SlackDetector" 49 | }, 50 | { 51 | "name": "SoftlayerDetector" 52 | }, 53 | { 54 | "name": "StripeDetector" 55 | }, 56 | { 57 | "name": "TwilioKeyDetector" 58 | } 59 | ], 60 | "results": { 61 | "creds_setup.sh": [ 62 | { 63 | "hashed_secret": "2e9ee120fd25e31048598693aca91d5473898a99", 64 | "is_verified": false, 65 | "line_number": 122, 66 | "type": "Secret Keyword" 67 | } 68 | ], 69 | "datadictionary/gdcdictionary/examples/valid/aligned_reads_index.json": [ 70 | { 71 | "hashed_secret": "a1ba33896d16eda8522e531edbaf3b625c1f4c31", 72 | "is_verified": false, 73 | "line_number": 6, 74 | "type": "Hex High Entropy String" 75 | } 76 | ], 77 | "datadictionary/gdcdictionary/examples/valid/experimental_metadata.json": [ 78 | { 79 | "hashed_secret": "daef34f66b6e909f3a22ffd063d48eb428067b6e", 80 | "is_verified": false, 81 | "line_number": 6, 82 | "type": "Hex High Entropy String" 83 | } 84 | ], 85 | "datadictionary/gdcdictionary/examples/valid/slide_image.json": [ 86 | { 87 | "hashed_secret": "daef34f66b6e909f3a22ffd063d48eb428067b6e", 88 | "is_verified": false, 89 | "line_number": 6, 90 | "type": "Hex High Entropy String" 91 | } 92 | ], 93 | "datadictionary/gdcdictionary/examples/valid/submitted_aligned_reads.json": [ 94 | { 95 | "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535", 96 | "is_verified": false, 97 | "line_number": 7, 98 | "type": "Hex High Entropy String" 99 | } 100 | ], 101 | "datadictionary/gdcdictionary/examples/valid/submitted_copy_number.json": [ 102 | { 103 | "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535", 104 | "is_verified": false, 105 | "line_number": 6, 106 | "type": "Hex High Entropy String" 107 | } 108 | ], 109 | "datadictionary/gdcdictionary/examples/valid/submitted_methylation.json": [ 110 | { 111 | "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535", 112 | "is_verified": false, 113 | "line_number": 7, 114 | "type": "Hex High Entropy String" 115 | } 116 | ], 117 | "datadictionary/gdcdictionary/examples/valid/submitted_somatic_mutation.json": [ 118 | { 119 | "hashed_secret": "a1ba33896d16eda8522e531edbaf3b625c1f4c31", 120 | "is_verified": false, 121 | "line_number": 9, 122 | "type": "Hex High Entropy String" 123 | } 124 | ], 125 | "datadictionary/gdcdictionary/examples/valid/submitted_unaligned_reads.json": [ 126 | { 127 | "hashed_secret": "88e3a7adc1779a311467797f00d2edc5e9697d9c", 128 | "is_verified": false, 129 | "line_number": 7, 130 | "type": "Hex High Entropy String" 131 | } 132 | ], 133 | "docker-compose.override.sample.yml": [ 134 | { 135 | "hashed_secret": "afc848c316af1a89d49826c5ae9d00ed769415f3", 136 | "is_verified": false, 137 | "line_number": 6, 138 | "type": "Secret Keyword" 139 | } 140 | ], 141 | "docker-compose.yml": [ 142 | { 143 | "hashed_secret": "afc848c316af1a89d49826c5ae9d00ed769415f3", 144 | "is_verified": false, 145 | "line_number": 21, 146 | "type": "Secret Keyword" 147 | }, 148 | { 149 | "hashed_secret": "cb93dace47db45078164ade928ba21cf27c1d8cf", 150 | "is_verified": false, 151 | "line_number": 75, 152 | "type": "Secret Keyword" 153 | }, 154 | { 155 | "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa", 156 | "is_verified": false, 157 | "line_number": 94, 158 | "type": "Secret Keyword" 159 | }, 160 | { 161 | "hashed_secret": "9b5925ea817163740dfb287a9894e8ab3aba2c18", 162 | "is_verified": false, 163 | "line_number": 242, 164 | "type": "Secret Keyword" 165 | } 166 | ], 167 | "docs/using_the_commons.md": [ 168 | { 169 | "hashed_secret": "6d9c68c603e465077bdd49c62347fe54717f83a3", 170 | "is_verified": false, 171 | "line_number": 88, 172 | "type": "Secret Keyword" 173 | } 174 | ], 175 | "scripts/postgres_always.sh": [ 176 | { 177 | "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa", 178 | "is_verified": false, 179 | "line_number": 30, 180 | "type": "Secret Keyword" 181 | } 182 | ], 183 | "scripts/postgres_init.sql": [ 184 | { 185 | "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa", 186 | "is_verified": false, 187 | "line_number": 11, 188 | "type": "Secret Keyword" 189 | }, 190 | { 191 | "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042", 192 | "is_verified": false, 193 | "line_number": 15, 194 | "type": "Secret Keyword" 195 | }, 196 | { 197 | "hashed_secret": "c9ed73071942a54e7ec610d5a93d4a22e83e1da7", 198 | "is_verified": false, 199 | "line_number": 19, 200 | "type": "Secret Keyword" 201 | }, 202 | { 203 | "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18", 204 | "is_verified": false, 205 | "line_number": 23, 206 | "type": "Secret Keyword" 207 | }, 208 | { 209 | "hashed_secret": "bf41596f893a5f6ed0f66addb555cba581413c56", 210 | "is_verified": false, 211 | "line_number": 27, 212 | "type": "Secret Keyword" 213 | }, 214 | { 215 | "hashed_secret": "cb93dace47db45078164ade928ba21cf27c1d8cf", 216 | "is_verified": false, 217 | "line_number": 31, 218 | "type": "Secret Keyword" 219 | } 220 | ], 221 | "templates/config_helper.py": [ 222 | { 223 | "hashed_secret": "bf21a9e8fbc5a3846fb05b4fa0859e0917b2202f", 224 | "is_verified": false, 225 | "line_number": 66, 226 | "type": "Basic Auth Credentials" 227 | } 228 | ], 229 | "templates/etl_creds.json": [ 230 | { 231 | "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18", 232 | "is_verified": false, 233 | "line_number": 4, 234 | "type": "Secret Keyword" 235 | } 236 | ], 237 | "templates/fence-config.yaml": [ 238 | { 239 | "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042", 240 | "is_verified": false, 241 | "line_number": 31, 242 | "type": "Basic Auth Credentials" 243 | }, 244 | { 245 | "hashed_secret": "5d07e1b80e448a213b392049888111e1779a52db", 246 | "is_verified": false, 247 | "line_number": 296, 248 | "type": "Secret Keyword" 249 | }, 250 | { 251 | "hashed_secret": "87942aadb396f068f7bc17acdf1c6ca4b93ae89b", 252 | "is_verified": false, 253 | "line_number": 355, 254 | "type": "Secret Keyword" 255 | } 256 | ], 257 | "templates/indexd_creds.json": [ 258 | { 259 | "hashed_secret": "bf41596f893a5f6ed0f66addb555cba581413c56", 260 | "is_verified": false, 261 | "line_number": 4, 262 | "type": "Secret Keyword" 263 | } 264 | ], 265 | "templates/indexd_settings.py": [ 266 | { 267 | "hashed_secret": "0a0d18c85e096611b5685b62bc60ec534d19bacc", 268 | "is_verified": false, 269 | "line_number": 49, 270 | "type": "Basic Auth Credentials" 271 | } 272 | ], 273 | "templates/peregrine_creds.json": [ 274 | { 275 | "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042", 276 | "is_verified": false, 277 | "line_number": 4, 278 | "type": "Secret Keyword" 279 | }, 280 | { 281 | "hashed_secret": "c9ed73071942a54e7ec610d5a93d4a22e83e1da7", 282 | "is_verified": false, 283 | "line_number": 8, 284 | "type": "Secret Keyword" 285 | }, 286 | { 287 | "hashed_secret": "1b691ca20ade79740ab622b50690458c609018ce", 288 | "is_verified": false, 289 | "line_number": 10, 290 | "type": "Base64 High Entropy String" 291 | } 292 | ], 293 | "templates/peregrine_settings.py": [ 294 | { 295 | "hashed_secret": "347cd9c53ff77d41a7b22aa56c7b4efaf54658e3", 296 | "is_verified": false, 297 | "line_number": 37, 298 | "type": "Basic Auth Credentials" 299 | } 300 | ], 301 | "templates/sheepdog_creds.json": [ 302 | { 303 | "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042", 304 | "is_verified": false, 305 | "line_number": 4, 306 | "type": "Secret Keyword" 307 | }, 308 | { 309 | "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18", 310 | "is_verified": false, 311 | "line_number": 8, 312 | "type": "Secret Keyword" 313 | }, 314 | { 315 | "hashed_secret": "1b691ca20ade79740ab622b50690458c609018ce", 316 | "is_verified": false, 317 | "line_number": 10, 318 | "type": "Base64 High Entropy String" 319 | }, 320 | { 321 | "hashed_secret": "87942aadb396f068f7bc17acdf1c6ca4b93ae89b", 322 | "is_verified": false, 323 | "line_number": 12, 324 | "type": "Secret Keyword" 325 | }, 326 | { 327 | "hashed_secret": "50f013532a9770a2c2cfdc38b7581dd01df69b70", 328 | "is_verified": false, 329 | "line_number": 15, 330 | "type": "Secret Keyword" 331 | } 332 | ], 333 | "templates/sheepdog_settings.py": [ 334 | { 335 | "hashed_secret": "347cd9c53ff77d41a7b22aa56c7b4efaf54658e3", 336 | "is_verified": false, 337 | "line_number": 37, 338 | "type": "Basic Auth Credentials" 339 | } 340 | ] 341 | }, 342 | "version": "0.13.1", 343 | "word_list": { 344 | "file": null, 345 | "hash": null 346 | } 347 | } 348 | --------------------------------------------------------------------------------