├── LaunchPortal.jpg
├── Authorized URL.jpg
├── SandboxContainers.jpg
├── LoggedInScreenshot.jpg
├── Authorization_URL_2020.jpg
├── scripts
    ├── fence_key_helper.py
    ├── wait_for_esproxy.sh
    ├── arborist_setup.sh
    ├── peregrine_setup.sh
    ├── indexd_setup.sh
    ├── sheepdog_setup.sh
    ├── fence_setup.sh
    ├── waitForContainers.sh
    ├── postgres_always.sh
    ├── postgres_run.sh
    └── postgres_init.sql
├── datadictionary
    ├── gdcdictionary
    │   ├── examples
    │   │   ├── valid
    │   │   │   ├── program.json
    │   │   │   ├── publication.json
    │   │   │   ├── case.json
    │   │   │   ├── keyword.json
    │   │   │   ├── acknowledgement.json
    │   │   │   ├── project.json
    │   │   │   ├── aliquot.json
    │   │   │   ├── slide_count.json
    │   │   │   ├── demographic.json
    │   │   │   ├── exposure.json
    │   │   │   ├── family_history.json
    │   │   │   ├── slide_image.json
    │   │   │   ├── experimental_metadata.json
    │   │   │   ├── submitted_somatic_mutation.json
    │   │   │   ├── submitted_copy_number.json
    │   │   │   ├── aligned_reads_index.json
    │   │   │   ├── experiment.json
    │   │   │   ├── treatment.json
    │   │   │   ├── slide.json
    │   │   │   ├── submitted_unaligned_reads.json
    │   │   │   ├── submitted_aligned_reads.json
    │   │   │   ├── submitted_methylation.json
    │   │   │   ├── read_group.json
    │   │   │   ├── sample.json
    │   │   │   ├── read_group_qc.json
    │   │   │   ├── clinical_test.json
    │   │   │   └── diagnosis.json
    │   │   └── invalid
    │   │   │   ├── case_invalid_1.json
    │   │   │   ├── aliquot_invalid_2.json
    │   │   │   ├── aliquot_invalid_1.json
    │   │   │   ├── aliquot_invalid_3.json
    │   │   │   └── case_invalid_2.json
    │   ├── schemas
    │   │   ├── _settings.yaml
    │   │   ├── projects
    │   │   │   └── project1.yaml
    │   │   ├── README.md
    │   │   ├── program.yaml
    │   │   ├── keyword.yaml
    │   │   ├── publication.yaml
    │   │   ├── acknowledgement.yaml
    │   │   ├── case.yaml
    │   │   ├── experimental_metadata.yaml
    │   │   ├── aligned_reads_index.yaml
    │   │   ├── submitted_somatic_mutation.yaml
    │   │   ├── family_history.yaml
    │   │   ├── submitted_unaligned_reads.yaml
    │   │   ├── submitted_aligned_reads.yaml
    │   │   ├── submitted_copy_number.yaml
    │   │   ├── submitted_methylation.yaml
    │   │   ├── aliquot.yaml
    │   │   ├── demographic.yaml
    │   │   ├── exposure.yaml
    │   │   ├── slide_image.yaml
    │   │   ├── slide_count.yaml
    │   │   ├── experiment.yaml
    │   │   ├── read_group_qc.yaml
    │   │   ├── slide.yaml
    │   │   ├── project.yaml
    │   │   ├── core_metadata_collection.yaml
    │   │   ├── treatment.yaml
    │   │   ├── clinical_test.yaml
    │   │   ├── read_group.yaml
    │   │   └── _definitions.yaml
    │   ├── __init__.py
    │   └── schema_test.py
    ├── setup.py
    ├── NOTICE
    ├── design_notes.md
    └── README.md
├── templates
    ├── etl_creds.json
    ├── guppy_config.json
    ├── indexd_creds.json
    ├── peregrine_creds.json
    ├── sheepdog_creds.json
    ├── test_config_helper.py
    ├── indexd_settings.py
    ├── etlMapping.yaml
    ├── gitops.json
    ├── sheepdog_settings.py
    ├── peregrine_settings.py
    └── user.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── docker-compose.override.sample.yml
├── NOTICE
├── guppy_setup.sh
├── docs
    ├── release_history.md
    ├── database_information.md
    ├── cheat_sheet.md
    ├── useful_links.md
    ├── dev_tips.md
    └── setup.md
├── smoke_test.sh
├── README.md
├── dump.sh
├── Jenkinsfile
├── creds_setup.sh
├── nginx.conf
└── .secrets.baseline


/LaunchPortal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/LaunchPortal.jpg


--------------------------------------------------------------------------------
/Authorized URL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/Authorized URL.jpg


--------------------------------------------------------------------------------
/SandboxContainers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/SandboxContainers.jpg


--------------------------------------------------------------------------------
/LoggedInScreenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/LoggedInScreenshot.jpg


--------------------------------------------------------------------------------
/Authorization_URL_2020.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-cdis/compose-services/HEAD/Authorization_URL_2020.jpg


--------------------------------------------------------------------------------
/scripts/fence_key_helper.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import os
3 | key = base64.urlsafe_b64encode(os.urandom(32))
4 | print(key.decode('UTF-8'))
5 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/program.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "program",
3 |     "name": "CGCI",
4 |     "dbgap_accession_number": "phs000235"
5 | }
6 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/invalid/case_invalid_1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "case",
3 |   "alias": "case_1",
4 |   "gender": "female",
5 |   "race": "Unknown"
6 | }
7 | 


--------------------------------------------------------------------------------
/templates/etl_creds.json:
--------------------------------------------------------------------------------
1 | {
2 |     "db_host": "postgres",
3 |     "db_username": "sheepdog_user",
4 |     "db_password": "sheepdog_pass",
5 |     "db_database": "metadata_db"
6 |   }
7 |   


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_2.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "aliquot",
3 |   "alias": "abc",
4 |   "derived_from": ["e58e1f64-d733-405f-95f1-ede1628c81e7"]
5 | }
6 | 


--------------------------------------------------------------------------------
/templates/guppy_config.json:
--------------------------------------------------------------------------------
1 | { "indices": [ { "index": "etl", "type": "case" }, { "index": "file", "type": "file" } ], "config_index": "etl_array-config", "auth_filter_field": "auth_resource_path" }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Secrets/
 2 | *.lock*
 3 | *.env
 4 | *.bak
 5 | *.old
 6 | *~
 7 | *.swp
 8 | .DS_Store
 9 | *__pycache__
10 | *.pytest_cache
11 | *.cache
12 | *pyc
13 | docker-compose.override.yml
14 | 


--------------------------------------------------------------------------------
/templates/indexd_creds.json:
--------------------------------------------------------------------------------
1 | {
2 |   "db_host": "postgres",
3 |   "db_username": "indexd_user",
4 |   "db_password": "indexd_pass",
5 |   "db_database": "indexd_db",
6 |   "fence_database": "fence_db"
7 | }
8 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/publication.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "publication",
3 |     "submitter_id": "publication_1",
4 |     "projects": {
5 |         "submitter_id": "project_1"
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "aliquot",
3 |   "alias": "abc",
4 |   "derived_from":
5 |     {
6 |       "id": ["e58e1f64-d733-405f-95f1-ede1628c81e7"]
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/case.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "case",
3 |     "submitter_id": "BLGSP-71-06-00019",
4 |     "experiments": {
5 |         "id": "daa208a7-f57a-562c-a04a-7a7c77542c98"
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/invalid/aliquot_invalid_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "aliquot",
 3 |   "alias": "abc",
 4 |   "derived_from": [
 5 |     {
 6 |       "id": "e58e1f64-d733-405f-95f1-ede1628c81e7"
 7 |     }
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/keyword.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "keyword", 
3 |     "submitter_id": "keyword_1",
4 |     "keyword_name": "Blood Profiling Atlas",
5 |     "projects": {
6 |         "submitter_id": "project_1"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/_settings.yaml:
--------------------------------------------------------------------------------
1 | # Global settings for the graph
2 | 
3 | # Is the graph case centric, that we want
4 | # to create a link between all children to case
5 | # to expedite case filter on nodes
6 | enable_case_cache: false
7 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dictionaryutils import DataDictionary as GDCDictionary
3 | 
4 | SCHEMA_DIR = os.path.join(
5 |     os.path.abspath(os.path.dirname(__file__)), 'schemas')
6 | gdcdictionary = GDCDictionary(root_dir=SCHEMA_DIR)
7 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/acknowledgement.json:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "acknowledgement",
3 |     "submitter_id": "acknowledgement_1",
4 |     "acknowledgee": "Joe Biden",
5 |     "projects": {
6 |         "submitter_id": "Cancer Moonshot"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/project.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "code": "BLGSP",
 3 |     "name": "Burkitt Lymphoma Genome Sequencing Project",
 4 |     "state": "open",
 5 |     "type": "project",
 6 |     "dbgap_accession_number": "phs000235.v4.p1"
 7 |     "programs": [
 8 |         {"name": "CGCI"}
 9 |     ]
10 | }
11 | 


--------------------------------------------------------------------------------
/scripts/wait_for_esproxy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | until curl -f -s http://esproxy-service:9200/_cluster/health | python3 -c "import sys, json; sys.exit(0 if json.load(sys.stdin)['status'] != 'red' else 1)" 2>/dev/null;
 4 | do
 5 |   echo "esproxy not ready, waiting..."
 6 |   sleep 5
 7 | done
 8 | 
 9 | echo "esproxy status is green"
10 | 
11 | exec "$@"


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/invalid/case_invalid_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "case",
 3 |   "alias": "case_1",
 4 |   "member_of": [
 5 |     {
 6 |       "id": "e58e1f64-d733-405f-95f1-ede1628c81e7"
 7 |     },
 8 |     {
 9 |       "id": "511bf8e8-ae71-4cb9-bb1b-4e58d04d12c1"
10 |     }
11 |   ],
12 |   "gender": "female",
13 |   "race": "Unknown"
14 | }
15 | 


--------------------------------------------------------------------------------
/scripts/arborist_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint script for arborist to setup db
 3 | 
 4 | sleep 2
 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do
 6 |   echo "Postgres is unavailable - sleeping"
 7 |   sleep 2
 8 | done
 9 | 
10 | echo "postgres is ready"
11 | 
12 | update-ca-certificates
13 | 
14 | ./migrations/latest
15 | ./bin/arborist
16 | 


--------------------------------------------------------------------------------
/scripts/peregrine_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint script for peregrine to update CA certificates before running
 3 | 
 4 | sleep 2
 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do
 6 |   echo "Postgres is unavailable - sleeping"
 7 |   sleep 2
 8 | done
 9 | 
10 | echo "postgres is ready"
11 | 
12 | update-ca-certificates 
13 | 
14 | /dockerrun.sh
15 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/aliquot.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "aliquot",
 3 |     "submitter_id": "BLGSP-71-06-00019-01A-11D",
 4 |     "aliquot_quantity": 0.4,
 5 |     "aliquot_volume": 5,
 6 |     "amount": 10,
 7 |     "source_center": "23",
 8 |     "concentration": 0.07,
 9 |     "samples": {
10 |         "submitter_id": "BLGSP-71-06-00019-99A-01D"
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/slide_count.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "slide_count",
 3 |     "submitter_id": "CD45_slide_count",
 4 |     "cell_type": "CD45",
 5 |     "cell_identifier": "1233423",
 6 |     "cell_count": 100,
 7 |     "ck_signal": 0.12,
 8 |     "run_name": "Run1",
 9 |     "biomarker_signal": 3.45,
10 |     "slides": {
11 |         "submitter_id": "slide_1"
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/demographic.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gender": "male",
 3 |     "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_demographic",
 4 |     "year_of_birth": 1652,
 5 |     "race": "white",
 6 |     "cases": {
 7 |         "submitter_id": "BLGSP-71-06-00019"
 8 |     },
 9 |     "type": "demographic",
10 |     "ethnicity": "not hispanic or latino",
11 |     "year_of_death": 2009
12 | }
13 | 


--------------------------------------------------------------------------------
/templates/peregrine_creds.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fence_host": "postgres",
 3 |   "fence_username": "fence_user",
 4 |   "fence_password": "fence_pass",
 5 |   "fence_database": "fence_db",
 6 |   "db_host": "postgres",
 7 |   "db_username": "peregrine_user",
 8 |   "db_password": "peregrine_pass",
 9 |   "db_database": "metadata_db",
10 |   "gdcapi_secret_key": "1JMWnHdApSGMJ8OIqA0IwWUEo8nJ1NJqwDQbjrz5L5v1QtW2ke",
11 |   "hostname": "localhost"
12 | }
13 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/exposure.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cigarettes_per_day": -1.0,
 3 |     "weight": -1.0,
 4 |     "alcohol_history": "",
 5 |     "alcohol_intensity": "",
 6 |     "bmi": -1.0,
 7 |     "years_smoked": -1.0,
 8 |     "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_exposure",
 9 |     "cases": {
10 |         "submitter_id": "BLGSP-71-06-00019"
11 |     },
12 |     "height": -1.0,
13 |     "type": "exposure"
14 | }
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     -   repo: git@github.com:Yelp/detect-secrets
 3 |         rev: v0.13.1
 4 |         hooks:
 5 |         -   id: detect-secrets
 6 |             args: ['--baseline', '.secrets.baseline']
 7 |     -   repo: https://github.com/pre-commit/pre-commit-hooks
 8 |         rev: v2.5.0
 9 |         hooks:
10 |         -   id: no-commit-to-branch
11 |             args: [--branch, develop, --branch, master, --pattern, release/.*]
12 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/family_history.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "family_history",
 3 |     "submitter_id": "family_history_of_patient_X",
 4 |     "relative_with_cancer_history": "yes",
 5 |     "relationship_type": "cousin",
 6 |     "relationship_gender": "unspecified",
 7 |     "relationship_age_at_diagnosis": 12345,
 8 |     "relationship_primary_diagnosis": "cancer",
 9 |     "cases": {
10 |         "submitter_id": "patient_z"
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/docker-compose.override.sample.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   postgres:
 4 |     environment:
 5 |     # you may override postgres password here:
 6 |       - POSTGRES_PASSWORD=postgres
 7 |     # this makes the postgres container available from the host - ex:
 8 |     #    psql -h localhost -d fence -U fence_user
 9 |     ports:
10 |       - 5432:5432
11 |   jupyter-service:
12 |     environment:
13 |       - FRAME_ANCESTORS=http://localhost http://*.example.com
14 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/slide_image.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "slide_image",
 3 |     "submitter_id": "slide_image_X",
 4 |     "file_name": "slide.svs",
 5 |     "file_size": 21234,
 6 |     "md5sum": "84a72e8aaad3017348cb3f8459c5d5d9",
 7 |     "data_category": "Biospecimen",
 8 |     "data_type": "Single Cell Image",
 9 |     "data_format": "SVS",
10 |     "experimental_strategy": "Diagnostic Slide",
11 |     "slides": {
12 | 	"submitter_id": "slide_X"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/experimental_metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "experimental_metadata",
 3 |     "submitter_id": "experiment XML from SRA XML",
 4 |     "file_name": "experiment.xml",
 5 |     "file_size": 21234,
 6 |     "md5sum": "84a72e8aaad3017348cb3f8459c5d5d9",
 7 |     "data_category": "Sequencing Data",
 8 |     "data_type": "Experimental Metadata",
 9 |     "data_format": "SRA XML",
10 |     "experiments": {
11 | 	"submitter_id": "read_group_X"
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/scripts/indexd_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint bash script for indexd to healthcheck postgres to make sure that 
 3 | # postgres is ready before indexd tries to access its database
 4 | 
 5 | sleep 2
 6 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do
 7 |   echo "Postgres is unavailable - sleeping"
 8 |   sleep 2
 9 | done
10 | 
11 | echo "postgres is ready"
12 | 
13 | python /indexd/bin/index_admin.py create --username indexd_client --password indexd_client_pass
14 | /dockerrun.sh
15 | 


--------------------------------------------------------------------------------
/scripts/sheepdog_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint script for sheepdog to run setup_transactionlogs.py before running
 3 | 
 4 | sleep 2
 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do
 6 |   echo "Postgres is unavailable - sleeping"
 7 |   sleep 2
 8 | done
 9 | 
10 | echo "postgres is ready"
11 | 
12 | update-ca-certificates
13 | 
14 | python /sheepdog/bin/setup_transactionlogs.py --host postgres --user sheepdog_user --password sheepdog_pass --database metadata_db
15 | bash /dockerrun.sh
16 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/submitted_somatic_mutation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "submitted_somatic_mutation",
 3 |     "submitter_id": "somatic_mutations_from_x",
 4 |     "data_category": "Sequencing Data",
 5 |     "data_type": "Somatic Mutations",
 6 |     "data_format": "VCF",
 7 |     "file_name": "test.vcf",
 8 |     "file_size": 100,
 9 |     "md5sum": "6fd84891e7a53725d1cf6109c5f2400f",
10 |     "experimental_strategy": "Targeted Sequencing",
11 |     "read_groups": {
12 |         "submitter_id": "read_group_x"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/submitted_copy_number.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "submitted_copy_number",
 3 |     "submitter_id": "copy_number_information",
 4 |     "file_name": "cn.copynumber.data.txt",
 5 |     "file_size": 281653,
 6 |     "md5sum": "d3266f2577584713ea17f94d331f30c4",
 7 |     "data_category": "Copy Number Variation",
 8 |     "data_type": "Copy Number Estimate",
 9 |     "data_format": "TXT",
10 |     "experimental_strategy": "Genotyping Array",
11 |     "aliquots": {
12 |       "submitter_id": "aliquot_m"
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/aligned_reads_index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "aligned_reads_index",
 3 |     "submitter_id": "TCGA-AB-2837-03B-01W-0728-08",
 4 |     "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam.bai",
 5 |     "file_size": 5990568,
 6 |     "md5sum": "6fd84891e7a53725d1cf6109c5f2400f",
 7 |     "data_category": "Sequencing Data",
 8 |     "data_type": "Aligned Reads Index",
 9 |     "data_format": "BAI",
10 |     "submitted_aligned_reads_files": {
11 | 	"id": "0cb66276-c29b-4811-a704-38502173c0f8"
12 |     }
13 | }
14 | 
15 |     
16 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/experiment.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "experiment",
 3 |     "submitter_id": "experiment_1",
 4 |     "number_experimental_group": 1,
 5 |     "number_samples_per_experimental_group": 12,
 6 |     "experimental_description": "Case/Control, Time Course, Responder/Non-Responder",
 7 |     "experimental_intent": "Temperature, Storage Duration, and Tube Type effects on ctDNA stability",
 8 |     "type_of_sample": "Clinical",
 9 |     "type_of_specimen": "Plasma",
10 |     "projects": {
11 |         "submitter_id": "P0001"
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/treatment.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "treatment",
 3 |     "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_treatment",
 4 |     "days_to_treatment": -1.0,
 5 |     "days_to_treatment_end": 14,
 6 |     "days_to_treatment_start": 25,
 7 |     "therapeutic_agents": "",
 8 |     "treatment_anatomic_site": "Arm", 
 9 |     "treatment_intent_type": "",
10 |     "treatment_or_therapy": "unknown",
11 |     "treatment_type": "Other",
12 |     "diagnoses": {
13 |         "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_diagnosis"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/datadictionary/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='gdcdictionary',
 5 |     version='0.0.0',
 6 |     packages=find_packages(),
 7 |     install_requires=[
 8 |         'dictionaryutils',
 9 |     ],
10 |     dependency_links=[
11 |        "git+https://github.com/uc-cdis/dictionaryutils.git@2.0.4#egg=dictionaryutils",
12 |     ],
13 |     package_data={
14 |         "gdcdictionary": [
15 |             "schemas/*.yaml",
16 |             "schemas/projects/*.yaml",
17 |             "schemas/projects/*/*.yaml",
18 |         ]
19 |     },
20 | )
21 | 


--------------------------------------------------------------------------------
/scripts/fence_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint script for fence to sync user.yaml before running
 3 | 
 4 | sleep 2
 5 | until (echo > /dev/tcp/postgres/5432) >/dev/null 2>&1; do
 6 |   echo "Postgres is unavailable - sleeping"
 7 |   sleep 2
 8 | done
 9 | 
10 | echo "postgres is ready"
11 | 
12 | update-ca-certificates
13 | 
14 | until curl -f -s -o /dev/null http://arborist-service/policy; do
15 |     echo "arborist not ready, waiting..."
16 |     sleep 10
17 | done
18 | 
19 | fence-create sync --yaml user.yaml --arborist http://arborist-service
20 | 
21 | cd /fence
22 | /dockerrun.sh


--------------------------------------------------------------------------------
/templates/sheepdog_creds.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fence_host": "postgres",
 3 |   "fence_username": "fence_user",
 4 |   "fence_password": "fence_pass",
 5 |   "fence_database": "fence_db",
 6 |   "db_host": "postgres",
 7 |   "db_username": "sheepdog_user",
 8 |   "db_password": "sheepdog_pass",
 9 |   "db_database": "metadata_db",
10 |   "gdcapi_secret_key": "1JMWnHdApSGMJ8OIqA0IwWUEo8nJ1NJqwDQbjrz5L5v1QtW2ke",
11 |   "indexd_client": "indexd_client",
12 |   "indexd_password": "indexd_client_pass",
13 |   "hostname": "localhost",
14 |   "oauth2_client_id": "n/a",
15 |   "oauth2_client_secret": "n/a"
16 | }
17 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright 2015 University of Chicago
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.    You may obtain a copy of the License at         
4 | 
5 | http://www.apache.org/licenses/LICENSE-2.0     
6 | 
7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
8 | 
9 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/slide.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "slide",
 3 |     "submitter_id": "TCGA-FW-A3R5-06A-01-TS1",
 4 |     "section_location": "TOP",
 5 |     "percent_tumor_cells": 80.0,
 6 |     "percent_tumor_nuclei": 80.0,
 7 |     "percent_normal_cells": 0.0,
 8 |     "percent_necrosis": 0.0,
 9 |     "percent_stromal_cells": 20.0,
10 |     "percent_lymphocyte_infiltration": 0.0,
11 |     "percent_monocyte_infiltration": 0.0,
12 |     "percent_neutrophil_infiltration": 0.0,
13 |     "samples": [
14 |         {
15 |             "submitter_id": "TCGA-FW-A3R5-06A-11"
16 |         }
17 |     ]
18 | }
19 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/submitted_unaligned_reads.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "submitted_unaligned_reads",
 3 |     "id": "5557a728-1827-4aff-b28b-f004d835f9d6",
 4 |     "submitter_id": "TCGA-DQ-5630-01A-01R-1873-07",
 5 |     "file_name": "UNCID_2741452.a38e0f12-7b18-4856-9cc8-314d8f0b63d6.1.fastq",
 6 |     "file_size": 5747943025,
 7 |     "md5sum": "d81203da215be180128f260b788900b5",
 8 |     "data_category": "Sequencing Data",
 9 |     "data_type": "Unaligned Reads",
10 |     "data_format": "FASTQ",
11 |     "experimental_strategy": "WGS",
12 |     "read_groups": {
13 | 	"submitter_id": "read_group_1"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/scripts/waitForContainers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # entrypoint script for data-portal to healthcheck sheepdog and peregrine to 
 3 | # make sure they are ready before dataportal attempts to get information from 
 4 | # them
 5 | 
 6 | sleep 10
 7 | 
 8 | until curl -f -s -o /dev/null http://sheepdog-service/v0/submission/_dictionary/_all; do
 9 |     echo "sheepdog not ready, waiting..."
10 |     sleep 10
11 | done
12 | 
13 | until curl -f -s -o /dev/null http://peregrine-service/v0/submission/getschema ; do
14 |     echo "peregrine not ready, waiting..."
15 |     sleep 10
16 | done
17 | 
18 | echo "both services are ready"
19 | bash ./dockerStart.sh
20 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/submitted_aligned_reads.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "submitted_aligned_reads",
 3 |     "id": "0cb66276-c29b-4811-a704-38502173c0f8",
 4 |     "submitter_id": "TCGA-AB-2837-03B-01W-0728-08",
 5 |     "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam",
 6 |     "file_size": 28165335141,
 7 |     "md5sum": "d3266f2577584713ea17f94d331f30c4",
 8 |     "data_category": "Sequencing Data",
 9 |     "data_type": "Aligned Reads",
10 |     "data_format": "BAM",
11 |     "experimental_strategy": "WXS",
12 |     "read_groups": {
13 | 	"submitter_id": "205CTABXX100806.5.C317.TCGA-AB-2837-03B-01W-0728-08.3.bam"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/guppy_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to create and re-create es indices and setup guppy
 3 | 
 4 | sleep 2
 5 | docker exec esproxy-service curl -X DELETE http://localhost:9200/etl_0
 6 | sleep 2
 7 | docker exec esproxy-service curl -X DELETE http://localhost:9200/file_0
 8 | sleep 2
 9 | docker exec esproxy-service curl -X DELETE http://localhost:9200/file-array-config_0
10 | sleep 2
11 | docker exec esproxy-service curl -X DELETE http://localhost:9200/etl-array-config_0
12 | sleep 2
13 | docker exec tube-service bash -c "python run_config.py && python run_etl.py"
14 | 
15 | docker container stop guppy-service
16 | docker container start guppy-service
17 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/projects/project1.yaml:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # Project 1 specific overrides
 3 | #####################################################################
 4 | 
 5 | $schema: "http://json-schema.org/draft-04/schema#"
 6 | 
 7 | #####################################################################
 8 | # Aliquot
 9 | #####################################################################
10 | 
11 | id: "aliquot"
12 | program: 'program1'
13 | project: 'project1'
14 | required:
15 |   - submitter_aliquot_id
16 |   - parents
17 |   - project_1_specific_thing
18 | 
19 | properties:
20 |   project_1_specific_thing:
21 | type: string
22 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/submitted_methylation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "submitted_methylation",
 3 |     "id": "0cb66276-c29b-4811-a704-38502173c0f8",
 4 |     "submitter_id": "TCGA-AB-2837-03B-01W-0728-08",
 5 |     "file_name": "C317.TCGA-AB-2837-03B-01W-0728-08.3.bam",
 6 |     "file_size": 28165335141,
 7 |     "md5sum": "d3266f2577584713ea17f94d331f30c4",
 8 |     "data_category": "Methylation Data",
 9 |     "data_type": "Methylation Intensity Values",
10 |     "data_format": "IDAT",
11 |     "assay_method": "Methylation Array",
12 |     "assay_instrument": "Illumina",
13 |     "assay_instrument_model": "Illumina Infinium HumanMethylation450",
14 |     "aliquots": {
15 |         "submitter_id": "205CTABXX100806.5.C317.TCGA-AB-2837-03B-01W-0728-08.3.bam"
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/README.md:
--------------------------------------------------------------------------------
 1 | Proposed additional keywords
 2 | ============================
 3 | 
 4 | The schemas defined here follow jsonschema as closely as possbile,
 5 | introducing new keywords as needed.
 6 | 
 7 | systemAlias
 8 | -----------
 9 | 
10 | For implementation. Allows properties to be stored as different
11 | keywords.  The property listed in the properties section is what the
12 | user will refer to it, and the systemAlias value is what it will be
13 | stored in the database as.
14 | 
15 | systemProperties
16 | ---------------
17 | 
18 | The property keys listed under systemProperties are properties that
19 | the submitter is not allowed to update.
20 | 
21 | parentType
22 | ---------------
23 | 
24 | The type of object that the parent relationship points to.
25 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/read_group.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "id": "b4918128-f3e5-496a-bcd7-a4ec8d7dd1d9",
 3 |     "submitter_id": "205DDABXX100804.3.C317.TCGA-AB-2899-03A-01W-0733-08.4.bam",
 4 |     "type": "read_group",
 5 |     "experiment_name": "Resequencing",
 6 |     "sequencing_center": "BI",
 7 |     "sequencing_date": "2010-08-04",
 8 |     "platform": "Illumina",
 9 |     "instrument_model": "Illumina HiSeq 2000",
10 |     "library_strategy": "WXS",
11 |     "flow_cell_barcode": "205DDABXX",
12 |     "library_selection": "Hybrid_Selection",
13 |     "library_name": "Solexa-34688",
14 |     "is_paired_end": true,
15 |     "read_length": 75,
16 |     "read_group_name": "205DD.3",
17 |     "aliquots": [
18 |         {
19 |             "submitter_id": "2c9108eb-c59a-4227-b650-56e61b3aa0ea"
20 |         }
21 |     ]   
22 | }
23 | 


--------------------------------------------------------------------------------
/datadictionary/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright 2015 University of Chicago, Ontario Institute for Cancer ResearchLicensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance with the License.You may obtain a copy of the License athttp://www.apache.org/licenses/LICENSE-2.0Unless required by applicable law or agreed to in writing,software distributed under the License is distributed on an "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the License for the specific language governing permissions andlimitations under the License.Portions of this work, authored by University of Chicago andOntario Institute for Cancer Research employees, was funded in whole or in partby National Cancer Institute, National Institutes of Healthunder U.S. Government contract HHSN261200800001E.


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/program.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "program"
 4 | title: Program
 5 | type: object
 6 | category: administrative
 7 | program: '*'
 8 | project: '*'
 9 | description: >
10 |   A broad framework of goals to be achieved. (NCIt C52647)
11 | additionalProperties: false
12 | submittable: false
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 | 
18 | required:
19 |   - name
20 |   - dbgap_accession_number
21 | 
22 | uniqueKeys:
23 |   - [id]
24 |   - [name]
25 | 
26 | links: []
27 | 
28 | # Program is the root entity and so it is the only entity
29 | # without a project or parents.
30 | properties:
31 |   type:
32 |     type: string
33 |   id:
34 |     $ref: "_definitions.yaml#/UUID"
35 |     systemAlias: node_id
36 |   name:
37 |     type: string
38 |     description: "Full name/title of the program."
39 |   dbgap_accession_number:
40 |     type: string
41 |     description: "The dbgap accession number provided for the program."
42 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "sample",
 3 |     "submitter_id": "BLGSP-71-06-00019s",
 4 |     "biospecimen_anatomic_site": "Adipose",
 5 |     "composition": "Cell",
 6 |     "current_weight": 1,
 7 |     "days_to_collection": 25,
 8 |     "days_to_sample_procurement": 123,
 9 |     "diagnosis_pathologically_confirmed": "Yes",
10 |     "freezing_method": "OCT",
11 |     "initial_weight": 0.5,
12 |     "intermediate_dimension": "1.2",
13 |     "is_ffpe": true,
14 |     "longest_dimension": "1.5",
15 |     "method_of_sample_procurement": "Indeterminant",
16 |     "oct_embedded": "false",
17 |     "sample_type": "Blood Derived Normal",
18 |     "sample_type_id": "10",
19 |     "shortest_dimension": "0.5",
20 |     "time_between_clamping_and_freezing": "30",
21 |     "tissue_type": "Normal",
22 |     "tumor_code": "Osteosarcoma (OS)",
23 |     "tumor_code_id": "00",
24 |     "tumor_descriptor": "NOS",
25 |     "cases": {
26 |         "submitter_id": "BLGSP-71-06-00019"
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/scripts/postgres_always.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | create_db_idempotent() {
 4 |   # Creating a DB similar to the "IF NOT EXISTS" syntax is a bit challenging in
 5 |   # Postgres.
 6 |   psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = '${1}'" | grep -q 1 || \
 7 |   psql -U postgres -c "CREATE DATABASE ${1}"
 8 | }
 9 | 
10 | create_user_idempotent() {
11 |   psql -U postgres << EOF
12 | DO \$\$
13 | BEGIN
14 |   IF NOT EXISTS(SELECT 1 FROM pg_roles WHERE rolname='${1}') THEN
15 |     CREATE USER ${1};
16 |   END IF;
17 | END
18 | \$\$;
19 | EOF
20 | }
21 | 
22 | # The metadata DB and user are here to backfill for installations that did not
23 | # have them originally. These entities did not always exist in compose-services.
24 | # New compose-services users would get them via the standard postgres init
25 | # script, but existing users would need to get them through this mechanism.
26 | create_db_idempotent "metadata"
27 | create_user_idempotent "metadata_user"
28 | 
29 | psql -U postgres <<EOF
30 | ALTER USER metadata_user WITH PASSWORD 'metadata_pass';
31 | ALTER USER metadata_user WITH SUPERUSER;
32 | EOF
33 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/read_group_qc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "id": "7b460e40-27d9-42a5-9dee-8d97f3960c6d",
 3 |     "submitter_id": "QC metrics for BAM_file_a",
 4 |     "type": "read_group_qc",
 5 |     "adapter_content": "PASS",
 6 |     "basic_statistics": "PASS",
 7 |     "created_datetime": "2015-11-17T14:13:16.931431-06:00",
 8 |     "encoding": "Sanger / Illumina 1.9",
 9 |     "fastq_name": "2893341596_1.fq.gz",
10 |     "kmer_content": "PASS",
11 |     "overrepresented_sequences": "PASS",
12 |     "per_base_n_content": "PASS",
13 |     "per_base_sequence_content": "WARN",
14 |     "per_base_sequence_quality": "PASS",
15 |     "per_sequence_gc_content": "WARN",
16 |     "per_sequence_quality_score": "PASS",
17 |     "per_tile_sequence_quality": "PASS",
18 |     "percent_gc_content": 39,
19 |     "sequence_duplication_levels": "PASS",
20 |     "sequence_length_distribution": "PASS",
21 |     "total_sequences": 195523357,
22 |     "workflow_link": "github.com/<owner>/<project>/commit/<hash>",
23 |     "submitted_aligned_reads_files":
24 |         {"submitter_id": "bam_file_a"},
25 |     "read_groups":
26 |         {"submitter_id": "read_group_a"}
27 | }
28 | 


--------------------------------------------------------------------------------
/templates/test_config_helper.py:
--------------------------------------------------------------------------------
 1 | import config_helper
 2 | import os
 3 | import time
 4 | 
 5 | # WORKSPACE == Jenkins workspace
 6 | TEST_ROOT=os.getenv('WORKSPACE',os.getenv('XDG_RUNTIME_DIR', '/tmp')) + '/test_config_helper/' + str(int(time.time()))
 7 | APP_NAME='test_config_helper'
 8 | TEST_JSON = '''
 9 | {
10 |   "a": "A",
11 |   "b": "B",
12 |   "c": "C"
13 | }
14 | '''
15 | TEST_FILENAME='bla.json'
16 | 
17 | config_helper.XDG_DATA_HOME=TEST_ROOT
18 | 
19 | def setup():
20 |   test_folder = TEST_ROOT + '/cdis/' + APP_NAME
21 |   if not os.path.exists(test_folder):
22 |     os.makedirs(test_folder)
23 |   with open(test_folder + '/' + TEST_FILENAME, 'w') as writer:
24 |     writer.write(TEST_JSON)
25 | 
26 | def test_find_paths():
27 |   setup()
28 |   path_list = config_helper.find_paths(TEST_FILENAME, APP_NAME)
29 |   assert len(path_list) == 1
30 |   bla_path = TEST_ROOT + '/cdis/' + APP_NAME + '/' + TEST_FILENAME
31 |   assert os.path.exists(bla_path)
32 |   assert path_list[0] == bla_path
33 | 
34 | def test_load_json():
35 |   setup()
36 |   data = config_helper.load_json(TEST_FILENAME, APP_NAME)
37 |   for key in ['a','b','c']:
38 |     assert data[key] == key.upper()
39 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/clinical_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "clinical_test",
 3 |     "submitter_id": "clinical_test_date_individual_name",
 4 |     "biomarker_name": "ERBB4",
 5 |     "biomarker_result": "Normal",
 6 |     "biomarker_test_method": "Cytogenetics",
 7 |     "cea_level_preoperative": 1,
 8 |     "dlco_ref_predictive_percent": 1,
 9 |     "estrogen_receptor_percent_positive_ihc": "<1%",
10 |     "estrogen_receptor_result_ihc": "Negative",
11 |     "fev1_ref_post_bronch_percent": 1,
12 |     "fev1_ref_pre_bronch_percent": 2,
13 |     "fev1_fvc_post_bronch_percent": 10,
14 |     "fev1_fvc_pre_bronch_percent": 15,
15 |     "her2_erbb2_percent_positive_ihc": "1-10%",
16 |     "her2_erbb2_result_fish": "Negative",
17 |     "her2_erbb2_result_ihc": "Not Performed",
18 |     "ldh_level_at_diagnosis": 3432,
19 |     "ldh_normal_range_upper": 13241,
20 |     "microsatellite_instability_abnormal": "Yes",
21 |     "progesterone_receptor_percent_positive_ihc": "<1%",
22 |     "progesterone_receptor_result_ihc": "Positive",
23 |     "cases": {
24 |         "submitter_id": "Extra large case"
25 |     },
26 |     "diagnoses": {
27 |         "submitter_id": "BL Diagnosis"
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/postgres_run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Thing shim around the normal Docker postgres entrypoint that allows us to run
 4 | # non-application migrations. Things like DB and user creations that would be
 5 | # done by cloud-automation tasks in a normal env.
 6 | 
 7 | set -e
 8 | 
 9 | # Initialize the DB, but don't allow outside connections yet.
10 | docker-entrypoint.sh postgres -c listen_addresses='127.0.0.1' &
11 | # Wait until the server is out of initialization mode and online.
12 | while ! psql -U postgres -h localhost -c 'SELECT 1;' 2>/dev/null; do echo "waiting for postgres init..."; sleep 1; done
13 | # Stop the server.
14 | gosu postgres pg_ctl stop
15 | 
16 | echo "[postgres] run migrations"
17 | 
18 | # Run migrations/scripts that should run on every start. This is handy for data
19 | # we want to backfill or otherwise migrate for users.
20 | gosu postgres bash -c "(
21 |   source /usr/local/bin/docker-entrypoint.sh
22 |   docker_setup_env
23 |   docker_temp_server_start
24 | 
25 |   bash /postgres_always.sh
26 | 
27 |   docker_temp_server_stop
28 | )"
29 | 
30 | echo "[postgres] migrations complete"
31 | 
32 | # Start postgres "normally" allowing all network clients to connect.
33 | docker-entrypoint.sh postgres
34 | 


--------------------------------------------------------------------------------
/scripts/postgres_init.sql:
--------------------------------------------------------------------------------
 1 | /* Entrypoint script for postgres container to set up databases and users for
 2 | docker-compose setup */
 3 | 
 4 | CREATE DATABASE metadata; -- Used by metadata-service (called "metadata" in cloud-automation)
 5 | CREATE DATABASE metadata_db; -- Used by sheepdog and peregrine (called "sheepdog" in cloud-automation)
 6 | CREATE DATABASE fence_db;
 7 | CREATE DATABASE indexd_db;
 8 | CREATE DATABASE arborist_db;
 9 | 
10 | CREATE USER metadata_user;
11 | ALTER USER metadata_user WITH PASSWORD 'metadata_pass';
12 | ALTER USER metadata_user WITH SUPERUSER;
13 | 
14 | CREATE USER fence_user;
15 | ALTER USER fence_user WITH PASSWORD 'fence_pass';
16 | ALTER USER fence_user WITH SUPERUSER;
17 | 
18 | CREATE USER peregrine_user;
19 | ALTER USER peregrine_user WITH PASSWORD 'peregrine_pass';
20 | ALTER USER peregrine_user WITH SUPERUSER;
21 | 
22 | CREATE USER sheepdog_user;
23 | ALTER USER sheepdog_user WITH PASSWORD 'sheepdog_pass';
24 | ALTER USER sheepdog_user WITH SUPERUSER;
25 | 
26 | CREATE USER indexd_user;
27 | ALTER USER indexd_user WITH PASSWORD 'indexd_pass';
28 | ALTER USER indexd_user WITH SUPERUSER;
29 | 
30 | CREATE USER arborist_user;
31 | ALTER USER arborist_user WITH PASSWORD 'arborist_pass';
32 | ALTER USER arborist_user WITH SUPERUSER;
33 | 


--------------------------------------------------------------------------------
/docs/release_history.md:
--------------------------------------------------------------------------------
 1 | # Release History and Migration Instructions
 2 | 
 3 | # 2019/03 release
 4 | 
 5 | The `2019/03` release includes changes necessary for running the latest versions of the `gen3` services as of March 2019.
 6 | This release may fail to run earlier versions of `gen3`.
 7 | 
 8 | * Changes
 9 |   - add `arborist` and `pidgin` services
10 |   - move secrets to `Secrets/` folder which git ignores (via the `.gitignore` file), `apis_configs/` is renamed to a `templates/` folder
11 |   - bump to Postgres `9.6`
12 |   - do not publish Postgres port to host by default - to avoid port conflicts on the host
13 | 
14 | * Migrate an existing commons to the new setup
15 |     - move the current secrets to `./Secrets`: `mv ./apis_configs Secrets`
16 |     - `git pull`
17 |     - `docker-compose pull` - pull the latest `gen3` Docker images
18 |     - `bash ./creds_setup.sh`
19 |     - edit the `postgres` service in `docker-compose.yaml` to stay on version `9.5` - a `9.6` server cannot read data saved by a `9.5` server.  If you want to erase the data currently in the commons, and proceed with Postgres `9.6`, then `docker-compose down -v` clears the old data.
20 |     - Set the settings in `Secrets/fence-config.yaml` - be sure to set the `client_secret` and `client_id` fields under `OPENID_CONNECT`.
21 |     - ready to go: `docker-compose up -d`
22 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/examples/valid/diagnosis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "diagnosis",
 3 |     "submitter_id": "E9EDB78B-6897-4205-B9AA-0CEF8AAB5A1F_diagnosis",
 4 |     "age_at_diagnosis": 47,
 5 |     "ann_arbor_b_symptoms": "Yes",
 6 |     "ann_arbor_extranodal_involvement": "No",
 7 |     "burkitt_lymphoma_clinical_variant": "Endemic",
 8 |     "cause_of_death": "Unknown",
 9 |     "classification_of_tumor": "other",
10 |     "days_to_birth": -17238.0,
11 |     "days_to_death": 1241.0,
12 |     "days_to_hiv_diagnosis": null,
13 |     "days_to_last_follow_up": -1.0,
14 |     "days_to_last_known_disease_status": -1,
15 |     "days_to_recurrence": -1,
16 |     "hiv_positive": "No",
17 |     "last_known_disease_status": "Unknown tumor status",
18 |     "ldh_level_at_diagnosis": 1,
19 |     "ldh_normal_range_upper": 1.5,
20 |     "method_of_diagnosis": "Cytology",
21 |     "morphology": "8255/3",
22 |     "new_event_anatomic_site": "Bone",
23 |     "new_event_type": "Distant Metastasis",
24 |     "primary_diagnosis": "c34.3",
25 |     "prior_malignancy": "no",
26 |     "progression_or_recurrence": "unknown",
27 |     "site_of_resection_or_biopsy": "c34.3",
28 |     "tissue_or_organ_of_origin": "c34.3",
29 |     "tumor_grade": "",
30 |     "tumor_stage": "stage iiia",
31 |     "vital_status": "dead",
32 |     "year_of_diagnosis": 2077,
33 |     "cases": {
34 |         "submitter_id": "BLGSP-71-06-00019"
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/keyword.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "keyword"
 4 | title: Keyword
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: administrative
 8 | program: '*'
 9 | project: '*'
10 | description: "A keyword for a project."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - state
19 |   - created_datetime
20 |   - updated_datetime
21 | 
22 | links:
23 |   - name: projects
24 |     backref: keywords
25 |     label: describe
26 |     target_type: project
27 |     multiplicity: many_to_many
28 |     required: true
29 | 
30 | required:
31 |   - submitter_id
32 |   - type
33 |   - projects
34 | 
35 | uniqueKeys:
36 |   - [ id ]
37 |   - [ project_id, submitter_id ]
38 | 
39 | properties:
40 |   type:
41 |     enum: [ "keyword" ]
42 |   id:
43 |     $ref: "_definitions.yaml#/UUID"
44 |     systemAlias: node_id
45 |   state:
46 |     $ref: "_definitions.yaml#/state"
47 |   submitter_id:
48 |     type:
49 |       - string
50 |       - "null"
51 |   keyword_name:
52 |     description: "The name of the keyword."
53 |     type: string
54 |   projects:
55 |     $ref: "_definitions.yaml#/to_many_project"
56 |   project_id:
57 |     type: string
58 |   created_datetime:
59 |     $ref: "_definitions.yaml#/datetime"
60 |   updated_datetime:
61 |     $ref: "_definitions.yaml#/datetime"
62 | 


--------------------------------------------------------------------------------
/smoke_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | help() {
 4 |   cat - <<EOM
 5 |   Use: bash smoke_test.sh [--help] HOSTNAME
 6 |     ex 1: bash smoke_test.sh localhost
 7 |     ex 2: bash smoke_test.sh --help
 8 | EOM
 9 | }
10 | 
11 | curly() {
12 |   local url=''
13 |   local service=''
14 |   local result='unknown'
15 |   
16 |   url="$1"
17 |   shift
18 |   service="$1"
19 |   result="$(curl -k -s -i -X GET "$url" | head -1 | awk '{ print $2 }')"
20 |   echo "$result  $service  $url"
21 |   if [[ "$result" == "200" ]]; then
22 |     return 0
23 |   fi
24 |   return 1
25 | }
26 | 
27 | 
28 | if [[ $# -lt 1 || "$1" =~ -*help ]]; then
29 |   help
30 |   exit 0
31 | fi
32 | 
33 | proto=https://
34 | hostname="$1"
35 | shift
36 | 
37 | if [[ "$hostname" =~ ^https{0,1}:// ]]; then
38 |   # hostname include protocol
39 |   proto=""
40 | fi
41 | 
42 | result=0
43 | curly "${proto}${hostname}/index.html" "portal"
44 | if [[ $? != 0 ]]; then result=$?; fi
45 | curly "${proto}${hostname}/user/.well-known/jwks" "fence"
46 | if [[ $? != 0 ]]; then result=$?; fi
47 | curly "${proto}${hostname}/index/_status" "indexd"
48 | if [[ $? != 0 ]]; then result=$?; fi
49 | curly "${proto}${hostname}/peregrine/_status" "peregrine"
50 | if [[ $? != 0 ]]; then result=$?; fi
51 | curly "${proto}${hostname}/api/_status" "sheepdog"
52 | if [[ $? != 0 ]]; then result=$?; fi
53 | curly "${proto}${hostname}/pidgin/_status" "pidgin"
54 | if [[ $? != 0 ]]; then result=1; fi
55 | exit $result
56 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/publication.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "publication"
 4 | title: Publication
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: administrative
 8 | program: '*'
 9 | project: '*'
10 | description: "Publication for a project."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - state
19 |   - created_datetime
20 |   - updated_datetime
21 | 
22 | links:
23 |   - name: projects
24 |     backref: publications
25 |     label: refers_to
26 |     target_type: project
27 |     multiplicity: many_to_many
28 |     required: true
29 | 
30 | required:
31 |   - submitter_id
32 |   - type
33 |   - projects
34 | 
35 | uniqueKeys:
36 |   - [ id ]
37 |   - [ project_id, submitter_id ] 
38 | 
39 | properties:
40 |   type:
41 |     enum: [ "publication" ] 
42 |   id:
43 |     $ref: "_definitions.yaml#/UUID"
44 |     systemAlias: node_id
45 |   state:
46 |     $ref: "_definitions.yaml#/state"
47 |   submitter_id:
48 |     type:
49 |       - string
50 |       - "null"
51 |   pmid:
52 |     type: string
53 |   doi:
54 |     type: string
55 |   projects:
56 |     $ref: "_definitions.yaml#/to_many_project"
57 |   project_id:
58 |     type: string
59 |   created_datetime:
60 |     $ref: "_definitions.yaml#/datetime"
61 |   updated_datetime:
62 |     $ref: "_definitions.yaml#/datetime"
63 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/acknowledgement.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "acknowledgement"
 4 | title: Acknowledgement 
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: administrative
 8 | program: '*'
 9 | project: '*'
10 | description: "Acknowledgement of an individual involved in a project."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - state
19 |   - created_datetime
20 |   - updated_datetime
21 | 
22 | links:
23 |   - name: projects
24 |     backref: acknowledgements 
25 |     label: contribute_to
26 |     target_type: project
27 |     multiplicity: many_to_many
28 |     required: true
29 | 
30 | required:
31 |   - submitter_id
32 |   - type
33 |   - projects
34 | 
35 | uniqueKeys:
36 |   - [ id ]
37 |   - [ project_id, submitter_id ]
38 | 
39 | properties:
40 |   type:
41 |     enum: [ "acknowledgement" ]
42 |   id:
43 |     $ref: "_definitions.yaml#/UUID"
44 |     systemAlias: node_id
45 |   state:
46 |     $ref: "_definitions.yaml#/state"
47 |   submitter_id:
48 |     type:
49 |       - string
50 |       - "null"
51 |   acknowledgee:
52 |     description: "The indvidiual or group being acknowledged by the project."
53 |     type: string
54 |   projects:
55 |     $ref: "_definitions.yaml#/to_many_project"
56 |   project_id:
57 |     type: string
58 |   created_datetime:
59 |     $ref: "_definitions.yaml#/datetime"
60 |   updated_datetime:
61 |     $ref: "_definitions.yaml#/datetime"
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Compose-Services
 2 | ===
 3 | 
 4 | Docker-compose setup for experimental commons, small commons, or local development of the Gen3 stack. Production use should use [cloud-automation](https://github.com/uc-cdis/cloud-automation).
 5 | 
 6 | This setup uses Docker containers for the [Gen3 microservices](https://github.com/uc-cdis/) and nginx. The microservices and nginx images are pulled from quay.io (master), while Postgres (9.5) images are pulled from Docker Hub. Nginx is used as a reverse proxy to each of the services. 
 7 | 
 8 | In the following pages you will find information about [migrating existing](docs/release_history.md) and [setting up](docs/setup.md) new compose services, [dev tips](docs/dev_tips.md), basic information about [using the data commons](docs/using_the_commons.md), and [useful links](docs/useful_links.md) contributed by our community. 
 9 | 
10 | You can quickly find commonly used commands in our [cheat sheet](./docs/cheat_sheet.md). Config file formats were copied from [cloud-automation](https://github.com/uc-cdis/cloud-automation) and stored in the `Secrets` directory and modified for local use with Docker Compose. Setup scripts for some of the containers are kept in the `scripts` directory.
11 | 
12 | 
13 | # Key Documentation
14 | 
15 | * [Database Information](docs/database_information.md)
16 | * [Release History and Migration Instructions](docs/release_history.md)
17 | * [Setup](docs/setup.md)
18 | * [Dev Tips](docs/dev_tips.md)
19 | * [Using the Data Commons](docs/using_the_commons.md)
20 | * [Useful links](docs/useful_links.md)
21 | 


--------------------------------------------------------------------------------
/docs/database_information.md:
--------------------------------------------------------------------------------
 1 | # Database Information
 2 | 
 3 | Database setup only has to occur the very first time you set up your local gen3 Docker Compose environment, as this docker-compose environment is configured to create a persistent volume for Postgres. The environment configuration is set up to automatically run setup scripts for the postgres container and set up the following:
 4 |   1. 5 databases
 5 |       - `metadata` (Used by `metadata-service`)
 6 |       - `metadata_db` (Used by `sheepdog` and `peregrine`)
 7 |       - `fence_db`
 8 |       - `indexd_db`
 9 |       - `arborist_db`
10 |   2. 6 users with passwords and superuser access
11 |       - `metadata_user`
12 |       - `fence_user`
13 |       - `peregrine_user`
14 |       - `sheepdog_user`
15 |       - `indexd_user`
16 |       - `arborist_user`
17 | 
18 | > **NOTE**: You can use docker compose override to configure the Postgres database container and publish the db service port to the host machine by changing the `ports` block under the `postgres` service in `docker-compose.override.yml`, then run `docker-compose up -d postgres`:
19 | ```
20 | cp docker-compose.override.sample.yml docker-compose.override.yml
21 | ```
22 | The container host can connect to the database after the port is published - ex:
23 | ```
24 | psql -h localhost -U fence_user -d fence_db
25 | ```
26 | 
27 | > **Heads-up**: Similarly, you can add/override your custom docker compose config parameters/values in `docker-compose.override.yml` and keep the base config clean. See [docker compose documentation](https://docs.docker.com/compose/extends/) for more.
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/cheat_sheet.md:
--------------------------------------------------------------------------------
 1 | # Docker compose services cheat sheet
 2 | 
 3 | **Quick start**
 4 | 
 5 | * bash ./creds_setup.sh (setup secrets)
 6 | * docker-compose up (start with logs)
 7 | * docker-compose up -d (start without logs)
 8 | * docker-compose down (stop)
 9 | * docker-compose down -v (stop and wipe existing data)
10 | 
11 | **Useful commands**
12 | 
13 | * docker ps
14 | * docker logs [-f] xxx-service
15 | * docker-compose restart xxx-service
16 | * docker exec -it fence-service fence-create xxx
17 | 
18 | **Update images**
19 | 
20 | * docker-compose pull
21 | * docker image prune -f (optional - to free up some space…)
22 | 
23 | **Access DB**
24 | 
25 | * docker exec -it compose-services_postgres_1 psql -U postgres
26 | * \c DB_name
27 | 
28 | **Sync users**
29 | 
30 | * docker exec -it fence-service fence-create sync --arborist http://arborist-service --yaml user.yaml
31 | 
32 | **Change dictionary**
33 | 
34 | Update in docker-compose.yml:
35 | * DICTIONARY_URL
36 | * APP (to get the [corresponding portal setup](https://github.com/uc-cdis/data-portal/tree/master/data/config)), for example:
37 |   * dev (goes to "default" config -> Dev data commons)
38 |   * edc (Environmental data commons)
39 | 
40 | **Use local code (example with fence)**
41 | 
42 | Update in docker-compose.yml:
43 | ```
44 | fence-service:
45 |     image: "my-fence:latest"
46 | ```
47 | Rerun the following commands after changing the code:
48 | * cd fence; docker build . -t my-fence -f Dockerfile
49 | * docker stop fence-service
50 | * docker-compose up -d fence-service
51 | 
52 | **Dump logs and config in a zip file**
53 | 
54 | * bash dump.sh
55 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/case.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "case"
 4 | title: Case
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: administrative
 8 | program: '*'
 9 | project: '*'
10 | description: >
11 |   The collection of all data related to a specific subject in the
12 |   context of a specific experiment. 
13 | additionalProperties: false
14 | submittable: true
15 | validators: null
16 | 
17 | systemProperties:
18 |   - id
19 |   - project_id
20 |   - created_datetime
21 |   - updated_datetime
22 |   - state
23 | 
24 | links:
25 |   - name: experiments 
26 |     backref: cases
27 |     label: member_of
28 |     target_type: experiment
29 |     multiplicity: many_to_one
30 |     required: true
31 | 
32 | required:
33 |   - submitter_id
34 |   - type
35 |   - experiments
36 | 
37 | uniqueKeys:
38 |   - [id]
39 |   - [project_id, submitter_id]
40 | 
41 | # Case properties
42 | properties:
43 |   type:
44 |     type: string
45 |   id:
46 |     $ref: "_definitions.yaml#/UUID"
47 |     systemAlias: node_id
48 |   state:
49 |     $ref: "_definitions.yaml#/state"
50 |   submitter_id:
51 |     type:
52 |       - string
53 |       - "null"
54 |   consent_codes:
55 |     type: array
56 |     items:
57 |       type: string
58 |   primary_site:
59 |     description: "Primary site for the case."
60 |     type: string
61 |   disease_type:
62 |     description: "Name of the disease for the case."
63 |     type: string
64 |   experiments: 
65 |     $ref: "_definitions.yaml#/to_one"
66 |   project_id:
67 |     $ref: "_definitions.yaml#/project_id"
68 |   created_datetime:
69 |     $ref: "_definitions.yaml#/datetime"
70 |   updated_datetime:
71 |     $ref: "_definitions.yaml#/datetime"
72 | 


--------------------------------------------------------------------------------
/templates/indexd_settings.py:
--------------------------------------------------------------------------------
 1 | from indexd.index.drivers.alchemy import SQLAlchemyIndexDriver
 2 | from indexd.alias.drivers.alchemy import SQLAlchemyAliasDriver
 3 | from indexd.auth.drivers.alchemy import SQLAlchemyAuthDriver
 4 | import config_helper
 5 | from os import environ
 6 | import json
 7 | 
 8 | APP_NAME='indexd'
 9 | def load_json(file_name):
10 |   return config_helper.load_json(file_name, APP_NAME)
11 | 
12 | conf_data = load_json('creds.json')
13 | 
14 | usr = conf_data.get('db_username', '{{db_username}}')
15 | db = conf_data.get('db_database', '{{db_database}}')
16 | psw = conf_data.get('db_password', '{{db_password}}')
17 | pghost = conf_data.get('db_host', '{{db_host}}')
18 | pgport = 5432
19 | index_config = conf_data.get('index_config')
20 | CONFIG = {}
21 | 
22 | CONFIG['JSONIFY_PRETTYPRINT_REGULAR'] = False
23 | 
24 | dist = environ.get('DIST', None)
25 | if dist:
26 |   CONFIG['DIST'] = json.loads(dist)
27 | 
28 | CONFIG['INDEX'] = {
29 | 'driver': SQLAlchemyIndexDriver('postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format(
30 |     usr=usr,
31 |     psw=psw,
32 |     pghost=pghost,
33 |     pgport=pgport,
34 |     db=db,
35 | ), index_config=index_config),
36 | }
37 | 
38 | CONFIG['ALIAS'] = {
39 | 'driver': SQLAlchemyAliasDriver('postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format(
40 |     usr=usr,
41 |     psw=psw,
42 |     pghost=pghost,
43 |     pgport=pgport,
44 |     db=db,
45 | )),
46 | }
47 | 
48 | AUTH = SQLAlchemyAuthDriver(
49 |   'postgresql+psycopg2://{usr}:{psw}@{pghost}:{pgport}/{db}'.format(
50 |       usr=usr,
51 |       psw=psw,
52 |       pghost=pghost,
53 |       pgport=pgport,
54 |       db=db,
55 |   ),
56 |   arborist="http://arborist-service/",
57 | )
58 | 
59 | settings = {'config': CONFIG, 'auth': AUTH}
60 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/experimental_metadata.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "experimental_metadata"
 4 | title: Experimental Metadata
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: metadata_file
 8 | project: '*'
 9 | program: '*'
10 | description: >
11 |   Data file containing the metadata for the experiment performed.
12 | additionalProperties: false
13 | submittable: true
14 | validators: null
15 | 
16 | systemProperties:
17 |   - id
18 |   - project_id
19 |   - created_datetime
20 |   - updated_datetime
21 |   - state
22 |   - file_state
23 |   - error_type
24 | 
25 | links:
26 |   - exclusive: false
27 |     required: true
28 |     subgroup:
29 |     - name: core_metadata_collections
30 |       backref: experiment_metadata_files
31 |       label: data_from
32 |       target_type: core_metadata_collection
33 |       multiplicity: many_to_many
34 |       required: false
35 |     - name: experiments
36 |       backref: experiment_metadata_files
37 |       label: derived_from
38 |       target_type: experiment
39 |       multiplicity: many_to_many
40 |       required: false
41 | 
42 | required:
43 |   - submitter_id
44 |   - type
45 |   - file_name
46 |   - file_size
47 |   - md5sum
48 |   - data_category
49 |   - data_type
50 |   - data_format
51 | 
52 | uniqueKeys:
53 |   - [ id ]
54 |   - [ project_id, submitter_id ]
55 | 
56 | properties:
57 |   $ref: "_definitions.yaml#/data_file_properties"
58 |   type:
59 |     enum: [ "experimental_metadata" ]
60 |   data_category:
61 |     term:
62 |       $ref: "_terms.yaml#/data_category"
63 |     type:
64 |       - string
65 |   data_type:
66 |     term:
67 |       $ref: "_terms.yaml#/data_type"
68 |     enum: [ "Experimental Metadata" ]
69 |   data_format:
70 |     term:
71 |       $ref: "_terms.yaml#/data_format"
72 |     type:
73 |       - string
74 |   experiments:
75 |     $ref: "_definitions.yaml#/to_one"
76 |   core_metadata_collections:
77 |     $ref: "_definitions.yaml#/to_many"
78 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/aligned_reads_index.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "aligned_reads_index"
 4 | title: Aligned Reads Index
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: index_file
 8 | program: '*'
 9 | project: '*'
10 | description: "Data file containing the index for a set of aligned reads."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - created_datetime
19 |   - updated_datetime
20 |   - state
21 |   - file_state
22 |   - error_type
23 | 
24 | links:
25 |   - exclusive: false
26 |     required: true
27 |     subgroup:
28 |     - name: submitted_aligned_reads_files
29 |       backref: aligned_reads_indexes
30 |       label: derived_from
31 |       target_type: submitted_aligned_reads
32 |       multiplicity: one_to_one
33 |       required: false
34 |     - name: core_metadata_collections
35 |       backref: aligned_reads_indexes
36 |       label: data_from
37 |       target_type: core_metadata_collection
38 |       multiplicity: many_to_many
39 |       required: false
40 | 
41 | required:
42 |   - submitter_id
43 |   - type
44 |   - file_name
45 |   - file_size
46 |   - md5sum
47 |   - data_category
48 |   - data_type
49 |   - data_format
50 | 
51 | uniqueKeys:
52 |   - [ id ]
53 |   - [ project_id, submitter_id ]
54 | 
55 | properties:
56 |   $ref: "_definitions.yaml#/data_file_properties"
57 |   type:
58 |     enum: [ "aligned_reads_index" ]
59 |   data_category:
60 |     term:
61 |       $ref: "_terms.yaml#/data_category"
62 |     enum:
63 |       - Sequencing Data
64 |       - Sequencing Reads
65 |       - Raw Sequencing Data
66 |   data_type:
67 |     term:
68 |       $ref: "_terms.yaml#/data_type"
69 |     enum: [ "Aligned Reads Index" ]
70 |   data_format:
71 |     term:
72 |       $ref: "_terms.yaml#/data_format"
73 |     enum: [ "BAI" ]
74 |   submitted_aligned_reads_files:
75 |     $ref: "_definitions.yaml#/to_one"
76 |   core_metadata_collections:
77 |     $ref: "_definitions.yaml#/to_many"
78 | 


--------------------------------------------------------------------------------
/datadictionary/design_notes.md:
--------------------------------------------------------------------------------
1 | One important aspect worth mentioning is that it is purposely chosen to model the dictionary using Directed Acyclic Graph (DAG). The idea behinds it is simpilicity! From a practical point of view, the data dictionary is not meant to model every aspects of the real world GDC entities and their relations with fine grain semantics. Rather, it's important to be able to express and enforce data integrity rules. So far, DAG seems to be a good fit although we should look harder to see whether there are any show-stoppers, are there any relations/rules can not be expressed using DAG and there is no way around it?
2 | 
3 | Choosing simplier design will always bring benefits in software development at all levels and phases. In our case, comparing to a general graph, DAG will be much easier to work with. Querying a DAG will be simply searching up to find parents or searching down to find children; finding siblings will require one step up and one step down, which should be performant. As any other graph search, querying a DAG with very wide or deep structure can be expensive. There should be tricks we can play to make the queries we care about performant. We can also keep it in mind while designing the model to avoid very wide or deep DAG whenever possible.
4 | 
5 | Taking one step further, practically, type of relations between nodes may not be so important to us. Just like ER modeling in RDBMS, uniqueness/cadinality is the only thing matters. With this thinking, it's possible to entirely eliminate the need for an edge table while implement the DAG model. We will still need to support some not so straightforward relations such as conditional relations, such as when there is A, there should (or shouldn't) be B etc. However such business logic is not necessarily harder for DAG to handle.
6 | 
7 | One last point is that converting data in a DAG to JSON should be easier comparing to a general graph. Data in a JSON document is essentially a tree. When converting a DAG to a tree, it is mainly to denormalize child nodes with multiple parents into multiple copies, each parent will have a materialized local child node copy. This should make the logic cleaner when we export data in graph db to JSON to build Elasticsearch indexes.
8 | 


--------------------------------------------------------------------------------
/dump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copies config and logs into a zip file
 3 | 
 4 | help="$(basename "$0") [help] [--logs-only]
 5 | where:
 6 | 	help		show this help text
 7 | 	--logs-only do not copy configuration files"
 8 | 
 9 | if [[ "$OSTYPE" != "linux-gnu" && "$OSTYPE" != "darwin"* ]]; then
10 | 	echo "This script only works on MacOS/Linux"
11 | 	exit 1
12 | fi
13 | 
14 | get_config=true
15 | while [ -n "$1" ]; do
16 | 	case "$1" in
17 | 	--logs-only)
18 | 		get_config=false
19 | 		;;
20 | 	help)
21 | 		echo "$help"
22 | 		exit 0
23 | 		;;
24 | 	*)
25 | 		echo "ignoring unknown option $1"
26 | 		;;
27 | 	esac
28 | 	shift
29 | done
30 | 
31 | dirname=compose-services_dump_`date '+%Y-%m-%d_%H:%M:%S'`
32 | mkdir -p $dirname
33 | mkdir -p $dirname/logs/
34 | if $get_config; then
35 | 	mkdir -p $dirname/config/
36 | fi
37 | 
38 | if $get_config; then
39 | 	echo "Copying config files"
40 | 	cp docker-compose.yml $dirname/config/
41 | 	cp Secrets/etlMapping.yaml $dirname/config/
42 | 	cp Secrets/gitops.json $dirname/config/
43 | 	cp Secrets/user.yaml $dirname/config/
44 | 	cp Secrets/*config.* $dirname/config/
45 | 	cp Secrets/*settings.* $dirname/config/
46 | 
47 | 	# remove lines containing creds
48 | 	if [[ "$OSTYPE" == "linux-gnu" ]]; then
49 | 		sed -i "/key/Id" $dirname/config/*
50 | 		sed -i "/secret/Id" $dirname/config/*
51 | 		sed -i "/password/Id" $dirname/config/*
52 | 	elif [[ "$OSTYPE" == "darwin"* ]]; then # MacOS
53 | 		sed -i "" "/[Kk][Ee][Yy]/d" $dirname/config/*
54 | 		sed -i "" "/[Ss][Ee][Cc][Rr][Ee][Tt]/d" $dirname/config/*
55 | 		sed -i "" "/[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]/d" $dirname/config/*
56 | 	else
57 | 		echo "WARNING: did not remove lines with creds (unknown OS $OSTYPE)"
58 | 	fi
59 | fi
60 | 
61 | echo "Dumping logs"
62 | cat docker-compose.yml | grep "container_name" | while read -r line ; do
63 | 	name=$(expr "$line" : ".* \([a-z]*-service\)")
64 | 	docker-compose logs $name > $dirname/logs/logs-$name.txt
65 | done
66 | 
67 | echo "Getting environment details"
68 | # pip freeze > $dirname/pip-freeze.txt
69 | # env > $dirname/env-vars.txt
70 | git rev-parse HEAD > $dirname/latest-commit.txt
71 | 
72 | echo "Saving as zip file $dirname.zip"
73 | zip -r $dirname.zip $dirname
74 | 
75 | echo "Cleaning up"
76 | rm -r $dirname
77 | 
78 | echo "Done"
79 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/submitted_somatic_mutation.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "submitted_somatic_mutation"
 4 | title: Submitted Somatic Mutation
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: data_file
 8 | program: '*'
 9 | project: '*'
10 | description: >
11 |   Data file containing somatic mutation calls from a read group.
12 | additionalProperties: false
13 | submittable: true
14 | validators: null
15 | 
16 | systemProperties:
17 |   - id
18 |   - project_id
19 |   - created_datetime
20 |   - updated_datetime
21 |   - state
22 |   - file_state
23 |   - error_type
24 | 
25 | links:
26 |   - exclusive: false
27 |     required: true
28 |     subgroup:
29 |     - name: core_metadata_collections
30 |       backref: submitted_somatic_mutations
31 |       label: data_from
32 |       target_type: core_metadata_collection
33 |       multiplicity: many_to_many
34 |       required: false
35 |     - name: read_groups
36 |       backref: submitted_somatic_mutations
37 |       label: derived_from
38 |       target_type: read_group
39 |       multiplicity: many_to_many
40 |       required: false
41 | 
42 | required:
43 |   - submitter_id
44 |   - type
45 |   - file_name
46 |   - file_size
47 |   - data_format
48 |   - md5sum
49 |   - data_category
50 |   - data_type
51 |   - experimental_strategy
52 | 
53 | uniqueKeys:
54 |   - [ id ]
55 |   - [ project_id, submitter_id ]
56 | 
57 | properties:
58 |   $ref: "_definitions.yaml#/data_file_properties"
59 |   type:
60 |     enum: [ "submitted_somatic_mutation" ]
61 |   data_category:
62 |     term:
63 |       $ref: "_terms.yaml#/data_category"
64 |     type: string
65 |   data_type:
66 |     term:
67 |       $ref: "_terms.yaml#/data_type"
68 |     type: string 
69 |   data_format:
70 |     term:
71 |       $ref: "_terms.yaml#/data_format"
72 |     type: string
73 |   experimental_strategy:
74 |     term:
75 |       $ref: "_terms.yaml#/experimental_strategy"
76 |     type: string 
77 |   total_variants:
78 |     description: "The total number of variants detected carrying a base change difference from the reference genome."
79 |     type: integer
80 |   read_groups:
81 |     $ref: "_definitions.yaml#/to_many"
82 |   core_metadata_collections:
83 |     $ref: "_definitions.yaml#/to_many"
84 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/family_history.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "family_history"
 4 | title: Family History
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: clinical
 8 | program: '*'
 9 | project: '*'
10 | description: >
11 |   Record of a patient's background regarding cancer events of blood relatives.
12 | additionalProperties: false
13 | submittable: true
14 | validators: null
15 | 
16 | systemProperties:
17 |   - id
18 |   - project_id
19 |   - state
20 |   - created_datetime
21 |   - updated_datetime
22 | 
23 | required:
24 |   - submitter_id
25 |   - type
26 | 
27 | links:
28 |   - name: cases
29 |     backref: family_histories
30 |     label: describes
31 |     target_type: case
32 |     multiplicity: many_to_one
33 |     required: true
34 | 
35 | 
36 | uniqueKeys:
37 |   #unclear if want submitter ID for clinical
38 |   - [id]
39 |   - [project_id, submitter_id]
40 | 
41 | properties:
42 |   type:
43 |     enum: [ "family_history" ]
44 | 
45 |   id:
46 |     $ref: "_definitions.yaml#/UUID"
47 |     systemAlias: node_id
48 | 
49 |   state:
50 |     $ref: "_definitions.yaml#/state"
51 | 
52 |   submitter_id:
53 |     type:
54 |       - string
55 |       - "null"
56 | 
57 |   relative_with_cancer_history:
58 |     term:
59 |       $ref: "_terms.yaml#/relative_with_cancer_history"
60 |     enum:
61 |       - "yes"
62 |       - "no"
63 |       - unknown
64 |       - not reported
65 | 
66 |   relationship_type:
67 |     term:
68 |       $ref: "_terms.yaml#/relationship_type"
69 |     type: string
70 | 
71 |   relationship_gender:
72 |     term:
73 |       $ref: "_terms.yaml#/gender"
74 |     enum:
75 |       - female
76 |       - male
77 |       - unknown
78 |       - unspecified
79 |       - not reported
80 | 
81 |   relationship_age_at_diagnosis:
82 |     term:
83 |       $ref: "_terms.yaml#/relationship_age_at_diagnosis"
84 |     type: number
85 | 
86 |   relationship_primary_diagnosis:
87 |     term:
88 |       $ref: "_terms.yaml#/primary_diagnosis"
89 |     type: string
90 | 
91 |   cases:
92 |     $ref: "_definitions.yaml#/to_one"
93 |   project_id:
94 |     $ref: "_definitions.yaml#/project_id"
95 |   created_datetime:
96 |     $ref: "_definitions.yaml#/datetime"
97 |   updated_datetime:
98 |     $ref: "_definitions.yaml#/datetime"
99 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/submitted_unaligned_reads.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "submitted_unaligned_reads"
 4 | title: Submitted Unaligned Reads
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: data_file
 8 | program: '*'
 9 | project: '*'
10 | description: "Data file containing unaligned reads that have not been GDC Harmonized."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - created_datetime
19 |   - updated_datetime
20 |   - state
21 |   - file_state
22 |   - error_type
23 | 
24 | links:
25 |   - exclusive: false
26 |     required: true
27 |     subgroup:
28 |     - name: read_groups
29 |       backref: submitted_unaligned_reads_files # pretty ugly
30 |       label: data_from
31 |       target_type: read_group
32 |       multiplicity: many_to_one
33 |       required: false
34 |     - name: core_metadata_collections
35 |       backref: submitted_unaligned_reads_files
36 |       label: data_from
37 |       target_type: core_metadata_collection
38 |       multiplicity: many_to_many
39 |       required: false
40 | 
41 | required:
42 |   - submitter_id
43 |   - type
44 |   - file_name
45 |   - file_size
46 |   - md5sum
47 |   - data_category
48 |   - data_type
49 |   - data_format
50 |   - experimental_strategy
51 | 
52 | uniqueKeys:
53 |   - [ id ]
54 |   - [ project_id, submitter_id ]
55 | 
56 | properties:
57 |   $ref: "_definitions.yaml#/data_file_properties"
58 |   type:
59 |     enum: [ "submitted_unaligned_reads" ]
60 |   data_category:
61 |     term:
62 |       $ref: "_terms.yaml#/data_category"
63 |     enum:
64 |       - Sequencing Data
65 |       - Sequencing Reads
66 |       - Raw Sequencing Data
67 |   data_type:
68 |     term:
69 |       $ref: "_terms.yaml#/data_type"
70 |     enum: [ "Unaligned Reads" ]
71 |   data_format:
72 |     term:
73 |       $ref: "_terms.yaml#/data_format"
74 |     enum:
75 |       - BAM
76 |       - FASTQ
77 |   experimental_strategy:
78 |     term:
79 |       $ref: "_terms.yaml#/experimental_strategy"
80 |     enum:
81 |       - WGS
82 |       - WXS
83 |       - Low Pass WGS
84 |       - Validation
85 |       - RNA-Seq
86 |       - miRNA-Seq
87 |       - Total RNA-Seq
88 |       - DNA Panel
89 |   read_groups:
90 |     $ref: "_definitions.yaml#/to_one"
91 |   core_metadata_collections:
92 |     $ref: "_definitions.yaml#/to_many"
93 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/submitted_aligned_reads.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "submitted_aligned_reads"
 4 | title: Submitted Aligned Reads
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: data_file
 8 | program: '*'
 9 | project: '*'
10 | description: >
11 |   Data file containing aligned reads that are used as input to GDC workflows.
12 | additionalProperties: false
13 | submittable: true
14 | validators: null
15 | 
16 | systemProperties:
17 |   - id
18 |   - project_id
19 |   - created_datetime
20 |   - updated_datetime
21 |   - state
22 |   - file_state
23 |   - error_type
24 | 
25 | links:
26 |   - exclusive: false
27 |     required: true
28 |     subgroup:
29 |     - name: read_groups
30 |       backref: submitted_aligned_reads_files # pretty ugly
31 |       label: data_from
32 |       target_type: read_group
33 |       multiplicity: one_to_many
34 |       required: false
35 |     - name: core_metadata_collections
36 |       backref: submitted_aligned_reads_files
37 |       label: data_from
38 |       target_type: core_metadata_collection
39 |       multiplicity: many_to_many
40 |       required: false
41 | 
42 | required:
43 |   - submitter_id
44 |   - type
45 |   - file_name
46 |   - file_size
47 |   - data_format
48 |   - md5sum
49 |   - data_category
50 |   - data_type
51 |   - experimental_strategy
52 | 
53 | uniqueKeys:
54 |   - [ id ]
55 |   - [ project_id, submitter_id ]
56 | 
57 | properties:
58 |   $ref: "_definitions.yaml#/data_file_properties"
59 |   type:
60 |     enum: [ "submitted_aligned_reads" ]
61 |   data_category:
62 |     term:
63 |       $ref: "_terms.yaml#/data_category"
64 |     enum:
65 |       - Sequencing Data
66 |       - Sequencing Reads
67 |       - Raw Sequencing Data
68 |   data_type:
69 |     term:
70 |       $ref: "_terms.yaml#/data_type"
71 |     enum:
72 |       - Aligned Reads
73 |       - Alignment Coordinates
74 |   data_format:
75 |     term:
76 |       $ref: "_terms.yaml#/data_format"
77 |     enum:
78 |       - BAM
79 |       - BED
80 |   experimental_strategy:
81 |     term:
82 |       $ref: "_terms.yaml#/experimental_strategy"
83 |     enum:
84 |       - WGS
85 |       - WXS
86 |       - Low Pass WGS
87 |       - Validation
88 |       - RNA-Seq
89 |       - miRNA-Seq
90 |       - Total RNA-Seq
91 |       - DNA Panel
92 |   read_groups:
93 |     $ref: "_definitions.yaml#/to_many"
94 |   core_metadata_collections:
95 |     $ref: "_definitions.yaml#/to_many"
96 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/submitted_copy_number.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "submitted_copy_number"
 4 | title: Submitted Copy Number
 5 | type: object
 6 | namespace: http://gdc.nci.nih.gov
 7 | category: data_file
 8 | program: '*'
 9 | project: '*'
10 | description: >
11 |   Data file containing normalized copy number information from an aliquot.
12 | additionalProperties: false
13 | submittable: true
14 | validators: null
15 | 
16 | systemProperties:
17 |   - id
18 |   - project_id
19 |   - created_datetime
20 |   - updated_datetime
21 |   - state
22 |   - file_state
23 |   - error_type
24 | 
25 | links:
26 |   - exclusive: false
27 |     required: true
28 |     subgroup:
29 |     - name: core_metadata_collections
30 |       backref: submitted_copy_number_files
31 |       label: data_from
32 |       target_type: core_metadata_collection
33 |       multiplicity: many_to_many
34 |       required: false
35 |     - exclusive: true
36 |       required: false
37 |       subgroup:
38 |         - name: aliquots
39 |           backref: submitted_copy_number_files
40 |           label: derived_from
41 |           target_type: aliquot
42 |           multiplicity: one_to_one
43 |           required: false
44 |         - name: read_groups
45 |           backref: submitted_copy_number_files
46 |           label: derived_from
47 |           target_type: read_group
48 |           multiplicity: many_to_many
49 |           required: false
50 | 
51 | required:
52 |   - submitter_id
53 |   - type
54 |   - file_name
55 |   - file_size
56 |   - data_format
57 |   - md5sum
58 |   - data_category
59 |   - data_type
60 |   - experimental_strategy
61 | 
62 | uniqueKeys:
63 |   - [ id ]
64 |   - [ project_id, submitter_id ]
65 | 
66 | properties:
67 |   $ref: "_definitions.yaml#/data_file_properties"
68 |   type:
69 |     enum: [ "submitted_copy_number" ]
70 |   data_category:
71 |     term:
72 |       $ref: "_terms.yaml#/data_category"
73 |     type: string
74 |   data_type:
75 |     term:
76 |       $ref: "_terms.yaml#/data_type"
77 |     type: string 
78 |   data_format:
79 |     term:
80 |       $ref: "_terms.yaml#/data_format"
81 |     type: string 
82 |   experimental_strategy:
83 |     term:
84 |       $ref: "_terms.yaml#/experimental_strategy"
85 |     type: string 
86 |   aliquots:
87 |     $ref: "_definitions.yaml#/to_one"
88 |   read_groups:
89 |     $ref: "_definitions.yaml#/to_many"
90 |   core_metadata_collections:
91 |     $ref: "_definitions.yaml#/to_many"
92 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/submitted_methylation.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-04/schema#"
 2 | 
 3 | id: "submitted_methylation"
 4 | title: Submitted Methylation
 5 | type: object
 6 | namespace: https://www.bloodpac.org/ 
 7 | category: data_file
 8 | program: '*'
 9 | project: '*'
10 | description: "DNA methylation data files contain information on raw and normalized signal intensities, detection confidence and calculated beta values for methylated and unmethylated probes. DNA methylation is an epigenetic mark which can be associated with transcriptional inactivity when located in promoter regions."
11 | additionalProperties: false
12 | submittable: true
13 | validators: null
14 | 
15 | systemProperties:
16 |   - id
17 |   - project_id
18 |   - created_datetime
19 |   - updated_datetime
20 |   - state
21 |   - file_state
22 |   - error_type
23 | 
24 | links:
25 |   - exclusive: false
26 |     required: true
27 |     subgroup:
28 |     - name: core_metadata_collections
29 |       backref: submitted_methylation_files
30 |       label: data_from
31 |       target_type: core_metadata_collection
32 |       multiplicity: many_to_many
33 |       required: false
34 |     - name: aliquots
35 |       backref: submitted_methylation_files
36 |       label: data_from
37 |       target_type: aliquot
38 |       multiplicity: many_to_one
39 |       required: false
40 | 
41 | required:
42 |   - submitter_id
43 |   - type
44 |   - file_name
45 |   - file_size
46 |   - md5sum
47 |   - data_category
48 |   - data_type
49 |   - data_format
50 | 
51 | uniqueKeys:
52 |   - [ id ]
53 |   - [ project_id, submitter_id ]
54 | 
55 | properties:
56 |   $ref: "_definitions.yaml#/data_file_properties"
57 |   type:
58 |     enum: [ "submitted_methylation" ]
59 |   data_category:
60 |     term:
61 |       $ref: "_terms.yaml#/data_category"
62 |     enum:
63 |       - Methylation Data
64 |   data_type:
65 |     term:
66 |       $ref: "_terms.yaml#/data_type"
67 |     enum: [ "Methylation Intensity Values" ]
68 |   data_format:
69 |     term:
70 |       $ref: "_terms.yaml#/data_format"
71 |     enum:
72 |       - IDAT
73 |   assay_method: 
74 |     enum:
75 |       - Methylation Array
76 |   assay_instrument:
77 |     enum:
78 |       - Illumina
79 |   assay_instrument_model:
80 |     enum:
81 |       - Illumina Infinium HumanMethylation450
82 |       - Illumina Infinium HumanMethylation450K
83 |   aliquots:
84 |     $ref: "_definitions.yaml#/to_one"
85 |   core_metadata_collections:
86 |     $ref: "_definitions.yaml#/to_many"
87 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/aliquot.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "aliquot"
  4 | title: Aliquot
  5 | type: object
  6 | category: biospecimen
  7 | program: '*'
  8 | project: '*'
  9 | description: >
 10 |   Pertaining to a portion of the whole; any one of two or more samples of something, of the same
 11 |   volume or weight.
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: []
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - state
 20 |   - created_datetime
 21 |   - updated_datetime
 22 | 
 23 | required:
 24 |   - submitter_id
 25 |   - type
 26 |   - samples
 27 | 
 28 | uniqueKeys:
 29 |   - [id]
 30 |   - [project_id, submitter_id]
 31 | 
 32 | links:
 33 |   - name: samples
 34 |     backref: aliquots
 35 |     label: derived_from
 36 |     multiplicity: many_to_many
 37 |     target_type: sample
 38 |     required: true
 39 | 
 40 | constraints: null
 41 | 
 42 | # Aliquot properties
 43 | properties:
 44 |   type:
 45 |     type: string
 46 |   id:
 47 |     $ref: "_definitions.yaml#/UUID"
 48 |     systemAlias: node_id
 49 |   state:
 50 |     $ref: "_definitions.yaml#/state"
 51 |   submitter_id:
 52 |     type:
 53 |       - string
 54 |       - "null"
 55 |     description: >
 56 |       The legacy barcode used before prior to the use
 57 |       UUIDs. For TCGA this is bcraliquotbarcode.
 58 |   aliquot_quantity:
 59 |     term:
 60 |       $ref: "_terms.yaml#/aliquot_quantity"
 61 |     type: number
 62 |   aliquot_volume:
 63 |     term:
 64 |       $ref: "_terms.yaml#/aliquot_volume"
 65 |     type: number
 66 |   amount:
 67 |     term:
 68 |       $ref: "_terms.yaml#/amount"
 69 |     type: number
 70 |   analyte_type:
 71 |     term:
 72 |       $ref: "_terms.yaml#/analyte_type"
 73 |     type: string
 74 |   analyte_type_id:
 75 |     term:
 76 |       $ref: "_terms.yaml#/analyte_type_id"
 77 |     enum:
 78 |       - D
 79 |       - E
 80 |       - G
 81 |       - H
 82 |       - R
 83 |       - S
 84 |       - T
 85 |       - W
 86 |       - X
 87 |       - Y
 88 |   concentration:
 89 |     term:
 90 |       $ref: "_terms.yaml#/concentration"
 91 |     type: number
 92 |   project_id:
 93 |     $ref: "_definitions.yaml#/project_id"
 94 |   source_center:
 95 |     term:
 96 |       $ref: "_terms.yaml#/source_center"
 97 |     type: string
 98 |   samples:
 99 |     $ref: "_definitions.yaml#/to_one"
100 |   created_datetime:
101 |     $ref: "_definitions.yaml#/datetime"
102 |   updated_datetime:
103 |     $ref: "_definitions.yaml#/datetime"
104 | 


--------------------------------------------------------------------------------
/docs/useful_links.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Useful links
 3 | 
 4 | Find below a list of links that show the capabilities of our Gen3 software stack tested by and further developed by users and initiatives in the research community all over the globe. These are fantastic resources to explore use cases of Gen3 and may be of help to new and experienced users/operators alike of a Gen3 Data Commons.
 5 | 
 6 | > 🟢 Note: We emphasize that we are not responsible for the content and opinions on the third-party webpages listed below.
 7 | 
 8 | 1. Working with on premises data and servers:
 9 | The gen3 system is optimized to deploy on cloud systems and work with cloud buckets. The Oregon Health & Science University (OHSU) has developed [a collection of extensions](https://github.com/ohsu-comp-bio/compose-services/tree/onprem) to enable gen3 to work in a non aws environment.  Read this [overview](https://github.com/ohsu-comp-bio/compose-services/blob/onprem/onprem/README.md) for more information.
10 | 2. A group of users shared their experiences with setting up their Gen3 Data Commons on a local desktop using Compose Services in August 2020 in form of three videos: [Gen3 Data Commons Setup Part 1](https://www.youtube.com/watch?v=xM54O4aMpWY), [Gen3 Data Commons Setup Part 2](https://www.youtube.com/watch?v=iMmCxnbHpGo), and [Data Upload](https://www.youtube.com/watch?v=F2EOtHPg6g8&feature=youtu.be). Please note, that the content in these videos might not reflect the current status of the Compose-Services repository. Referring to the video part 1, the following is outdated: the format of the `user.yaml` reflects the one shown in the Fence repository and the arborist DB setup is up to date.
11 | 3. A stand-alone data dictionary viewer for schema.json artifacts was published [here](https://github.com/bioteam/dictionary-visualizer).
12 | 4. The [Australian BioCommons group](https://www.biocommons.org.au/gen3-project) has implemented the Gen3 software stack for easier management and sharing of human genome data in Australia. Browse through the [detailed documentation](https://github.com/umccr/gen3-doc) of how they setup and deployed Gen3 including a [customized Data Dictionary](https://github.com/umccr/umccr-dictionary) and guides for users and admins on working with the [production environment](https://github.com/umccr/gen3-doc/tree/main/cloud). This group also wrote a [client "g3po"](https://github.com/umccr/g3po) to interact with Gen3 APIs and tested Gen3's capability to set [granular access to data files using authz and GA4GH Passport Visa consent codes](https://github.com/umccr/gen3-doc/tree/main/submit/agha-gdr-demo) in combination with CILogon as [a new authentication method](https://github.com/uc-cdis/fence/pull/896). 
13 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/demographic.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "demographic"
  4 | title: Demographic
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: clinical
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Data for the characterization of the patient by means of segementing the population (e.g.,
 12 |   characterization by age, sex, or race).
 13 | additionalProperties: false
 14 | submittable: true
 15 | validators: null
 16 | 
 17 | systemProperties:
 18 |   - id
 19 |   - project_id
 20 |   - state
 21 |   - created_datetime
 22 |   - updated_datetime
 23 | 
 24 | links:
 25 |   - name: cases
 26 |     backref: demographics
 27 |     label: describes
 28 |     target_type: case
 29 |     multiplicity: one_to_one
 30 |     required: true
 31 | 
 32 | required:
 33 |   - submitter_id
 34 |   - type
 35 |   - cases
 36 | 
 37 | preferred:
 38 |   - year_of_death
 39 | 
 40 | uniqueKeys:
 41 |   #unclear if want submitter ID for clinical
 42 |   - [id]
 43 |   - [project_id, submitter_id]
 44 | 
 45 | properties:
 46 |   type:
 47 |     type: string
 48 | 
 49 |   id:
 50 |     $ref: "_definitions.yaml#/UUID"
 51 |     systemAlias: node_id
 52 | 
 53 |   state:
 54 |     $ref: "_definitions.yaml#/state"
 55 | 
 56 |   submitter_id:
 57 |     type:
 58 |       - string
 59 |       - "null"
 60 | 
 61 |   gender:
 62 |     term:
 63 |       $ref: "_terms.yaml#/gender"
 64 |     enum:
 65 |       - female
 66 |       - male
 67 |       - unknown
 68 |       - unspecified
 69 |       - not reported
 70 | 
 71 |   race:
 72 |     term:
 73 |       $ref: "_terms.yaml#/race"
 74 |     enum:
 75 |       - white
 76 |       - american indian or alaska native
 77 |       - black or african american
 78 |       - asian
 79 |       - native hawaiian or other pacific islander
 80 |       - other
 81 |       - Unknown
 82 |       - not reported
 83 |       - not allowed to collect
 84 | 
 85 |   ethnicity:
 86 |     term:
 87 |       $ref: "_terms.yaml#/ethnicity"
 88 |     enum:
 89 |       - hispanic or latino
 90 |       - not hispanic or latino
 91 |       - Unknown
 92 |       - not reported
 93 |       - not allowed to collect
 94 | 
 95 |   year_of_birth:
 96 |     term:
 97 |       $ref: "_terms.yaml#/year_of_birth"
 98 |     type:
 99 |       - number
100 |       - "null"
101 | 
102 |   year_of_death:
103 |     term:
104 |       $ref: "_terms.yaml#/year_of_death"
105 |     type: number
106 | 
107 |   cases:
108 |     $ref: "_definitions.yaml#/to_one"
109 |   project_id:
110 |     $ref: "_definitions.yaml#/project_id"
111 |   created_datetime:
112 |     $ref: "_definitions.yaml#/datetime"
113 |   updated_datetime:
114 |     $ref: "_definitions.yaml#/datetime"
115 | 


--------------------------------------------------------------------------------
/templates/etlMapping.yaml:
--------------------------------------------------------------------------------
 1 | mappings:
 2 |   - name: etl
 3 |     doc_type: case
 4 |     type: aggregator
 5 |     root: case
 6 |     props:
 7 |       - name: submitter_id
 8 |       - name: project_id
 9 |       - name: disease_type
10 |       - name: primary_site
11 |     flatten_props:
12 |       - path: demographics
13 |         props:
14 |           - name: gender
15 |             value_mappings:
16 |               - female: F
17 |               - male: M
18 |           - name: race
19 |             value_mappings:
20 |               - american indian or alaskan native: Indian
21 |           - name: ethnicity
22 |           - name: year_of_birth
23 |     aggregated_props:
24 |       - name: _samples_count
25 |         path: samples
26 |         fn: count
27 |       - name: _aliquots_count
28 |         path: samples.aliquots
29 |         fn: count
30 |       - name: _submitted_methylations_count
31 |         path: samples.aliquots.submitted_methylation_files
32 |         fn: count
33 |       - name: _submitted_copy_number_files_on_aliquots_count
34 |         path: samples.aliquots.submitted_copy_number_files
35 |         fn: count
36 |       - name: _read_groups_count
37 |         path: samples.aliquots.read_groups
38 |         fn: count
39 |       - name: _submitted_aligned_reads_count
40 |         path: samples.aliquots.read_groups.submitted_aligned_reads_files
41 |         fn: count
42 |       - name: _submitted_unaligned_reads_count
43 |         path: samples.aliquots.read_groups.submitted_unaligned_reads_files
44 |         fn: count
45 |       - name: _submitted_copy_number_files_on_read_groups_count
46 |         path: samples.aliquots.read_groups.submitted_copy_number_files
47 |         fn: count
48 |       - name: _submitted_somatic_mutations_count
49 |         path: samples.aliquots.read_groups.submitted_somatic_mutations
50 |         fn: count
51 |     joining_props:
52 |       - index: file
53 |         join_on: _case_id
54 |         props:
55 |           - name: data_format
56 |             src: data_format
57 |             fn: set
58 |           - name: data_type
59 |             src: data_type
60 |             fn: set
61 |           - name: _file_id
62 |             src: file_id
63 |             fn: set
64 |   - name: file
65 |     doc_type: file
66 |     type: collector
67 |     root: None
68 |     category: data_file
69 |     props:
70 |       - name: object_id
71 |       - name: md5sum
72 |       - name: file_name
73 |       - name: file_size
74 |       - name: data_format
75 |       - name: data_type
76 |       - name: state
77 |     injecting_props:
78 |       case:
79 |         props:
80 |           - name: _case_id
81 |             src: id
82 |             fn: set
83 |           - name: project_id
84 |     target_nodes:
85 |       - name: slide_image
86 |         path: slides.samples.cases
87 | 


--------------------------------------------------------------------------------
/templates/gitops.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "gaTrackingId": "UA-119127212-1",
  3 |   "dataExplorerConfig": {
  4 |     "charts": {
  5 |       "project_id": {
  6 |         "chartType": "count",
  7 |         "title": "Projects"
  8 |       },
  9 |       "node_id": {
 10 |         "chartType": "count",
 11 |         "title": "Cases"
 12 |       },
 13 |       "gender": {
 14 |         "chartType": "pie",
 15 |         "title": "Gender"
 16 |       },
 17 |       "race": {
 18 |         "chartType": "bar",
 19 |         "title": "Race"
 20 |       }
 21 |     },
 22 |     "filters": {
 23 |       "tabs": [
 24 |         {
 25 |           "title": "Case",
 26 |           "fields":[
 27 |             "project_id",
 28 |             "gender",
 29 |             "race",
 30 |             "ethnicity"
 31 |           ]
 32 |         }
 33 |       ]
 34 |     },
 35 |     "table": {
 36 |       "enabled": false
 37 |     },
 38 |     "dropdowns": {},
 39 |     "buttons": [],
 40 |     "guppyConfig": {
 41 |       "dataType": "case",
 42 |       "nodeCountTitle": "Cases",
 43 |       "fieldMapping": [
 44 |         { "field": "disease_type", "name": "Disease type" },
 45 |         { "field": "primary_site", "name": "Site where samples were collected"}
 46 |       ],
 47 |       "manifestMapping": {
 48 |         "resourceIndexType": "file",
 49 |         "resourceIdField": "object_id",
 50 |         "referenceIdFieldInResourceIndex": "_case_id",
 51 |         "referenceIdFieldInDataIndex": "_case_id"
 52 |       },
 53 |       "accessibleFieldCheckList": ["project_id"],
 54 |       "accessibleValidationField": "project_id"
 55 |     }
 56 |   },
 57 |   "fileExplorerConfig": {
 58 |     "charts": {
 59 |       "data_type": {
 60 |         "chartType": "stackedBar",
 61 |         "title": "File Type"
 62 |       },
 63 |       "data_format": {
 64 |         "chartType": "stackedBar",
 65 |         "title": "File Format"
 66 |       }
 67 |     },
 68 |     "filters": {
 69 |       "tabs": [
 70 |         {
 71 |           "title": "File",
 72 |           "fields": [
 73 |             "project_id",
 74 |             "data_type",
 75 |             "data_format"
 76 |           ]
 77 |         }
 78 |       ]
 79 |     },
 80 |     "table": {
 81 |       "enabled": true,
 82 |       "fields": [
 83 |         "project_id",
 84 |         "file_name",
 85 |         "file_size",
 86 |         "object_id"
 87 |       ]
 88 |     },
 89 |     "dropdowns": {},
 90 |     "guppyConfig": {
 91 |       "dataType": "file",
 92 |       "fieldMapping": [
 93 |         { "field": "object_id", "name": "GUID" }
 94 |       ],
 95 |       "nodeCountTitle": "Files",
 96 |       "manifestMapping": {
 97 |         "resourceIndexType": "case",
 98 |         "resourceIdField": "_case_id",
 99 |         "referenceIdFieldInResourceIndex": "object_id",
100 |         "referenceIdFieldInDataIndex": "object_id"
101 |       },
102 |       "accessibleFieldCheckList": ["project_id"],
103 |       "accessibleValidationField": "project_id",
104 |       "downloadAccessor": "object_id"
105 |     }
106 |   }
107 | }


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/exposure.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "exposure"
  4 | title: Exposure
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: clinical
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Clinically relevant patient information not immediately resulting from genetic predispositions.
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - state
 20 |   - created_datetime
 21 |   - updated_datetime
 22 | 
 23 | required:
 24 |   - submitter_id
 25 |   - type
 26 | 
 27 | links:
 28 |   - name: cases
 29 |     backref: exposures
 30 |     label: describes
 31 |     target_type: case
 32 |     multiplicity: many_to_one
 33 |     required: true
 34 | 
 35 | preferred:
 36 |   - cigarettes_per_day
 37 |   - years_smoked
 38 | 
 39 | uniqueKeys:
 40 |   #unclear if want submitter ID for clinical
 41 |   - [id]
 42 |   - [project_id, submitter_id]
 43 | 
 44 | properties:
 45 |   type:
 46 |     enum: [ "exposure" ]
 47 | 
 48 |   id:
 49 |     $ref: "_definitions.yaml#/UUID"
 50 |     systemAlias: node_id
 51 | 
 52 |   state:
 53 |     $ref: "_definitions.yaml#/state"
 54 | 
 55 |   submitter_id:
 56 |     type:
 57 |       - string
 58 |       - "null"
 59 | 
 60 |   alcohol_history:
 61 |     term:
 62 |       $ref: "_terms.yaml#/alcohol_history"
 63 |     type: string
 64 | 
 65 |   alcohol_intensity:
 66 |     term:
 67 |       $ref: "_terms.yaml#/alcohol_intensity"
 68 |     type: string
 69 | 
 70 |   bmi:
 71 |     term:
 72 |       $ref: "_terms.yaml#/bmi"
 73 |     type: number
 74 | 
 75 |   cigarettes_per_day:
 76 |     term:
 77 |       $ref: "_terms.yaml#/cigarettes_per_day"
 78 |     type: number
 79 | 
 80 |   height:
 81 |     term:
 82 |       $ref: "_terms.yaml#/height"
 83 |     type: number
 84 | 
 85 |   pack_years_smoked:
 86 |     term:
 87 |       $ref: "_terms.yaml#/pack_years_smoked"
 88 |     type: number
 89 | 
 90 |   tobacco_smoking_onset_year:
 91 |     term:
 92 |       $ref: "_terms.yaml#/tobacco_smoking_onset_year"
 93 |     type: integer
 94 | 
 95 |   tobacco_smoking_quit_year:
 96 |     term:
 97 |       $ref: "_terms.yaml#/tobacco_smoking_quit_year"
 98 |     type: integer
 99 | 
100 |   tobacco_smoking_status:
101 |     term:
102 |       $ref: "_terms.yaml#/tobacco_smoking_status"
103 |     enum:
104 |       - "1"
105 |       - "2"
106 |       - "3"
107 |       - "4"
108 |       - "5"
109 |       - "6"
110 |       - "7"
111 |       - Unknown
112 |       - Not Reported
113 |       - Not Allowed To Collect
114 | 
115 |   weight:
116 |     term:
117 |       $ref: "_terms.yaml#/weight"
118 |     type: number
119 | 
120 |   years_smoked:
121 |     term:
122 |       $ref: "_terms.yaml#/years_smoked"
123 |     type: number
124 | 
125 |   cases:
126 |     $ref: "_definitions.yaml#/to_one"
127 |   project_id:
128 |     $ref: "_definitions.yaml#/project_id"
129 |   created_datetime:
130 |     $ref: "_definitions.yaml#/datetime"
131 |   updated_datetime:
132 |     $ref: "_definitions.yaml#/datetime"
133 | 


--------------------------------------------------------------------------------
/templates/sheepdog_settings.py:
--------------------------------------------------------------------------------
 1 | from sheepdog.api import app, app_init
 2 | from os import environ
 3 | import config_helper
 4 | 
 5 | APP_NAME='sheepdog'
 6 | def load_json(file_name):
 7 |   return config_helper.load_json(file_name, APP_NAME)
 8 | 
 9 | conf_data = load_json('creds.json')
10 | config = app.config
11 | 
12 | config["AUTH"] = 'https://auth.service.consul:5000/v3/'
13 | config["AUTH_ADMIN_CREDS"] = None
14 | config["INTERNAL_AUTH"] = None
15 | 
16 | # SIGNPOST is deprecated, replaced by INDEX_CLIENT (sheepdog>=1.1.12)
17 | config['SIGNPOST'] = {
18 |     'host': environ.get('SIGNPOST_HOST', 'http://indexd-service'),
19 |     'version': 'v0',
20 |     'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')),
21 | }
22 | config["INDEX_CLIENT"] = {
23 |     'host': environ.get('INDEX_CLIENT_HOST', 'http://indexd-service'),
24 |     'version': 'v0',
25 |     'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')),
26 | }
27 | config["FAKE_AUTH"] = False
28 | config["PSQLGRAPH"] = {
29 |     'host': conf_data['db_host'],
30 |     'user': conf_data['db_username'],
31 |     'password': conf_data['db_password'],
32 |     'database': conf_data['db_database'],
33 | }
34 | 
35 | config['HMAC_ENCRYPTION_KEY'] = conf_data.get('hmac_key', '{{hmac_key}}')
36 | config['FLASK_SECRET_KEY'] = conf_data.get('gdcapi_secret_key', '{{gdcapi_secret_key}}')
37 | config['PSQL_USER_DB_CONNECTION'] = 'postgresql://%s:%s@%s:5432/%s' % tuple([ conf_data.get(key, key) for key in ['fence_username', 'fence_password', 'fence_host', 'fence_database']])
38 | config['OIDC_ISSUER'] = 'https://%s/user' % conf_data['hostname']
39 | 
40 | config['OAUTH2'] = {
41 |     'client_id': conf_data.get('oauth2_client_id', '{{oauth2_client_id}}'),
42 |     'client_secret': conf_data.get('oauth2_client_secret', '{{oauth2_client_secret}}'),
43 |     'api_base_url': 'https://%s/user/' % conf_data['hostname'],
44 |     'authorize_url': 'https://%s/user/oauth2/authorize' % conf_data['hostname'],
45 |     'access_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'],
46 |     'refresh_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'],
47 |     'client_kwargs': {
48 |         'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'],
49 |         'scope': 'openid data user',
50 |     },
51 |     # deprecated key values, should be removed after all commons use new oidc
52 |     'internal_oauth_provider': 'http://fence-service/oauth2/',
53 |     'oauth_provider': 'https://%s/user/oauth2/' % conf_data['hostname'],
54 |     'redirect_uri': 'https://%s/api/v0/oauth2/authorize'  % conf_data['hostname']
55 | }
56 | config['USER_API'] = 'http://fence-service/'
57 | # option to force authutils to prioritize USER_API setting over the issuer from
58 | # token when redirecting, used during local docker compose setup when the
59 | # services are on different containers but the hostname is still localhost
60 | config['FORCE_ISSUER'] = True
61 | 
62 | if environ.get('DICTIONARY_URL'):
63 |     config['DICTIONARY_URL'] = environ.get('DICTIONARY_URL')
64 | else:
65 |     config['PATH_TO_SCHEMA_DIR'] = environ.get('PATH_TO_SCHEMA_DIR')
66 | 
67 | app_init(app)
68 | application = app
69 | application.debug = (environ.get('GEN3_DEBUG') == "True")
70 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/slide_image.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "slide_image"
  4 | title: Slide Image
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: data_file
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Data file containing image of a slide.
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - created_datetime
 20 |   - updated_datetime
 21 |   - state
 22 |   - file_state
 23 |   - error_type
 24 | 
 25 | links:
 26 |   - exclusive: false
 27 |     required: true
 28 |     subgroup:
 29 |     - name: slides
 30 |       backref: slide_images
 31 |       label: data_from
 32 |       target_type: slide
 33 |       multiplicity: many_to_one
 34 |       required: false
 35 |     - name: core_metadata_collections
 36 |       backref: slide_images
 37 |       label: data_from
 38 |       target_type: core_metadata_collection
 39 |       multiplicity: many_to_many
 40 |       required: false
 41 | 
 42 | required:
 43 |   - submitter_id
 44 |   - type
 45 |   - file_name
 46 |   - file_size
 47 |   - md5sum
 48 |   - data_category
 49 |   - data_type
 50 |   - data_format
 51 | 
 52 | uniqueKeys:
 53 |   - [ id ]
 54 |   - [ project_id, submitter_id ]
 55 | 
 56 | properties:
 57 |   $ref: "_definitions.yaml#/data_file_properties"
 58 |   type:
 59 |     enum: [ "slide_image" ]
 60 |   data_category:
 61 |     term:
 62 |       $ref: "_terms.yaml#/data_category"
 63 |     enum:
 64 |       - Biospecimen
 65 |       - Slide Image
 66 |       - Mass Cytometry
 67 |   data_type:
 68 |     term:
 69 |       $ref: "_terms.yaml#/data_type"
 70 |     enum:
 71 |       - image
 72 |       - Single Cell Image
 73 |       - Raw IMC Data
 74 |       - Single Channel IMC Image
 75 |       - Antibody Panel Added
 76 |   data_format:
 77 |     term:
 78 |       $ref: "_terms.yaml#/data_format"
 79 |     type: string 
 80 |   experimental_strategy:
 81 |     description: "Classification of the slide type with respect to its experimental use."
 82 |     enum:
 83 |       - Diagnostic Slide
 84 |       - Tissue Slide
 85 |   cell_type:
 86 |     description: "The type of cell being imaged or otherwised analysed."
 87 |     type: string
 88 |   cell_identifier:
 89 |     description: "An alternative identifier for a given cell type."
 90 |     type: string
 91 |   cell_count:
 92 |     description: "Count of the cell type being imaged or otherwise analysed."
 93 |     type: integer
 94 |   frame_identifier:
 95 |     description: "Name, number, or other identifier given to the frame of the slide from which this image was taken."
 96 |     type: string
 97 |   panel_used:
 98 |     description: "Name or other identifier given to the panel used during an IMC run."
 99 |     type: string
100 |   protocol_used:
101 |     description: "Name or other identifier given to the protocol used during an IMC run."
102 |     type: string
103 |   run_name:
104 |     description: "Name, number, or other identifier given to the run that generated this slide image."
105 |     type: string
106 |   slides:
107 |     $ref: "_definitions.yaml#/to_one"
108 |   core_metadata_collections:
109 |     $ref: "_definitions.yaml#/to_many"
110 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/slide_count.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "slide_count"
  4 | title: Slide Count
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: notation
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Information pertaining to processed results obtained from slides; often in the form of counts.
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - created_datetime
 20 |   - updated_datetime
 21 |   - state
 22 | 
 23 | links:
 24 |   - name: slides
 25 |     backref: slide_counts
 26 |     label: data_from
 27 |     target_type: slide
 28 |     multiplicity: many_to_many
 29 |     required: true
 30 | 
 31 | required:
 32 |   - submitter_id
 33 |   - type
 34 |   - slides
 35 | 
 36 | uniqueKeys:
 37 |   - [ id ]
 38 |   - [ project_id, submitter_id ]
 39 | 
 40 | properties:
 41 |   type:
 42 |     enum: [ "slide_count" ]
 43 |   id:
 44 |     $ref: "_definitions.yaml#/UUID"
 45 |     systemAlias: node_id
 46 |   state:
 47 |     $ref: "_definitions.yaml#/state"
 48 |   submitter_id:
 49 |     type:
 50 |       - string
 51 |       - "null"
 52 |   cell_type:
 53 |     description: "The type of cell being counted or measured."
 54 |     type: string
 55 |   cell_identifier:
 56 |     description: "An alternative identifier for a given cell type."
 57 |     type: string
 58 |   cell_count:
 59 |     description: "Raw count of a particular cell type."
 60 |     type: integer
 61 |   ck_signal:
 62 |     description: "Numeric quantification of the CK signal."
 63 |     type: number
 64 |   biomarker_signal:
 65 |     description: "Numeric quantification of the biomarker signal."
 66 |     type: number
 67 |   er_localization:
 68 |     description: "Cellular localization of the endoplasmic reticulum as determined by staining."
 69 |     enum:
 70 |       - Nuclear
 71 |       - Cytoplasmic
 72 |       - Both
 73 |       - None
 74 |       - Not Determined 
 75 |   frame_identifier:
 76 |     description: "Name, number, or other identifier given to the frame of the slide from which this image was taken."
 77 |     type: string
 78 |   relative_nuclear_size:
 79 |     description: "The ratio of the single cell's nucleus size to the average of the surrounding cells."
 80 |     type: number
 81 |   relative_nuclear_intensity:
 82 |     description: "The ratio of the single cell's nuclear staining intensity to the average of the surrounding cells."
 83 |     type: number
 84 |   relative_cytokeratin_intensity:
 85 |     description: "The ratio of the single cell's cytokeratin staining intensity to the average of the surrounding cells."
 86 |     type: number
 87 |   relative_er_intensity:
 88 |     description: "The ratio of the single cell's endoplasmic reticulum staining intensity to the average of the surrounding cells."
 89 |     type: number
 90 |   run_name:
 91 |     description: "The name or identifier given to the run that was used to generate this slide count."
 92 |     type: string
 93 |   slides:
 94 |     $ref: "_definitions.yaml#/to_many"
 95 |   project_id:
 96 |     type: string
 97 |   created_datetime:
 98 |     $ref: "_definitions.yaml#/datetime"
 99 |   updated_datetime:
100 |     $ref: "_definitions.yaml#/datetime"
101 | 


--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | #!groovy
 2 | 
 3 | pipeline {
 4 |   agent any
 5 | 
 6 | 
 7 |   stages {
 8 |     stage('FetchCode') {
 9 |       steps {
10 |         checkout scm  
11 |         dir('cloud-automation') {
12 |           git(
13 |             url: 'https://github.com/uc-cdis/cloud-automation.git',
14 |             branch: 'master'
15 |           )
16 |         }
17 |         script {
18 |           env.GEN3_HOME=env.WORKSPACE+'/cloud-automation'
19 |           env.GEN3_NOPROXY='true'
20 |           env.KLOCK_USER = "jenkins" + new Random().nextInt()
21 |         }
22 |       }
23 |     }
24 |     stage('docker pull') {
25 |       steps {
26 |         sh('sudo docker-compose pull')
27 |       }
28 |     }
29 |     stage('AcquireLock') {
30 |       steps {
31 |         script {
32 |           // acquire global lock to launch docker services on Jenkins host node
33 |           def lockStatus = sh( script: "bash cloud-automation/gen3/bin/klock.sh lock dockerTest ${env.KLOCK_USER} 3600 -w 600", returnStatus: true)
34 |           if (lockStatus != 0) {
35 |             error("unable to acquire dockerTest lock")
36 |           }
37 |         }
38 |       }
39 |     }
40 |     stage('docker up') {
41 |       steps {
42 |         sh 'sudo docker-compose down || true'
43 |         sh 'sudo docker-compose config'
44 |         //sh 'sudo docker-compose up -d' // see note below - this fails on k8s node
45 |       }
46 |     }
47 |     stage('smoke test') {
48 |       when {
49 |         expression {
50 |           return false  // docker-compose -up above fails, because k8s owns the host node networking
51 |           // + sudo docker-compose up -d
52 |           // Creating network "ithub_org_compose-services_pr-20_devnet" with the default driver
53 |           // Failed to program FILTER chain: iptables failed: iptables --wait -I FORWARD -o br-fa829e600aec -j DOCKER: iptables v1.4.21: Couldn't load target `DOCKER':No such file or directory
54 |         }
55 |       }
56 |       steps {
57 |         dir('testResults') {
58 |           script {
59 |             // get the IP address of the node Jenkins is running on
60 |             def ipAddress = sh(script: "kubectl describe pod -l app=jenkins | grep Node: | sed 's@^.*/@@'", returnStdout: true)
61 |             retry(10) { // retry smoke_test up to 10 times
62 |               sleep(60) // give the services some time to start up
63 |               sh(script: "bash ./smoke_test.sh ${ipAddress}")
64 |             }
65 |           }
66 |         }
67 |       }
68 |     }
69 |   }
70 |   post {
71 |     success {
72 |       echo "https://jenkins.planx-pla.net/ $env.JOB_NAME pipeline succeeded"
73 |     }
74 |     failure {
75 |       echo "Failure!"
76 |       //archiveArtifacts artifacts: '**/output/*.png', fingerprint: true
77 |       //slackSend color: 'bad', message: "https://jenkins.planx-pla.net $env.JOB_NAME pipeline failed"
78 |     }
79 |     unstable {
80 |       echo "Unstable!"
81 |       //slackSend color: 'bad', message: "https://jenkins.planx-pla.net $env.JOB_NAME pipeline unstable"
82 |     }
83 |     always {
84 |       script {
85 |         uid = env.service+"-"+env.quaySuffix+"-"+env.BUILD_NUMBER
86 |         withEnv(['GEN3_NOPROXY=true', "GEN3_HOME=$env.WORKSPACE/cloud-automation"]) {         
87 |           sh("bash cloud-automation/gen3/bin/klock.sh unlock dockerTest ${env.KLOCK_USER} || true")
88 |         }
89 |       }
90 |       echo "done"
91 |     }
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/experiment.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "experiment"
  4 | title: Experiment
  5 | type: object
  6 | namespace: http://bloodprofilingatlas.org/bpa/
  7 | category: administrative
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   A coordinated set of actions and observations designed to generate data, with the ultimate goal
 12 |   of discovery or hypothesis testing.
 13 | additionalProperties: false
 14 | submittable: true
 15 | validators: null
 16 | 
 17 | systemProperties:
 18 |   - id
 19 |   - project_id
 20 |   - created_datetime
 21 |   - updated_datetime
 22 |   - state
 23 | 
 24 | links:
 25 |   - name: projects
 26 |     backref: experiments
 27 |     label: performed_for
 28 |     target_type: project 
 29 |     multiplicity: many_to_one
 30 |     required: true
 31 | 
 32 | required:
 33 |   - submitter_id
 34 |   - type
 35 |   - projects
 36 | 
 37 | uniqueKeys:
 38 |   - [ id ]
 39 |   - [ project_id, submitter_id ]
 40 | 
 41 | properties:
 42 |   type:
 43 |     enum: [ "experiment" ]
 44 |   id:
 45 |     $ref: "_definitions.yaml#/UUID"
 46 |     systemAlias: node_id
 47 |   state:
 48 |     $ref: "_definitions.yaml#/state"
 49 |   submitter_id:
 50 |     type:
 51 |       - string
 52 |       - "null"
 53 |   number_experimental_group:
 54 |     description: "The number denoting this experiment's place within the group within the whole."
 55 |     type:
 56 |       - integer
 57 |   number_samples_per_experimental_group:
 58 |     description: "The number of samples contained within this experimental group."
 59 |     type:
 60 |       - integer
 61 |   experimental_description:
 62 |     description: "A brief description of the experiment being performed."
 63 |     type:
 64 |       - string
 65 |   experimental_intent:
 66 |     description: "Summary of the goals the experiment is designed to discover."
 67 |     type:
 68 |       - string
 69 |   associated_experiment:
 70 |     description: "The submitter_id for any experiment with which this experiment is associated, paired, or matched."
 71 |     type:
 72 |       - string
 73 |   type_of_sample:
 74 |     description: "String indicator identifying the types of samples as contrived or clinical."
 75 |     type:
 76 |       - string
 77 |   type_of_specimen:
 78 |     description: "Broad description of the specimens used in the experiment."
 79 |     type:
 80 |       - string
 81 |   marker_panel_description:
 82 |     description: "Brief description of the marker panel used in this experiment."
 83 |     type: string
 84 |   somatic_mutations_identified:
 85 |     description: "Are somatic mutations identified for this experiment?"
 86 |     type: boolean
 87 |   indels_identified:
 88 |     description: "Are indels identified in this experiment?"
 89 |     type: boolean
 90 |   copy_numbers_identified:
 91 |     description: "Are copy number variations identified in this experiment?"
 92 |     type: boolean
 93 |   type_of_data:
 94 |     description: "Is the data raw or processed?"
 95 |     enum:
 96 |       - Raw
 97 |       - Processed
 98 |   data_description:
 99 |     description: "Brief description of the data being provided for this experiment."
100 |     type: string
101 |   projects:
102 |     $ref: "_definitions.yaml#/to_one_project"
103 |   project_id:
104 |     $ref: "_definitions.yaml#/project_id"
105 |   created_datetime:
106 |     $ref: "_definitions.yaml#/datetime"
107 |   updated_datetime:
108 |     $ref: "_definitions.yaml#/datetime"
109 | 


--------------------------------------------------------------------------------
/datadictionary/README.md:
--------------------------------------------------------------------------------
 1 | # Data Dictionary
 2 | 
 3 | The data dictionary provides the first level of validation for all data
 4 | stored in and generated by the BPA. Written in YAML, JSON schemas define all the individual entities
 5 | (nodes) in the data model. Moreover, these schemas define all of the relationships (links)
 6 | between the nodes. Finally, the schemas define the valid key-value pairs that can be used to
 7 | describe the nodes. 
 8 | 
 9 | ## Data Dictionary Structure 
10 | 
11 | The Data Model covers all of the nodes within the as well as the relationships between
12 | the different types of nodes. All of the nodes in the data model are strongly typed and individually
13 | defined for a specific data type. For example, submitted files can come in different forms, such as
14 | aligned or unaligned reads; within the model we have two separately defined nodes for
15 | `Submitted Unaligned Reads` and `Submitted Aligned Reads`. Doing such allows for faster querying of
16 | the data model as well as providing a clear and concise representation of the data in the BPA.
17 | 
18 | Beyond node type, there are also a number of extensions used to further define the nodes within
19 | the data model. Nodes are grouped up into categories that represent broad roles for the node such
20 | as `analysis` or `biospecimen`. Additionally, nodes are defined within their `Program` or `Project`
21 | and have descriptions of their use. All nodes also have a series of `systemProperties`; these
22 | properties are those that will be automatically filled by the system unless otherwise defined by
23 | the user.  These basic properties define the node itself but still need to be placed into the model.
24 | 
25 | The model itself is represented as a graph. Within the schema are defined `links`; these links
26 | point from child to parent with Program being the root of the graph. The links also contain a
27 | `backref` that allows for a parent to point back to a child. Other features of the link include a
28 | semantic `label` that describes the relationship between the two nodes, a `multiplicity` property
29 | that describes the numeric relationship from the child to the parent, and a requirement property
30 | to define whether a node must have that link. Taken all together the nodes and links create the
31 | directed graph of the Data Model.
32 | 
33 | ## Node Properties and Examples
34 | 
35 | Each node contains a series of potential key-value pairs (`properties`) that can be used to
36 | characterize the data they represent. Some properties are categorized as `required` or `preferred`.
37 | If a submission lacks a required property, it cannot be accepted. Preferred properties can denote
38 | two things: the property is being highlighted as it has become more desired by the community or
39 | the property is being promoted to required. All properties not designated either `required` or
40 | `preferred` are still sought by BPA, but submissions without them are allowed. 
41 | 
42 | The properties have further validation through their entries. Legal values are defined in each
43 | property. For the most part these are represented in the `enum` categories although some keys,
44 | such as `submitter_id`, will allow any string value as a valid entry. Other numeric properties
45 | can have maximum and minimum values to limit valid entries.  For examples of what a valid entry
46 | would look like, each node has a mock submission located in the `examples/valid/` directory. 
47 | 
48 | ## Contributing
49 | 
50 | Read how to contribute [here](https://github.com/NCI-GDC/portal-ui/blob/develop/CONTRIBUTING.md).
51 | 


--------------------------------------------------------------------------------
/templates/peregrine_settings.py:
--------------------------------------------------------------------------------
 1 | from peregrine.api import app, app_init
 2 | from os import environ
 3 | import config_helper
 4 | 
 5 | APP_NAME='peregrine'
 6 | def load_json(file_name):
 7 |   return config_helper.load_json(file_name, APP_NAME)
 8 | 
 9 | conf_data = load_json('creds.json')
10 | config = app.config
11 | 
12 | config["AUTH"] = 'https://auth.service.consul:5000/v3/'
13 | config["AUTH_ADMIN_CREDS"] = None
14 | config["INTERNAL_AUTH"] = None
15 | 
16 | # SIGNPOST is deprecated, replaced by INDEX_CLIENT (peregrine>=1.3.0)
17 | config['SIGNPOST'] = {
18 |     'host': environ.get('SIGNPOST_HOST', 'http://indexd-service'),
19 |     'version': 'v0',
20 |     'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')),
21 | }
22 | config['INDEX_CLIENT'] = {
23 |     'host': environ.get('INDEX_CLIENT_HOST', 'http://indexd-service'),
24 |     'version': 'v0',
25 |     'auth': ('indexd_client', conf_data.get('indexd_password', '{{indexd_password}}')),
26 | }
27 | config["FAKE_AUTH"] = False
28 | config["PSQLGRAPH"] = {
29 |     'host': conf_data.get( 'db_host', '{{db_host}}' ),
30 |     'user': conf_data.get( 'db_username', '{{db_username}}' ),
31 |     'password': conf_data.get( 'db_password', '{{db_password}}' ),
32 |     'database': conf_data.get( 'db_database', '{{db_database}}' ),
33 | }
34 | 
35 | config['HMAC_ENCRYPTION_KEY'] = conf_data.get( 'hmac_key', '{{hmac_key}}' )
36 | config['FLASK_SECRET_KEY'] = conf_data.get( 'gdcapi_secret_key', '{{gdcapi_secret_key}}' )
37 | config['PSQL_USER_DB_CONNECTION'] = 'postgresql://%s:%s@%s:5432/%s' % tuple([ conf_data.get(key, key) for key in ['fence_username', 'fence_password', 'fence_host', 'fence_database']])
38 | 
39 | if environ.get('DICTIONARY_URL'):
40 |     config['DICTIONARY_URL'] = environ.get('DICTIONARY_URL')
41 | else:
42 |     config['PATH_TO_SCHEMA_DIR'] = environ.get('PATH_TO_SCHEMA_DIR')
43 | 
44 | config['SUBMISSION'] = {
45 |     'bucket': conf_data.get( 'bagit_bucket', '{{bagit_bucket}}' )
46 | }
47 | 
48 | config['STORAGE'] = {
49 |     "s3":
50 |     {
51 |         "access_key": conf_data.get( 's3_access', '{{s3_access}}' ),
52 |         'secret_key': conf_data.get( 's3_secret', '{{s3_secret}}' )
53 |     }
54 | }
55 | 
56 | config['OIDC_ISSUER'] = 'https://%s/user' % conf_data['hostname']
57 | 
58 | config['OAUTH2'] = {
59 |     'client_id': conf_data.get('oauth2_client_id', '{{oauth2_client_id}}'),
60 |     'client_secret': conf_data.get('oauth2_client_secret', '{{oauth2_client_secret}}'),
61 |     'api_base_url': 'https://%s/user/' % conf_data['hostname'],
62 |     'authorize_url': 'https://%s/user/oauth2/authorize' % conf_data['hostname'],
63 |     'access_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'],
64 |     'refresh_token_url': 'https://%s/user/oauth2/token' % conf_data['hostname'],
65 |     'client_kwargs': {
66 |         'redirect_uri': 'https://%s/api/v0/oauth2/authorize' % conf_data['hostname'],
67 |         'scope': 'openid data user',
68 |     },
69 |     # deprecated key values, should be removed after all commons use new oidc
70 |     'internal_oauth_provider': 'http://fence-service/oauth2/',
71 |     'oauth_provider': 'https://%s/user/oauth2/' % conf_data['hostname'],
72 |     'redirect_uri': 'https://%s/api/v0/oauth2/authorize'  % conf_data['hostname']
73 | }
74 | 
75 | config['USER_API'] = 'http://fence-service/'
76 | # option to force authutils to prioritize USER_API setting over the issuer from
77 | # token when redirecting, used during local docker compose setup when the
78 | # services are on different containers but the hostname is still localhost
79 | config['FORCE_ISSUER'] = True
80 | 
81 | app_init(app)
82 | application = app
83 | application.debug = (environ.get('GEN3_DEBUG') == "True")
84 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/read_group_qc.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "read_group_qc"
  4 | title: Read Group QC
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: notation
  8 | project: '*'
  9 | program: '*'
 10 | description: "GDC QC run metadata."
 11 | additionalProperties: false
 12 | submittable: false
 13 | validators: null
 14 | 
 15 | systemProperties:
 16 |   - id
 17 |   - project_id
 18 |   - created_datetime
 19 |   - updated_datetime
 20 |   - state
 21 | 
 22 | links:
 23 |   - exclusive: true
 24 |     required: true
 25 |     subgroup:
 26 |       - name: submitted_aligned_reads_files
 27 |         backref: read_group_qcs
 28 |         label: data_from
 29 |         target_type: submitted_aligned_reads
 30 |         multiplicity: one_to_one
 31 |         required: false
 32 |       - name: submitted_unaligned_reads_files
 33 |         backref: read_group_qcs
 34 |         label: data_from
 35 |         target_type: submitted_unaligned_reads
 36 |         multiplicity: one_to_many
 37 |         required: false
 38 |   - name: read_groups
 39 |     label: generated_from
 40 |     target_type: read_group
 41 |     multiplicity: many_to_one
 42 |     required: true
 43 |     backref: read_group_qcs
 44 | 
 45 | required:
 46 |   - submitter_id
 47 |   - workflow_link
 48 |   - type
 49 |   - percent_gc_content
 50 |   - encoding
 51 |   - total_sequences
 52 |   - basic_statistics
 53 |   - per_base_sequence_quality
 54 |   - per_tile_sequence_quality
 55 |   - per_sequence_quality_score
 56 |   - per_base_sequence_content
 57 |   - per_sequence_gc_content
 58 |   - per_base_n_content
 59 |   - sequence_length_distribution
 60 |   - sequence_duplication_levels
 61 |   - overrepresented_sequences
 62 |   - adapter_content
 63 |   - kmer_content
 64 |   - read_groups
 65 | 
 66 | uniqueKeys:
 67 |   - [ id ]
 68 |   - [ project_id, submitter_id ]
 69 | 
 70 | properties:
 71 |   $ref: "_definitions.yaml#/workflow_properties"
 72 |   type:
 73 |     enum: [ "read_group_qc" ]
 74 |   workflow_type:
 75 |     term:
 76 |       $ref: "_terms.yaml#/workflow_type"
 77 |     enum: [ "Read Group Quality Control" ]
 78 |   fastq_name:
 79 |     term:
 80 |       $ref: "_terms.yaml#/file_name"
 81 |     type: string
 82 |   percent_aligned:
 83 |     description: "The percent of reads with at least one reported alignment."
 84 |     type: integer
 85 |     minimum: 0
 86 |     maximum: 100
 87 |   percent_gc_content:
 88 |     term:
 89 |       $ref: "_terms.yaml#/percent_gc_content"
 90 |     type: integer
 91 |     minimum: 0
 92 |     maximum: 100
 93 |   encoding:
 94 |     term:
 95 |       $ref: "_terms.yaml#/encoding"
 96 |     type: string
 97 |   total_aligned_reads:
 98 |     description: "The total number of reads with at least one reported alignment."
 99 |     type: integer
100 |   total_sequences:
101 |     term:
102 |       $ref: "_terms.yaml#/total_sequences"
103 |     type: integer
104 |   basic_statistics:
105 |     $ref: "_definitions.yaml#/qc_metrics_state"
106 |   per_base_sequence_quality:
107 |     $ref: "_definitions.yaml#/qc_metrics_state"
108 |   per_tile_sequence_quality:
109 |     $ref: "_definitions.yaml#/qc_metrics_state"
110 |   per_sequence_quality_score:
111 |     $ref: "_definitions.yaml#/qc_metrics_state"
112 |   per_base_sequence_content:
113 |     $ref: "_definitions.yaml#/qc_metrics_state"
114 |   per_sequence_gc_content:
115 |     $ref: "_definitions.yaml#/qc_metrics_state"
116 |   per_base_n_content:
117 |     $ref: "_definitions.yaml#/qc_metrics_state"
118 |   sequence_length_distribution:
119 |     $ref: "_definitions.yaml#/qc_metrics_state"
120 |   sequence_duplication_levels:
121 |     $ref: "_definitions.yaml#/qc_metrics_state"
122 |   overrepresented_sequences:
123 |     $ref: "_definitions.yaml#/qc_metrics_state"
124 |   adapter_content:
125 |     $ref: "_definitions.yaml#/qc_metrics_state"
126 |   kmer_content:
127 |     $ref: "_definitions.yaml#/qc_metrics_state"
128 |   submitted_aligned_reads_files:
129 |     $ref: "_definitions.yaml#/to_one"
130 |   submitted_unaligned_reads_files:
131 |     $ref: "_definitions.yaml#/to_many"
132 |   read_groups:
133 |     $ref: "_definitions.yaml#/to_one"
134 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/slide.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "slide"
  4 | title: Slide
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: biospecimen
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   A digital image, microscopic or otherwise, of any sample, portion, or sub-part thereof. (GDC)
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - state
 20 |   - created_datetime
 21 |   - updated_datetime
 22 | 
 23 | links:
 24 |   - name: samples
 25 |     backref: slides
 26 |     label: derived_from
 27 |     target_type: sample
 28 |     multiplicity: many_to_many
 29 |     required: true
 30 | 
 31 | required:
 32 |   - submitter_id
 33 |   - type
 34 |   - samples
 35 | 
 36 | uniqueKeys:
 37 |   - [id]
 38 |   - [project_id, submitter_id]
 39 | 
 40 | # slide properties
 41 | properties:
 42 |   type:
 43 |     type: string
 44 |   id:
 45 |     $ref: "_definitions.yaml#/UUID"
 46 |     systemAlias: node_id
 47 |   state:
 48 |     $ref: "_definitions.yaml#/state"
 49 |   submitter_id:
 50 |     type:
 51 |       - string
 52 |       - "null"
 53 |   apoptotic_concentration:
 54 |     description: "The concentration, in cells/mL, of apoptotic cells in the slide blood."
 55 |     type: number
 56 |   ctc_concentration:
 57 |     description: "The concentration, in cells/mL, of traditional CTC cells (intact and enlarged cell and nucleus, cytokeratin positive, and CD45 negative) in the slide blood."
 58 |     type: number
 59 |   ctc_low_concentration:
 60 |     description: "The concentration, in cells/mL, of CTC-low cells (those with low cytokeratin levels compared to traditional CTCs) in the slide blood."
 61 |     type: number
 62 |   ctc_small_concentration:
 63 |     description: "The concentration, in cells/mL, of CTC-small cells (those with a small nuclear and cellular size relative to traditional CTCs) in the slide blood."
 64 |     type: number
 65 |   section_location:
 66 |     term:
 67 |       $ref: "_terms.yaml#/section_location"
 68 |     type: string
 69 |   methanol_added:
 70 |     description: "True/False indicator for if methanol was used in the slide preparation process."
 71 |     type: boolean
 72 |   number_proliferating_cells:
 73 |     term:
 74 |       $ref: "_terms.yaml#/number_proliferating_cells"
 75 |     type: integer
 76 |   number_nucleated_cells:
 77 |     description: "The total number of nucleated cells identified on the slide."
 78 |     type: integer
 79 |   percent_tumor_cells:
 80 |     term:
 81 |       $ref: "_terms.yaml#/percent_tumor_cells"
 82 |     type: number
 83 |   percent_tumor_nuclei:
 84 |     term:
 85 |       $ref: "_terms.yaml#/percent_tumor_nuclei"
 86 |     type: number
 87 |   percent_normal_cells:
 88 |     term:
 89 |       $ref: "_terms.yaml#/percent_normal_cells"
 90 |     type: number
 91 |   percent_necrosis:
 92 |     term:
 93 |       $ref: "_terms.yaml#/percent_necrosis"
 94 |     type: number
 95 |   percent_stromal_cells:
 96 |     term:
 97 |       $ref: "_terms.yaml#/percent_stromal_cells"
 98 |     type: number
 99 |   percent_inflam_infiltration:
100 |     term:
101 |       $ref: "_terms.yaml#/percent_inflam_infiltration"
102 |     type: number
103 |   percent_lymphocyte_infiltration:
104 |     term:
105 |       $ref: "_terms.yaml#/percent_lymphocyte_infiltration"
106 |     type: number
107 |   percent_monocyte_infiltration:
108 |     term:
109 |       $ref: "_terms.yaml#/percent_monocyte_infiltration"
110 |     type: number
111 |   percent_granulocyte_infiltration:
112 |     term:
113 |       $ref: "_terms.yaml#/percent_granulocyte_infiltration"
114 |     type: number
115 |   percent_neutrophil_infiltration:
116 |     term:
117 |       $ref: "_terms.yaml#/percent_neutrophil_infiltration"
118 |     type: number
119 |   percent_eosinophil_infiltration:
120 |     term:
121 |       $ref: "_terms.yaml#/percent_eosinophil_infiltration"
122 |     type: number
123 |   run_datetime:
124 |     $ref: "_definitions.yaml#/datetime"
125 |   run_name:
126 |     description: "Name, number, or other identifier given to this slide's run."
127 |     type: string
128 |   slide_identifier:
129 |     description: "Unique identifier given to the this slide."
130 |     type: string
131 |   samples:
132 |     $ref: "_definitions.yaml#/to_many"
133 |   project_id:
134 |     $ref: "_definitions.yaml#/project_id"
135 |   created_datetime:
136 |     $ref: "_definitions.yaml#/datetime"
137 |   updated_datetime:
138 |     $ref: "_definitions.yaml#/datetime"
139 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/project.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "project"
  4 | title: Project
  5 | type: object
  6 | program: '*'
  7 | project: '*'
  8 | category: administrative
  9 | description: >
 10 |   Any specifically defined piece of work that is undertaken or attempted to meet a single
 11 |   requirement. (NCIt C47885)
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - state
 19 |   - released
 20 |   - releasable
 21 |   - intended_release_date
 22 | 
 23 | required:
 24 |   - code
 25 |   - name
 26 |   - dbgap_accession_number
 27 |   - programs
 28 | 
 29 | uniqueKeys:
 30 |   - [ id ]
 31 |   - [ code ]
 32 | 
 33 | links:
 34 |   - name: programs
 35 |     backref: projects
 36 |     label: member_of
 37 |     target_type: program
 38 |     multiplicity: many_to_one
 39 |     required: true
 40 | 
 41 | constraints: null
 42 | 
 43 | properties:
 44 |   type:
 45 |     type: string
 46 |   id:
 47 |     $ref: "_definitions.yaml#/UUID"
 48 |     systemAlias: node_id
 49 |     description: "UUID for the project." # TOREVIEW
 50 |   name:
 51 |     type: string
 52 |     description: "Display name/brief description for the project." # TOREVIEW
 53 |   code:
 54 |     type: string
 55 |     description: "Unique identifier for the project."
 56 |   investigator_name:
 57 |     description: "Name of the principal investigator for the project."
 58 |     type: string
 59 |   investigator_affiliation:
 60 |     description: "The investigator's affiliation with respect to a research institution."
 61 |     type: string
 62 |   date_collected:
 63 |     description: "The date or date range in which the project data was collected."
 64 |     type: string
 65 |   availability_type:
 66 |     description: "Is the project open or restricted?"
 67 |     enum:
 68 |       - Open
 69 |       - Restricted
 70 |   availability_mechanism:
 71 |     description: "Mechanism by which the project will be made avilable."
 72 |     type: string
 73 |   support_source:
 74 |     description: "The name of source providing support/grant resources."
 75 |     type: string
 76 |   support_id:
 77 |     description: "The ID of the source providing support/grant resources."
 78 |     type: string
 79 |   programs:
 80 |     $ref: "_definitions.yaml#/to_one"
 81 |     description: >
 82 |       Indicates that the project is logically part of the indicated project.
 83 |   state:
 84 |     description: |
 85 |       The possible states a project can be in.  All but `open` are
 86 |       equivalent to some type of locked state.
 87 |     default: open
 88 |     enum:
 89 |         # open: the only state users can perform 'upload' actions
 90 |         # possible actions in `open`:
 91 |         #   - upload (no state change)
 92 |         #   - review -> review
 93 |         #   - release (project.released -> true)
 94 |         - open
 95 | 
 96 |         # locked: admin has locked project for review
 97 |         # possible actions in `locked`:
 98 |         #   - open -> open
 99 |         #   - submit -> submitted
100 |         #   - release (project.released -> true)
101 |         - review
102 | 
103 |         # submitted: An admin has submitted project, it is locked against
104 |         #            upload.
105 |         # possible actions in `submitted`:
106 |         #   - process -> processing
107 |         #   - release (project.released -> true)
108 |         - submitted
109 | 
110 |         # processing: The system is processing data in the project and
111 |         #             is locked against upload and submission
112 |         #   - (system transition to open)
113 |         #   - release (project.released -> true)
114 |         - processing
115 | 
116 | 
117 |         # closed: The closed state is introduced to replace the
118 |         #         ``legacy`` state and means that no further action
119 |         #         can be taken on the project
120 |         #   - (system transition to open)
121 |         #   - release (project.released -> true)
122 |         - closed
123 | 
124 |         # DEPRECATED(2016-03-01): synonymous with closed. included for
125 |         #                         backwards compatibility
126 |         - legacy
127 | 
128 |   released:
129 |     description: |
130 |       To release a project is to tell the GDC to include all submitted
131 |       entities in the next GDC index.
132 |     default: false
133 |     type: boolean
134 | 
135 |   releasable:
136 |     description: |
137 |       A project can only be released by the user when `releasable` is true.
138 |     default: false
139 |     type: boolean
140 | 
141 |   intended_release_date:
142 |     description: Tracks a Project's intended release date.
143 |     type: string
144 |     format: date-time
145 |   dbgap_accession_number:
146 |     type: string
147 |     description: "The dbgap accession number provided for the project."
148 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/core_metadata_collection.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "core_metadata_collection"
  4 | title: Core Metadata Collection
  5 | type: object
  6 | namespace: https://dcp.bionimbus.org/
  7 | category: administrative
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Structured description of a collection of several dataset
 12 | additionalProperties: false
 13 | submittable: true
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - state
 20 |   - created_datetime
 21 |   - updated_datetime
 22 | 
 23 | links:
 24 |   - name: projects
 25 |     backref: core_metadata_collections
 26 |     label: data_from
 27 |     target_type: project
 28 |     multiplicity: many_to_one
 29 |     required: true
 30 | 
 31 | uniqueKeys:
 32 |   - [id]
 33 |   - [project_id, submitter_id]
 34 | 
 35 | required:
 36 |   - submitter_id
 37 |   - type
 38 |   - projects
 39 | 
 40 | properties:
 41 |   $ref: "_definitions.yaml#/ubiquitous_properties"
 42 | 
 43 |   contributor:
 44 |     description: >
 45 |       An entity responsible for making contributions to the resource. Examples of a Contributor include a person, an organization, or a service. Typically, the name of a Contributor should be used to indicate the entity.
 46 |     type: string
 47 | 
 48 |   coverage:
 49 |     description: >
 50 |       The spatial or temporal topic of the resource, the spatial applicability of the resource, or the jurisdiction under which the resource is relevant. Spatial topic and spatial applicability may be a named place or a location specified by its geographic coordinates. Temporal topic may be a named period, date, or date range. A jurisdiction may be a named administrative entity or a geographic place to which the resource applies. Recommended best practice is to use a controlled vocabulary such as the Thesaurus of Geographic Names [TGN] (http://www.getty.edu/research/tools/vocabulary/tgn/index.html). Where appropriate, named places or time periods can be used in preference to numeric identifiers such as sets of coordinates or date ranges.
 51 |     type: string
 52 | 
 53 |   creator:
 54 |     description: >
 55 |       An entity primarily responsible for making the resource. Examples of a Creator include a person, an organization, or a service. Typically, the name of a Creator should be used to indicate the entity.
 56 |     type: string
 57 | 
 58 |   date:
 59 |     $ref: "_definitions.yaml#/datetime"
 60 | 
 61 |   description:
 62 |     description: >
 63 |       An account of the resource. Description may include but is not limited to: an abstract, a table of contents, a graphical representation, or a free-text account of the resource.
 64 |     type: string
 65 | 
 66 |   format:
 67 |     description: >
 68 |       The file format, physical medium, or dimensions of the resource. Examples of dimensions include size and duration. Recommended best practice is to use a controlled vocabulary such as the list of Internet Media Types [MIME] (http://www.iana.org/assignments/media-types/). 
 69 |     type: string
 70 | 
 71 |   language:
 72 |     description: >
 73 |       A language of the resource. Recommended best practice is to use a controlled vocabulary such as RFC 4646 (http://www.ietf.org/rfc/rfc4646.txt).
 74 |     type: string
 75 | 
 76 |   publisher:
 77 |     description: >
 78 |       An entity responsible for making the resource available. Examples of a Publisher include a person, an organization, or a service. Typically, the name of a Publisher should be used to indicate the entity.
 79 |     type: string
 80 | 
 81 |   relation:
 82 |     description: >
 83 |       A related resource. Recommended best practice is to identify the related resource by means of a string conforming to a formal identification system. 
 84 |     type: string
 85 | 
 86 |   rights:
 87 |     description: >
 88 |       Information about rights held in and over the resource. Typically, rights information includes a statement about various property rights associated with the resource, including intellectual property rights.
 89 |     type: string
 90 | 
 91 |   source:
 92 |     description: >
 93 |       A related resource from which the described resource is derived. The described resource may be derived from the related resource in whole or in part. Recommended best practice is to identify the related resource by means of a string conforming to a formal identification system.
 94 |     type: string
 95 | 
 96 |   subject:
 97 |     description: >
 98 |       The topic of the resource. Typically, the subject will be represented using keywords, key phrases, or classification codes. Recommended best practice is to use a controlled vocabulary.
 99 |     type: string
100 | 
101 |   title:
102 |     description: >
103 |       A name given to the resource. Typically, a Title will be a name by which the resource is formally known.
104 |     type: string
105 | 
106 |   data_type:
107 |     description: >
108 |       The nature or genre of the resource. Recommended best practice is to use a controlled vocabulary such as the DCMI Type Vocabulary [DCMITYPE]. To describe the file format, physical medium, or dimensions of the resource, use the Format element.
109 |     type: string
110 | 
111 |   projects:
112 |     $ref: "_definitions.yaml#/to_one_project"
113 | 
114 | 


--------------------------------------------------------------------------------
/docs/dev_tips.md:
--------------------------------------------------------------------------------
 1 | # Dev Tips
 2 | 
 3 | You can quickly find commonly used commands for compose services in our [cheat sheet](https://github.com/uc-cdis/compose-services/docs/cheat_sheet.md).
 4 | 
 5 | When developing, you can have local repositories of the services you are working on and use volumes to mount your local repository files onto the containers to override the containers' code (which is built from GitHub using quay.io). Then, you can restart a single container with
 6 | ```
 7 | docker-compose restart [CONTAINER_NAME]
 8 | ```
 9 | after you update some code in order to see changes without having to rebuild all the microservices. Keep in mind that running `docker-compose restart` does not apply changes you make in the docker-compose file. Look up the Docker documentation for more information about [volumes](https://docs.docker.com/storage/).
10 | 
11 | ## Spark service hdfs reformatting issue
12 | 
13 | The `spark-service` starts up runs `hdfs namenode -format` formatting, which is a compute intensive operation. If your `spark-service` fails to start due to being killed by docker daemon, e.g. the container status is `Exited (255)`, then tail the last lines of log as follows:
14 | 
15 | ```
16 | docker logs spark-service --tail=5
17 | /************************************************************
18 | SHUTDOWN_MSG: Shutting down NameNode at 3b8d38960f74/172.20.0.2
19 | ************************************************************/
20 | 2021-04-07 02:30:55,414 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21 | safemode: Your endpoint configuration is wrong; For more details see:  http://wiki.apache.org/hadoop/UnsetHostnameOrPort
22 | ```
23 | 
24 | Before attempting to (re)start the `spark-service`, make sure to delete the exited/failed container first.
25 | 
26 | ```
27 | docker rm spark-service
28 | docker-compose up -d
29 | ```
30 | 
31 | Otherwise, you may encounter the following looping in the container log:
32 | 
33 | ```
34 | docker logs spark-service --tail=5
35 |   Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input:
36 |   Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input:
37 |   Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input:
38 |   Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input:
39 |   Re-format filesystem in Storage Directory root= /hadoop/hdfs/data/dfs/namenode; location= null ? (Y or N) Invalid input:
40 | ```
41 | 
42 | ## Running Docker Compose on a Remote Machine
43 | 
44 | To run Docker Compose on a remote machine, modify the `BASE_URL` field in `fence-config.yaml`, and the `hostname` field in `peregrine_creds.json` and `sheepdog_creds.json` in the `Secrets` directory.
45 | 
46 | ## Dumping config files and logs (MacOS/Linux)
47 | 
48 | If you are encountering difficulties while setting up Docker Compose and need help from the Gen3 team, you can use the `dump.sh` script to create a zip file of your configuration and current logs, which you can share to get help.
49 | ```
50 | bash dump.sh
51 | ```
52 | Note that if docker-compose is not running, the logs will be empty.
53 | 
54 | The following configuration files will be included:
55 | * docker-compose.yml
56 | * user.yaml
57 | * any file ending with "settings" or "config"
58 | 
59 | Credentials files are NOT included and lines containing "password", "secret" or "key" are removed from other files.
60 | If your files contain other kinds of sensitive credentials, make sure to remove them before running the script.
61 | 
62 | ## Environment Details
63 | 
64 | The sandbox ecosystem deployed thus architecturally looks as shown below:
65 | ![Sandbox](https://github.com/uc-cdis/compose-services/blob/master/SandboxContainers.jpg)
66 | 
67 | 
68 | All the microservices communicate with the Postgres Container based on the configuration specified above. Once the services are up and running, the environment can be visualized using the windmill microservice running on port 80 by typing the URL of the machine on which the containers are deployed. Please see example screenshot below as an example:
69 | 
70 | ![Launch Portal](https://github.com/uc-cdis/compose-services/blob/master/LaunchPortal.jpg)
71 | 
72 | Upon clicking 'Login from Google' and providing Google Credentials (if the same Google Account is used where the developer credentials came from), the system redirects the user to their landing page as shown below:
73 | 
74 | 
75 | ![Logged Into Portal](https://github.com/uc-cdis/compose-services/blob/master/LoggedInScreenshot.jpg)
76 | 
77 | 
78 | ## Revproxy-service cannot start
79 | 
80 | If revproxy-service cannot start an error will occur. It may be useful to
81 | ```
82 | docker-compose down
83 | docker-compose up -d
84 | ```
85 | If the error still occurs, make sure that apache2 and revproxy-service do not share the same port. You can change the port for revproxy-service and any other service in the `docker-compose.yaml` [file](https://github.com/uc-cdis/compose-services/blob/bf1dbc0f43519c1d6bc25d9cb331b78c3b35ecca/docker-compose.yml#L215). For revproxy you would also need to change the port in the `nginx.conf` [here](https://github.com/uc-cdis/compose-services/blob/bf1dbc0f43519c1d6bc25d9cb331b78c3b35ecca/nginx.conf#L29).
86 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/treatment.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "treatment"
  4 | title: Treatment
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: clinical
  8 | program: '*'
  9 | project: '*'
 10 | description: >
 11 |   Record of the administration and intention of therapeutic agents provided to a patient to alter
 12 |   the course of a pathologic process.
 13 | additionalProperties: false
 14 | submittable: true
 15 | validators: null
 16 | 
 17 | systemProperties:
 18 |   - id
 19 |   - project_id
 20 |   - state
 21 |   - created_datetime
 22 |   - updated_datetime
 23 | 
 24 | required:
 25 |   - submitter_id
 26 |   - type
 27 | 
 28 | links:
 29 |   - name: diagnoses
 30 |     backref: treatments
 31 |     label: describes #need better term here
 32 |     target_type: diagnosis
 33 |     multiplicity: many_to_one
 34 |     required: true
 35 | 
 36 | 
 37 | 
 38 | uniqueKeys:
 39 |   #unclear if want submitter ID for clinical
 40 |   - [id]
 41 |   - [project_id, submitter_id]
 42 | 
 43 | properties:
 44 |   type:
 45 |     enum: [ "treatment" ]
 46 | 
 47 |   id:
 48 |     $ref: "_definitions.yaml#/UUID"
 49 |     systemAlias: node_id
 50 | 
 51 |   state:
 52 |     $ref: "_definitions.yaml#/state"
 53 | 
 54 |   submitter_id:
 55 |     type:
 56 |       - string
 57 |       - "null"
 58 | 
 59 |   days_to_treatment:
 60 |     term:
 61 |       $ref: "_terms.yaml#/days_to_treatment"
 62 |     type: number
 63 | 
 64 |   days_to_treatment_end:
 65 |     term:
 66 |       $ref: "_terms.yaml#/days_to_treatment_end"
 67 |     type: number
 68 | 
 69 |   days_to_treatment_start:
 70 |     term:
 71 |       $ref: "_terms.yaml#/days_to_treatment_start"
 72 |     type: number
 73 | 
 74 |   therapeutic_agents:
 75 |     term:
 76 |       $ref: "_terms.yaml#/therapeutic_agents"
 77 |     type: string
 78 | 
 79 |   treatment_anatomic_site:
 80 |     term:
 81 |       $ref: "_terms.yaml#/treatment_anatomic_site"
 82 |     enum:
 83 |       - Abdomen, total
 84 |       - Arm
 85 |       - Ascites
 86 |       - Axillary
 87 |       - Body, total
 88 |       - Bone
 89 |       - Bone, non-spine
 90 |       - Brain, focal
 91 |       - Brain, whole
 92 |       - Brain-C2
 93 |       - Breast
 94 |       - Cervical
 95 |       - Chest Wall
 96 |       - Effusion
 97 |       - Epitrochlear
 98 |       - Eye
 99 |       - Femoral
100 |       - Gastrointestinal, Colon
101 |       - Gastrointestinal, Gallbladder
102 |       - Gastrointestinal, Intestine
103 |       - Gastrointestinal, Liver
104 |       - Gastrointestinal, NOS
105 |       - Gastrointestinal, Pancreas
106 |       - Gastrointestinal, Rectum
107 |       - Gastrointestinal, Stomach
108 |       - Genitourinary, Bladder
109 |       - Genitourinary, Kidney
110 |       - Genitourinary, NOS
111 |       - Genitourinary, Prostate
112 |       - Genitourinary, Prostate and Seminal Vesicles
113 |       - Head
114 |       - Head, Face, or Neck
115 |       - Hilar
116 |       - Iliac-common
117 |       - Iliac-external
118 |       - Inguinal
119 |       - Internal Mammary Nodes
120 |       - Leg
121 |       - Lung
122 |       - Lymph Nodes
123 |       - Lymph node, distant (specify site)
124 |       - Lymph node, locoregional (specify site)
125 |       - Mantle
126 |       - Mediastinal
127 |       - Mediastinum
128 |       - Mesenteric
129 |       - Occipital
130 |       - Other
131 |       - Paraaortic
132 |       - Parametrium
133 |       - Parotid
134 |       - Pelvis
135 |       - Popliteal
136 |       - Primary tumor site
137 |       - Prostate
138 |       - Prostate Bed
139 |       - Prostate, Seminal Vesicles and Lymph Nodes
140 |       - Rectum
141 |       - Retroperitoneal
142 |       - Sacrum
143 |       - Seminal vesicles
144 |       - Shoulder
145 |       - Skin, lower extremity, local
146 |       - Skin, total
147 |       - Skin, trunk, local
148 |       - Skin, upper extremity, local
149 |       - Spine
150 |       - Spine, whole
151 |       - Splenic
152 |       - Submandibular
153 |       - Supraclavicular
154 |       - Supraclavicular/Axillary Level 3
155 |       - Thorax
156 |       - Trunk
157 |       - Unknown
158 |       - Not Reported
159 |       - Not Allowed To Collect
160 | 
161 |   treatment_intent_type:
162 |     term:
163 |       $ref: "_terms.yaml#/treatment_intent_type"
164 |     type: string
165 | 
166 |   treatment_or_therapy:
167 |     term:
168 |       $ref: "_terms.yaml#/treatment_or_therapy"
169 |     enum:
170 |       - "yes"
171 |       - "no"
172 |       - unknown
173 |       - not reported
174 | 
175 |   treatment_outcome:
176 |     term:
177 |       $ref: "_terms.yaml#/treatment_outcome"
178 |     enum:
179 |       - Complete Response
180 |       - Partial Response
181 |       - Treatment Ongoing
182 |       - Treatment Stopped Due to Toxicity
183 |       - Unknown
184 | 
185 |   treatment_type:
186 |     term:
187 |       $ref: "_terms.yaml#/treatment_type"
188 |     enum:
189 |       - Ablation
190 |       - Chemotherapy
191 |       - Concurrent Chemoradiation
192 |       - Cryoablation
193 |       - Embolization
194 |       - Hormone Therapy
195 |       - Internal Radiation
196 |       - Immunotherapy (Including Vaccines)
197 |       - Other
198 |       - Pharmaceutical Therapy
199 |       - Radiation Therapy
200 |       - Stem Cell Treatment
201 |       - Surgery
202 |       - Targeted Molecular Therapy
203 |       - Unknown
204 |       - Not Reported
205 |       - Not Allowed To Collect
206 | 
207 |   diagnoses:
208 |     $ref: "_definitions.yaml#/to_one"
209 | 
210 |   project_id:
211 |     $ref: "_definitions.yaml#/project_id"
212 | 
213 |   # ======== Timestamps ========
214 |   created_datetime:
215 |     $ref: "_definitions.yaml#/datetime"
216 |   updated_datetime:
217 |     $ref: "_definitions.yaml#/datetime"
218 | 


--------------------------------------------------------------------------------
/templates/user.yaml:
--------------------------------------------------------------------------------
  1 | authz:
  2 |   # policies automatically given to anyone, even if they are not authenticated
  3 |   anonymous_policies:
  4 |   - open_data_reader
  5 | 
  6 |   # policies automatically given to authenticated users (in addition to their other policies)
  7 |   all_users_policies: []
  8 | 
  9 |   groups:
 10 |   # can CRUD programs and projects and upload data files
 11 |   - name: data_submitters
 12 |     policies:
 13 |     - services.sheepdog-admin
 14 |     - data_upload
 15 |     - MyFirstProject_submitter
 16 |     users:
 17 |     - username1@gmail.com
 18 | 
 19 |   # can create/update/delete indexd records
 20 |   - name: indexd_admins
 21 |     policies:
 22 |     - indexd_admin
 23 |     users:
 24 |     - username1@gmail.com
 25 | 
 26 |   resources:
 27 |   - name: workspace
 28 |   - name: data_file
 29 |   - name: services
 30 |     subresources:
 31 |     - name: sheepdog
 32 |       subresources:
 33 |       - name: submission
 34 |         subresources:
 35 |         - name: program
 36 |         - name: project
 37 |   - name: open
 38 |   - name: programs
 39 |     subresources:
 40 |     - name: MyFirstProgram
 41 |       subresources:
 42 |       - name: projects
 43 |         subresources:
 44 |         - name: MyFirstProject
 45 |     - name: jnkns
 46 |       subresources:
 47 |         - name: projects
 48 |           subresources:
 49 |             - name: jenkins
 50 |     - name: program1
 51 |       subresources:
 52 |       - name: projects
 53 |         subresources:
 54 |         - name: P1
 55 | 
 56 |   policies:
 57 |   - id: workspace
 58 |     description: be able to use workspace
 59 |     resource_paths:
 60 |     - /workspace
 61 |     role_ids:
 62 |     - workspace_user
 63 |   - id: data_upload
 64 |     description: upload raw data files to S3
 65 |     role_ids:
 66 |     - file_uploader
 67 |     resource_paths:
 68 |     - /data_file
 69 |   - id: services.sheepdog-admin
 70 |     description: CRUD access to programs and projects
 71 |     role_ids:
 72 |       - sheepdog_admin
 73 |     resource_paths:
 74 |       - /services/sheepdog/submission/program
 75 |       - /services/sheepdog/submission/project
 76 |   - id: indexd_admin
 77 |     description: full access to indexd API
 78 |     role_ids:
 79 |       - indexd_admin
 80 |     resource_paths:
 81 |       - /programs
 82 |   - id: open_data_reader
 83 |     role_ids:
 84 |       - reader
 85 |       - storage_reader
 86 |     resource_paths:
 87 |     - /open
 88 |   - id: all_programs_reader
 89 |     role_ids:
 90 |     - reader
 91 |     - storage_reader
 92 |     resource_paths:
 93 |     - /programs
 94 |   - id: MyFirstProject_submitter
 95 |     role_ids:
 96 |     - reader
 97 |     - creator
 98 |     - updater
 99 |     - deleter
100 |     - storage_reader
101 |     - storage_writer
102 |     resource_paths:
103 |     - /programs/MyFirstProgram/projects/MyFirstProject
104 |   - id: jnkns
105 |     role_ids:
106 |     - reader
107 |     - creator
108 |     - updater
109 |     - deleter
110 |     - storage_reader
111 |     - storage_writer
112 |     resource_paths:
113 |     - /programs/jnkns
114 |     - /programs/jnkns/projects/jenkins
115 |   - id: program1
116 |     role_ids:
117 |     - reader
118 |     - creator
119 |     - updater
120 |     - deleter
121 |     - storage_reader
122 |     - storage_writer
123 |     resource_paths:
124 |     - /programs/program1
125 |     - /programs/program1/projects/P1
126 | 
127 |   roles:
128 |   - id: file_uploader
129 |     permissions:
130 |     - id: file_upload
131 |       action:
132 |         service: fence
133 |         method: file_upload
134 |   - id: workspace_user
135 |     permissions:
136 |     - id: workspace_access
137 |       action:
138 |         service: jupyterhub
139 |         method: access
140 |   - id: sheepdog_admin
141 |     description: CRUD access to programs and projects
142 |     permissions:
143 |     - id: sheepdog_admin_action
144 |       action:
145 |         service: sheepdog
146 |         method: '*'
147 |   - id: indexd_admin
148 |     description: full access to indexd API
149 |     permissions:
150 |     - id: indexd_admin
151 |       action:
152 |         service: indexd
153 |         method: '*'
154 |   - id: admin
155 |     permissions:
156 |       - id: admin
157 |         action:
158 |           service: '*'
159 |           method: '*'
160 |   - id: creator
161 |     permissions:
162 |       - id: creator
163 |         action:
164 |           service: '*'
165 |           method: create
166 |   - id: reader
167 |     permissions:
168 |       - id: reader
169 |         action:
170 |           service: '*'
171 |           method: read
172 |   - id: updater
173 |     permissions:
174 |       - id: updater
175 |         action:
176 |           service: '*'
177 |           method: update
178 |   - id: deleter
179 |     permissions:
180 |       - id: deleter
181 |         action:
182 |           service: '*'
183 |           method: delete
184 |   - id: storage_writer
185 |     permissions:
186 |       - id: storage_creator
187 |         action:
188 |           service: '*'
189 |           method: write-storage
190 |   - id: storage_reader
191 |     permissions:
192 |       - id: storage_reader
193 |         action:
194 |           service: '*'
195 |           method: read-storage
196 | 
197 | clients:
198 |   wts:
199 |     policies:
200 |     - all_programs_reader
201 |     - open_data_reader
202 | 
203 | users:
204 |   username1@gmail.com:
205 |     tags:
206 |       name: User One
207 | #      email: mustbe@differentemail.com
208 |     policies:
209 |     - workspace
210 |     - data_upload
211 |     - MyFirstProject_submitter
212 |     - jnkns
213 |     - program1
214 |   username2:
215 |     tags:
216 |       name: John Doe
217 |       email: johndoe@gmail.com
218 | 
219 | cloud_providers: {}
220 | groups: {}
221 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/clinical_test.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "clinical_test"
  4 | title: Clinical Test 
  5 | type: object
  6 | namespace: http://gdc.nci.nih.gov
  7 | category: clinical 
  8 | project: '*'
  9 | program: '*'
 10 | description: > 
 11 |   Metadata concerning any clinical tests used in relation to a case diagnosis. 
 12 | additionalProperties: false
 13 | submittable: true 
 14 | validators: null
 15 | 
 16 | systemProperties:
 17 |   - id
 18 |   - project_id
 19 |   - created_datetime
 20 |   - updated_datetime
 21 |   - state
 22 | 
 23 | links:
 24 |   - name: cases 
 25 |     backref: clinical_tests
 26 |     label: performed_for 
 27 |     target_type: case
 28 |     multiplicity: many_to_one
 29 |     required: true
 30 |   - name: diagnoses
 31 |     backref: clinical_tests
 32 |     label: relates_to
 33 |     target_type: diagnosis
 34 |     multiplicity: many_to_many
 35 |     required: false
 36 | 
 37 | required:
 38 |   - submitter_id
 39 |   - type
 40 |   - biomarker_name
 41 |   - biomarker_result
 42 |   - biomarker_test_method
 43 |   - cases
 44 | 
 45 | uniqueKeys:
 46 |   - [id]
 47 |   - [project_id, submitter_id]
 48 | 
 49 | properties:
 50 |   type:
 51 |     enum: [ "clinical_test" ]
 52 | 
 53 |   id:
 54 |     $ref: "_definitions.yaml#/UUID"
 55 |     systemAlias: node_id
 56 | 
 57 |   state:
 58 |     $ref: "_definitions.yaml#/state"
 59 | 
 60 |   submitter_id:
 61 |     type:
 62 |       - string
 63 |       - "null"
 64 | 
 65 |   biomarker_name:
 66 |     term:
 67 |       $ref: "_terms.yaml#/biomarker_name"
 68 |     type: string
 69 | 
 70 |   biomarker_result:
 71 |     term:
 72 |       $ref: "_terms.yaml#/biomarker_result"
 73 |     enum:
 74 |       - Amplification
 75 |       - Gain
 76 |       - Loss
 77 |       - Normal
 78 |       - Other
 79 |       - Translocation
 80 |       - Not Reported
 81 |       - Not Allowed To Collect
 82 |       - Pending
 83 | 
 84 |   biomarker_test_method:
 85 |     term:
 86 |       $ref: "_terms.yaml#/biomarker_test_method"
 87 |     enum:
 88 |       - Cytogenetics
 89 |       - FISH
 90 |       - IHC
 91 |       - Karyotype
 92 |       - NGS
 93 |       - Nuclear Staining
 94 |       - Other
 95 |       - RT-PCR
 96 |       - Southern
 97 |       - Not Reported
 98 |       - Not Allowed To Collect
 99 |       - Pending
100 | 
101 |   cea_level_preoperative:
102 |     term:
103 |       $ref: "_terms.yaml#/cea_level_preoperative"
104 |     type: number
105 | 
106 |   dlco_ref_predictive_percent:
107 |     term:
108 |       $ref: "_terms.yaml#/dlco_ref_predictive_percent"
109 |     type: number
110 | 
111 |   estrogen_receptor_percent_positive_ihc:
112 |     term:
113 |       $ref: "_terms.yaml#/estrogen_receptor_percent_positive_ihc"
114 |     enum:
115 |       - <1%
116 |       - 1-10%
117 |       - 11-20%
118 |       - 21-30%
119 |       - 31-40%
120 |       - 41-50%
121 |       - 51-60%
122 |       - 61-70%
123 |       - 71-80%
124 |       - 81-90%
125 |       - 91-100%
126 | 
127 |   estrogen_receptor_result_ihc:
128 |     term:
129 |       $ref: "_terms.yaml#/estrogen_receptor_result_ihc"
130 |     enum:
131 |       - Negative
132 |       - Not Performed
133 |       - Positive
134 |       - Unknown
135 | 
136 |   fev1_ref_post_bronch_percent:
137 |     term:
138 |       $ref: "_terms.yaml#/fev1_ref_post_bronch_percent"
139 |     type: number
140 | 
141 |   fev1_ref_pre_bronch_percent:
142 |     term:
143 |       $ref: "_terms.yaml#/fev1_ref_pre_bronch_percent"
144 |     type: number
145 | 
146 |   fev1_fvc_post_bronch_percent:
147 |     term:
148 |       $ref: "_terms.yaml#/fev1_fvc_post_bronch_percent"
149 |     type: number
150 | 
151 |   fev1_fvc_pre_bronch_percent:
152 |     term:
153 |       $ref: "_terms.yaml#/fev1_fvc_pre_bronch_percent"
154 |     type: number
155 | 
156 |   her2_erbb2_percent_positive_ihc:
157 |     term:
158 |       $ref: "_terms.yaml#/her2_erbb2_percent_positive_ihc"
159 |     enum:
160 |       - <1%
161 |       - 1-10%
162 |       - 11-20%
163 |       - 21-30%
164 |       - 31-40%
165 |       - 41-50%
166 |       - 51-60%
167 |       - 61-70%
168 |       - 71-80%
169 |       - 81-90%
170 |       - 91-100%
171 | 
172 |   her2_erbb2_result_fish:
173 |     term:
174 |       $ref: "_terms.yaml#/her2_erbb2_result_fish"
175 |     enum:
176 |       - Negative
177 |       - Not Performed 
178 |       - Positive
179 |       - Unknown
180 | 
181 |   her2_erbb2_result_ihc:
182 |     term:
183 |       $ref: "_terms.yaml#/her2_erbb2_result_ihc"
184 |     enum:
185 |       - Negative
186 |       - Not Performed
187 |       - Positive
188 |       - Unknown
189 | 
190 |   ldh_level_at_diagnosis:
191 |     term:
192 |       $ref: "_terms.yaml#/ldh_level_at_diagnosis"
193 |     type: number
194 | 
195 |   ldh_normal_range_upper:
196 |     term:
197 |       $ref: "_terms.yaml#/ldh_normal_range_upper"
198 |     type: number
199 | 
200 |   microsatellite_instability_abnormal:
201 |     term:
202 |       $ref: "_terms.yaml#/microsatellite_instability_abnormal"
203 |     enum:
204 |       - "Yes"
205 |       - "No"
206 |       - Unknown
207 | 
208 |   progesterone_receptor_percent_positive_ihc:
209 |     term:
210 |       $ref: "_terms.yaml#/progesterone_receptor_percent_positive_ihc"
211 |     enum:
212 |       - <1%
213 |       - 1-10%
214 |       - 11-20%
215 |       - 21-30%
216 |       - 31-40%
217 |       - 41-50%
218 |       - 51-60%
219 |       - 61-70%
220 |       - 71-80%
221 |       - 81-90%
222 |       - 91-100%
223 | 
224 |   progesterone_receptor_result_ihc:
225 |     term:
226 |       $ref: "_terms.yaml#/progesterone_receptor_result_ihc"
227 |     enum:
228 |       - Negative
229 |       - Not Performed
230 |       - Positive
231 |       - Unknown
232 | 
233 |   cases:
234 |     $ref: "_definitions.yaml#/to_one"
235 |   diagnoses:
236 |     $ref: "_definitions.yaml#/to_many"
237 |   project_id:
238 |     $ref: "_definitions.yaml#/project_id" 
239 |   created_datetime:
240 |     $ref: "_definitions.yaml#/datetime"
241 |   updated_datetime:
242 |     $ref: "_definitions.yaml#/datetime"
243 | 


--------------------------------------------------------------------------------
/creds_setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Script to setup keys for fence as well as ssl credentials
  3 | 
  4 | if [[ ! -d ./templates ]]; then
  5 |   echo "ERROR: ./templates not found - run in compose-services folder"
  6 |   exit 1
  7 | fi
  8 | if [[ -d Secrets ]]; then
  9 |   # make a backup
 10 |   bak="Secrets$(date +%Y%m%d%H%M%S).bak"
 11 |   if [[ -e "$bak" ]]; then
 12 |     echo "ERROR: ./Secrets and $bak already exist"
 13 |     exit 1
 14 |   fi
 15 |   echo "Backing up ./Secrets/ to ./$bak/"
 16 |   cp -r ./Secrets "./$bak"
 17 | fi
 18 | 
 19 | mkdir -p Secrets
 20 | 
 21 | for path in templates/*; do
 22 |   target="Secrets/$(basename "$path")"
 23 |   if [[ "$path" =~ \.py$ ]]; then # update python files
 24 |     echo "Copying $path to $target"
 25 |     cp "$path" "$target"
 26 |   elif [[ ! -e "$target" ]]; then
 27 |     echo "Copying $path to $target"
 28 |     cp -r "$path" "$target"
 29 |   else
 30 |     echo "$target already exists"
 31 |   fi
 32 | done
 33 | 
 34 | tempFile="gen3scratch.tmp"
 35 | if [ ! -z $1 ]; then
 36 |   customHost="$1"
 37 |   shift
 38 |   # be careful with sed -i on Mac: https://stackoverflow.com/questions/19456518/invalid-command-code-despite-escaping-periods-using-sed
 39 |   for name in Secrets/fence-config.yaml Secrets/*_creds.json; do
 40 |     sed "s/localhost/$customHost/g" "$name" > "$tempFile" && \
 41 |       cp "$tempFile" "$name"
 42 |   done
 43 | fi
 44 | 
 45 | configFile=./Secrets/fence-config.yaml
 46 | if grep "^ENCRYPTION_KEY: ''" "$configFile" > /dev/null; then
 47 |   # be careful with sed on Mac: https://stackoverflow.com/questions/19456518/invalid-command-code-despite-escaping-periods-using-sed
 48 |   key="$(python ./scripts/fence_key_helper.py)" && \
 49 |      sed "s/^ENCRYPTION_KEY: ''/ENCRYPTION_KEY: '$key'/" "$configFile" > "$tempFile" && \
 50 |      cp "$tempFile" "$configFile"
 51 | fi
 52 | rm "$tempFile"
 53 | 
 54 | cd Secrets
 55 | 
 56 | # make directories for temporary credentials
 57 | timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 58 | 
 59 | # generate private and public key for fence
 60 | yearMonth="$(date +%Y-%m)"
 61 | if [[ ! -d ./fenceJwtKeys ]] || ! (ls ./fenceJwtKeys | grep "$yearMonth" > /dev/null 2>&1); then
 62 |     echo "Generating fence OAUTH key pairs under Secrets/fenceJwtKeys"
 63 |     mkdir -p fenceJwtKeys
 64 |     mkdir -p fenceJwtKeys/${timestamp}
 65 | 
 66 |     openssl genpkey -algorithm RSA -out fenceJwtKeys/${timestamp}/jwt_private_key.pem \
 67 |         -pkeyopt rsa_keygen_bits:2048
 68 |     openssl rsa -pubout -in fenceJwtKeys/${timestamp}/jwt_private_key.pem \
 69 |         -out fenceJwtKeys/${timestamp}/jwt_public_key.pem
 70 |     chmod -R a+rx fenceJwtKeys
 71 | fi
 72 | 
 73 | # generate certs for nginx ssl
 74 | (
 75 |     mkdir -p TLS
 76 |     cd TLS
 77 | 
 78 |     OS=$(uname)
 79 |     OPTS=""
 80 |     if [[ $OS == "Darwin" ]]; then
 81 |         cp /etc/ssl/openssl.cnf openssl-with-ca.cnf
 82 | 
 83 |         __v3_ca="
 84 | [ v3_ca ]
 85 | basicConstraints = critical,CA:TRUE
 86 | subjectKeyIdentifier = hash
 87 | authorityKeyIdentifier = keyid:always,issuer:always
 88 | "
 89 |         echo "$__v3_ca" >> openssl-with-ca.cnf
 90 |         OPTS=" -extensions v3_ca -config openssl-with-ca.cnf"
 91 |     fi
 92 | 
 93 |     if ! [[ -f openssl.cnf && -f ca.pem && -f ca-key.pem ]]; then
 94 |       echo "Generating a local certificate authority, and TLS certificates under Secrets/TLS/"
 95 |       # erase old certs if they exist
 96 |       /bin/rm -rf service.key service.crt
 97 |       commonName=${1:-localhost}
 98 |       SUBJ="/countryName=US/stateOrProvinceName=IL/localityName=Chicago/organizationName=CDIS/organizationalUnitName=PlanX/commonName=$commonName/emailAddress=cdis@uchicago.edu"
 99 |       openssl req -new -x509 -nodes -extensions v3_ca -keyout ca-key.pem \
100 |           -out ca.pem -days 365 -subj $SUBJ $OPTS
101 |       if [[ $? -eq 1 ]]; then
102 |           echo "problem with creds_setup.sh script, refer to compose-services wiki"
103 |           rm -rf temp*
104 |           exit 1
105 |       fi
106 | 
107 |       mkdir -p CA/newcerts
108 |       touch CA/index.txt
109 |       touch CA/index.txt.attr
110 |       echo 1000 > CA/serial
111 |       cat > openssl.cnf <<EOM
112 | [ ca ]
113 | # man ca
114 | default_ca = CA_default
115 | [ CA_default ]
116 | # Directory and file locations.
117 | dir             = .                      # Where everything is kept
118 | new_certs_dir   = \$dir/CA/newcerts
119 | database        = \$dir/CA/index.txt     # database index file.
120 | certificate     = \$dir/ca.pem           # The CA certificate
121 | serial          = \$dir/CA/serial        # The current serial number
122 | private_key     = \$dir/ca-key.pem       # The private key
123 | # SHA-1 is deprecated, so use SHA-2 instead.
124 | default_md        = sha256
125 | preserve          = no
126 | policy            = policy_strict
127 | [ policy_strict ]
128 | # The root CA should only sign intermediate certificates that match.
129 | # See the POLICY FORMAT section of 'man ca'.
130 | countryName             = optional
131 | stateOrProvinceName     = optional
132 | organizationName        = optional
133 | organizationalUnitName  = optional
134 | commonName              = supplied
135 | emailAddress            = optional
136 | [ server_cert ]
137 | # Extensions for server certificates ('man x509v3_config').
138 | basicConstraints = CA:FALSE
139 | nsCertType = server
140 | nsComment = "OpenSSL Generated Server Certificate"
141 | subjectKeyIdentifier = hash
142 | authorityKeyIdentifier = keyid,issuer:always
143 | keyUsage = critical, digitalSignature, keyEncipherment
144 | extendedKeyUsage = serverAuth
145 | [ crl_ext ]
146 | # Extension for CRLs ('man x509v3_config').
147 | authorityKeyIdentifier=keyid:always
148 | EOM
149 |     else
150 |       echo "Looks like Secrets/TLS/CA already exists"
151 |     fi
152 |     if [[ ! -f service.key || ! -f service.crt ]]; then
153 |       openssl genrsa -out "service.key" 2048
154 |       openssl req -new -key "service.key" \
155 |           -out "service.csr" -subj $SUBJ
156 |       openssl ca -batch -in "service.csr" -config openssl.cnf \
157 |           -extensions server_cert -days 365 -notext -out "service.crt"
158 |     else
159 |       echo "Looks like Secrets/TLS/service.key and service.crt already exist"
160 |     fi
161 | )
162 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/read_group.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-04/schema#"
  2 | 
  3 | id: "read_group"
  4 | title: Read Group
  5 | type: object
  6 | description: "Sequencing reads from one lane of an NGS experiment."
  7 | namespace: http://gdc.nci.nih.gov
  8 | category: biospecimen
  9 | project: '*'
 10 | program: '*'
 11 | additionalProperties: false
 12 | submittable: true
 13 | validators: null
 14 | 
 15 | systemProperties:
 16 |   - id
 17 |   - project_id
 18 |   - created_datetime
 19 |   - updated_datetime
 20 |   - state
 21 | 
 22 | links:
 23 |   - name: aliquots
 24 |     label: derived_from
 25 |     target_type: aliquot
 26 |     multiplicity: many_to_one
 27 |     required: true
 28 |     backref: read_groups
 29 | 
 30 | required:
 31 |   - type
 32 |   - submitter_id
 33 |   - aliquots
 34 | 
 35 | uniqueKeys:
 36 |   - [ id ]
 37 |   - [ project_id, submitter_id ]
 38 | 
 39 | properties:
 40 |   id:
 41 |     $ref: "_definitions.yaml#/UUID"
 42 |   project_id:
 43 |     $ref: "_definitions.yaml#/project_id"
 44 |   submitter_id:
 45 |     type: string
 46 |   state:
 47 |     $ref: "_definitions.yaml#/state"
 48 |   type:
 49 |     enum: [ "read_group" ]
 50 |   experiment_name:
 51 |     term:
 52 |       $ref: "_terms.yaml#/experiment_name"
 53 |     type: string
 54 |   sequencing_center:
 55 |     term:
 56 |       $ref: "_terms.yaml#/sequencing_center"
 57 |     type: string
 58 |   sequencing_date:
 59 |     $ref: "_definitions.yaml#/datetime"
 60 |   platform:
 61 |     term:
 62 |       $ref: "_terms.yaml#/platform"
 63 |     enum:
 64 |       - Illumina
 65 |       - SOLiD
 66 |       - LS454
 67 |       - Ion Torrent
 68 |       - Complete Genomics
 69 |       - PacBio
 70 |       - Other
 71 |   instrument_model:
 72 |     terms:
 73 |       $ref: "_terms.yaml#/instrument_model"
 74 |     enum:
 75 |       - 454 GS FLX Titanium
 76 |       - AB SOLiD 4
 77 |       - AB SOLiD 2
 78 |       - AB SOLiD 3
 79 |       - Complete Genomics
 80 |       - Illumina HiSeq X Ten
 81 |       - Illumina HiSeq X Five
 82 |       - Illumina Genome Analyzer II
 83 |       - Illumina Genome Analyzer IIx
 84 |       - Illumina HiSeq 2000
 85 |       - Illumina HiSeq 2500
 86 |       - Illumina HiSeq 4000
 87 |       - Illumina MiSeq
 88 |       - Illumina NextSeq
 89 |       - Ion Torrent PGM
 90 |       - Ion Torrent Proton
 91 |       - PacBio RS
 92 |       - Ion S5 XL System, Ion 530 Chip
 93 |       - Other
 94 |   library_strategy:
 95 |     term:
 96 |       $ref: "_terms.yaml#/library_strategy"
 97 |     enum:
 98 |       - WGS
 99 |       - WXS
100 |       - RNA-Seq
101 |       - ChIP-Seq
102 |       - miRNA-Seq
103 |       - Bisulfite-Seq
104 |       - Validation
105 |       - Amplicon
106 |       - Other
107 |   RIN:
108 |     term:
109 |       $ref: "_terms.yaml#/RIN"
110 |     type: number
111 |   flow_cell_barcode:
112 |     term:
113 |       $ref: "_terms.yaml#/flow_cell_barcode"
114 |     type: string
115 |   includes_spike_ins:
116 |     term:
117 |       $ref: "_terms.yaml#/includes_spike_ins"
118 |     type: boolean
119 |   spike_ins_fasta:
120 |     term:
121 |       $ref: "_terms.yaml#/spike_ins_fasta"
122 |     type: string
123 |   spike_ins_concentration:
124 |     term:
125 |       $ref: "_terms.yaml#/spike_ins_concentration"
126 |     type: string
127 |   library_selection:
128 |     term:
129 |       $ref: "_terms.yaml#/library_selection"
130 |     enum:
131 |       - Hybrid_Selection
132 |       - PCR
133 |       - Affinity_Enrichment
134 |       - Poly-T_Enrichment
135 |       - RNA_Depletion
136 |       - Other
137 |   library_preparation_kit_name:
138 |     term:
139 |       $ref: "_terms.yaml#/library_preparation_kit_name"
140 |     type: string
141 |   library_preparation_kit_vendor:
142 |     term:
143 |       $ref: "_terms.yaml#/library_preparation_kit_vendor"
144 |     type: string
145 |   library_preparation_kit_catalog_number:
146 |     term:
147 |       $ref: "_terms.yaml#/library_preparation_kit_catalog_number"
148 |     type: string
149 |   library_preparation_kit_version:
150 |     term:
151 |       $ref: "_terms.yaml#/library_preparation_kit_version"
152 |     type: string
153 |   library_name:
154 |     term:
155 |       $ref: "_terms.yaml#/library_name"
156 |     type: string
157 |   target_capture_kit_name:  # conditionally required for WXS etc
158 |     term:
159 |       $ref: "_terms.yaml#/target_capture_kit_name"
160 |     type: string
161 |   target_capture_kit_vendor:
162 |     term:
163 |       $ref: "_terms.yaml#/target_capture_kit_vendor"
164 |     type: string
165 |   target_capture_kit_catalog_number:
166 |     term:
167 |       $ref: "_terms.yaml#/target_capture_kit_catalog_number"
168 |     type: string
169 |   target_capture_kit_version:
170 |     term:
171 |       $ref: "_terms.yaml#/target_capture_kit_version"
172 |     type: string
173 |   target_capture_kit_target_region:
174 |     term:
175 |       $ref: "_terms.yaml#/target_capture_kit_target_region"
176 |     type: string
177 |   size_selection_range:
178 |     term:
179 |       $ref: "_terms.yaml#/size_selection_range"
180 |     type: string
181 |   adapter_name:
182 |     term:
183 |       $ref: "_terms.yaml#/adapter_name"
184 |     type: string
185 |   adapter_sequence:
186 |     term:
187 |       $ref: "_terms.yaml#/adapter_sequence"
188 |     type: string
189 |   to_trim_adapter_sequence:
190 |     term:
191 |       $ref: "_terms.yaml#/to_trim_adapter_sequence"
192 |     type: boolean
193 |   library_strand:
194 |     term:
195 |       $ref: "_terms.yaml#/library_strand"
196 |     enum:
197 |       - Unstranded
198 |       - First_Stranded
199 |       - Second_Stranded
200 |   base_caller_name:
201 |     term:
202 |       $ref: "_terms.yaml#/base_caller_name"
203 |     type: string
204 |   base_caller_version:
205 |     term:
206 |       $ref: "_terms.yaml#/base_caller_version"
207 |     type: string
208 |   is_paired_end:
209 |     term:
210 |       $ref: "_terms.yaml#/is_paired_end"
211 |     type: boolean
212 |   read_length:
213 |     type: integer
214 |   read_group_name:  # it may be good to assign UUID to read group
215 |     description: "Read Group Name"
216 |     type: string
217 |   barcoding_applied:
218 |     description: "True/False: was barcoding applied?"
219 |     type: boolean
220 |   aliquots:
221 |     $ref: "_definitions.yaml#/to_one"
222 |   created_datetime:
223 |     $ref: "_definitions.yaml#/datetime"
224 |   updated_datetime:
225 |     $ref: "_definitions.yaml#/datetime"
226 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schemas/_definitions.yaml:
--------------------------------------------------------------------------------
  1 | id: _definitions
  2 | 
  3 | UUID:
  4 |     term:
  5 |         $ref: "_terms.yaml#/UUID"
  6 |     type: string
  7 |     pattern: "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$"
  8 | 
  9 | parent_uuids:
 10 |     type: array
 11 |     minItems: 1
 12 |     items:
 13 |         $ref: "#/UUID"
 14 |     uniqueItems: true
 15 | 
 16 | foreign_key_project:
 17 |     type: object
 18 |     # Allow true here because we can have other unique keys defined on
 19 |     # a target type
 20 |     additionalProperties: true
 21 |     # Can either use 'id' which are Gen3 IDs (UUID) or 'code'
 22 |     # which is the user defined ID for project
 23 |     properties:
 24 |         id:
 25 |             $ref: "#/UUID"
 26 |         code:
 27 |             type: string
 28 | 
 29 | to_one_project:
 30 |   anyOf:
 31 |     - type: array
 32 |       items:
 33 |         $ref: "#/foreign_key_project"
 34 |         minItems: 1
 35 |         maxItems: 1
 36 |     - $ref: "#/foreign_key_project"
 37 | 
 38 | to_many_project:
 39 |   anyOf:
 40 |     - type: array
 41 |       items:
 42 |         $ref: "#/foreign_key_project"
 43 |         minItems: 1
 44 |     - $ref: "#/foreign_key_project"
 45 | 
 46 | foreign_key:
 47 |     type: object
 48 |     # Allow true here because we can have other unique keys defined on
 49 |     # a target type
 50 |     additionalProperties: True
 51 |     # Can either use 'id' which are GDC IDs (UUID) or 'submitter_id'
 52 |     # which are user defined IDs ("submitter IDs in the backend")
 53 |     properties:
 54 |         id:
 55 |             $ref: "#/UUID"
 56 |         submitter_id:
 57 |             type: string
 58 | 
 59 | to_one:
 60 |   anyOf:
 61 |     - type: array
 62 |       items:
 63 |         $ref: "#/foreign_key"
 64 |         minItems: 1
 65 |         maxItems: 1
 66 |     - $ref: "#/foreign_key"
 67 | 
 68 | to_many:
 69 |   anyOf:
 70 |     - type: array
 71 |       items:
 72 |         $ref: "#/foreign_key"
 73 |         minItems: 1
 74 |     - $ref: "#/foreign_key"
 75 | 
 76 | datetime:
 77 |   oneOf:
 78 |     - type: string
 79 |       format: date-time
 80 |     - type: 'null'
 81 |   term:
 82 |     $ref: "_terms.yaml#/datetime"
 83 | 
 84 | file_name:
 85 |     type: string
 86 |     term:
 87 |         $ref: "_terms.yaml#/file_name"
 88 | 
 89 | file_size:
 90 |     type: integer
 91 |     term: 
 92 |         $ref: "_terms.yaml#/file_size"
 93 | 
 94 | file_format:
 95 |     type: string
 96 |     term:
 97 |         $ref: "_terms.yaml#/file_format"
 98 | 
 99 | md5sum:
100 |     type: string
101 |     pattern: "^[a-f0-9]{32}$"
102 |     term:
103 |         $ref: "_terms.yaml#/md5sum"
104 | 
105 | object_id:
106 |     type: string
107 |     description: "The GUID of the object in the index service."
108 | 
109 | release_state:
110 |     description: "Release state of an entity."
111 |     default: unreleased
112 |     enum:
113 |       - unreleased
114 |       - released
115 |       - redacted
116 | 
117 | data_bundle_state:
118 |     description: "State of a data bundle."
119 |     default: submitted
120 |     enum:
121 |         - submitted
122 |         - validated
123 |         - error
124 |         - released
125 |         - suppressed
126 |         - redacted
127 | 
128 | data_file_error_type:
129 |     term:
130 |         $ref: "_terms.yaml#/data_file_error_type"
131 |     enum:
132 |         - file_size
133 |         - file_format
134 |         - md5sum
135 | 
136 | state:
137 |     term:
138 |         $ref: "_terms.yaml#/state" 
139 |     default: validated
140 |     downloadable:
141 |         - uploaded
142 |         - md5summed
143 |         - validating
144 |         - validated
145 |         - error
146 |         - invalid
147 |         - released
148 |     public:
149 |         - live
150 |     oneOf:
151 |         - enum: #This list covers legacy data
152 |             - uploading
153 |             - uploaded
154 |             - md5summing
155 |             - md5summed
156 |             - validating
157 |             - error
158 |             - invalid
159 |             - suppressed
160 |             - redacted
161 |             - live
162 |         - enum: #This list covers all future data 1/15/16
163 |             - validated
164 |             - submitted
165 |             - released
166 | 
167 | file_state:
168 |     term:
169 |         $ref: "_terms.yaml#/file_state" 
170 |     default: registered
171 |     enum:
172 |         - registered
173 |         - uploading
174 |         - uploaded
175 |         - validating
176 |         - validated
177 |         - submitted
178 |         - processing
179 |         - processed
180 |         - released
181 |         - error
182 | 
183 | qc_metrics_state:
184 |     term:
185 |       $ref: "_terms.yaml#/qc_metric_state"
186 |     enum:
187 |         - FAIL
188 |         - PASS
189 |         - WARN
190 | 
191 | project_id:
192 |     type: string
193 |     term:
194 |         $ref: "_terms.yaml#/project_id"
195 | 
196 | data_file_properties:
197 |   id:
198 |     $ref: "#/UUID"
199 |     systemAlias: node_id
200 |   submitter_id:
201 |     type:
202 |       - string
203 |       - "null"
204 |     description: "The file ID assigned by the submitter." # TOREVIEW
205 |   file_name:
206 |     $ref: "#/file_name"
207 |   file_size:
208 |     $ref: "#/file_size"
209 |   md5sum:
210 |     $ref: "#/md5sum" 
211 |   file_state:
212 |     $ref: "#/file_state"
213 |   object_id:
214 |     $ref: "#/object_id"
215 |   state:
216 |     $ref: "#/state"
217 |   error_type:
218 |     $ref: "#/data_file_error_type"
219 |   state_comment:
220 |     type: string
221 |     description: > 
222 |       Optional comment about why the file is in the
223 |       current state, mainly for invalid state.
224 |   project_id:
225 |     $ref: "#/project_id"
226 |   created_datetime:
227 |     $ref: "#/datetime"
228 |   updated_datetime:
229 |     $ref: "#/datetime"
230 | 
231 | workflow_properties:
232 |   id:
233 |     $ref: "#/UUID"
234 |     systemAlias: node_id
235 |   submitter_id:
236 |     type:
237 |       - string
238 |       - "null"
239 |     description: "The file ID assigned by the submitter." # TOREVIEW
240 |   workflow_link:
241 |     description: "Link to Github hash for the CWL workflow used."
242 |     type: string
243 |   workflow_version:
244 |     description: "Major version for a GDC workflow."
245 |     type: string
246 |   workflow_start_datetime:
247 |     $ref: "#/datetime"
248 |   workflow_end_datetime:
249 |     $ref: "#/datetime"
250 |   state:
251 |     $ref: "#/state"
252 |   project_id:
253 |     $ref: "#/project_id"
254 |   created_datetime:
255 |     $ref: "#/datetime"
256 |   updated_datetime:
257 |     $ref: "#/datetime"
258 | 
259 | ubiquitous_properties:
260 |   type:
261 |     type: string
262 |   id:
263 |     $ref: "#/UUID"
264 |     systemAlias: node_id
265 |   submitter_id:
266 |     type:
267 |       - string
268 |     description: >
269 |       A project-specific identifier for a node. This property is the calling card/nickname/alias for
270 |       a unit of submission. It can be used in place of the UUID for identifying or recalling a node.
271 |   state:
272 |     $ref: "#/state"
273 |   project_id:
274 |     $ref: "#/project_id"
275 |   created_datetime:
276 |     $ref: "#/datetime"
277 |   updated_datetime:
278 |     $ref: "#/datetime"
279 | 


--------------------------------------------------------------------------------
/datadictionary/gdcdictionary/schema_test.py:
--------------------------------------------------------------------------------
  1 | """This is an example of json schema for the GDC using schemas defined
  2 | in local yaml files.
  3 | 
  4 | Included are a few functions to augment jsonschema and the python
  5 | validator.
  6 | 
  7 | Examples are at the end.
  8 | 
  9 | """
 10 | 
 11 | 
 12 | from jsonschema import validate, ValidationError
 13 | import copy
 14 | import yaml
 15 | import glob
 16 | import os
 17 | import argparse
 18 | import json
 19 | import unittest
 20 | from gdcdictionary import gdcdictionary
 21 | 
 22 | 
 23 | 
 24 | def load_yaml_schema(path):
 25 |     with open(path, 'r') as f:
 26 |         return yaml.load(f)
 27 | CUR_DIR = os.path.dirname(os.path.realpath(__file__))
 28 | DATA_DIR = os.path.join(CUR_DIR, 'examples')
 29 | project1 = load_yaml_schema(os.path.join(CUR_DIR, 'schemas/projects/project1.yaml'))
 30 | projects = {'project1': project1}
 31 | 
 32 | def merge_schemas(a, b, path=None):
 33 |     """Recursively zip schemas together
 34 | 
 35 |     """
 36 |     path = path if path is not None else []
 37 |     for key in b:
 38 |         if key in a:
 39 |             if isinstance(a[key], dict) and isinstance(b[key], dict):
 40 |                 merge_schemas(a[key], b[key], path + [str(key)])
 41 |             elif a[key] == b[key]:
 42 |                 pass
 43 |             else:
 44 |                 print("Overriding '{}':\n\t- {}\n\t+ {}".format(
 45 |                     '.'.join(path + [str(key)]), a[key], b[key]))
 46 |                 a[key] = b[key]
 47 |         else:
 48 |             print("Adding '{}':\n\t+ {}".format(
 49 |                 '.'.join(path + [str(key)]), b[key]))
 50 |             a[key] = b[key]
 51 |     return a
 52 | 
 53 | 
 54 | def get_project_specific_schema(projects, project, schema, entity_type):
 55 |     """Look up the core schema for its type and override it with any
 56 |     project level overrides
 57 | 
 58 |     """
 59 |     root = copy.deepcopy(schema)
 60 |     project_overrides = projects.get(project)
 61 |     if project_overrides:
 62 |         overrides = project_overrides.get(entity_type)
 63 |         if overrides:
 64 |             merge_schemas(root, overrides, [entity_type])
 65 |     return root
 66 | 
 67 | 
 68 | def validate_entity(entity, schemata, project=None, name=''):
 69 |     """Validate an entity by looking up the core schema for its type and
 70 |     overriding it with any project level overrides
 71 | 
 72 |     """
 73 |     local_schema = get_project_specific_schema(
 74 |         projects, project, schemata[entity['type']], entity['type'])
 75 |     result = validate(entity, local_schema)
 76 |     return result
 77 | 
 78 | 
 79 | def validate_schemata(schemata, metaschema):
 80 |     # validate schemata
 81 |     print('Validating schemas against metaschema... '),
 82 |     for s in schemata.values():
 83 |         validate(s, metaschema)
 84 | 
 85 |         def assert_link_is_also_prop(link):
 86 |             assert link in s['properties'],\
 87 |                 "Entity '{}' has '{}' as a link but not property".format(
 88 |                     s['id'], link)
 89 | 
 90 |         for link in [l['name'] for l in s['links'] if 'name' in l]:
 91 |             assert_link_is_also_prop(link)
 92 |         for subgroup in [l['subgroup'] for l in s['links'] if 'name' not in l]:
 93 |             for link in [l['name'] for l in subgroup if 'name' in l]:
 94 |                 assert_link_is_also_prop(link)
 95 | 
 96 | 
 97 | class SchemaTest(unittest.TestCase):
 98 |     def setUp(self):
 99 |         self.dictionary = gdcdictionary
100 |         self.definitions = yaml.load(open(os.path.join(CUR_DIR, 'schemas','_definitions.yaml'),'r'))
101 | 
102 |     def test_schemas(self):
103 |         validate_schemata(self.dictionary.schema, self.dictionary.metaschema)
104 | 
105 |     def test_valid_files(self):
106 |         for path in glob.glob(os.path.join(DATA_DIR, 'valid', '*.json')):
107 |             print("Validating {}".format(path))
108 |             doc = json.load(open(path, 'r'))
109 |             print(doc)
110 |             if type(doc) == dict:
111 |                 self.add_system_props(doc)
112 |                 validate_entity(doc, self.dictionary.schema)
113 |             elif type(doc) == list:
114 |                 for entity in doc:
115 |                     self.add_system_props(entity)
116 |                     validate_entity(entity, self.dictionary.schema)
117 |             else:
118 |                 raise Exception("Invalid json")
119 | 
120 |     def test_invalid_files(self):
121 |         for path in glob.glob(os.path.join(DATA_DIR, 'invalid', '*.json')):
122 |             print("Validating {}".format(path))
123 |             doc = json.load(open(path, 'r'))
124 |             if type(doc) == dict:
125 |                 self.add_system_props(doc)
126 |                 with self.assertRaises(ValidationError):
127 |                     validate_entity(doc, self.dictionary.schema)
128 |             elif type(doc) == list:
129 |                 for entity in doc:
130 |                     self.add_system_props(entity)
131 |                     with self.assertRaises(ValidationError):
132 |                         validate_entity(entity, self.dictionary.schema)
133 |             else:
134 |                 raise Exception("Invalid json")
135 | 
136 |     def add_system_props(self, doc):
137 |         schema = self.dictionary.schema[doc['type']]
138 |         for key in schema['systemProperties']:
139 |             use_def_default = (
140 |                 '$ref' in schema['properties'][key] and
141 |                 key in self.definitions and
142 |                 'default' in self.definitions[key]
143 |             )
144 |             if use_def_default:
145 |                 doc[key] = self.definitions[key]['default']
146 | 
147 | if __name__ == '__main__':
148 | 
149 |     ####################
150 |     # Setup
151 |     ####################
152 | 
153 | 
154 |     parser = argparse.ArgumentParser(description='Validate JSON')
155 |     parser.add_argument('jsonfiles', metavar='file',
156 |                         type=argparse.FileType('r'), nargs='*',
157 |                         help='json files to test if (in)valid')
158 | 
159 |     parser.add_argument('--invalid', action='store_true', default=False,
160 |                         help='expect the files to be invalid instead of valid')
161 | 
162 |     args = parser.parse_args()
163 | 
164 |     ####################
165 |     # Example validation
166 |     ####################
167 | 
168 |     # Load schemata
169 |     dictionary = gdcdictionary
170 | 
171 |     for f in args.jsonfiles:
172 |         doc = json.load(f)
173 |         if args.invalid:
174 |             try:
175 |                 print("CHECK if {0} is invalid:".format(f.name)),
176 |                 print(type(doc))
177 |                 if type(doc) == dict:
178 |                     validate_entity(doc, dictionary.schema)
179 |                 elif type(doc) == list:
180 |                     for entity in doc:
181 |                         validate_entity(entity, dictionary.schema)
182 |                 else:
183 |                     raise ValidationError("Invalid json")
184 |             except ValidationError as e:
185 |                 print("Invalid as expected.")
186 |                 pass
187 |             else:
188 |                 raise Exception("Expected invalid, but validated.")
189 |         else:
190 |             print ("CHECK if {0} is valid:".format(f.name)),
191 |             if type(doc) == dict:
192 |                 validate_entity(doc, dictionary.schema)
193 |             elif type(doc) == list:
194 |                 for entity in doc:
195 |                     validate_entity(entity, dictionary.schema)
196 |             else:
197 |                 print("Invalid json")
198 | 
199 |             print("Valid as expected")
200 |     print('ok.')
201 | 


--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
  1 | # Setup
  2 | 
  3 | ## Dependencies
  4 | 
  5 |   - OpenSSL
  6 |   - Docker and Docker Compose
  7 | 
  8 | ## Docker and Docker Compose Setup
  9 | 
 10 | If you've never used Docker before, it may be helpful to read some of the Docker documentation to familiarize yourself with containers. You can also read an overview of what Docker Compose is [here](https://docs.docker.com/compose/overview/) if you want some extra background information.
 11 | 
 12 | The official *Docker* installation page can be found [here](https://docs.docker.com/install/#supported-platforms). The official *Docker Compose* installation page can be found [here](https://docs.docker.com/compose/install/#prerequisites). For Windows and Mac, Docker Compose is included into Docker Desktop. If you are using Linux, then the official Docker installation does not come with Docker Compose; you will need to install Docker Engine before installing Docker Compose.
 13 | Go through the steps of installing Docker Compose for your platform, then proceed to set up credentials. Note, that Docker Desktop is set to use 2 GB runtime memory by default. 
 14 | 
 15 | > **NOTE:**
 16 | > 
 17 | > 🛑 As a minimum, make sure to increase the size of the **memory to 6 GB** (or more) as described [here](https://docs.docker.com/docker-for-mac/#resources).
 18 | 
 19 | > ElasticSearch and ETL/Spark jobs through tube/guppy/spark-service are particularly resource intensive. If you are running Compose-Services on your laptop, we recommend minimizing/stopping background jobs/services during running ETL jobs or hdfs formatting phase during `spark-service` startup, etc. Please do observe with `docker stats` and `top` / `htop`.
 20 | 
 21 | ## Docker ElasticSearch
 22 | 
 23 | If you are running on AWS EC2 instance (Amazon Linux), consider setup [Docker ElasticSearch prerequisites](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#docker-prod-prerequisites). The following are known to be required to set on Docker host:
 24 | ```
 25 | grep vm.max_map_count /etc/sysctl.conf
 26 | vm.max_map_count=262144
 27 | ```
 28 | 
 29 | ## Setting up Credentials
 30 | 
 31 | Setup credentials for Fence, a custom root CA  and SSL certs with the provided script by running either:
 32 | ```
 33 | bash ./creds_setup.sh
 34 | OR
 35 | bash ./creds_setup.sh YOUR-CUSTOM-DOMAIN
 36 | ```
 37 | This script will create a `Secrets` folder that holds various secrets and configuration files.
 38 | The script by default generates an SSL certificate to access the gen3 stack at `https://localhost`.
 39 | If you are running this in a remote server with an actual domain, you can run `bash creds_setup.sh YOUR_DOMAIN`.  This will create SSL cert signed by the custom CA so that the microservices can talk to each other without bypassing SSL verification. If you are setting this up on AWS, ensure that you use an Elastic IP address BEFORE you set up and use that as your domain. On an EC2 instance, for example, this would be your ec2-YOUR-Elastic-IP-Addr.us-region-number.compute.amazonaws.com. This will save a lot of time and avoid [editing the individual files](https://github.com/uc-cdis/compose-services/blob/master/docs/dev_tips.md#Running-Docker-Compose-on-a-Remote-Machine) to set up the hostname(`fence-config.yaml`, `peregrine_creds.json`, and `sheepdog_creds.json`) when the machine is rebooted. This is because each of the microservices can be configured to run on separate machines and thus have their respective configuration files. You will still need to bypass SSL verification when you hit the services from the browser. If you have real certs for your domain, you can copy to `Secrets/TLS/service.key` and `Secrets/TLS/service.crt` to overwrite our dev certs.
 40 | 
 41 | If you are using MacOS, you may run into an error with the default MacOS OpenSSL config not including the configuration for v3_ca certificate generation. OpenSSL should create the `jwt_private_key.pem` and `jwt_public_key.pem` in the `Secrets/fenceJwtKeys/{dateTtimeZ}` folder. If you do not see them, control whether your version of OpenSSL is correct.  You can refer to the solution on [this Github issue](https://github.com/jetstack/cert-manager/issues/279) on a related issue on Jetstack's cert-manager.
 42 | 
 43 | Support for multi-tenant fence (configure another fence as an IDP for this fence) is available and can be edited in the `fence-config.yaml`. If this is not the case, we recommend removing the [relevant section](https://github.com/uc-cdis/compose-services/blob/fa3dcc95a4244805c7a02f315cd330447e189945/templates/fence-config.yaml#L81).
 44 | 
 45 | ## Setting up Google OAuth Client-Id for Fence
 46 | 
 47 | This Docker Compose setup requires Google API Credentials in order for Fence microservice to complete its authentication.
 48 | To set up Google API Credentials, go to [the Credentials page of the Google Developer Console](https://console.developers.google.com/apis/credentials) and click the 'Create Credentials' button. Follow the prompts to create a new OAuth Client ID for a Web Application. Add  `https://localhost/user/login/google/login/` OR `https://YOUR_REMOTE_MACHINE_DOMAIN/user/login/google/login/` to your Authorized redirect URIs in the Credentials and click 'Create'. Then copy your client ID and client secret and use them to fill in the 'google.client_secret' and 'google.client_id' fields in the `Secrets/fence-config.yaml` file.
 49 | See image below for an example on a sample Google account.
 50 | 
 51 | ![Redirection Set up](https://github.com/uc-cdis/compose-services/blob/master/Authorization_URL_2020.jpg)
 52 | 
 53 | If you have Google API credentials set up already that you would like to use with the local gen3 Docker Compose setup, simply add `https://localhost/user/login/google/login/` OR `https://YOUR_REMOTE_MACHINE_DOMAIN/user/login/google/login/` to your Authorized redirect URIs in your credentials and copy your client ID and client secret from your credentials to the 'client_secret' and 'client_id' fields in the `Secrets/fence-config.yaml` under `OPENID_CONNECT` and `google`.
 54 | 
 55 | ## Setting up Users
 56 | 
 57 | To set up user privileges for the services, please edit the `Secrets/user.yaml` file, following [this guide](https://github.com/uc-cdis/fence/blob/master/docs/user.yaml_guide.md). In particular, you should change all occurrences of `username1@gmail.com` to the email you intend to log in with, so that you can create administrative nodes later on.
 58 | 
 59 | Fence container will automatically sync this file to the `fence_db` database on startup. If you wish to update user privileges while the containers are running (without restarting the container), just edit the `Secrets/user.yaml` file and then run
 60 | ```
 61 | docker exec -it fence-service fence-create sync --arborist http://arborist-service --yaml user.yaml
 62 | ```
 63 | This command will enter Fence container to run the fence-create sync command, which will update your user privileges. If you are logged in to your commons on a browser, you may need to log out and log back in again or clear your cookies in order to see the changes.
 64 | 
 65 | 
 66 | ## Start running your local Gen3 Docker Compose environment
 67 | 
 68 | > **NOTE**:
 69 | > 
 70 | > 🛑 If your Gen3 Data Commons does not host any data, yet, we recommend commenting out the [kibana-service section](https://github.com/uc-cdis/compose-services/blob/454d06358a49b4455097e34ddc060e76903e1aa3/docker-compose.yml#L309-L320) in the `docker-compose.yaml` and the [guppy section](https://github.com/uc-cdis/compose-services/blob/454d06358a49b4455097e34ddc060e76903e1aa3/nginx.conf#L140-L142) in the `nginx.conf` file. After having setup the first program/project and uploaded the first data, we recommend enabling these sections. Precisely, re-enable both services after you completed the following two steps: 
 71 | > 1. [Generate Test Metadata](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#generating-test-metadata)
 72 | > 2. Upload the simulated test metadata to the Data Portal UI. Follow [gen3.org](https://gen3.org/resources/user/submit-data/) and [Useful links](https://github.com/uc-cdis/compose-services/blob/master/docs/useful_links.md) for how-to guides and tutorials. 
 73 | 
 74 | > 🟢 Finally, re-enable kibana and guppy services before continuing with the section [Configuring guppy for exploration page](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#configuring-guppy-for-exploration-page). 
 75 | 
 76 | Now that you are done with the setup, all Docker Compose features should be available. If you are a non-root user you may need to add yourself to the 'docker' group: `sudo usermod -aG docker your-user`, and the log out and log back in.
 77 | Here are some useful commands:
 78 | 
 79 | The basic command of Docker Compose is
 80 | ```
 81 | docker-compose up
 82 | ```
 83 | which can be useful for debugging errors. To detach output from the containers, run
 84 | ```
 85 | docker-compose up -d
 86 | ```
 87 | When doing this, the logs for each service can be accessed using
 88 | ```
 89 | docker logs
 90 | ```
 91 | To stop the services use
 92 | ```
 93 | docker-compose down
 94 | ```
 95 | As the Docker images are pulled from quay.io, they do not update automatically. To update your Docker images, run
 96 | ```
 97 | docker-compose pull
 98 | docker image prune -f
 99 | ```
100 | These commands may take a while, and they also may fail. If they do fail, simply rerun them, or just update/remove images one at a time manually.
101 | Sheepdog and Peregrine services download the dictionary schema at startup, and the
102 | portal service runs a series of pre-launch compilations that depend on Sheepdog and Peregrine,
103 | so it may take several minutes for the portal to finally come up at https://localhost
104 | 
105 | Following the portal logs is one way to monitor its startup progress:
106 | ```
107 | docker logs -f portal-service
108 | ```
109 | When you see that `bundle.js` and `index.html` were successfully built in the logs, you should be able to log into https://localhost and see the data commons. You are now ready to setup the [first program and project](https://github.com/uc-cdis/compose-services/blob/master/docs/using_the_commons.md#programs-and-projects).
110 | 
111 | 
112 | ## Update tips
113 | 
114 | You should of course `git pull` compose-services if you have not done so for a while. You also need to `docker-compose pull` new images from Quay--this will not happen automatically. If your git pull pulled new commits, and you already have a `Secrets` folder, you may also need to delete your old `Secrets` and rerun `creds_setup.sh` (see [Setting up Credentials](https://github.com/uc-cdis/compose-services/blob/master/docs/setup.md#Setting-up-Credentials)) to recreate it.
115 | 


--------------------------------------------------------------------------------
/nginx.conf:
--------------------------------------------------------------------------------
  1 | 
  2 | user  nginx;
  3 | worker_processes  1;
  4 | 
  5 | error_log  /var/log/nginx/error.log warn;
  6 | pid        /var/run/nginx.pid;
  7 | 
  8 | load_module modules/ngx_http_perl_module.so;
  9 | load_module modules/ngx_http_js_module.so;
 10 | load_module modules/ngx_http_headers_more_filter_module.so;
 11 | 
 12 | events {
 13 |     worker_connections  1024;
 14 | }
 15 | 
 16 | http {
 17 |     include       /etc/nginx/mime.types;
 18 |     default_type  application/octet-stream;
 19 | 
 20 |     log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
 21 |                       '$status $body_bytes_sent "$http_referer" '
 22 |                       '"$http_user_agent" "$http_x_forwarded_for"';
 23 | 
 24 |     access_log  /var/log/nginx/access.log  main;
 25 | 
 26 |     sendfile        on;
 27 |     #tcp_nopush     on;
 28 | 
 29 |     keepalive_timeout  65;
 30 | 
 31 |     server {
 32 |         listen       80;
 33 |         server_name  revproxy-service;
 34 |         resolver 127.0.0.11;
 35 | 
 36 |         listen 443 ssl;
 37 | 
 38 |         ssl_certificate /etc/nginx/ssl/nginx.crt;
 39 |         ssl_certificate_key /etc/nginx/ssl/nginx.key;
 40 | 
 41 |         set $access_token "";
 42 |         set $csrf_check "ok-tokenauth";
 43 |         if ($cookie_access_token) {
 44 |             set $access_token "bearer $cookie_access_token";
 45 |             # cookie auth requires csrf check
 46 |             set $csrf_check "fail";
 47 |         }
 48 |         if ($http_authorization) {
 49 |             # Authorization header is present - prefer that token over cookie token
 50 |             set $access_token "$http_authorization";
 51 |         }
 52 | 
 53 |         proxy_set_header   Authorization "$access_token";
 54 |         # proxy_set_header   X-Forwarded-For "$realip";
 55 |         # proxy_set_header   X-UserId "$userid";
 56 | 
 57 |         #
 58 |         # Accomodate large jwt token headers
 59 |         # * http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_buffer_size
 60 |         # * https://ma.ttias.be/nginx-proxy-upstream-sent-big-header-reading-response-header-upstream/
 61 |         #
 62 |         proxy_buffer_size          16k;
 63 |         proxy_buffers              8 16k;
 64 |         proxy_busy_buffers_size    32k;
 65 |         #
 66 |         # also incoming from client:
 67 |         # * https://fullvalence.com/2016/07/05/cookie-size-in-nginx/
 68 |         # * https://nginx.org/en/docs/http/ngx_http_core_module.html#client_header_buffer_size
 69 |         large_client_header_buffers 4 8k;
 70 |         client_header_buffer_size 4k;
 71 | 
 72 |         #
 73 |         # CSRF check
 74 |         # This block requires a csrftoken for all POST requests.
 75 |         #
 76 |         if ($cookie_csrftoken = $http_x_csrf_token) {
 77 |           # this will fail further below if cookie_csrftoken is empty
 78 |           set $csrf_check "ok-$cookie_csrftoken";
 79 |         }
 80 |         if ($request_method != "POST") {
 81 |           set $csrf_check "ok-$request_method";
 82 |         }
 83 |         if ($cookie_access_token = "") {
 84 |           # do this again here b/c empty cookie_csrftoken == empty http_x_csrf_token - ugh
 85 |           set $csrf_check "ok-tokenauth";
 86 |         }
 87 | 
 88 |         location / {
 89 |             proxy_pass http://portal-service/;
 90 |         }
 91 | 
 92 |         location /user/ {
 93 |             proxy_pass http://fence-service/;
 94 |         }
 95 | 
 96 |         location /api/ {
 97 |             proxy_pass http://sheepdog-service/;
 98 |         }
 99 | 
100 |         location /mds/ {
101 |             proxy_pass http://metadata-service/;
102 |         }
103 | 
104 |         location /mds-admin/ {
105 |             rewrite ^/mds-admin/(.*) /$1 break;
106 |             proxy_pass http://metadata-service;
107 |             proxy_redirect http://$host/ https://$host/mds-admin/;
108 |         }
109 | 
110 |         location /coremetadata/ {
111 |             # redirect to coremetadata landing page if header does not specify otherwise
112 |             if ($http_accept !~ (application/json|x-bibtex|application/vnd\.schemaorg\.ld\+json)) {
113 |               rewrite ^/coremetadata/(.*) /files/$1 redirect;
114 |             }
115 | 
116 |             rewrite ^/coremetadata/(.*) /$1 break;
117 |             proxy_pass http://pidgin-service;
118 |         }
119 | 
120 |         location /index/ {
121 |             proxy_pass http://indexd-service/;
122 |         }
123 | 
124 |         location = /_status {
125 |             default_type application/json;
126 |             return 200 "{ \"message\": \"Feelin good!\" }\n";
127 |         }
128 | 
129 |         location /peregrine/_status {
130 |             proxy_pass http://peregrine-service/_status;
131 |         }
132 |         location /pidgin/_status {
133 |             proxy_pass http://pidgin-service/_status;
134 |         }
135 | 
136 |         location /api/v0/submission/getschema {
137 |             proxy_pass http://peregrine-service/v0/submission/getschema;
138 |         }
139 | 
140 |         location /guppy/ {
141 |             proxy_pass http://guppy-service/;
142 |         }
143 | 
144 |         location /api/v0/submission/graphql {
145 |             if ($cookie_csrftoken = "") {
146 |                 add_header Set-Cookie "csrftoken=$request_id$request_length$request_time$time_iso8601;Path=/";
147 |             }
148 |             proxy_next_upstream off;
149 |             # Forward the host and set Subdir header so api
150 |             # knows the original request path for hmac signing
151 |             proxy_set_header   Host $host;
152 |             proxy_set_header   Subdir /api;
153 |             proxy_set_header   Authorization "$access_token";
154 |             proxy_connect_timeout 300;
155 |             proxy_send_timeout 300;
156 |             proxy_read_timeout 300;
157 |             send_timeout 300;
158 |             proxy_pass http://peregrine-service/v0/submission/graphql;
159 |         }
160 | 
161 |         location /api/search {
162 |             if ($csrf_check !~ ^ok-\S.+$) {
163 |               return 403 "failed csrf check";
164 |             }
165 | 
166 |             gzip off;
167 |             proxy_next_upstream off;
168 |             proxy_set_header   Host $host;
169 |             proxy_set_header   Authorization "$access_token";
170 | 
171 |             proxy_connect_timeout 300;
172 |             proxy_send_timeout 300;
173 |             proxy_read_timeout 300;
174 |             send_timeout 300;
175 | 
176 |             rewrite ^/api/search/(.*) /$1 break;
177 |             proxy_pass http://peregrine-service;
178 |         }
179 | 
180 |         location @errorworkspace {
181 |             return 302 https://$host/no-workspace-access;
182 |         }
183 | 
184 |         #
185 |         # workspace AuthZ-proxy uses arborist to provide authorization to workpace services
186 |         # that don't implement our authn or authz i.e. shiny, jupyter.
187 |         #
188 |         location = /gen3-authz {
189 |             internal;
190 |             error_page 400 =403 @errorworkspace;
191 |             error_page 500 =403 @errorworkspace;
192 | 
193 |             proxy_pass http://arborist-service/auth/proxy?resource=$authz_resource&method=$authz_method&service=$authz_service;
194 | 
195 |             proxy_pass_request_body off;
196 |             proxy_set_header Authorization "$access_token";
197 |             proxy_set_header Content-Length "";
198 |             proxy_intercept_errors on;
199 | 
200 |             # nginx bug that it checks even if request_body off
201 |             client_max_body_size 0;
202 |         }
203 | 
204 |         #
205 |         # authorization endpoint
206 |         # https://hostname/authz?resource=programs/blah&method=acb&service=xyz
207 |         #
208 |         location ~ /authz/? {
209 |             if ($csrf_check !~ ^ok-\S.+$) {
210 |                 return 403 "failed csrf check";
211 |             }
212 |             set $proxy_service  "arborist";
213 | 
214 |             proxy_pass http://arborist-service/auth/proxy?resource=$arg_resource&method=$arg_method&service=$arg_service;
215 |         }
216 | 
217 |         location = /authz/resources {
218 |             if ($csrf_check !~ ^ok-\S.+$) {
219 |                 return 403 "failed csrf check";
220 |             }
221 | 
222 |             proxy_pass http://arborist-service/auth/resources;
223 |         }
224 | 
225 |         location = /authz/mapping {
226 |             if ($csrf_check !~ ^ok-\S.+$) {
227 |                 return 403 "failed csrf check";
228 |             }
229 | 
230 |             # Do not expose POST /auth/mapping
231 |             limit_except GET {
232 |                 deny all;
233 |             }
234 | 
235 |             # Do not pass the username arg here! Otherwise anyone can see anyone's access.
236 |             # Arborist will fall back to parsing the jwt for username.
237 |             proxy_pass http://arborist-service/auth/mapping;
238 |         }
239 | 
240 |         location = /lw-workspace/status {
241 |             default_type application/json;
242 |             return 200 "{ \"message\": \"Feelin good!\" }\n";
243 |         }
244 | 
245 | 
246 |         location /lw-workspace/proxy {
247 |             set $authz_resource "/workspace";
248 |             set $authz_method "access";
249 |             set $authz_service "jupyterhub";
250 |             # be careful - sub-request runs in same context as this request
251 |             auth_request_set $remoteUser $upstream_http_REMOTE_USER;
252 |             auth_request_set $saved_set_cookie $upstream_http_set_cookie;
253 |             auth_request /gen3-authz;
254 | 
255 |             if ($saved_set_cookie != "") {
256 |                 add_header Set-Cookie $saved_set_cookie always;
257 |             }
258 | 
259 |             proxy_set_header REMOTE_USER $remoteUser;
260 |             error_page 403 = @errorworkspace;
261 | 
262 |             #
263 |             # jupyter notebooks use websockets
264 |             # See https://aptro.github.io/server/architecture/2016/06/21/Jupyter-Notebook-Nginx-Setup.html
265 |             #
266 |             proxy_pass http://jupyter-service:8888/lw-workspace/proxy;
267 |             proxy_http_version 1.1;
268 |             proxy_set_header Host $host;
269 |             #proxy_set_header X-Real-IP $remote_addr;
270 |             #proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
271 |             proxy_set_header Upgrade $http_upgrade;
272 |             proxy_set_header Connection $http_connection;
273 |             #client_max_body_size 0;
274 |         }
275 | 
276 |         location /lw-workspace/ {
277 |             return 302 /lw-workspace/proxy;
278 |         }
279 |     }
280 | }
281 | 


--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
  1 | {
  2 |   "exclude": {
  3 |     "files": null,
  4 |     "lines": null
  5 |   },
  6 |   "generated_at": "2021-06-25T20:50:57Z",
  7 |   "plugins_used": [
  8 |     {
  9 |       "name": "AWSKeyDetector"
 10 |     },
 11 |     {
 12 |       "name": "ArtifactoryDetector"
 13 |     },
 14 |     {
 15 |       "base64_limit": 4.5,
 16 |       "name": "Base64HighEntropyString"
 17 |     },
 18 |     {
 19 |       "name": "BasicAuthDetector"
 20 |     },
 21 |     {
 22 |       "name": "CloudantDetector"
 23 |     },
 24 |     {
 25 |       "hex_limit": 3,
 26 |       "name": "HexHighEntropyString"
 27 |     },
 28 |     {
 29 |       "name": "IbmCloudIamDetector"
 30 |     },
 31 |     {
 32 |       "name": "IbmCosHmacDetector"
 33 |     },
 34 |     {
 35 |       "name": "JwtTokenDetector"
 36 |     },
 37 |     {
 38 |       "keyword_exclude": null,
 39 |       "name": "KeywordDetector"
 40 |     },
 41 |     {
 42 |       "name": "MailchimpDetector"
 43 |     },
 44 |     {
 45 |       "name": "PrivateKeyDetector"
 46 |     },
 47 |     {
 48 |       "name": "SlackDetector"
 49 |     },
 50 |     {
 51 |       "name": "SoftlayerDetector"
 52 |     },
 53 |     {
 54 |       "name": "StripeDetector"
 55 |     },
 56 |     {
 57 |       "name": "TwilioKeyDetector"
 58 |     }
 59 |   ],
 60 |   "results": {
 61 |     "creds_setup.sh": [
 62 |       {
 63 |         "hashed_secret": "2e9ee120fd25e31048598693aca91d5473898a99",
 64 |         "is_verified": false,
 65 |         "line_number": 122,
 66 |         "type": "Secret Keyword"
 67 |       }
 68 |     ],
 69 |     "datadictionary/gdcdictionary/examples/valid/aligned_reads_index.json": [
 70 |       {
 71 |         "hashed_secret": "a1ba33896d16eda8522e531edbaf3b625c1f4c31",
 72 |         "is_verified": false,
 73 |         "line_number": 6,
 74 |         "type": "Hex High Entropy String"
 75 |       }
 76 |     ],
 77 |     "datadictionary/gdcdictionary/examples/valid/experimental_metadata.json": [
 78 |       {
 79 |         "hashed_secret": "daef34f66b6e909f3a22ffd063d48eb428067b6e",
 80 |         "is_verified": false,
 81 |         "line_number": 6,
 82 |         "type": "Hex High Entropy String"
 83 |       }
 84 |     ],
 85 |     "datadictionary/gdcdictionary/examples/valid/slide_image.json": [
 86 |       {
 87 |         "hashed_secret": "daef34f66b6e909f3a22ffd063d48eb428067b6e",
 88 |         "is_verified": false,
 89 |         "line_number": 6,
 90 |         "type": "Hex High Entropy String"
 91 |       }
 92 |     ],
 93 |     "datadictionary/gdcdictionary/examples/valid/submitted_aligned_reads.json": [
 94 |       {
 95 |         "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535",
 96 |         "is_verified": false,
 97 |         "line_number": 7,
 98 |         "type": "Hex High Entropy String"
 99 |       }
100 |     ],
101 |     "datadictionary/gdcdictionary/examples/valid/submitted_copy_number.json": [
102 |       {
103 |         "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535",
104 |         "is_verified": false,
105 |         "line_number": 6,
106 |         "type": "Hex High Entropy String"
107 |       }
108 |     ],
109 |     "datadictionary/gdcdictionary/examples/valid/submitted_methylation.json": [
110 |       {
111 |         "hashed_secret": "e3f181b6b92d74e30d524d03029e785d0c7c7535",
112 |         "is_verified": false,
113 |         "line_number": 7,
114 |         "type": "Hex High Entropy String"
115 |       }
116 |     ],
117 |     "datadictionary/gdcdictionary/examples/valid/submitted_somatic_mutation.json": [
118 |       {
119 |         "hashed_secret": "a1ba33896d16eda8522e531edbaf3b625c1f4c31",
120 |         "is_verified": false,
121 |         "line_number": 9,
122 |         "type": "Hex High Entropy String"
123 |       }
124 |     ],
125 |     "datadictionary/gdcdictionary/examples/valid/submitted_unaligned_reads.json": [
126 |       {
127 |         "hashed_secret": "88e3a7adc1779a311467797f00d2edc5e9697d9c",
128 |         "is_verified": false,
129 |         "line_number": 7,
130 |         "type": "Hex High Entropy String"
131 |       }
132 |     ],
133 |     "docker-compose.override.sample.yml": [
134 |       {
135 |         "hashed_secret": "afc848c316af1a89d49826c5ae9d00ed769415f3",
136 |         "is_verified": false,
137 |         "line_number": 6,
138 |         "type": "Secret Keyword"
139 |       }
140 |     ],
141 |     "docker-compose.yml": [
142 |       {
143 |         "hashed_secret": "afc848c316af1a89d49826c5ae9d00ed769415f3",
144 |         "is_verified": false,
145 |         "line_number": 21,
146 |         "type": "Secret Keyword"
147 |       },
148 |       {
149 |         "hashed_secret": "cb93dace47db45078164ade928ba21cf27c1d8cf",
150 |         "is_verified": false,
151 |         "line_number": 75,
152 |         "type": "Secret Keyword"
153 |       },
154 |       {
155 |         "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa",
156 |         "is_verified": false,
157 |         "line_number": 94,
158 |         "type": "Secret Keyword"
159 |       },
160 |       {
161 |         "hashed_secret": "9b5925ea817163740dfb287a9894e8ab3aba2c18",
162 |         "is_verified": false,
163 |         "line_number": 242,
164 |         "type": "Secret Keyword"
165 |       }
166 |     ],
167 |     "docs/using_the_commons.md": [
168 |       {
169 |         "hashed_secret": "6d9c68c603e465077bdd49c62347fe54717f83a3",
170 |         "is_verified": false,
171 |         "line_number": 88,
172 |         "type": "Secret Keyword"
173 |       }
174 |     ],
175 |     "scripts/postgres_always.sh": [
176 |       {
177 |         "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa",
178 |         "is_verified": false,
179 |         "line_number": 30,
180 |         "type": "Secret Keyword"
181 |       }
182 |     ],
183 |     "scripts/postgres_init.sql": [
184 |       {
185 |         "hashed_secret": "f60aa0266ec9d2734d854b9dd3047b4b002d18aa",
186 |         "is_verified": false,
187 |         "line_number": 11,
188 |         "type": "Secret Keyword"
189 |       },
190 |       {
191 |         "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042",
192 |         "is_verified": false,
193 |         "line_number": 15,
194 |         "type": "Secret Keyword"
195 |       },
196 |       {
197 |         "hashed_secret": "c9ed73071942a54e7ec610d5a93d4a22e83e1da7",
198 |         "is_verified": false,
199 |         "line_number": 19,
200 |         "type": "Secret Keyword"
201 |       },
202 |       {
203 |         "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18",
204 |         "is_verified": false,
205 |         "line_number": 23,
206 |         "type": "Secret Keyword"
207 |       },
208 |       {
209 |         "hashed_secret": "bf41596f893a5f6ed0f66addb555cba581413c56",
210 |         "is_verified": false,
211 |         "line_number": 27,
212 |         "type": "Secret Keyword"
213 |       },
214 |       {
215 |         "hashed_secret": "cb93dace47db45078164ade928ba21cf27c1d8cf",
216 |         "is_verified": false,
217 |         "line_number": 31,
218 |         "type": "Secret Keyword"
219 |       }
220 |     ],
221 |     "templates/config_helper.py": [
222 |       {
223 |         "hashed_secret": "bf21a9e8fbc5a3846fb05b4fa0859e0917b2202f",
224 |         "is_verified": false,
225 |         "line_number": 66,
226 |         "type": "Basic Auth Credentials"
227 |       }
228 |     ],
229 |     "templates/etl_creds.json": [
230 |       {
231 |         "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18",
232 |         "is_verified": false,
233 |         "line_number": 4,
234 |         "type": "Secret Keyword"
235 |       }
236 |     ],
237 |     "templates/fence-config.yaml": [
238 |       {
239 |         "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042",
240 |         "is_verified": false,
241 |         "line_number": 31,
242 |         "type": "Basic Auth Credentials"
243 |       },
244 |       {
245 |         "hashed_secret": "5d07e1b80e448a213b392049888111e1779a52db",
246 |         "is_verified": false,
247 |         "line_number": 296,
248 |         "type": "Secret Keyword"
249 |       },
250 |       {
251 |         "hashed_secret": "87942aadb396f068f7bc17acdf1c6ca4b93ae89b",
252 |         "is_verified": false,
253 |         "line_number": 355,
254 |         "type": "Secret Keyword"
255 |       }
256 |     ],
257 |     "templates/indexd_creds.json": [
258 |       {
259 |         "hashed_secret": "bf41596f893a5f6ed0f66addb555cba581413c56",
260 |         "is_verified": false,
261 |         "line_number": 4,
262 |         "type": "Secret Keyword"
263 |       }
264 |     ],
265 |     "templates/indexd_settings.py": [
266 |       {
267 |         "hashed_secret": "0a0d18c85e096611b5685b62bc60ec534d19bacc",
268 |         "is_verified": false,
269 |         "line_number": 49,
270 |         "type": "Basic Auth Credentials"
271 |       }
272 |     ],
273 |     "templates/peregrine_creds.json": [
274 |       {
275 |         "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042",
276 |         "is_verified": false,
277 |         "line_number": 4,
278 |         "type": "Secret Keyword"
279 |       },
280 |       {
281 |         "hashed_secret": "c9ed73071942a54e7ec610d5a93d4a22e83e1da7",
282 |         "is_verified": false,
283 |         "line_number": 8,
284 |         "type": "Secret Keyword"
285 |       },
286 |       {
287 |         "hashed_secret": "1b691ca20ade79740ab622b50690458c609018ce",
288 |         "is_verified": false,
289 |         "line_number": 10,
290 |         "type": "Base64 High Entropy String"
291 |       }
292 |     ],
293 |     "templates/peregrine_settings.py": [
294 |       {
295 |         "hashed_secret": "347cd9c53ff77d41a7b22aa56c7b4efaf54658e3",
296 |         "is_verified": false,
297 |         "line_number": 37,
298 |         "type": "Basic Auth Credentials"
299 |       }
300 |     ],
301 |     "templates/sheepdog_creds.json": [
302 |       {
303 |         "hashed_secret": "8414234c06141597b7dc1b3410b69cc49773e042",
304 |         "is_verified": false,
305 |         "line_number": 4,
306 |         "type": "Secret Keyword"
307 |       },
308 |       {
309 |         "hashed_secret": "8aedff83e21726bb3591555105f3d2b0c9b83e18",
310 |         "is_verified": false,
311 |         "line_number": 8,
312 |         "type": "Secret Keyword"
313 |       },
314 |       {
315 |         "hashed_secret": "1b691ca20ade79740ab622b50690458c609018ce",
316 |         "is_verified": false,
317 |         "line_number": 10,
318 |         "type": "Base64 High Entropy String"
319 |       },
320 |       {
321 |         "hashed_secret": "87942aadb396f068f7bc17acdf1c6ca4b93ae89b",
322 |         "is_verified": false,
323 |         "line_number": 12,
324 |         "type": "Secret Keyword"
325 |       },
326 |       {
327 |         "hashed_secret": "50f013532a9770a2c2cfdc38b7581dd01df69b70",
328 |         "is_verified": false,
329 |         "line_number": 15,
330 |         "type": "Secret Keyword"
331 |       }
332 |     ],
333 |     "templates/sheepdog_settings.py": [
334 |       {
335 |         "hashed_secret": "347cd9c53ff77d41a7b22aa56c7b4efaf54658e3",
336 |         "is_verified": false,
337 |         "line_number": 37,
338 |         "type": "Basic Auth Credentials"
339 |       }
340 |     ]
341 |   },
342 |   "version": "0.13.1",
343 |   "word_list": {
344 |     "file": null,
345 |     "hash": null
346 |   }
347 | }
348 | 


--------------------------------------------------------------------------------