├── .gitattributes ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── buildspec_master.yml ├── buildspec_staging.yml ├── docker ├── alignment │ ├── build.sh │ ├── conda_requirements.yml │ └── dockerfile_template ├── annotation │ ├── build.sh │ ├── conda_requirements.yml │ └── dockerfile_template ├── breakpoint │ ├── build.sh │ ├── conda_base_requirements.yml │ ├── conda_lumpy_requirements.yml │ └── dockerfile_template ├── cohort_qc │ ├── conda_requirements.txt │ └── python_requirements.txt ├── haplotypes │ ├── build.sh │ ├── dockerfile_template │ └── requirements.yml ├── hmmcopy │ ├── build.sh │ ├── conda_requirements.yml │ └── dockerfile_template ├── qc │ ├── build.sh │ ├── conda_requirements.txt │ ├── dockerfile_template │ ├── oncokb-annotator │ │ ├── AnnotatorCore.py │ │ └── MafAnnotator.py │ └── pip_requirements.txt ├── sample_qc │ ├── conda_requirements.txt │ └── python_requirements.txt ├── variant │ ├── build.sh │ ├── conda_base_requirements.yml │ ├── conda_museq_requirements.yml │ └── dockerfile_template └── vcf2maf │ ├── conda_requirements.txt │ └── python_requirements.txt ├── docs ├── Makefile ├── make.bat └── source │ ├── alignment_metrics.md │ ├── annotation_metrics.md │ ├── conf.py │ ├── gc_metrics.md │ ├── hmmcopy_metrics.md │ ├── hmmcopy_reads.md │ ├── hmmcopy_segments.md │ ├── index.md │ ├── install.md │ ├── organism_filter.md │ ├── quality_classifier.md │ └── readme_data │ ├── alignment.png │ ├── alignment.tikz │ ├── annotation.png │ ├── annotation.tikz │ ├── breakpoint_calling.png │ ├── breakpoint_calling.tikz │ ├── dlp_cohort_pipeline.png │ ├── germline.png │ ├── germline.tikz │ ├── hmmcopy.png │ ├── hmmcopy.tikz │ ├── infer_haps.png │ ├── infer_haps.tikz │ ├── merge_cell_bams.png │ ├── merge_cell_bams.tikz │ ├── pseudo_bulk_qc.png │ ├── split_wgs_bam.png │ ├── split_wgs_bam.tikz │ ├── variant_calling.png │ ├── variant_calling.tikz │ ├── variant_counting.png │ └── variant_counting.tikz ├── setup.cfg ├── setup.py ├── single_cell ├── __init__.py ├── _version.py ├── alignment.py ├── annotation.py ├── breakpoint_calling.py ├── clean_sentinels.py ├── cmdline.py ├── cohort_qc.py ├── config │ ├── __init__.py │ ├── batch.py │ ├── config_reference.py │ ├── generate_batch_config.py │ ├── generate_pipeline_config.py │ └── pipeline_config.py ├── generate_config.py ├── germline_calling.py ├── hmmcopy.py ├── infer_haps.py ├── merge_bams.py ├── run.py ├── sample_qc.py ├── snv_genotyping.py ├── split_bam.py ├── sv_genotyping.py ├── tests │ ├── __init__.py │ └── codebuild │ │ ├── __init__.py │ │ ├── align │ │ ├── align.sh │ │ ├── inputs.yaml │ │ └── test_alignment.py │ │ ├── annotation │ │ ├── annotation.sh │ │ ├── inputs.yaml │ │ └── test_annotation.py │ │ ├── breakpoint_calling │ │ ├── breakpoint_calling.sh │ │ ├── inputs.yaml │ │ └── test_breakpoint_calling.py │ │ ├── cohort_qc │ │ ├── cohort_qc.sh │ │ └── inputs.yaml │ │ ├── compare.py │ │ ├── count_haps │ │ ├── count_haps.sh │ │ ├── inputs.yaml │ │ └── test_count_haps.py │ │ ├── hmmcopy │ │ ├── hmmcopy.sh │ │ ├── inputs.yaml │ │ └── test_hmmcopy.py │ │ ├── infer_haps │ │ ├── infer_haps.sh │ │ ├── inputs.yaml │ │ └── test_infer_haps.py │ │ ├── merge_cell_bams │ │ ├── inputs.yaml │ │ ├── merge_cell_bams.sh │ │ └── test_merge_cell_bams.py │ │ ├── preflight │ │ └── preflight.sh │ │ ├── pseudo_bulk_qc │ │ ├── inputs.yaml │ │ └── pseudo_bulk_qc.sh │ │ ├── refdata │ │ └── download.sh │ │ ├── snv_genotyping │ │ ├── inputs.yaml │ │ └── snv_genotyping.sh │ │ ├── split_wgs_bam │ │ ├── inputs.yaml │ │ ├── split_wgs_bam.sh │ │ └── test_split_wgs_bam.py │ │ └── variant_calling │ │ ├── inputs.yaml │ │ ├── test_variant_calling.py │ │ └── variant_calling.sh ├── utils │ ├── __init__.py │ ├── bamutils.py │ ├── csvutils.py │ ├── fastqutils.py │ ├── gatkutils.py │ ├── helpers.py │ ├── inpututils.py │ ├── ltmutils.py │ ├── pdfutils.py │ ├── picardutils.py │ ├── pysamutils.py │ ├── refgenome.py │ ├── singlecell_copynumber_plot_utils │ │ ├── __init__.py │ │ ├── heatmap.py │ │ ├── plot_hmmcopy.py │ │ ├── plot_kernel_density.py │ │ ├── plot_metrics.py │ │ ├── plot_pcolormesh.py │ │ └── utils.py │ ├── storageutils.py │ ├── tests │ │ ├── __init__.py │ │ ├── csvutils_test.py │ │ └── test_helpers.py │ ├── validator │ │ ├── __init__.py │ │ ├── utils.py │ │ └── validate.py │ └── vcfutils.py ├── variant_calling.py └── workflows │ ├── __init__.py │ ├── align │ ├── __init__.py │ ├── align_tasks.py │ ├── coverage_metrics.py │ ├── dtypes.py │ ├── fastqscreen.py │ ├── fastqscreen_test.py │ ├── fastqscreen_utils.py │ ├── scripts │ │ ├── __init__.py │ │ ├── collect_metrics.py │ │ ├── gen_cn_matrix.py │ │ ├── run_trimgalore.py │ │ └── summary_metrics.py │ └── tasks.py │ ├── cohort_qc │ ├── __init__.py │ ├── scripts │ │ ├── oncoplot.R │ │ ├── report.Rmd │ │ ├── vcf2maf │ │ └── vcf2maf.sh │ └── tasks.py │ ├── db_annotation │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── destruct_singlecell │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── extract_allele_readcounts │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── germline │ ├── __init__.py │ └── tasks.py │ ├── hmmcopy │ ├── __init__.py │ ├── dtypes.py │ ├── scripts │ │ ├── __init__.py │ │ ├── classify.py │ │ ├── convert_csv_to_seg.py │ │ ├── correct_read_count.py │ │ ├── hmmcopy_single_cell.R │ │ └── read_counter.py │ └── tasks.py │ ├── infer_haps │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── lumpy │ ├── __init__.py │ ├── dtypes.py │ ├── generate_histogram.py │ ├── merge_histograms.py │ ├── parse_lumpy_to_csv.py │ └── tasks.py │ ├── mappability_annotation │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── merge_bams │ ├── __init__.py │ ├── scripts │ │ ├── __init__.py │ │ └── collect_metrics.py │ └── tasks.py │ ├── mutationseq │ ├── __init__.py │ ├── dtypes.py │ ├── scripts │ │ ├── __init__.py │ │ ├── parse_museq.py │ │ └── vizutils │ │ │ ├── __init__.py │ │ │ ├── parseutils.py │ │ │ ├── utils.py │ │ │ └── vcf.py │ └── tasks.py │ ├── pseudo_bulk_qc │ ├── __init__.py │ ├── scripts │ │ ├── mergemafs.R │ │ ├── mergesnvs.R │ │ ├── mutationreport.Rmd │ │ ├── report.Rmd │ │ ├── single_cell_qc_plots.py │ │ └── vcf2maf.sh │ └── tasks.py │ ├── qc_annotation │ ├── __init__.py │ ├── dtypes.py │ ├── scripts │ │ ├── __init__.py │ │ ├── classify.py │ │ ├── fastqscreen_classify.py │ │ └── generate_qc.py │ ├── tasks.py │ └── tests.py │ ├── snpeff_annotation │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py │ ├── snv_allele_counts │ ├── __init__.py │ └── dtypes.py │ ├── snv_annotate │ └── __init__.py │ ├── split_bams │ ├── __init__.py │ └── tasks.py │ ├── strelka │ ├── __init__.py │ ├── _merge.py │ ├── components_utils.py │ ├── dtypes.py │ ├── scripts │ │ ├── __init__.py │ │ ├── parse_strelka.py │ │ └── vizutils │ │ │ ├── __init__.py │ │ │ ├── parseutils.py │ │ │ ├── utils.py │ │ │ └── vcf.py │ ├── strelkautils.py │ ├── tasks.py │ └── vcf_tasks.py │ ├── sv_genotyping │ ├── __init__.py │ └── tasks.py │ └── trinuc_annotation │ ├── __init__.py │ ├── dtypes.py │ └── tasks.py └── versioneer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | single_cell/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_Store 3 | *.egg-info 4 | *.project 5 | *.pydevproject 6 | *.swp 7 | build 8 | dist 9 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at grewald@mskcc.org. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include single_cell/_version.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Single Cell Pipeline 3 | We've stopped development on this project, Please checkout mondrian for the latest DLP+ workflows: 4 | https://github.com/mondrian-scwgs/mondrian 5 | 6 | 7 | For a detailed guide see [INSTALL](docs/source/index.md) 8 | 9 | [Changelog](CHANGELOG.md) 10 | 11 | 12 | ## What is it? 13 | 14 | single cell pipeline is suite of workflows for analysing the single cell data generated by DLP+. 15 | 16 | ## Where to get it 17 | The source code is currently hosted on GitHub at: 18 | https://github.com/shahcompbio/single_cell_pipeline 19 | 20 | docker containers are available at 21 | https://quay.io/organization/singlecellpipeline 22 | 23 | conda packages are available at 24 | https://anaconda.org/shahcompbio 25 | 26 | ## License 27 | [GPL v3.0](LICENSE) 28 | 29 | ## Documentation 30 | The official documentation is hosted on http://single_cell_pipeline.readthedocs.io/ 31 | 32 | ## Getting Help 33 | 34 | Please contact the developers at 35 | * Diljot Grewal 36 | * Andrew Mcpherson 37 | 38 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/__init__.py -------------------------------------------------------------------------------- /buildspec_master.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | pre_build: 5 | commands: 6 | - bash single_cell/tests/codebuild/preflight/preflight.sh 7 | - cd docker/alignment/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 8 | - cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 9 | - cd docker/annotation/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 10 | - cd docker/variant/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 11 | - cd docker/breakpoint/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 12 | - cd docker/haplotypes/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 13 | - cd docker/qc/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 14 | build: 15 | commands: 16 | - bash single_cell/tests/codebuild/refdata/download.sh 17 | - bash single_cell/tests/codebuild/align/align.sh quay.io/singlecellpipelinetest 18 | - bash single_cell/tests/codebuild/hmmcopy/hmmcopy.sh quay.io/singlecellpipelinetest 19 | - bash single_cell/tests/codebuild/annotation/annotation.sh quay.io/singlecellpipelinetest 20 | - bash single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh quay.io/singlecellpipelinetest 21 | - bash single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh quay.io/singlecellpipelinetest 22 | - bash single_cell/tests/codebuild/variant_calling/variant_calling.sh quay.io/singlecellpipelinetest 23 | - bash single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh quay.io/singlecellpipelinetest 24 | - bash single_cell/tests/codebuild/infer_haps/infer_haps.sh quay.io/singlecellpipelinetest 25 | - bash single_cell/tests/codebuild/count_haps/count_haps.sh quay.io/singlecellpipelinetest 26 | - bash single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh quay.io/singlecellpipelinetest 27 | - bash single_cell/tests/codebuild/cohort_qc/cohort_qc.sh quay.io/singlecellpipelinetest 28 | - bash single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh quay.io/singlecellpipelinetest 29 | post_build: 30 | commands: 31 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/alignment/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 32 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 33 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/annotation/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 34 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/variant/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 35 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/breakpoint/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 36 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/haplotypes/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 37 | - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/qc/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi 38 | -------------------------------------------------------------------------------- /buildspec_staging.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | pre_build: 5 | commands: 6 | - cd docker/alignment/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 7 | - cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 8 | - cd docker/annotation/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 9 | - cd docker/variant/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 10 | - cd docker/breakpoint/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 11 | - cd docker/haplotypes/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 12 | - cd docker/qc/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../ 13 | build: 14 | commands: 15 | - bash single_cell/tests/codebuild/refdata/download.sh 16 | - bash single_cell/tests/codebuild/align/align.sh quay.io/singlecellpipelinetest 17 | - bash single_cell/tests/codebuild/hmmcopy/hmmcopy.sh quay.io/singlecellpipelinetest 18 | - bash single_cell/tests/codebuild/annotation/annotation.sh quay.io/singlecellpipelinetest 19 | - bash single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh quay.io/singlecellpipelinetest 20 | - bash single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh quay.io/singlecellpipelinetest 21 | - bash single_cell/tests/codebuild/variant_calling/variant_calling.sh quay.io/singlecellpipelinetest 22 | - bash single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh quay.io/singlecellpipelinetest 23 | - bash single_cell/tests/codebuild/infer_haps/infer_haps.sh quay.io/singlecellpipelinetest 24 | - bash single_cell/tests/codebuild/count_haps/count_haps.sh quay.io/singlecellpipelinetest 25 | - bash single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh quay.io/singlecellpipelinetest 26 | - bash single_cell/tests/codebuild/cohort_qc/cohort_qc.sh quay.io/singlecellpipelinetest 27 | - bash single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh quay.io/singlecellpipelinetest 28 | -------------------------------------------------------------------------------- /docker/alignment/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_alignment:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_alignment:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/alignment/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | ADD . /app 3 | 4 | RUN apt-get update -y && apt-get install -y libltdl7 parallel && rm -rf /var/lib/apt/lists/* 5 | 6 | RUN conda install --file /app/conda_requirements.yml 7 | 8 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 9 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 10 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 11 | -------------------------------------------------------------------------------- /docker/annotation/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_annotation:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_annotation:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/annotation/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | ADD . /app 3 | RUN conda install --file /app/conda_requirements.yml 4 | RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py 5 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 6 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 7 | RUN pip install git+https://github.com/shahcompbio/cell_cycle_classifier.git@v0.0.3 8 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 9 | -------------------------------------------------------------------------------- /docker/breakpoint/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_breakpoint:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_breakpoint:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/breakpoint/conda_lumpy_requirements.yml: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | @EXPLICIT 5 | https://conda.anaconda.org/bioconda/linux-64/sambamba-0.6.6-2.tar.bz2 6 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda 7 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda 8 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2021.1.19-h06a4308_0.conda 9 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.2-254.conda 10 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.conda 11 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.conda 12 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.conda 13 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.2-256.conda 14 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.tar.bz2 15 | https://repo.anaconda.com/pkgs/main/linux-64/gawk-5.1.0-h7b6447c_0.conda 16 | https://conda.anaconda.org/bioconda/linux-64/libdeflate-1.0-h14c3975_1.tar.bz2 17 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda 18 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.conda 19 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1i-h27cfd23_0.conda 20 | https://conda.anaconda.org/bioconda/linux-64/samblaster-0.1.26-hc9558a2_0.tar.bz2 21 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.conda 22 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.conda 23 | https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20191231-h14c3975_1.conda 24 | https://repo.anaconda.com/pkgs/main/linux-64/libssh2-1.9.0-h1ba5d50_1.conda 25 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1-h27cfd23_0.conda 26 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.conda 27 | https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.18.2-h173b8e3_0.conda 28 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.33.0-h62c20be_0.conda 29 | https://repo.anaconda.com/pkgs/main/linux-64/libcurl-7.71.1-h20c2e04_1.conda 30 | https://repo.anaconda.com/pkgs/main/linux-64/python-2.7.18-h15b4118_1.conda 31 | https://repo.anaconda.com/pkgs/main/noarch/certifi-2020.6.20-pyhd3eb1b0_3.conda 32 | https://repo.anaconda.com/pkgs/main/linux-64/curl-7.71.1-hbc83047_1.conda 33 | https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-pyhd3eb1b0_0.tar.bz2 34 | https://repo.anaconda.com/pkgs/main/noarch/wheel-0.36.2-pyhd3eb1b0_0.conda 35 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py27he904b0f_0.conda 36 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.15.3-py27hda2845c_1.tar.bz2 37 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-44.0.0-py27_0.conda 38 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.16.6-py27hde5b4d6_0.conda 39 | https://repo.anaconda.com/pkgs/main/linux-64/pip-19.3.1-py27_0.conda 40 | https://conda.anaconda.org/componc/linux-64/lumpy-sv-0.2.12-h14c3975_0.tar.bz2 41 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.0.15-py27ha843d7b_0.conda 42 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.0-py27hd6b4f25_0.conda 43 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.16.6-py27hbc911f0_0.conda 44 | -------------------------------------------------------------------------------- /docker/breakpoint/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | ADD . /app 3 | 4 | 5 | # this does not work because bioconda package for lumpy is missing a commit w fix that's needed for bed output 6 | #RUN conda create --name lumpy python=2.7 7 | #SHELL ["conda", "run", "-n", "lumpy", "/bin/bash", "-c"] 8 | #RUN conda install --file /app/conda_lumpy_requirements.yml 9 | #RUN sed 's/usr\/bin\/env python/usr\/bin\/env python2/' /opt/conda/envs/lumpy/bin/lumpy_extractSplitReads_BwaMem > /opt/conda/envs/lumpy/bin/extractSplitReads_BwaMem && chmod 777 /opt/conda/envs/lumpy/bin/extractSplitReads_BwaMem 10 | #SHELL ["conda", "run", "-n", "base", "/bin/bash", "-c"] 11 | 12 | RUN apt-get update -y && apt install autoconf make gcc zlib1g-dev libcurl3-dev libssl-dev g++ samtools -y && rm -rf /var/lib/apt/lists/* 13 | RUN git clone --recursive https://github.com/arq5x/lumpy-sv.git && cd ./lumpy-sv && make && cp bin/* /usr/local/bin/. && cp ./scripts/extractSplitReads_BwaMem /usr/local/bin/. 14 | 15 | 16 | 17 | RUN conda install --file /app/conda_base_requirements.yml 18 | 19 | ENV PATH="${PATH}:/opt/conda/envs/lumpy/bin" 20 | 21 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 22 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 23 | RUN pip install git+https://github.com/shahcompbio/biowrappers.git@master 24 | RUN pip install -e git+https://github.com/amcpherson/blossomv.git@download_link_fix#egg=blossomv 25 | RUN pip install networkx==2.1 26 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 27 | -------------------------------------------------------------------------------- /docker/cohort_qc/python_requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2021.5.30 2 | cffi==1.14.5 3 | chardet==4.0.0 4 | click==7.1.2 5 | cycler==0.10.0 6 | decorator==4.4.2 7 | dill==0.3.3 8 | idna==2.10 9 | Jinja2==3.0.0 10 | kiwisolver==1.3.1 11 | mafannotator @ git+https://github.com/shahcompbio/mafannotator.git@2d773b4fe77b1408d60916ef70f47183adbba5b0 12 | MarkupSafe==2.0.0 13 | matplotlib==3.4.1 14 | networkx==2.5.1 15 | numexpr==2.7.3 16 | numpy==1.20.2 17 | pandas==1.2.4 18 | Pillow==8.2.0 19 | pycparser==2.20 20 | pyparsing==2.4.7 21 | pypeliner @ git+https://github.com/shahcompbio/pypeliner.git@b452c14c4abc6e653ac8e8f52d3c9b9a158becd1 22 | python-dateutil==2.8.1 23 | pytz==2021.1 24 | PyYAML==5.4.1 25 | requests==2.25.1 26 | rpy2==3.4.4 27 | #scgenome @ git+https://github.com/shahcompbio/scgenome.git@179017b23b423b17c9a40450927ed6bbbd21cc7b 28 | scipy==1.6.3 29 | seaborn==0.11.1 30 | # Editable install with no version control (single-cell==0.7.6+14.gc0a7879.dirty) 31 | #-e /juno/home/abramsd/miniconda3/envs/scp_cohort_qc/lib/python3.9/site-packages/single_cell-0.7.6+14.gc0a7879.dirty-py3.9.egg 32 | six==1.15.0 33 | tables==3.6.1 34 | tzlocal==2.1 35 | urllib3==1.26.4 36 | wgs-analysis @ git+https://github.com/amcpherson/wgs_analysis.git@e86b3a158f4cbc2e43fab0e24b8c2b7dded360ad 37 | #classifycopynumber @ git+https://github.com/shahcompbio/classifycopynumber.git@1c7c81ada82b885b8da6d540cd6cd3ccf2656f1e 38 | 39 | -------------------------------------------------------------------------------- /docker/haplotypes/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_haplotypes:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_haplotypes:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/haplotypes/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | 3 | ADD . /app 4 | 5 | RUN rm -rf /opt/conda/lib/python2.7/site-packages/remixt* && apt-get update -y && apt install libc-dev libz-dev build-essential -y && rm -rf /var/lib/apt/lists/* && conda install -c bioconda cython 6 | 7 | RUN conda install --file /app/requirements.yml 8 | 9 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 10 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 11 | 12 | RUN pip install git+https://github.com/amcpherson/remixt.git@0.5.13r2 13 | RUN mkdir -p /root/.config/matplotlib && echo "backend : Agg" > /root/.config/matplotlib/matplotlibrc 14 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 15 | -------------------------------------------------------------------------------- /docker/hmmcopy/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_hmmcopy:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_hmmcopy:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/hmmcopy/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | ADD . /app 3 | RUN conda install --file /app/conda_requirements.yml 4 | 5 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 6 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 7 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 8 | -------------------------------------------------------------------------------- /docker/qc/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_qc:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_qc:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/qc/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | 3 | ADD . /app 4 | 5 | RUN apt update && apt install build-essential samtools -y 6 | 7 | RUN conda install --file /app/conda_requirements.txt 8 | RUN pip install -r /app/pip_requirements.txt 9 | 10 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/pypeliner* && pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 11 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/single_cell* && pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 12 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/biowrappers* && pip install git+https://github.com/shahcompbio/biowrappers.git@master 13 | 14 | RUN pip install git+https://github.com/amcpherson/wgs_analysis.git@v0.0.2 15 | RUN pip install git+https://github.com/shahcompbio/scgenome.git@v0.0.1 16 | RUN pip install git+https://github.com/shahcompbio/classifycopynumber.git@v0.0.5 17 | RUN pip install git+https://github.com/shahcompbio/mafannotator.git@master 18 | 19 | RUN cp /app/oncokb-annotator/MafAnnotator.py /app/oncokb-annotator/AnnotatorCore.py /opt/conda/bin && chmod 777 /opt/conda/bin/MafAnnotator.py /opt/conda/bin/AnnotatorCore.py 20 | RUN rm -rf /opt/conda/bin/samtools -------------------------------------------------------------------------------- /docker/qc/pip_requirements.txt: -------------------------------------------------------------------------------- 1 | adal==1.2.7 2 | adjusttext==0.7.3 3 | azure-batch==12.0.0 4 | azure-common==1.1.28 5 | azure-core==1.24.0 6 | azure-identity==1.10.0 7 | azure-storage-blob==12.12.0 8 | backports-zoneinfo==0.2.1 9 | brewer2mpl==1.4.1 10 | click==8.1.3 11 | cython==0.29.30 12 | dill==0.3.5.1 13 | fonttools==4.33.3 14 | hdbscan==0.8.28 15 | importlib-metadata==4.11.4 16 | isodate==0.6.1 17 | jinja2==3.1.2 18 | kiwisolver==1.4.2 19 | lda==2.0.0 20 | markupsafe==2.1.1 21 | matplotlib==3.5.2 22 | msal==1.17.0 23 | msal-extensions==1.0.0 24 | msrest==0.6.21 25 | msrestazure==0.6.4 26 | networkx==2.6.3 27 | numexpr==2.8.1 28 | numpy==1.21.6 29 | oauthlib==3.2.0 30 | packaging==21.3 31 | pandas==1.3.5 32 | pbr==3.1.1 33 | pillow==9.1.1 34 | portalocker==2.4.0 35 | pyjwt==2.4.0 36 | pytz==2022.1 37 | pytz-deprecation-shim==0.1.0.post0 38 | pyyaml==5.4.1 39 | requests-oauthlib==1.3.1 40 | rpy2==3.5.2 41 | scikit-learn==1.0.2 42 | scipy==1.7.3 43 | seaborn==0.11.2 44 | tables==3.7.0 45 | typing-extensions==4.2.0 46 | tzdata==2022.1 47 | tzlocal==4.2 48 | umap==0.1.1 49 | zipp==3.8.0 -------------------------------------------------------------------------------- /docker/sample_qc/python_requirements.txt: -------------------------------------------------------------------------------- 1 | adjustText==0.7.3 2 | brewer2mpl==1.4.1 3 | certifi==2020.12.5 4 | cffi==1.14.5 5 | click==8.0.0a1 6 | cycler==0.10.0 7 | Cython==0.29.22 8 | decorator==4.4.2 9 | dill==0.3.3 10 | hdbscan==0.8.27 11 | hmmlearn==0.2.5 12 | Jinja2==2.11.3 13 | joblib==1.0.1 14 | kiwisolver==1.3.1 15 | lda==2.0.0 16 | MarkupSafe==1.1.1 17 | matplotlib==3.4.1 18 | networkx==2.5 19 | numexpr==2.7.3 20 | numpy==1.20.2 21 | packaging==20.9 22 | pandas==1.2.3 23 | pbr==3.1.1 24 | Pillow==8.1.2 25 | pycparser==2.20 26 | pyparsing==2.4.7 27 | pypeliner @ git+https://github.com/shahcompbio/pypeliner.git@b452c14c4abc6e653ac8e8f52d3c9b9a158becd1 28 | python-dateutil==2.8.1 29 | pytz==2021.1 30 | PyYAML==5.4.1 31 | rpy2==3.4.3 32 | scgenome @ git+https://github.com/DouglasAbrams/scgenome.git@fb2e01e16bce038367d8e45184f2d14dde200fb5 33 | scikit-learn==0.24.1 34 | scipy==1.6.2 35 | seaborn==0.11.1 36 | single-cell @ git+https://github.com/shahcompbio/single_cell_pipeline.git@f3ac2b7b1857a64279fe2b2b8a7ae3d9c13df45d 37 | six==1.15.0 38 | sklearn==0.0 39 | tables==3.6.1 40 | threadpoolctl==2.1.0 41 | tzlocal==2.1 42 | umap==0.1.1 43 | -e git+https://github.com/amcpherson/wgs_analysis.git@c73a9bd0268b5e6fb55a8c18a58ac28e5f918482#egg=wgs_analysis 44 | -------------------------------------------------------------------------------- /docker/variant/build.sh: -------------------------------------------------------------------------------- 1 | REGISTRY=$1 2 | ORG=$2 3 | 4 | echo "\n LOGIN \n" 5 | docker login $REGISTRY -u $3 --password $4 6 | 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 8 | 9 | COMMIT=`git rev-parse HEAD` 10 | 11 | cat dockerfile_template \ 12 | | sed "s/{git_commit}/$COMMIT/g" \ 13 | > dockerfile 14 | 15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_variant:$TAG . --no-cache 16 | 17 | docker push $REGISTRY/$ORG/single_cell_pipeline_variant:$TAG 18 | 19 | -------------------------------------------------------------------------------- /docker/variant/dockerfile_template: -------------------------------------------------------------------------------- 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3 2 | ADD . /app 3 | 4 | 5 | RUN conda create --name museq python=2.7 6 | SHELL ["conda", "run", "-n", "museq", "/bin/bash", "-c"] 7 | RUN conda install --file /app/conda_museq_requirements.yml 8 | RUN conda install -c bioconda variantbam 9 | SHELL ["conda", "run", "-n", "base", "/bin/bash", "-c"] 10 | 11 | RUN conda install --file /app/conda_base_requirements.yml 12 | RUN apt update -y && apt install samtools -y && rm -rf /var/lib/apt/lists/* 13 | ENV PATH="${PATH}:/opt/conda/envs/museq/bin" 14 | 15 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3 16 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit} 17 | RUN pip install git+https://github.com/shahcompbio/biowrappers.git@master 18 | RUN pip install pyvcf bx-python==0.8.9 numpy==1.19.5 pandas==0.25.3 --force-reinstall 19 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity 20 | -------------------------------------------------------------------------------- /docker/vcf2maf/python_requirements.txt: -------------------------------------------------------------------------------- 1 | analytics-python==1.2.9 2 | azure-core==1.0.0 3 | azure-identity==1.2.0 4 | azure-keyvault-secrets==4.0.0 5 | azure-storage-blob==1.5.0 6 | azure-storage-common==1.4.2 7 | cached-property==1.4.2 8 | certifi==2020.12.5 9 | idna==2.7 10 | msal==1.0.0 11 | msal-extensions==0.1.3 12 | msrest==0.6.10 13 | munch==2.3.2 14 | numpy==1.20.2 15 | pandas==1.2.3 16 | python-dateutil==2.8.1 17 | python-slugify==1.1.2 18 | pytz==2018.4 19 | PyYAML==5.3 20 | requests==2.19.1 21 | six==1.11.0 22 | Unidecode==1.2.0 23 | urllib3==1.23 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | PAPER = 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/alignment_metrics.md: -------------------------------------------------------------------------------- 1 | # QC pipeline metrics 2 | 3 | |Column|Description| 4 | |------|-----------| 5 | |cell_id|label of the cell| 6 | |index_sequence|index sequence of the adaptor sequence| 7 | |column|column of the cell on the nanowell chip| 8 | |img_col|column of the cell from the perspective of the microscope| 9 | |index_i5|id of the i5 index adapter sequence| 10 | |sample_type|type of the sample| 11 | |primer_i7|id of the i5 index primer sequence| 12 | |experimental_condition|experimental treatment of the cell, includes controls| 13 | |index_i7|id of the i7 index adapter sequence| 14 | |cell_call|living/dead classification of the cell based on staining usually, C1 == living, C2 == dead| 15 | |sample_id|name of the sample| 16 | |primer_i5|id of the i5 index primer sequence| 17 | |row|row of the cell on the nanowell chip| 18 | |estimated_library_size|scaled total number of mapped reads| 19 | |total_mapped_reads|total number of mapped reads| 20 | |nohit|number of reads with no organism match| 21 | |salmon_multihit|number of reads that were classified as salmon and something else| 22 | |total_duplicate_reads|number of duplicate reads| 23 | |percent_duplicate_reads|percentage of duplicate reads| 24 | |total_properly_paired|number of properly paired reads| 25 | |mean_insert_size|mean insert size between paired reads| 26 | |coverage_breadth|percentage of genome covered by some read| 27 | |grch37|number of reads that were classified as human| 28 | |unpaired_duplicate_reads|number of unpaired duplicated reads| 29 | |unpaired_mapped_reads|number of unpaired mapped reads| 30 | |unmapped_reads|number of unmapped reads| 31 | |coverage_depth|average reads per nucleotide position in the genome| 32 | |median_insert_size|median insert size between paired reads| 33 | |salmon|number of reads that were classified as salmon| 34 | |grch37_multihit|number of reads that were classified as human and something else| 35 | |mm10|number of reads that were classified as mouse| 36 | |total_reads|total number of reads, regardless of mapping status| 37 | |standard_deviation_insert_size|standard deviation of the insert size between paired reads| 38 | |paired_mapped_reads|number of mapped reads that were properly paired| 39 | |mm10_multihit|number of reads classified as mouse and something else| 40 | |paired_duplicate_reads|number of paired reads that were also marked as duplicate| 41 | -------------------------------------------------------------------------------- /docs/source/gc_metrics.md: -------------------------------------------------------------------------------- 1 | ## GC metrics table 2 | 3 | 4 | 5 | For each run, the corresponding reference sequence is divided into bins or windows based on the percentage of G + C content ranging from 0 - 100%. The data is collected with picard tools CollectGcMetrics tool and you can read more about it [here](https://gatk.broadinstitute.org/hc/en-us/articles/360036801531-CollectGcBiasMetrics-Picard-) -------------------------------------------------------------------------------- /docs/source/hmmcopy_metrics.md: -------------------------------------------------------------------------------- 1 | # hmmcopy metrics 2 | 3 | |Column|Description| 4 | |------|-----------| 5 | |multiplier|during parameter searching, the set [1..6] that was chosen| 6 | |MSRSI_non_integerness|median of segment residuals from segment integer copy number states| 7 | |MBRSI_dispersion_non_integerness|median of bin residuals from segment integer copy number states| 8 | |MBRSM_dispersion|median of bin residuals from segment median copy number values| 9 | |autocorrelation_hmmcopy|hmmcopy copy autocorrelation| 10 | |cv_hmmcopy|| 11 | |empty_bins_hmmcopy|number of empty bins in hmmcopy| 12 | |mad_hmmcopy|median absolute deviation of hmmcopy copy| 13 | |mean_hmmcopy_reads_per_bin|mean reads per hmmcopy bin| 14 | |median_hmmcopy_reads_per_bin|median reads per hmmcopy bin| 15 | |std_hmmcopy_reads_per_bin|standard deviation value of reads in hmmcopy bins| 16 | |total_mapped_reads_hmmcopy|total mapped reads in all hmmcopy bins| 17 | |total_halfiness|summed halfiness penality score of the cell| 18 | |scaled_halfiness|summed scaled halfiness penalty score of the cell| 19 | |mean_state_mads|mean value for all median absolute deviation scores for each state| 20 | |mean_state_vars|variance value for all median absolute deviation scores for each state| 21 | |mad_neutral_state|median absolute deviation score of the neutral 2 copy state| 22 | |breakpoints|number of breakpoints, as indicated by state changes not at the ends of chromosomes| 23 | |mean_copy|mean hmmcopy copy value| 24 | |state_mode|the most commonly occuring state| 25 | |log_likelihood|hmmcopy log likelihood for the cell| 26 | |true_multiplier|the exact decimal value used to scale the copy number for segmentation| 27 | |cell_id|label of the cell| 28 | |order|order of the cell in the hierarchical clustering tree| 29 | |index_sequence|index sequence of the adaptor sequence| 30 | |column|column of the cell on the nanowell chip| 31 | |img_col|column of the cell from the perspective of the microscope| 32 | |index_i5|id of the i5 index adapter sequence| 33 | |sample_type|type of the sample| 34 | |primer_i7|id of the i5 index primer sequence| 35 | |experimental_condition|experimental treatment of the cell, includes controls| 36 | |index_i7|id of the i7 index adapter sequence| 37 | |cell_call|living/dead classification of the cell based on staining usually, C1 == living, C2 == dead| 38 | |sample_id|name of the sample| 39 | |primer_i5|id of the i5 index primer sequence| 40 | |row|row of the cell on the nanowell chip| 41 | |is_contaminated|boolean, set to True if most reads belong to a different genome| 42 | -------------------------------------------------------------------------------- /docs/source/hmmcopy_reads.md: -------------------------------------------------------------------------------- 1 | # HMMCopy Reads 2 | 3 | |Column|Description| 4 | |------|-----------| 5 | |chr|chromosome| 6 | |start|start position| 7 | |end|end position| 8 | |width|width of genomie segment that comprises the bin| 9 | |reads|number of reads that start in the bin| 10 | |gc|average GC content of all bases in the bin, -1 if N is present| 11 | |map|average mappability value of bin| 12 | |cor_gc|gc-corrected copy number value| 13 | |copy|final output copy number value| 14 | |valid|TRUE if reads > 0 & gc > 0, else FALSE| 15 | |ideal|TRUE if bin is VALID with good mappability and non-outlier gc and read values| 16 | |modal_curve|value of the gc-correction modal curve given the bin's gc| 17 | |modal_quantile|| 18 | |cor_map|mappability-corrected gc-corrected copy number value| 19 | |multiplier|hmmcopy parameter set used [1..6]| 20 | |state|the copy number state of the bin| 21 | |cell_id|label of the cell| 22 | |is_low_mappability|bool, set to True if the segment has a low mappability score| 23 | -------------------------------------------------------------------------------- /docs/source/hmmcopy_segments.md: -------------------------------------------------------------------------------- 1 | # HMMCopy Segments 2 | |Column|Description| 3 | |------|-----------| 4 | |chr|chromosome| 5 | |start|start position| 6 | |end|end position| 7 | |state|copy number state| 8 | |median|median copy number value of segment| 9 | |multiplier|hmmcopy parameter set used [1..6]| 10 | |cell_id|label of the cell| -------------------------------------------------------------------------------- /docs/source/organism_filter.md: -------------------------------------------------------------------------------- 1 | ## Organism Filter 2 | 3 | The pipeline uses [FastqScreen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) to classify and filter non human reads. 4 | 5 | The QC pipeline runs fastq screen on each single cell fastq pair. Fastq screen takes fastq inputs and outputs fastqs with tags added to read names. Each read in a pair is classified independently. We run our classification against human, mouse and salmon genomes. The bam files generated by the pipeline will be tagged with the fastqscreen tag to specify the species that they belong to. 6 | 7 | | Fastq Screen Flag| Explanation| 8 | |----|----| 9 | |0|Read does not map| 10 | |1|Read maps uniquely| 11 | |2|Read multi maps| 12 | 13 | #### Fastq format 14 | Flag Format: 15 | The Flag information is appended to the read id in the fastq file. The very first read will have the following format: 16 | 17 | 18 | The Flag information is appended to the read id in the fastq file. The very first read will have the following format: 19 | ``` 20 | @#FQST:grch37:mm10:salmon:100 21 | ``` 22 | In this example, the read uniquely maps to the human genome and doesn't align to Mouse or Salmon genome at all. 23 | 24 | All subsequent reads will have the following format: 25 | ``` 26 | @#FQST:100 27 | ``` 28 | 29 | #### Bam format 30 | 31 | Each read in the bam file will contain the following tag: 32 | 33 | ``` 34 | FS:Z:mm10_0,salmon_0,grch37_1 35 | ``` 36 | 37 | 38 | ## Pipeline features: 39 | 40 | #### Metrics: 41 | 42 | ###### Detailed Metrics: 43 | 44 | The pipeline generates a csv file with detailed counts for every flag option. The counts are also split by the Read direction. The table columns depend on the references that we're checking against. For instance, the table will have following columns for a run against Human, Mouse and Salmon genomes: 45 | 46 | * cell_id: id of the cell 47 | * read_end: end 1 or 2 of read pairs 48 | * Human: The column will have values {0,1,2}. Please see the table in fastq screen for details 49 | * Mouse: The column will have values {0,1,2}. Please see the table in fastq screen for details 50 | * Salmon: The column will have values {0,1,2}. Please see the table in fastq screen for details 51 | * count: number of reads 52 | 53 | ###### Summary Metrics: 54 | 55 | The pipeline will also add some summary metrics to the main alignment metrics table. The column names depend on the references. For instance, the table will have following columns for a run against Human, Mouse and Salmon genomes 56 | 57 | * human: count of reads that align to human genome (uniquely or multi-map) 58 | * human_multihit: count of reads that align to human genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map) 59 | * mouse: count of reads that align to mouse genome (uniquely or multi-map) 60 | * mouse_multihit: count of reads that align to mouse genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map) 61 | * salmon: count of reads that align to salmon genome (uniquely or multi-map) 62 | * salmon_multihit: count of reads that align to salmon genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map) 63 | * nohit: count of reads that do not align to any genome 64 | 65 | 66 | #### Options 67 | 68 | ###### Default functionality: 69 | 70 | do not filter the files at all. The output bam files will have the information in their read tags. 71 | 72 | 73 | ###### Filter options: 74 | 75 | * filter_contaminated_reads flag in config file. 76 | keep the following read pairs: 77 | 78 | * Both R1 and R2 match human only (remove reads that match multiple references) 79 | * one of the mates matches human only, other one doesnt match anything. -------------------------------------------------------------------------------- /docs/source/quality_classifier.md: -------------------------------------------------------------------------------- 1 | # Cell Quality Classifier 2 | 3 | 4 | |Feature name|Source|Description | 5 | | ----| ----|----| 6 | |percent_duplicate_reads|picard|percentage of reads marked as PCR duplicate by MarkDuplicates| 7 | |total_mapped_reads | samtools|number of reads mapped by the bwa mem alignment algorithm | 8 | |total_duplicate_reads | samtools|number of reads marked as PCR duplicate by MarkDuplicates | 9 | |standard_deviation_insert_size| picard| read insert size standard deviation | 10 | |MSRSI_non_integerness| hmmcopy| median of segment residuals from segment integer copy number states| 11 | |MBRSI_dispersion_non_integerness| hmmcopy| median of bin residuals from segment integer copy number states| 12 | |MBRSM_dispersion| hmmcopy | median of bin residuals from segment median copy number values| 13 | |autocorrelation_hmmcopy| | autocorrelation of CNV results| 14 | |cv_hmmcopy| hmmcopy| coefficient of variation of CNV results| 15 | |mad_hmmcopy| hmmcopy| mean absolute deviation of CNV results| 16 | |total_halfiness|hmmcopy | halfiness score but without copy number state scaling| 17 | |scaled_halfiness| hmmcopy| a scaled metric to assess integer goodness of fit, described in text| 18 | |mean_state_mads| hmmcopy| the mean across all MADs of each copy number state| 19 | |mean_state_vars| hmmcopy| the mean across all variances of each copy number state| 20 | |breakpoints| hmmcopy| number of intrachromosomal breakpoints| 21 | |mean_copy| hmmcopy| mean copy number of all genomic bin segments| 22 | |state_mode| hmmcopy| the most commonly occuring copy nubmer state| 23 | |log_likelihood| hmmcopy| log-likelihood of HMMcopy CNV fit| 24 | 25 | 26 | ## Percent Duplicate Reads 27 | 28 | Calculated from the output of Mark Duplicates from picard tools. Please see [mark duplicates](http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics) 29 | 30 | 31 | Formula: 32 | 33 | UNPAIRED_READ_DUPLICATES + ((READ_PAIR_DUPLICATES + READ_PAIR_OPTICAL_DUPLICATES)*2) / (UNPAIRED_READS_EXAMINED + (READ_PAIRS_EXAMINED * 2)) 34 | 35 | -------------------------------------------------------------------------------- /docs/source/readme_data/alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/alignment.png -------------------------------------------------------------------------------- /docs/source/readme_data/annotation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/annotation.png -------------------------------------------------------------------------------- /docs/source/readme_data/breakpoint_calling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/breakpoint_calling.png -------------------------------------------------------------------------------- /docs/source/readme_data/breakpoint_calling.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (normal_bams) [io, text width=3cm] {Normal (WGS/cell) Bam}; 27 | \node (tumour_bams) [io, text width=3cm, xshift=6cm] {Tumour Cell Bams}; 28 | 29 | \node (destruct) [process, below of=normal_bams] {destruct}; 30 | \draw[-latex] (normal_bams) -- coordinate (ab) (destruct); 31 | \draw[-latex] (tumour_bams) -- coordinate (ab) (destruct); 32 | 33 | \node (lumpy) [process, below of=tumour_bams] {Lumpy}; 34 | \draw[-latex] (normal_bams) -- coordinate (ab) (lumpy); 35 | \draw[-latex] (tumour_bams) -- coordinate (ab) (lumpy); 36 | 37 | \node (lumpy_bed) [io, below of=lumpy, text width=2cm] {Breakpoints}; 38 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_bed); 39 | 40 | \node (lumpy_csv) [io, below of=lumpy, text width=2cm, xshift=4cm] {Breakpoints csv}; 41 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_csv); 42 | 43 | \node (lumpy_counts) [io, below of=lumpy, text width=1.5cm, xshift=8cm] {Cell counts}; 44 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_counts); 45 | 46 | 47 | 48 | \node (destruct_breakpoints) [io, below of=destruct, text width=2cm] {Breakpoints}; 49 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_breakpoints); 50 | 51 | \node (destruct_breakpoints_lib) [io, below of=destruct, text width=2cm, xshift=-4cm] {Breakpoints Library}; 52 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_breakpoints_lib); 53 | 54 | \node (destruct_counts) [io, below of=destruct, text width=1.5cm, xshift=-8cm] {Cell counts}; 55 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_counts); 56 | 57 | \end{tikzpicture} 58 | 59 | \end{adjustbox} 60 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/dlp_cohort_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/dlp_cohort_pipeline.png -------------------------------------------------------------------------------- /docs/source/readme_data/germline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/germline.png -------------------------------------------------------------------------------- /docs/source/readme_data/germline.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (normal_bams) [io, text width=3cm] {Normal Region Bams}; 27 | 28 | \node (samtools_germline) [process, below of=normal_bams] {Samtools Germline Calling}; 29 | \draw[-latex] (normal_bams) -- coordinate (ab) (samtools_germline); 30 | 31 | \node (germline_out) [io, below of=normal_bams, text width=1.5cm, xshift=4cm] {Germline vcf}; 32 | \draw[-latex] (samtools_germline) -- coordinate (ab) (germline_out); 33 | 34 | \node (mapp) [process, below of=samtools_germline] {Annotate Mappability}; 35 | \draw[-latex] (samtools_germline) -- coordinate (ab) (mapp); 36 | 37 | \node (mapp_out) [io, below of=mapp, text width=2cm] {Mappability vcf}; 38 | \draw[-latex] (mapp) -- coordinate (ab) (mapp_out); 39 | 40 | \node (geno) [process, below of=samtools_germline, xshift=4cm] {Annotate Genotype}; 41 | \draw[-latex] (samtools_germline) -- coordinate (ab) (geno); 42 | 43 | \node (geno_out) [io, below of=geno, text width=2cm] {Genotype vcf}; 44 | \draw[-latex] (geno) -- coordinate (ab) (geno_out); 45 | 46 | \node (snpeff) [process, below of=samtools_germline, xshift=-4cm] {Snpeff}; 47 | \draw[-latex] (samtools_germline) -- coordinate (ab) (snpeff); 48 | 49 | \node (snpeff_out) [io, below of=snpeff, text width=1.5cm] {snpeff vcf}; 50 | \draw[-latex] (snpeff) -- coordinate (ab) (snpeff_out); 51 | 52 | \end{tikzpicture} 53 | 54 | \end{adjustbox} 55 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/hmmcopy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/hmmcopy.png -------------------------------------------------------------------------------- /docs/source/readme_data/hmmcopy.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (in1) [io] {Input}; 27 | 28 | \node (readcounter) [pprocess, below of=in1] {Readcounter}; 29 | \draw[-latex] (in1) -- coordinate (ab) (readcounter); 30 | \draw (ab) -- ++(0.5,-0.3)coordinate[pos=.3](ab1) coordinate[pos=.6](ab2); 31 | \draw[-latex] (ab1) -- ($(readcounter.north east)!(ab1)!(readcounter.north west)$); 32 | \draw[-latex] (ab2) -- ($(readcounter.north west)!(ab2)!(readcounter.north east)$); 33 | 34 | \node (hmmcopy) [pprocess, below of=readcounter] {HMMCopy}; 35 | \draw[-latex] (readcounter.south) -- (hmmcopy.north); 36 | \draw[-latex] ([xshift=0.2 cm]readcounter.south) -- ([xshift=0.2 cm]hmmcopy.north); 37 | \draw[-latex] ([xshift=-0.2 cm]readcounter.south) -- ([xshift=-0.2 cm]hmmcopy.north); 38 | 39 | \node (segs) [io, below of=hmmcopy, xshift = 3cm] {Segments}; 40 | \draw[-latex] (hmmcopy) -- (segs.north); 41 | 42 | \node (params) [io, below of=hmmcopy, xshift = 7cm] {Params}; 43 | \draw[-latex] (hmmcopy) -- (params.north); 44 | 45 | \node (reads) [io, below of=hmmcopy, xshift = -3cm] {Reads}; 46 | \draw[-latex] (hmmcopy) -- (reads.north); 47 | 48 | \node (metrics) [io, below of=hmmcopy, xshift = -7cm] {Metrics}; 49 | \draw[-latex] (hmmcopy) -- (metrics.north); 50 | 51 | \node (merge_metrics) [decision, below of=hmmcopy, yshift=-2cm] {Merge}; 52 | \draw[-latex] (metrics.south) -- (merge_metrics); 53 | \draw[-latex] (params.south) -- (merge_metrics); 54 | \draw[-latex] (reads.south) -- (merge_metrics); 55 | \draw[-latex] (segs.south) -- (merge_metrics); 56 | 57 | \node (plot_metrics) [process, below of=merge_metrics,xshift=7cm] {Plot metrics}; 58 | \draw[-latex] (merge_metrics) -- (plot_metrics.north); 59 | 60 | \node (plot_kernel) [process, below of=merge_metrics, xshift=3cm] {Plot kernel density}; 61 | \draw[-latex] (merge_metrics) -- (plot_kernel.north); 62 | 63 | 64 | \node (plot_hmap) [process, below of=merge_metrics,xshift=-3cm] {Plot heatmap}; 65 | \draw[-latex] (merge_metrics) -- (plot_hmap.north); 66 | 67 | \node (plot_hmm) [process, below of=merge_metrics,xshift=-7cm] {Plot hmmcopy}; 68 | \draw[-latex] (merge_metrics) -- (plot_hmm.north); 69 | 70 | \end{tikzpicture} 71 | 72 | \end{adjustbox} 73 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/infer_haps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/infer_haps.png -------------------------------------------------------------------------------- /docs/source/readme_data/infer_haps.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (normal_bams) [io, text width=3cm] {Normal (WGS/cell) Bam}; 27 | \node (tumour_bams) [io, text width=3cm, xshift=6cm] {Tumour Cell Bams}; 28 | 29 | \node (infer_haps) [process, below of=normal_bams] {Infer Haps}; 30 | \draw[-latex] (normal_bams) -- coordinate (ab) (infer_haps); 31 | 32 | \node (readcounts) [process, below of=tumour_bams] {Extract Read Counts}; 33 | \draw[-latex] (tumour_bams) -- coordinate (ab) (readcounts); 34 | \draw[-latex] (infer_haps) -- coordinate (ab) (readcounts); 35 | 36 | \node (haplotypes) [io, below of=infer_haps, text width=2cm] {Haplotypes}; 37 | \draw[-latex] (infer_haps) -- coordinate (ab) (haplotypes); 38 | 39 | \node (allele_counts) [io, below of=readcounts, text width=2cm] {Read Counts}; 40 | \draw[-latex] (readcounts) -- coordinate (ab) (allele_counts); 41 | 42 | \end{tikzpicture} 43 | 44 | \end{adjustbox} 45 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/merge_cell_bams.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/merge_cell_bams.png -------------------------------------------------------------------------------- /docs/source/readme_data/merge_cell_bams.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (cell_bams) [io, text width=2.5cm] {Cell Bam files}; 27 | 28 | \node (merge_bams) [process, below of=cell_bams] {merge bams}; 29 | \draw[-latex] (cell_bams) -- coordinate (ab) (merge_bams); 30 | 31 | \node (region_bams) [io, right of=merge_bams, xshift=4cm] {Region bams}; 32 | \draw[-latex] (merge_bams) -- coordinate (ab) (region_bams); 33 | 34 | 35 | \node (index_bams) [process, below of=merge_bams] {index bams}; 36 | \draw[-latex] (merge_bams) -- coordinate (ab) (index_bams); 37 | 38 | \node (region_bams_index) [io, right of=index_bams, xshift=4cm, text width=3cm] {Region bam Indexes}; 39 | \draw[-latex] (index_bams) -- coordinate (ab) (region_bams_index); 40 | 41 | 42 | \end{tikzpicture} 43 | 44 | \end{adjustbox} 45 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/pseudo_bulk_qc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/pseudo_bulk_qc.png -------------------------------------------------------------------------------- /docs/source/readme_data/split_wgs_bam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/split_wgs_bam.png -------------------------------------------------------------------------------- /docs/source/readme_data/split_wgs_bam.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (wgs_bam) [io] {WGS bam}; 27 | 28 | \node (split_bam) [process, below of=wgs_bam] {split bam}; 29 | \draw[-latex] (wgs_bam) -- coordinate (ab) (split_bam); 30 | 31 | \node (region_bams) [io, right of=split_bam, xshift=4cm] {Region bams}; 32 | \draw[-latex] (split_bam) -- coordinate (ab) (region_bams); 33 | 34 | 35 | \node (index_bams) [process, below of=split_bam] {index bams}; 36 | \draw[-latex] (split_bam) -- coordinate (ab) (index_bams); 37 | 38 | \node (region_bams_index) [io, right of=index_bams, xshift=4cm, text width=3cm] {Region bam Indexes}; 39 | \draw[-latex] (index_bams) -- coordinate (ab) (region_bams_index); 40 | 41 | 42 | \end{tikzpicture} 43 | 44 | \end{adjustbox} 45 | \end{document} -------------------------------------------------------------------------------- /docs/source/readme_data/variant_calling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/variant_calling.png -------------------------------------------------------------------------------- /docs/source/readme_data/variant_counting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/variant_counting.png -------------------------------------------------------------------------------- /docs/source/readme_data/variant_counting.tikz: -------------------------------------------------------------------------------- 1 | \documentclass[class=minimal,border=2pt]{standalone} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{tikz} 4 | \usepackage{adjustbox} 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc} 6 | \pagenumbering{gobble} 7 | 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow 9 | yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum 10 | width=2cm}, 11 | ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}} 12 | 13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30] 14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30] 15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30] 17 | 18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30] 19 | \tikzstyle{arrow} = [thick,->,>=stealth] 20 | 21 | \begin{document} 22 | \begin{adjustbox}{margin=2cm} 23 | 24 | \begin{tikzpicture}[node distance=2cm] 25 | 26 | \node (museq_vcf) [io, text width=1.5cm, xshift=4cm] {Museq vcf file}; 27 | \node (strelka_vcf) [io, text width=1.5cm, xshift=8cm] {strelka vcf}; 28 | 29 | \node (merge_snv) [decision, text width=1.5cm, below of=museq_vcf, xshift=2cm] {Merge calls}; 30 | \draw[-latex] (museq_vcf) -- coordinate (ab) (merge_snv); 31 | \draw[-latex] (strelka_vcf) -- coordinate (ab) (merge_snv); 32 | 33 | \node (tumour_bams) [io, text width=2cm, below of=merge_snv, xshift=-4cm, yshift=-1cm] {Tumour Region Bams}; 34 | 35 | \node (counting) [pprocess, text width=1.5cm, below of=merge_snv, yshift=-1cm] {Generate Count}; 36 | \draw[-latex] (merge_snv) -- coordinate (ab) (counting); 37 | \draw[-latex] (tumour_bams) -- coordinate (ab) (counting); 38 | 39 | \node (counts) [io, text width=1.5cm, below of=counting] {Counts csv}; 40 | \draw[-latex] (counting) -- coordinate (ab) (counts); 41 | 42 | \end{tikzpicture} 43 | 44 | \end{adjustbox} 45 | \end{document} -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | [versioneer] 4 | VCS = git 5 | style = pep440 6 | versionfile_source = single_cell/_version.py 7 | versionfile_build = single_cell/_version.py 8 | tag_prefix = v -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import versioneer 3 | 4 | 5 | setup( 6 | name='single_cell', 7 | packages=find_packages(), 8 | version=versioneer.get_version(), 9 | cmdclass=versioneer.get_cmdclass(), 10 | description='Single cell pipeline', 11 | author='Andrew McPherson', 12 | author_email='andrew.mcpherson@gmail.com', 13 | entry_points={'console_scripts': ['single_cell = single_cell.run:main']}, 14 | package_data={'':['scripts/*.py', 'scripts/*.R', 'scripts/*.npz', "config/*.yaml", 'scripts/*.Rmd', 'scripts/*.sh', "data/*"]} 15 | ) 16 | -------------------------------------------------------------------------------- /single_cell/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ._version import get_versions 3 | __version__ = get_versions()['version'] 4 | del get_versions 5 | -------------------------------------------------------------------------------- /single_cell/annotation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 19, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | import sys 8 | 9 | import pypeliner.managed as mgd 10 | from single_cell.utils import inpututils 11 | from single_cell.workflows import qc_annotation 12 | 13 | import pypeliner 14 | 15 | 16 | def annotation_workflow(args): 17 | config = inpututils.load_config(args) 18 | 19 | annotation_infiles = inpututils.load_yaml(args['input_yaml']) 20 | 21 | lib = args["library_id"] 22 | 23 | workflow = pypeliner.workflow.Workflow() 24 | 25 | annotation_dir = args["output_prefix"] 26 | 27 | input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') 28 | annotation_files = get_output_files(annotation_dir) 29 | annotation_meta = os.path.join(args['out_dir'], 'metadata.yaml') 30 | 31 | workflow.subworkflow( 32 | name='annotation_workflow', 33 | func=qc_annotation.create_qc_annotation_workflow, 34 | args=( 35 | mgd.InputFile(annotation_infiles['hmmcopy_metrics']), 36 | mgd.InputFile(annotation_infiles['hmmcopy_reads']), 37 | mgd.InputFile(annotation_infiles['alignment_metrics']), 38 | mgd.InputFile(annotation_infiles['gc_metrics']), 39 | mgd.InputFile(annotation_infiles['segs_pdf_tar']), 40 | mgd.OutputFile(annotation_files['merged_metrics_csvs']), 41 | mgd.OutputFile(annotation_files['qc_report']), 42 | mgd.OutputFile(annotation_files['segs_pass']), 43 | mgd.OutputFile(annotation_files['segs_fail']), 44 | mgd.OutputFile(annotation_files['heatmap_filt_pdf']), 45 | config['annotation'], 46 | ) 47 | ) 48 | 49 | workflow.transform( 50 | name='generate_meta_files_results', 51 | func='single_cell.utils.helpers.generate_and_upload_metadata', 52 | args=( 53 | sys.argv[0:], 54 | args['out_dir'], 55 | list(annotation_files.values()), 56 | mgd.OutputFile(annotation_meta) 57 | ), 58 | kwargs={ 59 | 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 60 | 'input_yaml': mgd.OutputFile(input_yaml_blob), 61 | 'metadata': { 62 | 'library_id': lib, 63 | 'type': 'annotation' 64 | } 65 | } 66 | ) 67 | 68 | return workflow 69 | 70 | 71 | def get_output_files(outdir): 72 | data = { 73 | 'merged_metrics_csvs': outdir + 'metrics.csv.gz', 74 | 'qc_report': outdir + 'QC_report.html', 75 | 'segs_pass': outdir + 'segs_pass.tar.gz', 76 | 'segs_fail': outdir + 'segs_fail.tar.gz', 77 | 'heatmap_filt_pdf': outdir + 'heatmap_by_ec_filtered.pdf', 78 | } 79 | 80 | return data 81 | 82 | 83 | def annotation_pipeline(args): 84 | pyp = pypeliner.app.Pypeline(config=args) 85 | 86 | workflow = annotation_workflow(args) 87 | 88 | pyp.run(workflow) 89 | -------------------------------------------------------------------------------- /single_cell/clean_sentinels.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 9, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | import fnmatch 8 | import os 9 | 10 | from pypeliner.sqlitedb import SqliteDb 11 | 12 | 13 | def clean_sentinels(args): 14 | dirname = args["pipelinedir"] 15 | 16 | rundir, pattern = args["pattern"] 17 | 18 | rundir = os.path.join(dirname, rundir) 19 | 20 | if args["mode"] == "list": 21 | list_sentinels(rundir, pattern) 22 | else: 23 | delete_sentinels(rundir, pattern) 24 | 25 | 26 | def list_sentinels(dirname, pattern): 27 | jobs_shelf = os.path.join(dirname, "jobs.db") 28 | 29 | jobs = SqliteDb(jobs_shelf) 30 | 31 | job_matches = [v for v in jobs.keys() if fnmatch.fnmatch(v, pattern)] 32 | 33 | jobs.close() 34 | 35 | matches = job_matches 36 | 37 | matches = '\n'.join(matches) 38 | 39 | print(matches) 40 | 41 | 42 | def delete_sentinels(dirname, pattern): 43 | jobs_shelf = os.path.join(dirname, "jobs.db") 44 | 45 | jobs = SqliteDb(jobs_shelf) 46 | 47 | for job in jobs.keys(): 48 | if fnmatch.fnmatch(job, pattern): 49 | jobs.delete(job) 50 | 51 | jobs.close() 52 | -------------------------------------------------------------------------------- /single_cell/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/config/__init__.py -------------------------------------------------------------------------------- /single_cell/config/generate_batch_config.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import os 4 | 5 | from single_cell.config import batch 6 | from single_cell.utils import helpers 7 | 8 | 9 | def generate_submit_config_in_temp(args): 10 | 11 | if args['which'] in ['clean_sentinels', 'generate_config']: 12 | return args 13 | 14 | if args.get("submit_config", None): 15 | return args 16 | 17 | azure_submit = ['azurebatch', 18 | 'pypeliner.contrib.azure.batchqueue.AzureJobQueue'] 19 | if not args.get("submit", None) in azure_submit: 20 | return args 21 | 22 | batch_yaml = "batch.yaml" 23 | tmpdir = args.get("tmpdir", None) 24 | pipelinedir = args.get("pipelinedir", None) 25 | 26 | # use pypeliner tmpdir to store yaml 27 | if pipelinedir: 28 | batch_yaml = os.path.join(pipelinedir, batch_yaml) 29 | elif tmpdir: 30 | batch_yaml = os.path.join(tmpdir, batch_yaml) 31 | else: 32 | logging.getLogger("single_cell.generate_batch_config").warn( 33 | "no tmpdir specified, generating configs in working dir" 34 | ) 35 | batch_yaml = os.path.join(os.getcwd(), batch_yaml) 36 | 37 | helpers.makedirs(batch_yaml, isfile=True) 38 | 39 | batch_yaml = helpers.get_incrementing_filename(batch_yaml) 40 | 41 | params_override = args["config_override"] 42 | if not params_override: 43 | params_override = {} 44 | 45 | config_params = batch.get_batch_params(override=params_override) 46 | config = batch.get_batch_config(config_params, override=params_override) 47 | batch.write_config(config, batch_yaml) 48 | 49 | args["submit_config"] = batch_yaml 50 | 51 | return args 52 | -------------------------------------------------------------------------------- /single_cell/config/generate_pipeline_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from single_cell.config import pipeline_config 4 | from single_cell.utils import helpers 5 | 6 | 7 | def generate_pipeline_config_in_temp(args): 8 | 9 | if args['which'] in ['clean_sentinels', 'generate_config']: 10 | return args 11 | 12 | if args.get("config_file", None): 13 | return args 14 | 15 | config_yaml = "config.yaml" 16 | tmpdir = args.get("tmpdir", None) 17 | pipelinedir = args.get("pipelinedir", None) 18 | 19 | # use pypeliner tmpdir to store yaml 20 | if pipelinedir: 21 | config_yaml = os.path.join(pipelinedir, config_yaml) 22 | elif tmpdir: 23 | config_yaml = os.path.join(tmpdir, config_yaml) 24 | else: 25 | logging.getLogger("single_cell.generate_pipeline_config").warn( 26 | "no tmpdir specified, generating configs in working dir" 27 | ) 28 | config_yaml = os.path.join(os.getcwd(), config_yaml) 29 | 30 | config_yaml = helpers.get_incrementing_filename(config_yaml) 31 | 32 | params_override = args["config_override"] 33 | 34 | helpers.makedirs(config_yaml, isfile=True) 35 | 36 | config_params = pipeline_config.get_config_params(override=params_override) 37 | config = pipeline_config.get_singlecell_pipeline_config(config_params, override=params_override) 38 | pipeline_config.write_config(config, config_yaml) 39 | 40 | args["config_file"] = config_yaml 41 | 42 | return args 43 | -------------------------------------------------------------------------------- /single_cell/generate_config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 9, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | from single_cell.config import pipeline_config 8 | from single_cell.config import batch 9 | 10 | 11 | def generate_config(args): 12 | config_yaml = args.get("pipeline_config") 13 | batch_yaml = args.get("batch_config") 14 | params_override = args.get("config_override") 15 | 16 | if config_yaml: 17 | config_params = pipeline_config.get_config_params(override=params_override) 18 | config = pipeline_config.get_singlecell_pipeline_config(config_params) 19 | pipeline_config.write_config(config, config_yaml) 20 | 21 | if batch_yaml: 22 | config_params = batch.get_batch_params(override=params_override) 23 | config = batch.get_batch_config(config_params) 24 | batch.write_config(config, batch_yaml) 25 | -------------------------------------------------------------------------------- /single_cell/hmmcopy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 19, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | import sys 8 | 9 | import pypeliner 10 | import pypeliner.managed as mgd 11 | from single_cell.utils import inpututils 12 | from single_cell.workflows import hmmcopy 13 | 14 | 15 | def get_output_files(outdir): 16 | data = { 17 | 'reads_csvs': outdir + 'reads.csv.gz', 18 | 'segs_csvs': outdir + 'segments.csv.gz', 19 | 'params_csvs': outdir + 'params.csv.gz', 20 | 'metrics_csvs': outdir + 'hmmcopy_metrics.csv.gz', 21 | 'hmmcopy_data_tar': outdir + 'hmmcopy_data.tar.gz', 22 | 'igv_csvs': outdir + 'igv_segments.seg', 23 | 'segs_pdf': outdir + 'segs.tar.gz', 24 | 'bias_pdf': outdir + 'bias.tar.gz', 25 | 'heatmap_pdf': outdir + 'heatmap_by_ec.pdf', 26 | 'metrics_pdf': outdir + 'hmmcopy_metrics.pdf', 27 | 'kernel_density_pdf': outdir + 'kernel_density.pdf', 28 | } 29 | 30 | return data 31 | 32 | 33 | def hmmcopy_workflow(args): 34 | config = inpututils.load_config(args) 35 | config = config['hmmcopy'] 36 | 37 | sampleinfo = inpututils.get_sample_info(args['input_yaml']) 38 | cellids = inpututils.get_samples(args['input_yaml']) 39 | bam_files = inpututils.get_bams(args['input_yaml']) 40 | 41 | lib = args["library_id"] 42 | 43 | workflow = pypeliner.workflow.Workflow() 44 | 45 | hmmcopy_prefix = args["output_prefix"] 46 | 47 | hmmcopy_files = get_output_files(hmmcopy_prefix) 48 | hmmcopy_meta = os.path.join(args['out_dir'], 'metadata.yaml') 49 | input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') 50 | 51 | workflow.setobj( 52 | obj=mgd.OutputChunks('cell_id'), 53 | value=list(bam_files.keys()), 54 | ) 55 | 56 | workflow.subworkflow( 57 | name='hmmcopy_workflow', 58 | func=hmmcopy.create_hmmcopy_workflow, 59 | args=( 60 | mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), 61 | mgd.OutputFile(hmmcopy_files['reads_csvs']), 62 | mgd.OutputFile(hmmcopy_files['segs_csvs']), 63 | mgd.OutputFile(hmmcopy_files['metrics_csvs']), 64 | mgd.OutputFile(hmmcopy_files['params_csvs']), 65 | mgd.OutputFile(hmmcopy_files['igv_csvs']), 66 | mgd.OutputFile(hmmcopy_files['segs_pdf']), 67 | mgd.OutputFile(hmmcopy_files['bias_pdf']), 68 | mgd.OutputFile(hmmcopy_files['heatmap_pdf']), 69 | mgd.OutputFile(hmmcopy_files['metrics_pdf']), 70 | mgd.OutputFile(hmmcopy_files['kernel_density_pdf']), 71 | mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), 72 | cellids, 73 | config, 74 | sampleinfo 75 | ), 76 | ) 77 | 78 | workflow.transform( 79 | name='generate_meta_files_results', 80 | func='single_cell.utils.helpers.generate_and_upload_metadata', 81 | args=( 82 | sys.argv[0:], 83 | args['out_dir'], 84 | list(hmmcopy_files.values()), 85 | mgd.OutputFile(hmmcopy_meta) 86 | ), 87 | kwargs={ 88 | 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 89 | 'input_yaml': mgd.OutputFile(input_yaml_blob), 90 | 'metadata': { 91 | 'library_id': lib, 92 | 'cell_ids': list(bam_files.keys()), 93 | 'type': 'hmmcopy', 94 | } 95 | } 96 | ) 97 | 98 | return workflow 99 | 100 | 101 | def hmmcopy_pipeline(args): 102 | pyp = pypeliner.app.Pypeline(config=args) 103 | 104 | workflow = hmmcopy_workflow(args) 105 | 106 | pyp.run(workflow) 107 | -------------------------------------------------------------------------------- /single_cell/merge_bams.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 22, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | import sys 8 | 9 | import pypeliner 10 | import pypeliner.managed as mgd 11 | from single_cell.utils import inpututils 12 | from single_cell.workflows import merge_bams 13 | 14 | 15 | def merge_bams_workflow(args): 16 | config = inpututils.load_config(args) 17 | config = config['merge_bams'] 18 | 19 | ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 20 | 'ncpus': 1, 'mem': config["memory"]['low']} 21 | workflow = pypeliner.workflow.Workflow(ctx=ctx) 22 | 23 | bam_files = inpututils.load_merge_cell_bams(args['input_yaml']) 24 | 25 | merge_out_template = args['output_prefix'] + '{region}.bam' 26 | meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') 27 | input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') 28 | 29 | workflow.setobj( 30 | obj=mgd.OutputChunks('cell_id'), 31 | value=list(bam_files.keys()), 32 | ) 33 | 34 | workflow.transform( 35 | name="get_regions", 36 | func="single_cell.utils.pysamutils.get_regions_from_reference", 37 | ret=pypeliner.managed.OutputChunks('region'), 38 | args=( 39 | config["ref_genome"], 40 | config["split_size"], 41 | config["chromosomes"], 42 | ) 43 | ) 44 | 45 | workflow.transform( 46 | name="remove_softclipped_reads", 47 | func="single_cell.utils.pysamutils.remove_softclipped_reads", 48 | axes=('cell_id',), 49 | args=( 50 | mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), 51 | mgd.TempOutputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), 52 | args['softclipped_reads_threshold'] 53 | ) 54 | ) 55 | 56 | workflow.subworkflow( 57 | name="wgs_merge_workflow", 58 | func=merge_bams.create_merge_bams_workflow, 59 | args=( 60 | mgd.TempInputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), 61 | mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai'], template=merge_out_template), 62 | mgd.InputChunks("region"), 63 | config, 64 | ) 65 | ) 66 | 67 | workflow.transform( 68 | name='generate_meta_files_results', 69 | func='single_cell.utils.helpers.generate_and_upload_metadata', 70 | args=( 71 | sys.argv[0:], 72 | args['out_dir'], 73 | mgd.Template('bam_filenames', 'region', template=merge_out_template), 74 | mgd.OutputFile(meta_yaml) 75 | ), 76 | kwargs={ 77 | 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 78 | 'input_yaml': mgd.OutputFile(input_yaml_blob), 79 | 'template': (mgd.InputChunks('region'), merge_out_template, 'region'), 80 | 'metadata': { 81 | 'type': 'pseudowgs_regionbams', 82 | 'cell_ids': list(bam_files.keys())} 83 | 84 | } 85 | ) 86 | 87 | return workflow 88 | 89 | 90 | def merge_bams_pipeline(args): 91 | pyp = pypeliner.app.Pypeline(config=args) 92 | 93 | workflow = merge_bams_workflow(args) 94 | 95 | pyp.run(workflow) 96 | -------------------------------------------------------------------------------- /single_cell/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from single_cell.alignment import alignment_pipeline 4 | from single_cell.annotation import annotation_pipeline 5 | from single_cell.breakpoint_calling import breakpoint_calling_pipeline 6 | from single_cell.clean_sentinels import clean_sentinels 7 | from single_cell.cmdline import parse_args 8 | from single_cell.generate_config import generate_config 9 | from single_cell.germline_calling import germline_calling_pipeline 10 | from single_cell.hmmcopy import hmmcopy_pipeline 11 | from single_cell.infer_haps import count_haps_pipeline 12 | from single_cell.infer_haps import infer_haps_pipeline 13 | from single_cell.merge_bams import merge_bams_pipeline 14 | from single_cell.sample_qc import sample_qc_pipeline 15 | from single_cell.snv_genotyping import snv_genotyping_pipeline 16 | from single_cell.split_bam import split_bam_pipeline 17 | from single_cell.sv_genotyping import sv_genotyping_pipeline 18 | from single_cell.variant_calling import variant_calling_pipeline 19 | from single_cell.cohort_qc import cohort_qc_pipeline 20 | 21 | 22 | def main(): 23 | args = parse_args() 24 | 25 | if args["which"] == "generate_config": 26 | generate_config(args) 27 | return 28 | 29 | if args["which"] == "clean_sentinels": 30 | clean_sentinels(args) 31 | return 32 | 33 | if args["which"] == "alignment": 34 | alignment_pipeline(args) 35 | 36 | if args["which"] == "hmmcopy": 37 | hmmcopy_pipeline(args) 38 | 39 | if args["which"] == "annotation": 40 | annotation_pipeline(args) 41 | 42 | if args["which"] == "merge_cell_bams": 43 | merge_bams_pipeline(args) 44 | 45 | if args["which"] == "split_wgs_bam": 46 | split_bam_pipeline(args) 47 | 48 | if args["which"] == "variant_calling": 49 | variant_calling_pipeline(args) 50 | 51 | if args["which"] == "germline_calling": 52 | germline_calling_pipeline(args) 53 | 54 | if args["which"] == "infer_haps": 55 | infer_haps_pipeline(args) 56 | 57 | if args["which"] == "count_haps": 58 | count_haps_pipeline(args) 59 | 60 | if args["which"] == "breakpoint_calling": 61 | breakpoint_calling_pipeline(args) 62 | 63 | if args["which"] == "snv_genotyping": 64 | snv_genotyping_pipeline(args) 65 | 66 | if args["which"] == "sv_genotyping": 67 | sv_genotyping_pipeline(args) 68 | 69 | if args["which"] == "sample_qc": 70 | sample_qc_pipeline(args) 71 | 72 | if args["which"] == "cohort_qc": 73 | cohort_qc_pipeline(args) 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /single_cell/snv_genotyping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pypeliner 5 | import pypeliner.managed as mgd 6 | from single_cell.utils import inpututils 7 | 8 | 9 | def create_variant_counting_workflow(args): 10 | """ Count variant reads for multiple sets of variants across cells. 11 | """ 12 | 13 | vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input( 14 | args['input_yaml'] 15 | ) 16 | 17 | counts_template = '{sample_id}_{library_id}_counts.csv.gz' 18 | counts_output_template = args['output_prefix'] + counts_template 19 | 20 | meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') 21 | input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') 22 | 23 | config = inpututils.load_config(args) 24 | config = config['variant_calling'] 25 | 26 | workflow = pypeliner.workflow.Workflow() 27 | 28 | workflow.setobj( 29 | obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), 30 | value=list(tumour_cell_bams.keys()), 31 | ) 32 | 33 | workflow.transform( 34 | name='merge_snvs_museq', 35 | func='single_cell.utils.vcfutils.merge_vcf', 36 | args=( 37 | [mgd.InputFile(vcf_file, extensions=['.tbi','.csi']) for vcf_file in vcf_files], 38 | mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 39 | mgd.TempSpace("merge_vcf_temp") 40 | ), 41 | ) 42 | 43 | workflow.subworkflow( 44 | name='count_alleles', 45 | axes=('sample_id', 'library_id'), 46 | func='single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', 47 | args=( 48 | mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], 49 | fnames=tumour_cell_bams, axes_origin=[]), 50 | mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 51 | mgd.OutputFile('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), 52 | mgd.Instance('sample_id'), 53 | mgd.Instance('library_id'), 54 | config['memory'], 55 | ), 56 | ) 57 | 58 | workflow.transform( 59 | name='generate_meta_files_results', 60 | func='single_cell.utils.helpers.generate_and_upload_metadata', 61 | args=( 62 | sys.argv[0:], 63 | args['out_dir'], 64 | mgd.Template('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), 65 | mgd.OutputFile(meta_yaml) 66 | ), 67 | kwargs={ 68 | 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 69 | 'input_yaml': mgd.OutputFile(input_yaml_blob), 70 | 'metadata': { 71 | 'type': 'snv_genotyping', 72 | 'counts': { 73 | 'template': counts_template, 74 | 'instances': sample_library, 75 | } 76 | } 77 | } 78 | ) 79 | 80 | return workflow 81 | 82 | 83 | def snv_genotyping_pipeline(args): 84 | pyp = pypeliner.app.Pypeline(config=args) 85 | 86 | workflow = create_variant_counting_workflow(args) 87 | 88 | pyp.run(workflow) 89 | -------------------------------------------------------------------------------- /single_cell/split_bam.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 6, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | import sys 8 | 9 | import pypeliner 10 | import pypeliner.managed as mgd 11 | from single_cell.utils import inpututils 12 | from single_cell.workflows import split_bams 13 | 14 | 15 | def split_bam_workflow(args): 16 | config = inpututils.load_config(args) 17 | config = config['split_bam'] 18 | 19 | bam_file = inpututils.load_split_wgs_input(args['input_yaml']) 20 | 21 | split_bam_template = args['output_prefix'] + '{region}.bam' 22 | 23 | meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') 24 | input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') 25 | 26 | workflow = pypeliner.workflow.Workflow() 27 | 28 | workflow.transform( 29 | name="get_regions", 30 | ctx={'mem': config['memory']['low'], 'ncpus': 1}, 31 | func="single_cell.utils.pysamutils.get_regions_from_reference", 32 | ret=pypeliner.managed.OutputChunks('region'), 33 | args=( 34 | config["ref_genome"], 35 | config["split_size"], 36 | config["chromosomes"], 37 | ) 38 | ) 39 | 40 | workflow.subworkflow( 41 | name="split_normal", 42 | func=split_bams.create_split_workflow, 43 | ctx={'mem': config['memory']['low'], 'ncpus': 1}, 44 | args=( 45 | mgd.InputFile(bam_file), 46 | mgd.OutputFile( 47 | "normal.split.bam", 'region', 48 | template=split_bam_template, axes_origin=[] 49 | ), 50 | pypeliner.managed.InputChunks('region'), 51 | config, 52 | ), 53 | ) 54 | 55 | workflow.transform( 56 | name='generate_meta_files_results', 57 | func='single_cell.utils.helpers.generate_and_upload_metadata', 58 | args=( 59 | sys.argv[0:], 60 | args['output_prefix'], 61 | mgd.Template('bam_filenames', 'region', template=split_bam_template), 62 | mgd.OutputFile(meta_yaml) 63 | ), 64 | kwargs={ 65 | 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 66 | 'input_yaml': mgd.OutputFile(input_yaml_blob), 67 | 'metadata': {'type': 'wgs_regionbams'}, 68 | 'template': (mgd.InputChunks('region'), split_bam_template, 'region'), 69 | } 70 | ) 71 | 72 | return workflow 73 | 74 | 75 | def split_bam_pipeline(args): 76 | pyp = pypeliner.app.Pypeline(config=args) 77 | 78 | workflow = split_bam_workflow(args) 79 | 80 | pyp.run(workflow) 81 | -------------------------------------------------------------------------------- /single_cell/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/tests/__init__.py -------------------------------------------------------------------------------- /single_cell/tests/codebuild/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/tests/codebuild/__init__.py -------------------------------------------------------------------------------- /single_cell/tests/codebuild/align/align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p ALIGN/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/alignment ALIGN/ref_test_data --recursive --quiet 13 | 14 | 15 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 16 | $1/single_cell_pipeline_alignment:$TAG \ 17 | single_cell alignment --input_yaml single_cell/tests/codebuild/align/inputs.yaml \ 18 | --library_id A97318A --maxjobs 1 --nocleanup --sentinel_only \ 19 | --submit local --loglevel DEBUG \ 20 | --tmpdir ALIGN/temp \ 21 | --pipelinedir ALIGN/pipeline \ 22 | --submit local \ 23 | --output_prefix ALIGN/output/A97318A \ 24 | --bams_dir ALIGN/bams \ 25 | --sequencing_center TEST --trim 26 | 27 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 28 | $1/single_cell_pipeline_alignment:$TAG \ 29 | python single_cell/tests/codebuild/align/test_alignment.py ALIGN/output A97318A ALIGN/ref_test_data/refdata/bwa-mem 30 | 31 | 32 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf ALIGN 33 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/align/inputs.yaml: -------------------------------------------------------------------------------- 1 | SA1090-A96213A-R20-C28: 2 | column: 28 3 | condition: B 4 | fastqs: 5 | HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5: 6 | fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C28_1.fastq.gz 7 | fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C28_2.fastq.gz 8 | img_col: 45 9 | index_i5: i5-20 10 | index_i7: i7-28 11 | pick_met: C1 12 | primer_i5: GTATAG 13 | primer_i7: CTATCT 14 | row: 20 15 | sample_id: SA1090 16 | library_id: A96213A 17 | is_control: True 18 | SA1090-A96213A-R20-C62: 19 | column: 62 20 | condition: B 21 | fastqs: 22 | HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5: 23 | fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C62_1.fastq.gz 24 | fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C62_2.fastq.gz 25 | img_col: 11 26 | index_i5: i5-20 27 | index_i7: i7-62 28 | pick_met: C1 29 | primer_i5: GTATAG 30 | primer_i7: AAGCTA 31 | row: 20 32 | sample_id: SA1090 33 | library_id: A96213A 34 | is_control: False 35 | SA1090-A96213A-R22-C43: 36 | column: 43 37 | condition: B 38 | fastqs: 39 | HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5: 40 | fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R22-C43_1.fastq.gz 41 | fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R22-C43_2.fastq.gz 42 | img_col: 30 43 | index_i5: i5-22 44 | index_i7: i7-43 45 | pick_met: C2 46 | primer_i5: GCTGTA 47 | primer_i7: ATTCCG 48 | row: 22 49 | sample_id: SA1090 50 | library_id: A96213A 51 | is_control: False 52 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/align/test_alignment.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | 5 | def get_inputs(path, library_id): 6 | ''' 7 | get metrics and gc metrics given a directory and library 8 | :param path: path to metrics files 9 | :param library_id: library id associated with metrics files 10 | ''' 11 | metrics = os.path.join(path, library_id) 12 | metrics += "_alignment_metrics.csv.gz" 13 | 14 | gc_metrics = os.path.join(path, library_id) 15 | gc_metrics += "_gc_metrics.csv.gz" 16 | 17 | return metrics, gc_metrics 18 | 19 | def compare_alignment(ref_metrics, metrics, 20 | ref_gc_metrics, gc_metrics): 21 | 22 | compare.compare_metrics(ref_metrics, metrics) 23 | compare.compare_metrics(ref_gc_metrics, gc_metrics) 24 | 25 | if __name__ == "__main__": 26 | 27 | output_path = sys.argv[1] 28 | output_lib = sys.argv[2] 29 | 30 | ref_path = sys.argv[3] 31 | ref_lib = "A97318A" 32 | 33 | ref_metrics, ref_gc_metrics = get_inputs(ref_path, "A97318A") 34 | metrics, gc_metrics = get_inputs(output_path, output_lib) 35 | 36 | compare_alignment(ref_metrics, metrics, 37 | ref_gc_metrics, gc_metrics) 38 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/annotation/annotation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | NUMCORES=`nproc --all` 7 | 8 | mkdir -p ANNOTATION/ref_test_data 9 | 10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 11 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/annotation ANNOTATION/ref_test_data --recursive --quiet 12 | 13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 14 | $1/single_cell_pipeline_annotation:$TAG \ 15 | single_cell annotation --input_yaml single_cell/tests/codebuild/annotation/inputs.yaml \ 16 | --library_id A97318A --maxjobs $NUMCORES --nocleanup --sentinel_only \ 17 | --submit local --loglevel DEBUG \ 18 | --tmpdir ANNOTATION/temp \ 19 | --pipelinedir ANNOTATION/pipeline \ 20 | --submit local \ 21 | --output_prefix ANNOTATION/output/A97318A \ 22 | --config_override '{"annotation": {"chromosomes": ["6", "8", "17"]}}' \ 23 | --no_corrupt_tree 24 | 25 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 26 | $1/single_cell_pipeline_annotation:$TAG \ 27 | python single_cell/tests/codebuild/annotation/test_annotation.py ANNOTATION/output A97318A ANNOTATION/ref_test_data/refdata 28 | 29 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_annotation:$TAG rm -rf ANNOTATION 30 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/annotation/inputs.yaml: -------------------------------------------------------------------------------- 1 | hmmcopy_metrics: ANNOTATION/ref_test_data/testdata/A96213A_hmmcopy_metrics.csv.gz 2 | hmmcopy_reads: ANNOTATION/ref_test_data/testdata/A96213A_reads.csv.gz 3 | alignment_metrics: ANNOTATION/ref_test_data/testdata/A96213A_alignment_metrics.csv.gz 4 | gc_metrics: ANNOTATION/ref_test_data/testdata/A96213A_gc_metrics.csv.gz 5 | segs_pdf_tar: ANNOTATION/ref_test_data/testdata/A96213A_segs.tar.gz 6 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/annotation/test_annotation.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | 5 | 6 | def get_inputs(path, library_id): 7 | ''' 8 | get metrics and gc metrics given a directory and library 9 | :param path: path to metrics files 10 | :param library_id: library id associated with metrics files 11 | ''' 12 | metrics = os.path.join(path, library_id) 13 | metrics += "_metrics.csv.gz" 14 | 15 | return metrics 16 | 17 | if __name__ == "__main__": 18 | output_path = sys.argv[1] 19 | output_lib = sys.argv[2] 20 | 21 | ref_path = sys.argv[3] 22 | ref_lib = "A97318A" 23 | 24 | ref_metrics = get_inputs(ref_path, "A97318A") 25 | metrics = get_inputs(output_path, output_lib) 26 | 27 | compare.compare_annotation_metrics(ref_metrics, metrics) 28 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p BREAKPOINT_CALLING/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/breakpoint-calling BREAKPOINT_CALLING/ref_test_data --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_breakpoint:$TAG \ 16 | single_cell breakpoint_calling \ 17 | --input_yaml single_cell/tests/codebuild/breakpoint_calling/inputs.yaml \ 18 | --maxjobs $NUMCORES \ 19 | --nocleanup \ 20 | --sentinel_only \ 21 | --submit local \ 22 | --loglevel DEBUG \ 23 | --tmpdir BREAKPOINT_CALLING/temp \ 24 | --pipelinedir BREAKPOINT_CALLING/pipeline \ 25 | --submit local \ 26 | --output_prefix BREAKPOINT_CALLING/output/ \ 27 | --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}}' 28 | 29 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 30 | $1/single_cell_pipeline_breakpoint:$TAG \ 31 | python single_cell/tests/codebuild/breakpoint_calling/test_breakpoint_calling.py BREAKPOINT_CALLING/output BREAKPOINT_CALLING/ref_test_data/refdata 32 | 33 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_breakpoint:$TAG rm -rf BREAKPOINT_CALLING 34 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/breakpoint_calling/test_breakpoint_calling.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | from single_cell.utils import csvutils 5 | 6 | def get_inputs(path): 7 | """" 8 | get metrics and gc metrics given a directory and library 9 | :param path: path to metrics files 10 | """ 11 | 12 | must_exist = ["destruct_breakpoints_library.csv.gz", 13 | "destruct_breakpoints_library.csv.gz.yaml", 14 | "destruct_cell_counts.csv.gz", 15 | "destruct_cell_counts.csv.gz.yaml", 16 | "input.yaml", 17 | "lumpy_breakpoints.bed", 18 | "lumpy_breakpoints_evidence.csv.gz", 19 | "lumpy_breakpoints_evidence.csv.gz.yaml", 20 | "metadata.yaml"] 21 | 22 | lumpy_breakpoints = os.path.join(path, "lumpy_breakpoints.csv.gz") 23 | destruct_breakpoints = os.path.join(path, "destruct_breakpoints.csv.gz") 24 | 25 | must_exist = [os.path.join(path, f) for f in must_exist] 26 | 27 | return must_exist, lumpy_breakpoints, destruct_breakpoints 28 | 29 | 30 | def test_breakpoint_calling(args): 31 | output_path = args[1] 32 | ref_path = args[2] 33 | 34 | ref_must_exist, ref_lumpy, ref_destruct = get_inputs(ref_path) 35 | must_exist, lumpy, destruct = get_inputs(output_path) 36 | 37 | assert all(map(os.path.exists, ref_must_exist)) 38 | assert all(map(os.path.exists, must_exist)) 39 | 40 | compare.compare_breakpoint_calls(ref_lumpy, lumpy) 41 | 42 | ref_destruct = csvutils.read_csv_and_yaml(ref_destruct) 43 | destruct = csvutils.read_csv_and_yaml(destruct) 44 | 45 | assert ref_destruct.empty and destruct.empty 46 | 47 | if __name__ == "__main__": 48 | test_breakpoint_calling(sys.argv) 49 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/cohort_qc/cohort_qc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p COHORT_QC/testdata 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/cohort-qc-2 COHORT_QC/testdata --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_qc:$TAG \ 16 | single_cell cohort_qc --input_yaml single_cell/tests/codebuild/cohort_qc/inputs.yaml \ 17 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 18 | --loglevel DEBUG \ 19 | --tmpdir COHORT_QC/temp \ 20 | --pipelinedir COHORT_QC/pipeline \ 21 | --submit local \ 22 | --output_prefix COHORT_QC/output \ 23 | --config_override '{"refdir":"/refdata"}' \ 24 | --API_key $ONCOKB_KEY 25 | 26 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_qc:$TAG rm -rf COHORT_QC 27 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/cohort_qc/inputs.yaml: -------------------------------------------------------------------------------- 1 | SIGNATURES: 2 | DG1134: 3 | libdata: 4 | A96168B: 5 | hmmcopy_reads: COHORT_QC/testdata/A96168B_reads.csv.gz 6 | hmmcopy_metrics: COHORT_QC/testdata/A96168B_hmmcopy_metrics.csv.gz 7 | museq: COHORT_QC/testdata/museq.vcf.gz 8 | strelka_snv: COHORT_QC/testdata/strelka_s.vcf.gz 9 | strelka_indel: COHORT_QC/testdata/strelka_i.vcf.gz 10 | A96168C: 11 | hmmcopy_reads: COHORT_QC/testdata/A96168B_reads.csv.gz 12 | hmmcopy_metrics: COHORT_QC/testdata/A96168B_hmmcopy_metrics.csv.gz 13 | museq: COHORT_QC/testdata/museq.vcf.gz 14 | strelka_snv: COHORT_QC/testdata/strelka_s.vcf.gz 15 | strelka_indel: COHORT_QC/testdata/strelka_i.vcf.gz 16 | germline_maf: COHORT_QC/testdata/germline_small.maf 17 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/count_haps/count_haps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | NUMCORES=`nproc --all` 7 | 8 | mkdir -p COUNT_HAPS/ref_test_data 9 | 10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 11 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/count-haps-new COUNT_HAPS/ref_test_data --recursive --quiet 12 | 13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 14 | $1/single_cell_pipeline_haplotypes:$TAG \ 15 | single_cell count_haps \ 16 | --input_yaml single_cell/tests/codebuild/count_haps/inputs.yaml \ 17 | --maxjobs $NUMCORES \ 18 | --nocleanup \ 19 | --sentinel_only \ 20 | --submit local \ 21 | --loglevel DEBUG \ 22 | --tmpdir COUNT_HAPS/temp \ 23 | --config_override '{"count_haps":{"chromosomes":["15"], "extract_seqdata": {"genome_fai_template": "/refdata/human/infer_haps/GRCh37-lite.fa.fai", "genome_fasta_template": "/refdata/human/infer_haps/GRCh37-lite.fa"}, "ref_data_dir": "/refdata/human/infer_haps/"}}' \ 24 | --pipelinedir COUNT_HAPS/pipeline \ 25 | --submit local \ 26 | --output_prefix COUNT_HAPS/output/ 27 | 28 | 29 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 30 | $1/single_cell_pipeline_haplotypes:$TAG \ 31 | python single_cell/tests/codebuild/count_haps/test_count_haps.py COUNT_HAPS/output COUNT_HAPS/ref_test_data 32 | 33 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_haplotypes:$TAG rm -rf COUNT_HAPS 34 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/count_haps/inputs.yaml: -------------------------------------------------------------------------------- 1 | haplotypes: COUNT_HAPS/ref_test_data/haps.csv.gz 2 | tumour: 3 | SA607_3X10XB02284-A108843A-R03-C03: 4 | bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C03.bam 5 | SA607_3X10XB02284-A108843A-R03-C10: 6 | bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C10.bam 7 | SA607_3X10XB02284-A108843A-R03-C08: 8 | bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C08.bam 9 | SA607_3X10XB02284-A108843A-R03-C09: 10 | bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C09.bam -------------------------------------------------------------------------------- /single_cell/tests/codebuild/count_haps/test_count_haps.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | 5 | def compare_count_haps(): 6 | output_path = sys.argv[1] 7 | ref_path = sys.argv[2] 8 | 9 | refhaps = os.path.join(ref_path, "allele_counts_ref.csv.gz") 10 | haps = os.path.join(output_path, "allele_counts.csv.gz") 11 | 12 | compare.compare_count_haps(haps, refhaps) 13 | 14 | if __name__ == "__main__": 15 | compare_count_haps() 16 | 17 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/hmmcopy/hmmcopy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p HMMCOPY/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/hmmcopy HMMCOPY/ref_test_data --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_hmmcopy:$TAG \ 16 | single_cell hmmcopy \ 17 | --input_yaml single_cell/tests/codebuild/hmmcopy/inputs.yaml \ 18 | --library_id A97318A \ 19 | --maxjobs $NUMCORES \ 20 | --nocleanup \ 21 | --sentinel_only \ 22 | --submit local \ 23 | --loglevel DEBUG \ 24 | --config_override '{"hmmcopy": {"chromosomes": ["6", "8", "17"]}}' \ 25 | --tmpdir HMMCOPY/temp \ 26 | --pipelinedir HMMCOPY/pipeline \ 27 | --submit local \ 28 | --output_prefix HMMCOPY/output/A97318A 29 | 30 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 31 | $1/single_cell_pipeline_hmmcopy:$TAG \ 32 | python single_cell/tests/codebuild/hmmcopy/test_hmmcopy.py HMMCOPY/output A97318A HMMCOPY/ref_test_data/refdata 33 | 34 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_hmmcopy:$TAG rm -rf HMMCOPY 35 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/hmmcopy/inputs.yaml: -------------------------------------------------------------------------------- 1 | SA1090-A96213A-R20-C28: 2 | bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R20-C28.bam 3 | column: 28 4 | condition: B 5 | img_col: 45 6 | index_i5: i5-20 7 | index_i7: i7-28 8 | pick_met: C1 9 | primer_i5: GTATAG 10 | primer_i7: CTATCT 11 | row: 20 12 | sample_id: SA1090 13 | library_id: A96213A 14 | is_control: True 15 | SA1090-A96213A-R20-C62: 16 | bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R20-C62.bam 17 | column: 62 18 | condition: B 19 | img_col: 11 20 | index_i5: i5-20 21 | index_i7: i7-62 22 | pick_met: C1 23 | primer_i5: GTATAG 24 | primer_i7: AAGCTA 25 | row: 20 26 | sample_id: SA1090 27 | library_id: A96213A 28 | is_control: False 29 | SA1090-A96213A-R22-C43: 30 | bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R22-C43.bam 31 | column: 43 32 | condition: B 33 | img_col: 30 34 | index_i5: i5-22 35 | index_i7: i7-43 36 | pick_met: C2 37 | primer_i5: GCTGTA 38 | primer_i7: ATTCCG 39 | row: 22 40 | sample_id: SA1090 41 | library_id: A96213A 42 | is_control: False 43 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/hmmcopy/test_hmmcopy.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | 5 | 6 | def get_inputs(path, library_id): 7 | ''' 8 | get metrics and gc metrics given a directory and library 9 | :param path: path to metrics files 10 | :param library_id: library id associated with metrics files 11 | ''' 12 | metrics = os.path.join(path, library_id) 13 | metrics += "_hmmcopy_metrics.csv.gz" 14 | 15 | reads = os.path.join(path, library_id) 16 | reads += "_reads.csv.gz" 17 | 18 | return metrics, reads 19 | 20 | 21 | if __name__ == "__main__": 22 | output_path = sys.argv[1] 23 | output_lib = sys.argv[2] 24 | 25 | ref_path = sys.argv[3] 26 | ref_lib = "A97318A" 27 | 28 | ref_metrics, ref_reads = get_inputs(ref_path, "A97318A") 29 | metrics, reads = get_inputs(output_path, output_lib) 30 | 31 | compare.compare_metrics(ref_metrics, metrics) 32 | compare.compare_reads(ref_reads, reads) 33 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/infer_haps/infer_haps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p INFER_HAPS/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/infer-haps INFER_HAPS/ref_test_data/ --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_haplotypes:$TAG \ 16 | single_cell infer_haps --input_yaml single_cell/tests/codebuild/infer_haps/inputs.yaml \ 17 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 18 | --submit local --loglevel DEBUG \ 19 | --tmpdir INFER_HAPS/temp \ 20 | --pipelinedir INFER_HAPS/pipeline \ 21 | --submit local \ 22 | --output_prefix INFER_HAPS/output/ \ 23 | --config_override '{"infer_haps":{"chromosomes":["15"], "ref_data_dir": "/refdata/human/infer_haps/"}}' \ 24 | 25 | 26 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 27 | $1/single_cell_pipeline_haplotypes:$TAG \ 28 | python single_cell/tests/codebuild/infer_haps/test_infer_haps.py INFER_HAPS/output INFER_HAPS/ref_test_data 29 | 30 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_haplotypes:$TAG rm -rf INFER_HAPS 31 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/infer_haps/inputs.yaml: -------------------------------------------------------------------------------- 1 | normal: 2 | bam: INFER_HAPS/ref_test_data/HCC1395BL_chr15.bam 3 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/infer_haps/test_infer_haps.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | 4 | from single_cell.tests.codebuild import compare 5 | 6 | 7 | def compare_infer_haps(): 8 | output_path = sys.argv[1] 9 | ref_path = sys.argv[2] 10 | 11 | refhaps = os.path.join(ref_path, "ref_haplotypes.csv.gz") 12 | haps = os.path.join(output_path, "haplotypes.csv.gz") 13 | 14 | compare.compare_infer_haps(haps, refhaps) 15 | 16 | 17 | if __name__ == "__main__": 18 | compare_infer_haps() 19 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | NUMCORES=`nproc --all` 7 | 8 | mkdir -p MERGE_CELL_BAMS/ref_test_data 9 | 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/merge-bams MERGE_CELL_BAMS/ref_test_data --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_alignment:$TAG \ 16 | single_cell merge_cell_bams \ 17 | --input_yaml single_cell/tests/codebuild/merge_cell_bams/inputs.yaml \ 18 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 19 | --submit local --loglevel DEBUG \ 20 | --tmpdir MERGE_CELL_BAMS/temp \ 21 | --pipelinedir MERGE_CELL_BAMS/pipeline \ 22 | --submit local \ 23 | --output_prefix MERGE_CELL_BAMS/output/ --config_override '{"merge_bams": {"chromosomes": ["6", "8", "17"]}}' 24 | 25 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 26 | $1/single_cell_pipeline_alignment:$TAG \ 27 | python single_cell/tests/codebuild/merge_cell_bams/test_merge_cell_bams.py MERGE_CELL_BAMS/output MERGE_CELL_BAMS/ref_test_data/refdata 28 | 29 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf MERGE_CELL_BAMS 30 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/merge_cell_bams/test_merge_cell_bams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | import pandas as pd 5 | import pysam 6 | 7 | def get_merged_counts(path): 8 | bam_fnames = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".bam")] 9 | bams = [pysam.AlignmentFile(bam, "rb") for bam in bam_fnames] 10 | 11 | regions = [os.path.basename(fname).split(".")[0] for fname in bam_fnames] 12 | mapped = [bam.mapped for bam in bams] 13 | unmapped = [bam.unmapped for bam in bams] 14 | return pd.DataFrame({"interval":regions, "mapped": mapped, "unmapped": unmapped}) 15 | 16 | def compare_merge_counts(): 17 | output_path = sys.argv[1] 18 | ref_path = sys.argv[2] 19 | 20 | refcounts = os.path.join(ref_path, "counts.csv") 21 | 22 | counts = get_merged_counts(output_path) 23 | refcounts = pd.read_csv(refcounts) 24 | 25 | counts = counts.sort_values("interval", ascending=True) 26 | counts = counts.set_index("interval") 27 | 28 | refcounts = refcounts.sort_values("interval", ascending=True) 29 | refcounts = refcounts.set_index("interval") 30 | 31 | compare.compare_tables(counts, refcounts) 32 | 33 | 34 | if __name__ == "__main__": 35 | compare_merge_counts() 36 | 37 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/preflight/preflight.sh: -------------------------------------------------------------------------------- 1 | CURR_HEAD=$(git rev-parse $(git rev-parse --abbrev-ref HEAD)) 2 | TAG=$(git describe --tags $(git rev-list --tags --max-count=1)) 3 | TAG_HEAD=$(git rev-parse $TAG^{commit}) 4 | 5 | if test $CURR_HEAD != $TAG_HEAD; then 6 | echo "Branch is not tagged" 7 | exit -1 8 | fi 9 | 10 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/pseudo_bulk_qc/inputs.yaml: -------------------------------------------------------------------------------- 1 | PT1: 2 | '271592': 3 | '11574': 4 | alignment_metrics: PSEUDO_BULK_QC/ref_test_data/11574_alignment_metrics.csv.gz 5 | annotation_metrics: PSEUDO_BULK_QC/ref_test_data/11574_metrics.csv.gz 6 | cosmic_status: PSEUDO_BULK_QC/ref_test_data/snv_cosmic_status.csv.gz 7 | counts: PSEUDO_BULK_QC/ref_test_data/271592_11574_counts.csv.gz 8 | dbsnp_status: PSEUDO_BULK_QC/ref_test_data/snv_dbsnp_status.csv.gz 9 | destruct_breakpoint_annotation: PSEUDO_BULK_QC/ref_test_data/destruct_breakpoints.csv.gz 10 | destruct_breakpoint_counts: PSEUDO_BULK_QC/ref_test_data/destruct_cell_counts.csv.gz 11 | gc_metrics: PSEUDO_BULK_QC/ref_test_data/11574_gc_metrics.csv.gz 12 | haplotype_allele_data: PSEUDO_BULK_QC/ref_test_data/allele_counts.csv.gz 13 | hmmcopy_metrics: PSEUDO_BULK_QC/ref_test_data/11574_hmmcopy_metrics.csv.gz 14 | hmmcopy_reads: PSEUDO_BULK_QC/ref_test_data/11574_reads.csv.gz 15 | hmmcopy_segs: PSEUDO_BULK_QC/ref_test_data/11574_segments.csv.gz 16 | indel_file: PSEUDO_BULK_QC/ref_test_data/strelka_indel.vcf.gz 17 | isabl_id: '271592' 18 | lumpy_breakpoint_annotation: PSEUDO_BULK_QC/ref_test_data/lumpy_breakpoints.csv.gz 19 | lumpy_breakpoint_evidence: PSEUDO_BULK_QC/ref_test_data/lumpy_breakpoints_evidence.csv.gz 20 | mappability: PSEUDO_BULK_QC/ref_test_data/snv_mappability.csv.gz 21 | museq: PSEUDO_BULK_QC/ref_test_data/snv_museq.csv.gz 22 | snpeff: PSEUDO_BULK_QC/ref_test_data/snv_snpeff.csv.gz 23 | strelka: PSEUDO_BULK_QC/ref_test_data/snv_strelka.csv.gz 24 | trinuc: PSEUDO_BULK_QC/ref_test_data/snv_trinuc.csv.gz 25 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p PSEUDO_BULK_QC/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/sample_qc PSEUDO_BULK_QC/ref_test_data --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_qc:$TAG \ 16 | single_cell sample_qc --input_yaml single_cell/tests/codebuild/pseudo_bulk_qc/inputs.yaml \ 17 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 18 | --submit local --loglevel DEBUG \ 19 | --tmpdir PSEUDO_BULK_QC/temp \ 20 | --pipelinedir PSEUDO_BULK_QC/pipeline \ 21 | --submit local \ 22 | --output_prefix PSEUDO_BULK_QC/output \ 23 | --config_override '{"annotation": {"chromosomes": ["6", "8", "17"]}}' \ 24 | 25 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_qc:$TAG rm -rf PSEUDO_BULK_QC 26 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/refdata/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d "/refdata" ]; then 4 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v /refdata:/refdata quay.io/singlecellpipeline/awscli:v0.0.1 aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/refdata /refdata --recursive --quiet 5 | fi 6 | 7 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/snv_genotyping/inputs.yaml: -------------------------------------------------------------------------------- 1 | vcf_files: 2 | - SNV_GENOTYPING/testdata/vcf/museq.vcf.gz 3 | - SNV_GENOTYPING/testdata/vcf/strelka_snv.vcf.gz 4 | tumour_cells: 5 | SA1090: 6 | A96213A: 7 | SA1090-A96213A-R20-C28: 8 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R20-C28.bam 9 | SA1090-A96213A-R22-C43: 10 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R22-C43.bam 11 | SA1090-A96213A-R22-C44: 12 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R22-C44.bam 13 | SA1090-A96213A-R24-C12: 14 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C12.bam 15 | SA1090-A96213A-R24-C20: 16 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C20.bam 17 | SA1090-A96213A-R24-C58: 18 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C58.bam 19 | SA1090-A96213A-R25-C14: 20 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C14.bam 21 | SA1090-A96213A-R25-C22: 22 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C22.bam 23 | SA1090-A96213A-R25-C40: 24 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C40.bam 25 | SA1090-A96213A-R25-C64: 26 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C64.bam 27 | SA1090-A96213A-R26-C49: 28 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C49.bam 29 | SA1090-A96213A-R26-C50: 30 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C50.bam 31 | SA1090-A96213A-R26-C64: 32 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C64.bam 33 | SA1090-A96213A-R27-C14: 34 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C14.bam 35 | SA1090-A96213A-R27-C21: 36 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C21.bam 37 | SA1090-A96213A-R27-C45: 38 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C45.bam 39 | SA1090-A96213A-R28-C23: 40 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C23.bam 41 | SA1090-A96213A-R28-C39: 42 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C39.bam 43 | SA1090-A96213A-R28-C64: 44 | bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C64.bam -------------------------------------------------------------------------------- /single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p SNV_GENOTYPING/testdata 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/snv_genotyping SNV_GENOTYPING/testdata/ --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_variant:$TAG \ 16 | single_cell snv_genotyping --input_yaml single_cell/tests/codebuild/snv_genotyping/inputs.yaml \ 17 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 18 | --submit local --loglevel DEBUG \ 19 | --tmpdir SNV_GENOTYPING/temp \ 20 | --pipelinedir SNV_GENOTYPING/pipeline --submit local --output_prefix SNV_GENOTYPING/output \ 21 | --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}, "version": '\"$TAG\"'}' 22 | 23 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_variant:$TAG rm -rf SNV_GENOTYPING 24 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/split_wgs_bam/inputs.yaml: -------------------------------------------------------------------------------- 1 | normal: 2 | bam: SPLIT_WGS_BAM/ref_test_data/DAH370N_A41086.bam 3 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | DOCKER=`which docker` 7 | NUMCORES=`nproc --all` 8 | 9 | mkdir -p SPLIT_WGS_BAM/ref_test_data 10 | 11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 12 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/split-bam SPLIT_WGS_BAM/ref_test_data --recursive --quiet 13 | 14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 15 | $1/single_cell_pipeline_alignment:$TAG \ 16 | single_cell split_wgs_bam --input_yaml single_cell/tests/codebuild/split_wgs_bam/inputs.yaml \ 17 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 18 | --submit local --loglevel DEBUG \ 19 | --tmpdir SPLIT_WGS_BAM/temp \ 20 | --pipelinedir SPLIT_WGS_BAM/pipeline \ 21 | --submit local \ 22 | --output_prefix SPLIT_WGS_BAM/output/ --config_override '{"split_bam": {"chromosomes": ["6", "8", "17"]}}' 23 | 24 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 25 | $1/single_cell_pipeline_alignment:$TAG \ 26 | python single_cell/tests/codebuild/split_wgs_bam/test_split_wgs_bam.py SPLIT_WGS_BAM/output SPLIT_WGS_BAM/ref_test_data/refdata 27 | 28 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf SPLIT_WGS_BAM 29 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/split_wgs_bam/test_split_wgs_bam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | import pandas as pd 5 | import pysam 6 | 7 | def get_merged_counts(path): 8 | bam_fnames = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".bam")] 9 | bams = [pysam.AlignmentFile(bam, "rb") for bam in bam_fnames] 10 | 11 | regions = [os.path.basename(fname).split(".")[0] for fname in bam_fnames] 12 | mapped = [bam.mapped for bam in bams] 13 | unmapped = [bam.unmapped for bam in bams] 14 | return pd.DataFrame({"interval":regions, "mapped": mapped, "unmapped": unmapped}) 15 | 16 | def compare_merge_counts(): 17 | output_path = sys.argv[1] 18 | ref_path = sys.argv[2] 19 | 20 | refcounts = os.path.join(ref_path, "counts.csv") 21 | 22 | counts = get_merged_counts(output_path) 23 | refcounts = pd.read_csv(refcounts) 24 | 25 | counts = counts.sort_values("interval", ascending=True) 26 | counts = counts.set_index("interval") 27 | 28 | refcounts = refcounts.sort_values("interval", ascending=True) 29 | refcounts = refcounts.set_index("interval") 30 | 31 | compare.compare_tables(counts, refcounts) 32 | 33 | 34 | if __name__ == "__main__": 35 | compare_merge_counts() 36 | 37 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/variant_calling/test_variant_calling.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | from single_cell.tests.codebuild import compare 4 | from single_cell.utils import csvutils 5 | 6 | def get_inputs(path): 7 | """" 8 | get metrics and gc metrics given a directory and library 9 | :param path: path to metrics files 10 | """ 11 | strelka = os.path.join(path, "snv_strelka.csv.gz") 12 | museq = os.path.join(path, "snv_museq.csv.gz") 13 | snpeff = os.path.join(path, "snv_snpeff.csv.gz") 14 | 15 | return strelka, museq, snpeff 16 | 17 | 18 | def test_breakpoint_calling(args): 19 | output_path = args[1] 20 | ref_path = args[2] 21 | 22 | ref_strelka, ref_museq, ref_snpeff = get_inputs(ref_path) 23 | strelka, museq, snpeff = get_inputs(output_path) 24 | 25 | compare.compare_variant_calls(ref_snpeff, snpeff) 26 | 27 | ref_strelka = csvutils.read_csv_and_yaml(ref_strelka) 28 | strelka = csvutils.read_csv_and_yaml(strelka) 29 | 30 | assert ref_strelka.empty and strelka.empty 31 | 32 | if __name__ == "__main__": 33 | test_breakpoint_calling(sys.argv) 34 | -------------------------------------------------------------------------------- /single_cell/tests/codebuild/variant_calling/variant_calling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)` 6 | NUMCORES=`nproc --all` 7 | 8 | mkdir -p VARIANT_CALLING/ref_test_data 9 | 10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \ 11 | aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/variant-calling VARIANT_CALLING/ref_test_data/ --recursive --quiet 12 | 13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 14 | $1/single_cell_pipeline_variant:$TAG \ 15 | single_cell variant_calling --input_yaml single_cell/tests/codebuild/variant_calling/inputs.yaml \ 16 | --maxjobs $NUMCORES --nocleanup --sentinel_only \ 17 | --submit local --loglevel DEBUG \ 18 | --tmpdir VARIANT_CALLING/temp \ 19 | --pipelinedir VARIANT_CALLING/pipeline --submit local --output_prefix VARIANT_CALLING/output/ \ 20 | --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}, "version": '\"$TAG\"'}' 21 | 22 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \ 23 | $1/single_cell_pipeline_variant:$TAG \ 24 | python single_cell/tests/codebuild/variant_calling/test_variant_calling.py VARIANT_CALLING/output VARIANT_CALLING/ref_test_data/refdata 25 | 26 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_variant:$TAG rm -rf VARIANT_CALLING 27 | -------------------------------------------------------------------------------- /single_cell/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /single_cell/utils/gatkutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 19, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | import pypeliner 8 | 9 | def generate_targets(input_bams, config, intervals, interval, **kwargs): 10 | # generate positions 11 | cmd = ['gatk', '-Xmx8G', 12 | '-T', 'RealignerTargetCreator', 13 | '-R', config['ref_genome'], 14 | '-o', intervals, '-L', interval, 15 | ] 16 | 17 | for _, bamfile in input_bams.items(): 18 | cmd.extend(['-I', bamfile]) 19 | 20 | pypeliner.commandline.execute(*cmd, **kwargs) 21 | 22 | 23 | def gatk_realigner(inputs, config, targets, interval, tempdir, **kwargs): 24 | 25 | 26 | targets = os.path.abspath(targets) 27 | cmd = ['gatk', '-Xmx8G', 28 | '-T', 'IndelRealigner', 29 | '-R', config['ref_genome'], 30 | '-targetIntervals', targets, 31 | '--nWayOut', '_indel_realigned.bam', '-L', interval, 32 | '--maxReadsForRealignment','150000' 33 | ] 34 | 35 | for _, bamfile in inputs.items(): 36 | bamfile = os.path.abspath(bamfile) 37 | cmd.extend(['-I', bamfile]) 38 | 39 | 40 | cwd = os.getcwd() 41 | os.chdir(tempdir) 42 | 43 | pypeliner.commandline.execute(*cmd, **kwargs) 44 | 45 | os.chdir(cwd) -------------------------------------------------------------------------------- /single_cell/utils/ltmutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on July 31, 2018 3 | 4 | @author: pwalters 5 | ''' 6 | 7 | import logging 8 | 9 | import pandas as pd 10 | 11 | 12 | def read_input_file(input_file): 13 | inputs = pd.read_csv(input_file, dtype=str) 14 | 15 | for column in ('timepoint', 'hmmcopy',): 16 | if column not in inputs.columns: 17 | raise Exception( 18 | 'input_csv should contain {}'.format(column)) 19 | 20 | timepoints = list(sorted(inputs['timepoint'].unique())) 21 | 22 | if inputs.duplicated(['timepoint']).any(): 23 | raise Exception('duplicate timepoints in input_csv') 24 | 25 | hmmcopy = dict() 26 | for _, row in inputs.iterrows(): 27 | hmmcopy[row['timepoint']] = row['hmmcopy'].strip() 28 | 29 | return hmmcopy, timepoints 30 | 31 | 32 | def get_cn_matrix_from_hdf(hmmcopy_hdf_file, ploidy='0'): 33 | df = pd.read_hdf(hmmcopy_hdf_file, '/hmmcopy/reads/' + ploidy) 34 | 35 | df["bin"] = list(zip(df.chr, df.start, df.end)) 36 | df = df.pivot(index='cell_id', columns='bin', values='state') 37 | chromosomes = map(str, range(1, 23)) + ['X', 'Y'] 38 | bins = pd.DataFrame(df.columns.values.tolist(), 39 | columns=['chr', 'start', 'end']) 40 | bins["chr"] = pd.Categorical(bins["chr"], chromosomes) 41 | bins = bins.sort_values(['start', ]) 42 | bins = [tuple(v) for v in bins.values.tolist()] 43 | df = df.sort_values(bins, axis=0).T 44 | 45 | dropped_cells = df.columns[df.isna().all()].tolist() 46 | 47 | if len(dropped_cells) != 0: 48 | logging.getLogger("single_cell.helpers.ltmutils").warn( 49 | 'Dropping {} cells: {}'.format(len(dropped_cells), dropped_cells) 50 | ) 51 | 52 | df = df.loc[:, ~df.isna().all()].astype(int) 53 | df.columns = df.columns.astype(str) 54 | df = df.reset_index() 55 | 56 | chrom = [] 57 | start = [] 58 | end = [] 59 | width = [] 60 | for i, b in df['bin'].items(): 61 | chrom.append(b[0]) 62 | start.append(b[1]) 63 | end.append(b[2]) 64 | width.append(b[2] - b[1] + 1) 65 | df['chr'] = chrom 66 | df['start'] = start 67 | df['end'] = end 68 | df['width'] = width 69 | 70 | df = df.drop(columns='bin') 71 | 72 | return df, dropped_cells 73 | 74 | 75 | def get_root(cells_list, root_id_file): 76 | for cell in cells_list: 77 | if 'SA928' in cell: 78 | with open(root_id_file, 'w') as outfile: 79 | outfile.write(cell + '\n') 80 | outfile.close() 81 | return cell 82 | 83 | raise Exception('No SA928 cells in the copy number matrix.') 84 | -------------------------------------------------------------------------------- /single_cell/utils/pdfutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 20, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | import os 8 | 9 | from PyPDF2 import PdfFileMerger, PdfFileWriter, PdfFileReader 10 | 11 | from single_cell.utils import helpers 12 | 13 | 14 | def merge_pdfs(infiles, outfile): 15 | if isinstance(infiles, dict): 16 | infiles = infiles.values() 17 | 18 | merger = PdfFileMerger() 19 | 20 | for infile in infiles: 21 | # add it to list if not empty. skip empty files to avoid errors later 22 | if os.path.getsize(infile): 23 | merger.append(open(infile, 'rb')) 24 | 25 | helpers.makedirs(outfile, isfile=True) 26 | 27 | with open(outfile, 'wb') as fout: 28 | merger.write(fout) 29 | 30 | 31 | def merge_pdfs_with_scaling(infiles, outfile, width=500, height=500): 32 | if isinstance(infiles, dict): 33 | infiles = infiles.values() 34 | 35 | pdf_writer = PdfFileWriter() 36 | 37 | pagenum = 0 38 | 39 | for infile in infiles: 40 | pdf_file = PdfFileReader(open(infile, 'rb')) 41 | num_pages = pdf_file.getNumPages() 42 | 43 | for page_number in range(0, num_pages): 44 | pdf_page = pdf_file.getPage(page_number) 45 | 46 | pdf_page.scaleTo(width, height) 47 | 48 | pdf_writer.addPage(pdf_page) 49 | 50 | pdf_writer.addBookmark(title=infile, pagenum=pagenum) 51 | pagenum += 1 52 | 53 | with open(outfile, 'wb') as fout: 54 | pdf_writer.write(fout) 55 | -------------------------------------------------------------------------------- /single_cell/utils/pysamutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jun 1, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import shutil 7 | from collections import OrderedDict 8 | 9 | import pysam 10 | from single_cell.utils.bamutils import bam_index 11 | 12 | 13 | def load_chromosome_lengths(file_name, chromosomes=None): 14 | chromosome_lengths = OrderedDict() 15 | 16 | ref = pysam.Fastafile(file_name) 17 | 18 | for chrom, length in zip(ref.references, ref.lengths): 19 | if chromosomes and chrom not in chromosomes: 20 | continue 21 | 22 | chromosome_lengths[str(chrom)] = int(length) 23 | 24 | return chromosome_lengths 25 | 26 | 27 | def get_regions_from_reference(reference_fastq, split_size, chromosomes): 28 | chromosome_lengths = load_chromosome_lengths( 29 | reference_fastq, 30 | chromosomes=chromosomes 31 | ) 32 | return get_regions(chromosome_lengths, split_size) 33 | 34 | 35 | def get_regions(chromosome_lengths, split_size): 36 | if split_size is None: 37 | return dict(enumerate(chromosome_lengths.keys())) 38 | 39 | regions = [] 40 | 41 | for chrom, length in chromosome_lengths.items(): 42 | lside_interval = range(1, length + 1, split_size) 43 | rside_interval = range(split_size, length + split_size, split_size) 44 | 45 | for beg, end in zip(lside_interval, rside_interval): 46 | end = min(end, length) 47 | 48 | regions.append('{}-{}-{}'.format(chrom, beg, end)) 49 | 50 | return regions 51 | 52 | 53 | def _fraction_softclipped(x): 54 | total_softclipped = 0 55 | for a in x.cigar: 56 | if a[0] == 4: 57 | total_softclipped += a[1] 58 | return float(total_softclipped) / x.query_length 59 | 60 | 61 | def remove_softclipped_reads(infile, outfile, softclipped_reads_threshold): 62 | if softclipped_reads_threshold == 1: 63 | shutil.copyfile(infile, outfile) 64 | shutil.copyfile(infile + '.bai', outfile + '.bai') 65 | return 66 | 67 | bamfile = pysam.AlignmentFile(infile, "rb") 68 | 69 | filteredbam = pysam.AlignmentFile(outfile, "wb", template=bamfile) 70 | for read in bamfile.fetch(): 71 | if _fraction_softclipped(read) < softclipped_reads_threshold: 72 | filteredbam.write(read) 73 | filteredbam.close() 74 | 75 | bam_index(outfile, outfile + '.bai') 76 | -------------------------------------------------------------------------------- /single_cell/utils/refgenome.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | default_chromosomes = [str(a) for a in range(1, 23)] + ['X', 'Y'] 4 | 5 | 6 | def read_chromosome_lengths(genome_fasta_index, chromosomes=default_chromosomes): 7 | fai = pd.read_csv(genome_fasta_index, sep='\t', header=None, names=['chrom', 'length', 'V3', 'V4', 'V5']) 8 | fai = fai.set_index('chrom')['length'] 9 | fai = fai.reindex(chromosomes).astype(int) 10 | return fai.to_dict() 11 | 12 | 13 | def get_split_regions(split_size, refgenome, chromosomes=default_chromosomes): 14 | genome_fasta_index = refgenome + '.fai' 15 | 16 | chromosome_lengths = read_chromosome_lengths(genome_fasta_index, chromosomes=chromosomes) 17 | 18 | if split_size is None: 19 | return dict(enumerate(chromosome_lengths.keys())) 20 | 21 | regions = [] 22 | 23 | for chrom, length in chromosome_lengths.items(): 24 | lside_interval = range(1, length + 1, split_size) 25 | rside_interval = range(split_size, length + split_size, split_size) 26 | 27 | for beg, end in zip(lside_interval, rside_interval): 28 | end = min(end, length) 29 | 30 | regions.append('{}-{}-{}'.format(chrom, beg, end)) 31 | 32 | return regions 33 | -------------------------------------------------------------------------------- /single_cell/utils/singlecell_copynumber_plot_utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .plot_metrics import PlotMetrics 3 | from .plot_kernel_density import PlotKernelDensity 4 | from .plot_pcolormesh import PlotPcolor 5 | from .plot_hmmcopy import GenHmmPlots -------------------------------------------------------------------------------- /single_cell/utils/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/utils/tests/__init__.py -------------------------------------------------------------------------------- /single_cell/utils/validator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/utils/validator/__init__.py -------------------------------------------------------------------------------- /single_cell/utils/validator/utils.py: -------------------------------------------------------------------------------- 1 | class DtypeException(Exception): 2 | pass 3 | 4 | 5 | class MissingFieldError(Exception): 6 | pass 7 | 8 | 9 | class InvalidBarcode(Exception): 10 | pass 11 | 12 | 13 | class InvalidIndex(Exception): 14 | pass 15 | 16 | 17 | class MissingInput(Exception): 18 | pass 19 | 20 | 21 | class InvalidInstrument(Exception): 22 | pass 23 | 24 | 25 | class DLPIndexError(Exception): 26 | pass 27 | 28 | 29 | def get(data, key): 30 | if key not in data: 31 | raise MissingFieldError('{} key missing in yaml file.'.format(key)) 32 | return data[key] 33 | 34 | 35 | def check_data_type(keys, dtype, data): 36 | for key in keys: 37 | 38 | if not isinstance(get(data, key), dtype): 39 | raise DtypeException('{} value must be {}'.format(key, dtype)) 40 | 41 | 42 | def check_barcodes(barcode_str): 43 | for val in barcode_str: 44 | if val not in ['A', 'C', 'G', 'T']: 45 | raise InvalidBarcode('{} is not a valid varcode'.format(barcode_str)) 46 | 47 | 48 | def check_genomic_regions(region, sep='-'): 49 | chroms = list(map(str, range(1, 23))) + ['X', 'Y'] 50 | 51 | chrom, start, end = region.split(sep) 52 | 53 | assert chrom in chroms, '{} is not a valid chrom'.format(chrom) 54 | 55 | 56 | def check_cells_data(data): 57 | for cell in data: 58 | check_data_type(['bam'], str, data[cell]) 59 | 60 | 61 | def check_normal_data(normal): 62 | if 'bam' in normal: 63 | check_data_type(['bam'], str, normal) 64 | else: 65 | for cell in normal: 66 | check_data_type(['bam'], str, normal[cell]) 67 | -------------------------------------------------------------------------------- /single_cell/utils/validator/validate.py: -------------------------------------------------------------------------------- 1 | from single_cell.utils.validator import utils 2 | 3 | 4 | def validate_alignment_fastqs(data): 5 | for sample, sample_data in data.items(): 6 | for lane, lane_data in sample_data['fastqs'].items(): 7 | if not utils.get(lane_data, 'fastq_1') or not utils.get(lane_data, 'fastq_2'): 8 | raise utils.MissingInput() 9 | 10 | 11 | def validate_sample_info(yamldata): 12 | for cell in yamldata: 13 | celldata = yamldata[cell] 14 | 15 | utils.check_data_type(['column', 'img_col', 'row'], int, celldata) 16 | utils.check_data_type(['condition', 'pick_met', 'index_i5', 'index_i7', 'sample_id', 'library_id'], str, celldata) 17 | utils.check_data_type(['is_control'], bool, celldata) 18 | 19 | utils.check_barcodes(utils.get(celldata, 'primer_i5')) 20 | utils.check_barcodes(utils.get(celldata, 'primer_i7')) 21 | 22 | if not utils.get(celldata, 'index_i5').startswith('i5-'): 23 | raise utils.DLPIndexError() 24 | if not utils.get(celldata, 'index_i7').startswith('i7-'): 25 | raise utils.DLPIndexError() 26 | 27 | 28 | def validate_hmmcopy_bams(yamldata): 29 | for cell, celldata in yamldata.items(): 30 | utils.check_data_type(['bam'], str, celldata) 31 | 32 | 33 | def validate_annotation(yamldata): 34 | utils.check_data_type( 35 | ['hmmcopy_metrics', 'hmmcopy_reads', 'alignment_metrics', 'gc_metrics', 'segs_pdf_tar'], 36 | str, 37 | yamldata 38 | ) 39 | 40 | 41 | def validate_merge_cell_bams(yamldata): 42 | utils.check_cells_data(utils.get(yamldata, 'cell_bams')) 43 | 44 | 45 | def validate_split_wgs_bam(yamldata): 46 | data = utils.get(yamldata, 'normal') 47 | utils.check_data_type(['bam'], str, data) 48 | 49 | 50 | def validate_variant_calling(yamldata): 51 | normals = yamldata['normal'] 52 | for region in normals: 53 | utils.check_data_type(['bam'], str, normals[region]) 54 | utils.check_genomic_regions(region) 55 | 56 | tumours = yamldata['tumour'] 57 | for region in tumours: 58 | utils.check_data_type(['bam'], str, tumours[region]) 59 | utils.check_genomic_regions(region) 60 | 61 | 62 | def validate_germline_calling(yamldata): 63 | utils.check_normal_data(utils.get(yamldata, 'normal')) 64 | 65 | 66 | def validate_infer_haps(yamldata): 67 | utils.check_normal_data(utils.get(yamldata, 'normal')) 68 | 69 | 70 | def validate_count_haps(yamldata): 71 | utils.check_cells_data(utils.get(yamldata, 'tumour')) 72 | utils.check_data_type(['haplotypes'], str, yamldata) 73 | 74 | 75 | def validate_breakpoint_calling(yamldata): 76 | utils.check_normal_data(utils.get(yamldata, 'normal')) 77 | utils.check_cells_data(utils.get(yamldata, 'tumour')) 78 | 79 | 80 | def validate_snv_genotyping(yamldata): 81 | tumour_cells = utils.get(yamldata, 'tumour_cells') 82 | for sample in tumour_cells: 83 | for library in tumour_cells[sample]: 84 | utils.check_cells_data(tumour_cells[sample][library]) 85 | 86 | vcf_files = utils.get(yamldata, 'vcf_files') 87 | assert isinstance(vcf_files, list) 88 | for filepath in vcf_files: 89 | assert isinstance(filepath, str) 90 | 91 | 92 | def validate_sv_genotyping(yamldata): 93 | pass 94 | -------------------------------------------------------------------------------- /single_cell/utils/vcfutils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 27, 2018 3 | 4 | @author: dgrewal 5 | ''' 6 | import itertools 7 | import logging 8 | import os 9 | 10 | import biowrappers.components.io.vcf.tasks as vcf_tasks 11 | import vcf 12 | from single_cell.utils import helpers 13 | 14 | 15 | def _get_header(infile): 16 | ''' 17 | Extract header from the VCF file 18 | 19 | :param infile: input VCF file 20 | :return: header 21 | ''' 22 | 23 | header = [] 24 | for line in infile: 25 | if line.startswith('##'): 26 | header.append(line) 27 | elif line.startswith('#'): 28 | header.append(line) 29 | return header 30 | else: 31 | raise Exception('invalid header: missing #CHROM line') 32 | 33 | logging.getLogger("single_cell.helpers.vcfutils").warn( 34 | "One of the input files is empty" 35 | ) 36 | return [] 37 | 38 | 39 | def concatenate_vcf(infiles, outfile): 40 | ''' 41 | Concatenate VCF files 42 | 43 | :param infiles: dictionary of input VCF files to be concatenated 44 | :param outfile: output VCF file 45 | ''' 46 | 47 | with open(outfile, 'w') as ofile: 48 | header = None 49 | 50 | for _, ifile in infiles.items(): 51 | 52 | if os.path.getsize(ifile) == 0: 53 | logging.getLogger("single_cell.helpers.vcfutils").warn( 54 | 'input file {} is empty'.format(ifile) 55 | ) 56 | continue 57 | 58 | with open(ifile) as f: 59 | 60 | if not header: 61 | header = _get_header(f) 62 | 63 | for line in header: 64 | ofile.write(line) 65 | else: 66 | if not _get_header(f) == header: 67 | logging.getLogger("single_cell.helpers.vcfutils").warn( 68 | 'merging vcf files with mismatching headers' 69 | ) 70 | 71 | for l in f: 72 | ofile.write(l) 73 | 74 | 75 | def merge_vcf(infiles, outfile, tempdir): 76 | vcf_files = [] 77 | for infile in infiles: 78 | if isinstance(infile, str): 79 | vcf_files.append(infile) 80 | elif isinstance(infile, dict): 81 | vcf_files.extend(list(infile.values())) 82 | elif isinstance(infile, (list, tuple)): 83 | vcf_files.extend(list(infile)) 84 | else: 85 | raise Exception("unknown data type") 86 | 87 | helpers.makedirs(tempdir) 88 | temp_output = os.path.join(tempdir, 'merged.vcf') 89 | 90 | vcf_tasks.merge_vcfs(vcf_files, temp_output) 91 | 92 | vcf_tasks.finalise_vcf(temp_output, outfile) 93 | 94 | 95 | def split_vcf(in_file, out_files, lines_per_file): 96 | """ Split a VCF file into smaller files. 97 | 98 | :param in_file: Path of VCF file to split. 99 | 100 | :param out_files: Callback function which supplies file name given index of split. 101 | 102 | :param lines_per_file: Maximum number of lines to be written per file. 103 | 104 | """ 105 | 106 | def line_group(_, line_idx=itertools.count()): 107 | return int(next(line_idx) / lines_per_file) 108 | 109 | reader = vcf.Reader(filename=in_file) 110 | 111 | for file_idx, records in itertools.groupby(reader, key=line_group): 112 | file_name = out_files[file_idx] 113 | 114 | with open(file_name, 'w') as out_fh: 115 | writer = vcf.Writer(out_fh, reader) 116 | 117 | for record in records: 118 | writer.write_record(record) 119 | 120 | writer.close() 121 | -------------------------------------------------------------------------------- /single_cell/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 6, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | -------------------------------------------------------------------------------- /single_cell/workflows/align/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | metrics = { 3 | 'cell_id': 'str', 4 | 'total_mapped_reads': 'int', 5 | 'library_id': 'str', 6 | 'unpaired_mapped_reads': 'int', 7 | 'paired_mapped_reads': 'int', 8 | 'unpaired_duplicate_reads': 'int', 9 | 'paired_duplicate_reads': 'int', 10 | 'unmapped_reads': 'int', 11 | 'percent_duplicate_reads': 'float', 12 | 'estimated_library_size': 'int', 13 | 'total_reads': 'int', 14 | 'total_duplicate_reads': 'int', 15 | 'total_properly_paired': 'int', 16 | 'coverage_breadth': 'float', 17 | 'coverage_depth': 'float', 18 | 'median_insert_size': 'float', 19 | 'mean_insert_size': 'float', 20 | 'standard_deviation_insert_size': 'float', 21 | 'cell_call': 'str', 22 | 'column': 'int', 23 | 'experimental_condition': 'str', 24 | 'img_col': 'int', 25 | 'index_i5': 'str', 26 | 'index_i7': 'str', 27 | 'primer_i5': 'str', 28 | 'primer_i7': 'str', 29 | 'row': 'int', 30 | 'sample_type': 'str', 31 | 'is_contaminated': 'bool', 32 | 'trim': 'bool', 33 | 'sample_id': 'str', 34 | 'aligned': 'float', 35 | 'expected': 'float', 36 | 'overlap_with_all_filters': 'float', 37 | 'overlap_with_all_filters_and_qual': 'float', 38 | 'overlap_with_dups': 'float', 39 | 'overlap_without_dups': 'float', 40 | 'is_control': 'bool', 41 | } 42 | 43 | gc = {str(i): 'float' for i in range(0, 101)} 44 | gc['cell_id'] = 'str' 45 | 46 | dtypes = locals() 47 | 48 | return dtypes 49 | 50 | 51 | def fastqscreen_dtypes(genome_labels): 52 | metrics = {'fastqscreen_nohit': 'int', 'cell_id': 'str'} 53 | for label in genome_labels: 54 | metrics['fastqscreen_{}'.format(label)] = 'int' 55 | metrics['fastqscreen_{}_multihit'.format(label)] = 'int' 56 | 57 | fastqscreen_detailed = { 58 | 'cell_id': 'str', 59 | 'readend': 'str', 60 | 'count': 'int' 61 | } 62 | 63 | for label in genome_labels: 64 | fastqscreen_detailed[label] = 'int' 65 | 66 | dtypes = locals() 67 | return dtypes 68 | -------------------------------------------------------------------------------- /single_cell/workflows/align/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | from .gen_cn_matrix import GenerateCNMatrix 8 | from .collect_metrics import CollectMetrics 9 | from .run_trimgalore import RunTrimGalore 10 | from .summary_metrics import SummaryMetrics -------------------------------------------------------------------------------- /single_cell/workflows/align/scripts/gen_cn_matrix.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 8, 2015 3 | 4 | @author: dgrewal 5 | ''' 6 | import logging 7 | import numpy as np 8 | import pandas as pd 9 | from single_cell.utils import csvutils 10 | 11 | class GenerateCNMatrix(object): 12 | ''' 13 | merges files. no overlap queries, simple concatenation 14 | since columns are different, select header and insert values at proper 15 | indices. use N/A for missing. 16 | ''' 17 | 18 | def __init__(self, infile, output, sep, colname, sample_id, typ, dtypes): 19 | self.sep = sep 20 | self.output = output 21 | self.column_name = colname 22 | self.input = infile 23 | self.sample_id = sample_id 24 | self.type = typ 25 | self.dtypes = dtypes 26 | 27 | @staticmethod 28 | def replace_missing_vals(input_df, nan_val='N/A'): 29 | ''' 30 | replace NaN values with nan_val 31 | ''' 32 | input_df = input_df.fillna(nan_val) 33 | 34 | return input_df 35 | 36 | def write(self, input_df, transpose=False): 37 | ''' 38 | write the dataframe to output file 39 | ''' 40 | if transpose: 41 | del input_df["gc"] 42 | input_df = input_df.T 43 | input_df["cell_id"] = input_df.index 44 | 45 | input_df.columns = input_df.columns.astype(str) 46 | csvutils.write_dataframe_to_csv_and_yaml(input_df, self.output, self.dtypes) 47 | 48 | 49 | def read_hmmcopy_corrected_read_file(self, sample_id): 50 | """ 51 | 52 | """ 53 | column_name = self.column_name 54 | data = pd.read_csv(self.input) 55 | if column_name in data.columns: 56 | df = data[['chr', 'start', 'end', 'width', column_name]] 57 | else: 58 | df = data[['chr', 'start', 'end', 'width']] 59 | 60 | df[column_name] = float('NaN') 61 | 62 | df = df.rename(columns = {column_name:sample_id}) 63 | 64 | return df 65 | 66 | def read_gcbias_file(self, sample_id): 67 | """ 68 | parses the gcbias data 69 | """ 70 | column_name = self.column_name 71 | 72 | data = open(self.input).readlines() 73 | skiprows = [i for i,v in enumerate(data) if v[0] == '#' or v=='\n'] 74 | 75 | #If the file is empty (only header no data) then return 0s (dummy data) 76 | try: 77 | data = pd.read_csv(self.input, sep='\t', skiprows=skiprows) 78 | except pd.io.common.EmptyDataError: 79 | logging.getLogger("single_cell.align.gcbias").warn( 80 | 'No data in the GCBias output') 81 | #If the file is empty (only header no data) then return 0s (dummy data) 82 | data = np.array([np.arange(100), [0]*100]).T 83 | data = pd.DataFrame(data, columns = ['gc', sample_id]) 84 | return data 85 | 86 | data = pd.DataFrame(data[column_name]) 87 | 88 | data['gc'] = data.index 89 | 90 | df = data.rename(columns={'NORMALIZED_COVERAGE':sample_id}) 91 | 92 | df = df[['gc',sample_id]] 93 | return df 94 | 95 | def main(self): 96 | ''' 97 | main function 98 | ''' 99 | sample_id = self.sample_id 100 | 101 | if self.type == 'hmmcopy_corrected_reads': 102 | data = self.read_hmmcopy_corrected_read_file(sample_id) 103 | self.write(data) 104 | else: 105 | data = self.read_gcbias_file(sample_id) 106 | self.write(data, transpose=True) 107 | 108 | -------------------------------------------------------------------------------- /single_cell/workflows/cohort_qc/scripts/oncoplot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(maftools) 4 | 5 | 6 | oncoplot = function(read_maf, oncoplot_path, genes){ 7 | png(filename=oncoplot_path, units="px", width=1600, height=1600, res=300) 8 | 9 | maftools::oncoplot(maf=read_maf,showTumorSampleBarcodes=TRUE,genes=genes) 10 | dev.off() 11 | } 12 | 13 | 14 | 15 | main = function(){ 16 | args = commandArgs(trailingOnly=TRUE) 17 | genes=c("PPM1D", "TP53", "BRCA1", "BRCA2", "MECOM", "RB1", "PTEN", "PALB2","ERBB2", "CDK12", "PIK3CA", "KRAS", "CCNE1", "MYC") 18 | 19 | maf_file = args[1] 20 | vcNames=args[2] 21 | cn=args[3] 22 | oncoplot_path = args[4] 23 | 24 | 25 | vcNames=read.table(vcNames,header=TRUE)$Variant_Classification 26 | 27 | maf = maftools::read.maf(maf=maf_file, cnTable=cn, vc_nonSyn=vcNames) 28 | 29 | oncoplot(maf, oncoplot_path, genes) 30 | 31 | 32 | } 33 | 34 | 35 | main() 36 | 37 | 38 | -------------------------------------------------------------------------------- /single_cell/workflows/cohort_qc/scripts/report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | date: "`r format(Sys.time(), '%d %B, %Y')`" 3 | output: 4 | html_document 5 | params: 6 | cohort: "" 7 | oncoplot: "" 8 | 9 | --- 10 | --- 11 | title: `r params$cohort` 12 | --- 13 | 14 | ```{r setup, include=FALSE} 15 | knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE, fig.align = 'center') 16 | library(knitr) 17 | 18 | ``` 19 | 20 | ## oncoplot 21 | ```{r adjdist, echo = FALSE, out.width = "100%", out.height = "300",} 22 | # All defaults 23 | include_graphics(params$oncoplot) 24 | ``` 25 | -------------------------------------------------------------------------------- /single_cell/workflows/cohort_qc/scripts/vcf2maf: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUTVCF=$1 4 | OUTPUTMAF=$2 5 | FASTA=$3 6 | VEPDATA=$4 7 | BUFFERSIZE=$5 8 | 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4 --vep-path $(dirname `which vep`) --buffer-size $5 10 | -------------------------------------------------------------------------------- /single_cell/workflows/cohort_qc/scripts/vcf2maf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUTVCF=$1 4 | OUTPUTMAF=$2 5 | FASTA=$3 6 | VEPDATA=$4 7 | BUFFERSIZE=$5 8 | 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4 --vep-path $(dirname `which vep`) --buffer-size $5 10 | -------------------------------------------------------------------------------- /single_cell/workflows/db_annotation/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | import pypeliner.managed as mgd 3 | 4 | 5 | def create_db_annotation_workflow( 6 | in_vcf_file, 7 | out_csv_file, 8 | db_vcf_file, 9 | split_size=1e4 10 | ): 11 | workflow = pypeliner.workflow.Workflow(ctx=dict(mem=2, num_retry=3, mem_retry_increment=2)) 12 | 13 | workflow.transform( 14 | name='split_vcf', 15 | func='single_cell.utils.vcfutils.split_vcf', 16 | args=( 17 | mgd.InputFile(in_vcf_file), 18 | mgd.TempOutputFile('split.vcf', 'split') 19 | ), 20 | kwargs={'lines_per_file': split_size} 21 | ) 22 | 23 | workflow.transform( 24 | name='annotate_db_status', 25 | axes=('split',), 26 | func='single_cell.workflows.db_annotation.tasks.annotate_db_status', 27 | args=( 28 | db_vcf_file, 29 | mgd.TempInputFile('split.vcf', 'split'), 30 | mgd.TempOutputFile('annotated.csv.gz', 'split', extensions=['.yaml']) 31 | ) 32 | ) 33 | 34 | workflow.transform( 35 | name='merge_tables', 36 | func='single_cell.utils.csvutils.concatenate_csv', 37 | args=( 38 | mgd.TempInputFile('annotated.csv.gz', 'split', extensions=['.yaml']), 39 | mgd.OutputFile(out_csv_file, extensions=['.yaml']) 40 | ) 41 | ) 42 | 43 | return workflow 44 | -------------------------------------------------------------------------------- /single_cell/workflows/db_annotation/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_annotate = { 3 | 'cell_id': 'str', 4 | 'chrom': 'str', 5 | 'coord': 'int', 6 | 'ref': 'str', 7 | 'alt': 'str', 8 | 'db_id': 'str', 9 | 'exact_match': 'int', 10 | 'indel': 'int', 11 | 'mappability': 'float', 12 | 'effect': 'str', 13 | 'effect_impact': 'str', 14 | 'functional_class': 'str', 15 | 'codon_change': 'str', 16 | 'amino_acid_change': 'str', 17 | 'amino_acid_length': 'str', 18 | 'gene_name': 'str', 19 | 'transcript_biotype': 'str', 20 | 'gene_coding': 'str', 21 | 'transcript_id': 'str', 22 | 'exon_rank': 'str', 23 | 'genotype': 'str', 24 | 'tri_nucleotide_context': 'str', 25 | } 26 | 27 | 28 | return snv_annotate 29 | -------------------------------------------------------------------------------- /single_cell/workflows/db_annotation/tasks.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import vcf 3 | from single_cell.utils import csvutils 4 | from single_cell.workflows.db_annotation.dtypes import dtypes 5 | 6 | 7 | def annotate_db_status(db_vcf_file, target_vcf_file, out_file): 8 | db_reader = vcf.Reader(filename=db_vcf_file) 9 | 10 | reader = vcf.Reader(filename=target_vcf_file) 11 | 12 | data = [] 13 | 14 | for record in reader: 15 | chrom = record.CHROM 16 | 17 | coord = record.POS 18 | 19 | try: 20 | db_position_records = [x for x in db_reader.fetch(chrom, coord - 1, coord)] 21 | 22 | except ValueError: 23 | db_position_records = [] 24 | 25 | for db_record in db_position_records: 26 | 27 | if (db_record.CHROM != chrom) or (db_record.POS != coord): 28 | continue 29 | 30 | if db_record.is_indel: 31 | indel = 1 32 | 33 | else: 34 | indel = 0 35 | 36 | for alt in record.ALT: 37 | 38 | if (record.REF == db_record.REF) and (alt in db_record.ALT): 39 | exact_match = 1 40 | 41 | else: 42 | exact_match = 0 43 | 44 | out_row = { 45 | 'chrom': chrom, 46 | 'coord': coord, 47 | 'ref': record.REF, 48 | 'alt': str(alt), 49 | 'db_id': db_record.ID, 50 | 'exact_match': exact_match, 51 | 'indel': indel 52 | } 53 | 54 | data.append(out_row) 55 | 56 | data = pd.DataFrame(data) 57 | 58 | csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes()) 59 | -------------------------------------------------------------------------------- /single_cell/workflows/destruct_singlecell/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | cell_counts = { 3 | "cluster_id": "int", 4 | "cell_id": "str", 5 | "read_count": "int" 6 | } 7 | library = { 8 | "prediction_id": "int", 9 | "num_reads": "int", 10 | "num_unique_reads": "int", 11 | "library": "str", 12 | "is_normal": "bool", 13 | "patient_id": "float" 14 | } 15 | breakpoints = { 16 | "prediction_id": "int", 17 | "chromosome_1": "str", 18 | "strand_1": "str", 19 | "position_1": "int", 20 | "chromosome_2": "str", 21 | "strand_2": "str", 22 | "position_2": "int", 23 | "homology": "int", 24 | "num_split": "int", 25 | "inserted": "str", 26 | "mate_score": "float", 27 | "template_length_1": "int", 28 | "log_cdf": "float", 29 | "template_length_2": "int", 30 | "log_likelihood": "float", 31 | "template_length_min": "int", 32 | "num_reads": "int", 33 | "num_unique_reads": "int", 34 | "type": "str", 35 | "num_inserted": "int", 36 | "sequence": "str", 37 | "gene_id_1": "str", 38 | "gene_name_1": "str", 39 | "gene_location_1": "str", 40 | "gene_id_2": "str", 41 | "gene_name_2": "str", 42 | "gene_location_2": "str", 43 | "dgv_ids": "float", 44 | "is_germline": "bool", 45 | "is_dgv": "bool", 46 | "num_patients": "int", 47 | "is_filtered": "bool", 48 | "dist_filtered": "float", 49 | "balanced": "bool", 50 | "rearrangement_type": "str" 51 | } 52 | 53 | dtypes = locals() 54 | 55 | return dtypes 56 | -------------------------------------------------------------------------------- /single_cell/workflows/extract_allele_readcounts/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | readcount = { 3 | 'chromosome': 'str', 4 | 'start': 'int', 5 | 'end': 'int', 6 | 'hap_label': 'str', 7 | 'allele_id': 'str', 8 | 'readcount': 'int', 9 | 'cell_id': 'str' 10 | } 11 | 12 | dtypes = locals() 13 | 14 | return dtypes 15 | -------------------------------------------------------------------------------- /single_cell/workflows/extract_allele_readcounts/tasks.py: -------------------------------------------------------------------------------- 1 | from single_cell.utils import csvutils 2 | 3 | 4 | def convert_csv_to_tsv(csv_infile, tsv_outfile): 5 | csvinput = csvutils.CsvInput(csv_infile) 6 | 7 | csvdata = csvinput.read_csv() 8 | 9 | csvdata.to_csv(tsv_outfile, sep='\t', index=False) 10 | -------------------------------------------------------------------------------- /single_cell/workflows/germline/__init__.py: -------------------------------------------------------------------------------- 1 | from pypeliner.workflow import Workflow 2 | 3 | import pypeliner 4 | 5 | default_chromosomes = [str(x) for x in range(1, 23)] + ['X', 'Y'] 6 | 7 | 8 | def create_samtools_germline_workflow( 9 | normal_bam_files, 10 | ref_genome_fasta_file, 11 | vcf_file, 12 | config, 13 | ): 14 | 15 | ctx = {'mem': config["memory"]['low'], 16 | 'mem_retry_increment': 2, 17 | 'disk_retry_increment': 50, 18 | 'ncpus': 1} 19 | 20 | regions = list(normal_bam_files.keys()) 21 | 22 | workflow = Workflow(ctx=ctx) 23 | 24 | workflow.setobj( 25 | obj=pypeliner.managed.OutputChunks('regions'), 26 | value=regions, 27 | ) 28 | 29 | workflow.transform( 30 | name='run_samtools_variant_calling', 31 | axes=('regions',), 32 | func="single_cell.workflows.germline.tasks.run_samtools_variant_calling", 33 | args=( 34 | pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files, extensions=['.bai']), 35 | ref_genome_fasta_file, 36 | pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'), 37 | ), 38 | kwargs={ 39 | 'region': pypeliner.managed.InputInstance('regions'), 40 | }, 41 | ) 42 | 43 | workflow.transform( 44 | name='concatenate_variants', 45 | func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", 46 | args=( 47 | pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'), 48 | pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']), 49 | pypeliner.managed.TempSpace("merge_variants_germline"), 50 | ), 51 | ) 52 | 53 | return workflow 54 | -------------------------------------------------------------------------------- /single_cell/workflows/hmmcopy/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | reads = { 3 | 'chr': 'str', 4 | 'start': 'int', 5 | 'end': 'int', 6 | 'width': 'int', 7 | 'reads': 'int', 8 | 'gc': 'float', 9 | 'cor_gc': 'float', 10 | 'cor_map': 'float', 11 | 'copy': 'float', 12 | 'map': 'float', 13 | 'state': 'float', 14 | 'cell_id': 'str', 15 | 'sample_id': 'str', 16 | 'library_id': 'str', 17 | 'valid': 'bool', 18 | 'ideal': 'bool', 19 | 'modal_curve': 'float', 20 | 'modal_quantile': 'float', 21 | 'multiplier': 'int', 22 | 'is_low_mappability': 'bool' 23 | } 24 | 25 | segs = { 26 | 'chr': 'str', 27 | 'start': 'int', 28 | 'end': 'int', 29 | 'state': 'float', 30 | 'median': 'float', 31 | 'multiplier': 'int', 32 | 'cell_id': 'str', 33 | } 34 | 35 | params = { 36 | 'iteration': 'float', 37 | # 'is_final': 'bool', 38 | 'state':'float', 39 | 'parameter': 'str', 40 | 'cell_id':'str', 41 | 'value':'float', 42 | } 43 | 44 | metrics = { 45 | 'multiplier': 'int', 46 | 'cell_id': 'str', 47 | 'sample_id': 'str', 48 | 'library_id': 'str', 49 | 'MSRSI_non_integerness': 'float', 50 | 'MBRSI_dispersion_non_integerness': 'float', 51 | 'MBRSM_dispersion': 'float', 52 | 'autocorrelation_hmmcopy': 'float', 53 | 'cv_hmmcopy': 'float', 54 | 'empty_bins_hmmcopy': 'int', 55 | 'mad_hmmcopy': 'float', 56 | 'mean_hmmcopy_reads_per_bin': 'float', 57 | 'median_hmmcopy_reads_per_bin': 'float', 58 | 'std_hmmcopy_reads_per_bin': 'float', 59 | 'total_mapped_reads_hmmcopy': 'int', 60 | 'total_halfiness': 'float', 61 | 'scaled_halfiness': 'float', 62 | 'mean_state_mads': 'float', 63 | 'mean_state_vars': 'float', 64 | 'mad_neutral_state': 'float', 65 | 'breakpoints': 'int', 66 | 'mean_copy': 'float', 67 | 'state_mode': 'int', 68 | 'log_likelihood': 'float', 69 | 'true_multiplier': 'float', 70 | 'column': 'int', 71 | 'img_col': 'int', 72 | 'primer_i7': 'str', 73 | 'index_i5': 'str', 74 | 'sample_type': 'str', 75 | 'primer_i5': 'str', 76 | 'experimental_condition': 'str', 77 | 'cell_call': 'str', 78 | 'index_i7': 'str', 79 | 'order': 'int', 80 | 'row': 'int', 81 | 'trim': 'bool', 82 | 'is_control': 'bool' 83 | } 84 | 85 | dtypes = locals() 86 | 87 | return dtypes 88 | -------------------------------------------------------------------------------- /single_cell/workflows/hmmcopy/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | 8 | from .read_counter import ReadCounter 9 | from .convert_csv_to_seg import ConvertCSVToSEG 10 | from .read_counter import ReadCounter 11 | from .correct_read_count import CorrectReadCount 12 | -------------------------------------------------------------------------------- /single_cell/workflows/infer_haps/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | haplotypes = { 3 | 'chromosome': 'str', 4 | 'position': 'int', 5 | 'allele': 'str', 6 | 'hap_label': 'str', 7 | 'allele_id': 'str', 8 | 'ref': 'str', 9 | 'alt': 'str' 10 | } 11 | 12 | dtypes = locals() 13 | 14 | return dtypes 15 | -------------------------------------------------------------------------------- /single_cell/workflows/infer_haps/tasks.py: -------------------------------------------------------------------------------- 1 | from single_cell.utils import helpers 2 | import os 3 | 4 | def annotate_ref_alt(haps_csv, refdir, output_csv): 5 | thousand_genomes = os.path.join(refdir, 'thousand_genomes_snps.tsv') 6 | 7 | annotation_data = {} 8 | 9 | with helpers.getFileHandle(thousand_genomes, 'rt') as db: 10 | for line in db: 11 | line = line.strip().split('\t') 12 | 13 | chrom, pos, ref, alt = line 14 | 15 | annotation_data[(chrom, pos)] = (ref, alt) 16 | 17 | with helpers.getFileHandle(haps_csv, 'rt') as reader, helpers.getFileHandle(output_csv, 'wt') as writer: 18 | 19 | header = reader.readline().strip() 20 | header += '\tref\talt\n' 21 | writer.write(header) 22 | 23 | for line in reader: 24 | line = line.strip() 25 | l_split = line.split('\t') 26 | 27 | chrom = l_split[0] 28 | pos = l_split[1] 29 | 30 | if (chrom, pos) in annotation_data: 31 | ref, alt = annotation_data[(chrom, pos)] 32 | else: 33 | ref = 'NA' 34 | alt = 'NA' 35 | 36 | line += '\t{}\t{}\n'.format(ref, alt) 37 | 38 | writer.write(line) 39 | -------------------------------------------------------------------------------- /single_cell/workflows/lumpy/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | evidence = { 3 | "breakpoint_id": "int", 4 | "cell_id": "str", 5 | "count": "int" 6 | } 7 | 8 | breakpoint = { 9 | "breakpoint_id": "int", 10 | "chrom1": "str", 11 | "start1": "int", 12 | "end1": "int", 13 | "strand1": "str", 14 | "max_chr1": "str", 15 | "max_pos1": "int", 16 | "confidence_interval_chr1": "str", 17 | "confidence_interval_start1": "int", 18 | "confidence_interval_end1": "int", 19 | "chrom2": "str", 20 | "start2": "int", 21 | "end2": "int", 22 | "strand2": "str", 23 | "max_chr2": "str", 24 | "max_pos2": "int", 25 | "confidence_interval_chr2": "str", 26 | "confidence_interval_start2": "int", 27 | "confidence_interval_end2": "int", 28 | "type": "str", 29 | "score": "float", 30 | "strands": "str", 31 | "normal_PE": "float", 32 | "tumour_PE": "float", 33 | "tumour_SR": "float", 34 | "normal_SR": "float", 35 | } 36 | dtypes = locals() 37 | 38 | return dtypes 39 | -------------------------------------------------------------------------------- /single_cell/workflows/lumpy/merge_histograms.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | def parse_histogram(infile): 5 | data = [] 6 | 7 | with open(infile) as inputdata: 8 | for line in inputdata: 9 | if line.startswith('#'): 10 | line = line.strip().split(':') 11 | if line[0] == "#numreads": 12 | numreads = int(line[1]) 13 | elif line[0] == "#mean": 14 | mean = float(line[1]) 15 | elif line[0] == "#stdev": 16 | stdev = float(line[1]) 17 | else: 18 | raise Exception() 19 | continue 20 | 21 | line = line.strip().split(',') 22 | i = int(line[0]) 23 | val = float(line[1]) 24 | data.append((i, val)) 25 | 26 | return data, mean, stdev, numreads 27 | 28 | 29 | def merge_histo(indata, merged_data, numreads): 30 | for (i, val) in indata: 31 | if not i in merged_data: 32 | merged_data[i] = 0 33 | merged_data[i] += (val * numreads) 34 | return merged_data 35 | 36 | 37 | def normalize_histo(merged_data, total_reads): 38 | data = [] 39 | indices = sorted(merged_data.keys()) 40 | for idx in indices: 41 | value = merged_data[idx] 42 | value = value / total_reads 43 | data.append((idx, value)) 44 | return data 45 | 46 | 47 | def prune_histogram(histogram): 48 | # towards the tail end, most cells will be 0 49 | # dividing by total reads will make most of these almost 0 50 | # remove these 51 | if not histogram: 52 | return histogram 53 | for idx in range(len(histogram) - 1, -1, -1): 54 | if float(histogram[idx][1]) >= 0.0001: 55 | break 56 | 57 | histogram = histogram[:idx] 58 | 59 | return histogram 60 | 61 | 62 | def write_histo_file(data, outfile): 63 | with open(outfile, 'w') as histo_file: 64 | for i, val in data: 65 | histo_file.write("{}\t{}\n".format(i, val)) 66 | 67 | 68 | def write_metadata(mean, stdev, outfile): 69 | with open(outfile, 'w') as fileoutput: 70 | yaml.safe_dump({'mean': mean, 'stdev': stdev}, fileoutput) 71 | 72 | 73 | def merge_histograms(infiles, outfile, metadata): 74 | merged_data = {} 75 | total_reads = 0 76 | 77 | means = 0 78 | stdevs = 0 79 | 80 | if isinstance(infiles, dict): 81 | infiles = infiles.values() 82 | 83 | # if input is a single file 84 | if isinstance(infiles, str): 85 | infiles = [infiles] 86 | 87 | for infile in infiles: 88 | data, mean, stdev, numreads = parse_histogram(infile) 89 | 90 | merged_data = merge_histo(data, merged_data, numreads) 91 | 92 | total_reads += numreads 93 | 94 | means += (mean * numreads) 95 | stdevs += (stdev * numreads) 96 | 97 | final_histo = normalize_histo(merged_data, total_reads) 98 | final_histo = prune_histogram(final_histo) 99 | 100 | mean = means / total_reads 101 | stdev = stdevs / total_reads 102 | 103 | write_histo_file(final_histo, outfile) 104 | 105 | write_metadata(mean, stdev, metadata) 106 | -------------------------------------------------------------------------------- /single_cell/workflows/mappability_annotation/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | import pypeliner.managed as mgd 3 | 4 | 5 | def create_mappability_annotation_workflow( 6 | in_vcf_file, 7 | out_csv_file, 8 | mappability_file, 9 | split_size=1e4 10 | ): 11 | workflow = pypeliner.workflow.Workflow( 12 | ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} 13 | ) 14 | 15 | workflow.transform( 16 | name="get_regions", 17 | func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions", 18 | ret=mgd.TempOutputObj('regions_obj', 'regions'), 19 | args=( 20 | mgd.InputFile(in_vcf_file, extensions=['.tbi']), 21 | int(split_size), 22 | ), 23 | ) 24 | 25 | workflow.transform( 26 | name='annotate_db_status', 27 | axes=('regions',), 28 | func='single_cell.workflows.mappability_annotation.tasks.get_mappability', 29 | args=( 30 | mappability_file, 31 | mgd.InputFile(in_vcf_file, extensions=['.tbi']), 32 | mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml']) 33 | ), 34 | kwargs={ 35 | 'region': mgd.TempInputObj('regions_obj', 'regions'), 36 | }, 37 | ) 38 | 39 | workflow.transform( 40 | name='merge_tables', 41 | func='single_cell.utils.csvutils.concatenate_csv', 42 | args=( 43 | mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']), 44 | mgd.OutputFile(out_csv_file, extensions=['.yaml']) 45 | ) 46 | ) 47 | 48 | return workflow 49 | -------------------------------------------------------------------------------- /single_cell/workflows/mappability_annotation/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_annotate = { 3 | 'cell_id': 'str', 4 | 'chrom': 'str', 5 | 'coord': 'int', 6 | 'ref': 'str', 7 | 'alt': 'str', 8 | 'db_id': 'str', 9 | 'exact_match': 'int', 10 | 'indel': 'int', 11 | 'mappability': 'float', 12 | 'effect': 'str', 13 | 'effect_impact': 'str', 14 | 'functional_class': 'str', 15 | 'codon_change': 'str', 16 | 'amino_acid_change': 'str', 17 | 'amino_acid_length': 'str', 18 | 'gene_name': 'str', 19 | 'transcript_biotype': 'str', 20 | 'gene_coding': 'str', 21 | 'transcript_id': 'str', 22 | 'exon_rank': 'str', 23 | 'genotype': 'str', 24 | 'tri_nucleotide_context': 'str', 25 | } 26 | 27 | 28 | return snv_annotate 29 | -------------------------------------------------------------------------------- /single_cell/workflows/merge_bams/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 11, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | import pypeliner.managed as mgd 8 | 9 | import pypeliner 10 | 11 | 12 | def create_merge_bams_workflow( 13 | input_bams, 14 | merged_bams, 15 | regions, 16 | config, 17 | ): 18 | merged_bams = dict([(region, merged_bams[region]) 19 | for region in regions]) 20 | 21 | 22 | workflow = pypeliner.workflow.Workflow() 23 | 24 | workflow.setobj( 25 | obj=mgd.OutputChunks('cell_id'), 26 | value=list(input_bams.keys()), 27 | ) 28 | 29 | workflow.setobj( 30 | obj=mgd.OutputChunks('region'), 31 | value=regions, 32 | ) 33 | 34 | one_split_job = config["one_split_job"] 35 | 36 | if one_split_job: 37 | workflow.transform( 38 | name='merge_bams', 39 | ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']}, 40 | func="single_cell.workflows.merge_bams.tasks.merge_bams", 41 | args=( 42 | mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), 43 | mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), 44 | regions, 45 | mgd.TempSpace("merge_bams_tempdir") 46 | ), 47 | kwargs={"ncores": config["max_cores"]} 48 | ) 49 | else: 50 | workflow.transform( 51 | name='split_merge_tumour', 52 | func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', 53 | axes=('region',), 54 | args=( 55 | mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), 56 | mgd.OutputFile( 57 | 'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), 58 | mgd.Instance('region'), 59 | ), 60 | ) 61 | 62 | return workflow 63 | -------------------------------------------------------------------------------- /single_cell/workflows/merge_bams/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | 8 | from .collect_metrics import CollectMetrics -------------------------------------------------------------------------------- /single_cell/workflows/merge_bams/tasks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | import os 7 | 8 | from single_cell.utils import bamutils 9 | from single_cell.utils import helpers 10 | 11 | 12 | def cell_region_merge_bams(cell_bams, region_bam, region): 13 | cell_bams = cell_bams.values() 14 | region = '{}:{}-{}'.format(*region.split('-')) 15 | 16 | bamutils.bam_merge( 17 | cell_bams, region_bam, 18 | region=region) 19 | 20 | bamutils.bam_index( 21 | region_bam, region_bam + '.bai', 22 | ) 23 | 24 | 25 | def merge_bams(bams, outputs, regions, tempdir, ncores=None): 26 | merge_tempdir = os.path.join(tempdir, "merge") 27 | commands = [] 28 | for region in regions: 29 | output = outputs[region] 30 | region = '{}:{}-{}'.format(*region.split('-')) 31 | cmd = list(['samtools', 'merge', '-f', '-R', region]) 32 | cmd.append(output) 33 | cmd.extend(bams.values()) 34 | commands.append(cmd) 35 | helpers.run_in_gnu_parallel(commands, merge_tempdir, ncores=ncores) 36 | 37 | index_tempdir = os.path.join(tempdir, "index") 38 | commands = [] 39 | for region in regions: 40 | output = outputs[region] 41 | commands.append(['samtools', 'index', output, output + ".bai"]) 42 | 43 | helpers.run_in_gnu_parallel(commands, index_tempdir, ncores=ncores) 44 | -------------------------------------------------------------------------------- /single_cell/workflows/mutationseq/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_museq = { 3 | "chrom": "str", 4 | "coord": "int", 5 | "ref": "str", 6 | "alt": "str", 7 | "score": "float" 8 | } 9 | 10 | dtypes = locals() 11 | 12 | return dtypes 13 | -------------------------------------------------------------------------------- /single_cell/workflows/mutationseq/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | from .parse_museq import ParseMuseq -------------------------------------------------------------------------------- /single_cell/workflows/mutationseq/scripts/parse_museq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: dgrewal 4 | 5 | Last updated: Diljot Grewal Jun 3 2015 6 | 7 | reads vcf files, filters and write the output in tsv format 8 | """ 9 | 10 | #!/usr/bin/env python 11 | 12 | from vizutils import Utils as pau 13 | from vizutils import Vcf 14 | 15 | class ParseMuseq(object): 16 | ''' 17 | parse, filter and print museq vcf in tsv format 18 | ''' 19 | 20 | def __init__(self, **kwargs): 21 | 22 | self.infiles = pau.get_inputs(kwargs.get('tid'), 23 | kwargs.get('nid'), 24 | kwargs.get('case'), 25 | kwargs.get('infile'), 26 | kwargs.get('all_files'), 27 | fh_names='infile') 28 | 29 | self.output = kwargs.get('output') 30 | self.project = kwargs.get('project') 31 | 32 | self.genes = pau.read_file_to_list(kwargs.get('genes')) 33 | self.snpeff_keywords = kwargs.get('snpeff_keywords') 34 | self.chromosomes = kwargs.get('chromosomes') 35 | self.remove_duplicates = kwargs.get('rm_dups') 36 | self.pr_threshold = kwargs.get('pr_thres') 37 | 38 | self.keep_dbsnp = kwargs.get('keep_dbsnp') 39 | self.keep_1000gen = kwargs.get('keep_1000gen') 40 | 41 | 42 | def main(self): 43 | ''' 44 | loop through files, load, filter and print 45 | ''' 46 | header = False 47 | with open(self.output, 'w') as outfile: 48 | for (case, tum, norm), fname in self.infiles.items(): 49 | 50 | museq = Vcf(tumour_id = tum, 51 | normal_id = norm, 52 | case_id = case, 53 | infile = fname, 54 | snpeff_keywords = self.snpeff_keywords, 55 | keep_dbsnp = self.keep_dbsnp, 56 | keep_1000gen = self.keep_1000gen, 57 | chromosomes = self.chromosomes, 58 | genes = self.genes, 59 | rmdups = self.remove_duplicates, 60 | pr_threshold = self.pr_threshold, 61 | mode = 'museq' 62 | ) 63 | #write header 64 | if not header: 65 | colnames = museq.get_info_header() 66 | pau.write_list(outfile, colnames, sep=",") 67 | header=True 68 | 69 | infos = museq.get_data() 70 | 71 | for info in infos: 72 | pau.write_list(outfile, info, sep=',') 73 | -------------------------------------------------------------------------------- /single_cell/workflows/mutationseq/scripts/vizutils/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | 4 | from parseutils import ParseUtils 5 | from vcf import Vcf 6 | from utils import Utils 7 | -------------------------------------------------------------------------------- /single_cell/workflows/mutationseq/tasks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | import pypeliner 7 | from single_cell.utils import vcfutils 8 | 9 | 10 | def subsample(input_bam, output_bam, max_coverage=10000): 11 | cmd = ['variant', input_bam, '-m', max_coverage, '-v', '-b', '-o', output_bam] 12 | pypeliner.commandline.execute(*cmd) 13 | 14 | cmd = ['samtools', 'index', output_bam] 15 | pypeliner.commandline.execute(*cmd) 16 | 17 | 18 | def run_museq(tumour, normal, out, log, region, config): 19 | ''' 20 | Run museq script for each chromosome 21 | 22 | :param tumour: path to tumour bam 23 | :param normal: path to normal bam 24 | :param out: path to temporary output VCF file for the chromosome 25 | :param log: path to the log file 26 | :param config: path to the config YAML file 27 | :param chrom: chromosome number 28 | ''' 29 | 30 | reference = config['ref_genome'] 31 | 32 | region = '{}:{}-{}'.format(*region.split('-')) 33 | 34 | cmd = ['museq', 'normal:' + normal, 'tumour:' + tumour, 35 | 'reference:' + reference, '--out', out, 36 | '--log', log, '--interval', region] 37 | 38 | museq_params = config.get('museq_params', {}) 39 | for key, val in museq_params.items(): 40 | if isinstance(val, bool): 41 | if val: 42 | cmd.append('--{}'.format(key)) 43 | else: 44 | cmd.append('--{}'.format(key)) 45 | if isinstance(val, list): 46 | cmd.extend(val) 47 | else: 48 | cmd.append(val) 49 | 50 | pypeliner.commandline.execute(*cmd) 51 | 52 | 53 | def concatenate_vcfs(inputs, output): 54 | vcfutils.concatenate_vcf(inputs, output) 55 | -------------------------------------------------------------------------------- /single_cell/workflows/pseudo_bulk_qc/scripts/mergemafs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(tidyverse) 4 | library(data.table) 5 | args <- commandArgs(TRUE) 6 | 7 | input = args[1] 8 | output = args[2] 9 | maf = data.table::fread(input) 10 | 11 | filtmaf <- filter(maf, str_detect(Consequence, "frameshift|stop") | IMPACT == "HIGH") %>% 12 | group_by_at(vars(-contains("depth"), -contains("count"))) %>% 13 | summarise(t_depth = sum(t_depth), 14 | t_ref_count = sum(t_ref_count), 15 | t_alt_count = sum(t_alt_count), 16 | n_depth = sum(n_depth), 17 | n_ref_count = sum(n_ref_count), 18 | n_alt_count = sum(n_alt_count), 19 | nlibrary = n() 20 | ) %>% 21 | ungroup() %>% 22 | mutate(tVAF = t_alt_count / t_depth, nVAF = n_alt_count / n_depth) %>% 23 | dplyr::select(id, Hugo_Symbol, Chromosome, Start_Position, 24 | Reference_Allele, Variant_Type, Tumor_Seq_Allele1, 25 | Tumor_Seq_Allele2, Consequence, IMPACT, tVAF, nVAF, nlibrary) %>% 26 | dplyr::arrange(id, Chromosome, Start_Position) 27 | 28 | write_delim(filtmaf, output, delim = "\t") 29 | -------------------------------------------------------------------------------- /single_cell/workflows/pseudo_bulk_qc/scripts/mergesnvs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(tidyverse) 4 | 5 | args <- commandArgs(TRUE) 6 | input = data.table::fread(args[1]) 7 | print(input) 8 | output = args[2] 9 | 10 | 11 | filtsnvs <- input %>% 12 | group_by_at(vars(-contains("counts"), -num_cells)) %>% 13 | summarise(alt_counts = sum(alt_counts), 14 | ref_counts = sum(ref_counts), 15 | total_counts = sum(total_counts), 16 | num_cells = sum(num_cells), 17 | nlibrary = n() 18 | ) %>% 19 | ungroup() %>% 20 | mutate(tVAF = alt_counts / total_counts) %>% 21 | dplyr::select(chrom,coord,ref,alt,gene_name,effect,effect_impact,is_cosmic, 22 | amino_acid_change,num_cells,alt_counts,ref_counts,total_counts, 23 | id, tVAF, nlibrary) %>% 24 | dplyr::arrange(id, chrom, coord) 25 | 26 | write_delim(filtsnvs, output, delim = "\t") 27 | -------------------------------------------------------------------------------- /single_cell/workflows/pseudo_bulk_qc/scripts/vcf2maf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUTVCF=$1 4 | OUTPUTMAF=$2 5 | FASTA=$3 6 | VEPDATA=$4 7 | BUFFERSIZE=$5 8 | 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4 --vep-path $(dirname `which vep`) --buffer-size $5 10 | -------------------------------------------------------------------------------- /single_cell/workflows/qc_annotation/dtypes.py: -------------------------------------------------------------------------------- 1 | def metrics_dtypes(): 2 | metrics = { 3 | 'cell_id': 'str', 4 | 'sample_id': 'str', 5 | 'library_id': 'str', 6 | 'multiplier': 'Int64', 7 | 'MSRSI_non_integerness': 'float64', 8 | 'MBRSI_dispersion_non_integerness': 'float64', 9 | 'MBRSM_dispersion': 'float64', 10 | 'autocorrelation_hmmcopy': 'float64', 11 | 'cv_hmmcopy': 'float64', 12 | 'empty_bins_hmmcopy': 'Int64', 13 | 'mad_hmmcopy': 'float64', 14 | 'mean_hmmcopy_reads_per_bin': 'float64', 15 | 'median_hmmcopy_reads_per_bin': 'float64', 16 | 'std_hmmcopy_reads_per_bin': 'float64', 17 | 'total_mapped_reads_hmmcopy': 'Int64', 18 | 'total_halfiness': 'float64', 19 | 'scaled_halfiness': 'float64', 20 | 'mean_state_mads': 'float64', 21 | 'mean_state_vars': 'float64', 22 | 'mad_neutral_state': 'float64', 23 | 'breakpoints': 'Int64', 24 | 'mean_copy': 'float64', 25 | 'state_mode': 'Int64', 26 | 'log_likelihood': 'float64', 27 | 'true_multiplier': 'float64', 28 | 'column': 'Int64', 29 | 'img_col': 'Int64', 30 | 'primer_i7': 'str', 31 | 'index_i5': 'str', 32 | 'sample_type': 'str', 33 | 'primer_i5': 'str', 34 | 'experimental_condition': 'str', 35 | 'cell_call': 'str', 36 | 'index_i7': 'str', 37 | 'order': 'Int64', 38 | 'row': 'Int64', 39 | 'is_s_phase': 'bool', 40 | 'is_s_phase_prob': 'float64', 41 | 'quality': 'float64', 42 | 'coverage_depth': 'float64', 43 | 'paired_duplicate_reads': 'Int64', 44 | 'total_reads': 'Int64', 45 | 'unpaired_duplicate_reads': 'Int64', 46 | 'percent_duplicate_reads': 'float64', 47 | 'coverage_breadth': 'float64', 48 | 'mean_insert_size': 'float64', 49 | 'unpaired_mapped_reads': 'Int64', 50 | 'median_insert_size': 'float64', 51 | 'total_duplicate_reads': 'Int64', 52 | 'is_contaminated': 'bool', 53 | 'is_control': 'bool', 54 | 'estimated_library_size': 'Int64', 55 | 'standard_deviation_insert_size': 'float64', 56 | 'unmapped_reads': 'Int64', 57 | 'total_mapped_reads': 'Int64', 58 | 'total_properly_paired': 'Int64', 59 | 'paired_mapped_reads': 'Int64', 60 | 'order_corrupt_tree': 'Int64', 61 | 'species': 'str', 62 | 'trim': 'bool', 63 | 'aligned': 'float', 64 | 'expected': 'float', 65 | 'overlap_with_all_filters': 'float', 66 | 'overlap_with_all_filters_and_qual': 'float', 67 | 'overlap_with_dups': 'float', 68 | 'overlap_without_dups': 'float', 69 | } 70 | 71 | return metrics 72 | 73 | 74 | def fastqscreen_dtypes(genome_labels): 75 | metrics = { 76 | 'fastqscreen_nohit': 'int', 77 | 'fastqscreen_nohit_ratio': 'float', 78 | 'cell_id': 'str' 79 | } 80 | for label in genome_labels: 81 | metrics['fastqscreen_{}'.format(label)] = 'int' 82 | metrics['fastqscreen_{}_multihit'.format(label)] = 'int' 83 | metrics['fastqscreen_{}_ratio'.format(label)] = 'float' 84 | 85 | return metrics 86 | 87 | 88 | def dtypes(genome_labels): 89 | return {**metrics_dtypes(), **fastqscreen_dtypes(genome_labels)} 90 | -------------------------------------------------------------------------------- /single_cell/workflows/qc_annotation/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | 7 | from . import generate_qc -------------------------------------------------------------------------------- /single_cell/workflows/qc_annotation/scripts/fastqscreen_classify.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from single_cell.utils import csvutils 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.preprocessing import * 5 | import numpy as np 6 | 7 | def train(training_data_path): 8 | ''' 9 | Train the model using the provided training data. 10 | Return a feature scaler and a classifier. 11 | ''' 12 | data = pd.read_csv(training_data_path) 13 | species_list = ["salmon", "grch37", "mm10"] 14 | labels = data["species"] 15 | features = data.drop('species', axis=1) 16 | 17 | le = LabelEncoder() 18 | le.fit(species_list) 19 | # convert the labels 20 | labels = le.transform(labels) 21 | # train a feature scaler 22 | transformer = RobustScaler().fit(features) 23 | features = transformer.transform(features) 24 | # train the random forest model 25 | rf = RandomForestClassifier(n_estimators=10, random_state=42) 26 | rf.fit(features, labels) 27 | 28 | return features, transformer, rf 29 | 30 | 31 | def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes): 32 | df = csvutils.read_csv_and_yaml(metrics_path) 33 | features_train, feature_transformer, model = train(training_data_path) 34 | 35 | features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio", 36 | "fastqscreen_salmon_ratio"] 37 | label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"} 38 | # check if all the features exists, if yes, make predictions, else create an empty species column. 39 | exist = all([feature[:-6] in df for feature in features]) 40 | if exist: 41 | # make the feature columns 42 | for feature in features: 43 | df[feature] = df[feature[:-6]].divide(df["total_reads"]) 44 | # check if there's any missing value 45 | feature_test = df[features] 46 | feature_test = feature_test.replace([np.inf, -np.inf], np.nan) 47 | feature_test.fillna(features_train.mean(), inplace=True) 48 | # scale the features 49 | scaled_features = feature_transformer.transform(feature_test) 50 | df["species"] = model.predict(scaled_features) 51 | df["species"].replace(label_to_species, inplace=True) 52 | csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes) 53 | -------------------------------------------------------------------------------- /single_cell/workflows/qc_annotation/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | from single_cell.utils import csvutils 5 | from single_cell.workflows.qc_annotation import tasks 6 | 7 | 8 | def test_contamination(tmpdir): 9 | data = {} 10 | 11 | cols = [ 12 | 'fastqscreen_nohit', 13 | 'fastqscreen_grch37', 14 | 'fastqscreen_grch37_multihit', 15 | 'fastqscreen_mm10', 16 | 'fastqscreen_mm10_multihit', 17 | 'fastqscreen_salmon', 18 | 'fastqscreen_salmon_multihit' 19 | ] 20 | 21 | for i in range(5): 22 | data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} 23 | for col in cols: 24 | data[i][col] = i * 10 25 | data[i]['fastqscreen_grch37'] = i * 1000 26 | data[i]['fastqscreen_mm10'] = i * 100 27 | 28 | for i in range(5, 10): 29 | data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} 30 | for col in cols: 31 | data[i][col] = (i * 10) 32 | data[i]['fastqscreen_grch37'] = i * 1000 33 | 34 | data = pd.DataFrame.from_dict(data, orient='index') 35 | data['total_reads'] = data[cols].sum(axis=1) 36 | 37 | dtypes = {col: 'int' for col in cols} 38 | dtypes['cell_id'] = 'str' 39 | dtypes['total_reads'] = 'int' 40 | 41 | infile = os.path.join(tmpdir, 'input.csv.gz') 42 | outfile = os.path.join(tmpdir, 'output.csv.gz') 43 | 44 | csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes) 45 | 46 | config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]} 47 | 48 | tasks.add_contamination_status(infile, outfile, config) 49 | 50 | output = csvutils.read_csv_and_yaml(outfile) 51 | 52 | assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5 53 | -------------------------------------------------------------------------------- /single_cell/workflows/snpeff_annotation/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | 3 | import pypeliner.managed as mgd 4 | 5 | 6 | 7 | def create_snpeff_annotation_workflow( 8 | in_vcf_file, 9 | out_csv_file, 10 | db, 11 | data_dir, 12 | split_size=int(1e3) 13 | ): 14 | workflow = pypeliner.workflow.Workflow( 15 | ctx={'num_retry': 3, 'mem_retry_increment': 2} 16 | ) 17 | 18 | workflow.transform( 19 | name='split_vcf', 20 | func='single_cell.utils.vcfutils.split_vcf', 21 | args=( 22 | mgd.InputFile(in_vcf_file), 23 | mgd.TempOutputFile('split.vcf', 'split') 24 | ), 25 | kwargs={'lines_per_file': split_size} 26 | ) 27 | 28 | workflow.transform( 29 | name='run_snpeff', 30 | axes=('split',), 31 | func='single_cell.workflows.snpeff_annotation.tasks.run_snpeff', 32 | args=( 33 | db, 34 | data_dir, 35 | mgd.TempInputFile('split.vcf', 'split'), 36 | mgd.TempOutputFile('snpeff.vcf', 'split') 37 | ), 38 | kwargs={ 39 | 'classic_mode': True 40 | } 41 | ) 42 | 43 | workflow.transform( 44 | name='convert_vcf_to_csv', 45 | axes=('split',), 46 | func='single_cell.workflows.snpeff_annotation.tasks.convert_vcf_to_table', 47 | args=( 48 | mgd.TempInputFile('snpeff.vcf', 'split'), 49 | mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), 50 | ) 51 | ) 52 | 53 | workflow.transform( 54 | name='concatenate_tables', 55 | func='single_cell.utils.csvutils.concatenate_csv', 56 | args=( 57 | mgd.TempInputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), 58 | mgd.OutputFile(out_csv_file, extensions=['.yaml']) 59 | ) 60 | ) 61 | 62 | return workflow 63 | -------------------------------------------------------------------------------- /single_cell/workflows/snpeff_annotation/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_annotate = { 3 | 'cell_id': 'str', 4 | 'chrom': 'str', 5 | 'coord': 'int', 6 | 'ref': 'str', 7 | 'alt': 'str', 8 | 'db_id': 'str', 9 | 'exact_match': 'int', 10 | 'indel': 'int', 11 | 'mappability': 'float', 12 | 'effect': 'str', 13 | 'effect_impact': 'str', 14 | 'functional_class': 'str', 15 | 'codon_change': 'str', 16 | 'amino_acid_change': 'str', 17 | 'amino_acid_length': 'str', 18 | 'gene_name': 'str', 19 | 'transcript_biotype': 'str', 20 | 'gene_coding': 'str', 21 | 'transcript_id': 'str', 22 | 'exon_rank': 'str', 23 | 'genotype': 'str', 24 | 'tri_nucleotide_context': 'str', 25 | } 26 | 27 | 28 | return snv_annotate 29 | -------------------------------------------------------------------------------- /single_cell/workflows/snpeff_annotation/tasks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from collections import OrderedDict 4 | 5 | import pandas as pd 6 | import pypeliner 7 | import vcf 8 | from single_cell.utils import csvutils 9 | from single_cell.workflows.snpeff_annotation.dtypes import dtypes 10 | 11 | 12 | def run_snpeff(db, data_dir, in_vcf_file, out_file, classic_mode=True): 13 | os.environ['MALLOC_ARENA_MAX'] = '2' 14 | data_dir = os.path.abspath(data_dir) 15 | 16 | cmd = [ 17 | 'snpEff', 18 | '-noStats', 19 | '-noLog', 20 | '-Xms2g', 21 | '-Xmx5g', 22 | '-hgvs1LetterAa', 23 | '-dataDir', 24 | data_dir, 25 | ] 26 | 27 | if classic_mode: 28 | cmd.append('-classic') 29 | 30 | cmd.extend([ 31 | db, 32 | in_vcf_file, 33 | '>', 34 | out_file 35 | ]) 36 | 37 | pypeliner.commandline.execute(*cmd) 38 | 39 | 40 | class ClassicSnpEffParser(object): 41 | 42 | def __init__(self, file_name): 43 | self._reader = vcf.Reader(filename=file_name) 44 | 45 | self.fields = self._get_field_names() 46 | 47 | self._buffer = [] 48 | 49 | self._effect_matcher = re.compile(r'(.*)\(') 50 | 51 | self._fields_matcher = re.compile(r'\((.*)\)') 52 | 53 | def __iter__(self): 54 | while True: 55 | try: 56 | yield self.next() 57 | except StopIteration: 58 | break 59 | 60 | def next(self): 61 | while len(self._buffer) == 0: 62 | record = next(self._reader) 63 | 64 | if 'EFF' not in record.INFO: 65 | continue 66 | 67 | for row in self._parse_record(record): 68 | self._buffer.append(row) 69 | 70 | return self._buffer.pop(0) 71 | 72 | def _get_field_names(self): 73 | fields = [] 74 | 75 | match = re.search(r'\((.*)\[', self._reader.infos['EFF'].desc) 76 | 77 | for x in match.groups()[0].split('|'): 78 | fields.append(x.strip().lower()) 79 | 80 | return fields 81 | 82 | def _parse_record(self, record): 83 | for annotation in record.INFO['EFF']: 84 | effect = self._effect_matcher.search(annotation).groups()[0] 85 | 86 | out_row = OrderedDict(( 87 | ('chrom', record.CHROM), 88 | ('coord', record.POS), 89 | ('ref', record.REF), 90 | ('alt', ','.join([str(x) for x in record.ALT])), 91 | ('effect', effect), 92 | )) 93 | 94 | fields = self._fields_matcher.search(annotation).groups()[0].split('|') 95 | 96 | for i, key in enumerate(self.fields): 97 | out_row[key] = fields[i] 98 | 99 | yield out_row 100 | 101 | 102 | def convert_vcf_to_table(in_file, out_file): 103 | data = [] 104 | 105 | parser = ClassicSnpEffParser(in_file) 106 | 107 | for row in parser: 108 | data.append(row) 109 | 110 | data = pd.DataFrame(data) 111 | 112 | csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes()) 113 | -------------------------------------------------------------------------------- /single_cell/workflows/snv_allele_counts/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | import pypeliner.managed as mgd 3 | from single_cell.workflows.snv_allele_counts.dtypes import dtypes 4 | 5 | 6 | def create_snv_allele_counts_for_vcf_targets_workflow( 7 | bam_files, 8 | vcf_file, 9 | out_file, 10 | sample_id, 11 | library_id, 12 | memory_cfg, 13 | count_duplicates=False, 14 | min_bqual=0, 15 | min_mqual=0, 16 | vcf_to_bam_chrom_map=None, 17 | ): 18 | ctx = { 19 | 'mem': memory_cfg['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1, 20 | 'disk_retry_increment': 50, 21 | } 22 | workflow = pypeliner.workflow.Workflow(ctx=ctx) 23 | 24 | workflow.setobj( 25 | obj=mgd.OutputChunks('cell_id'), 26 | value=list(bam_files.keys()), 27 | ) 28 | 29 | workflow.transform( 30 | name='get_snv_allele_counts_for_vcf_targets', 31 | axes=('cell_id',), 32 | func="biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets", 33 | args=( 34 | mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files, extensions=['.bai']), 35 | mgd.InputFile(vcf_file), 36 | mgd.TempOutputFile('counts.csv.gz', 'cell_id', extensions=['.yaml']), 37 | ), 38 | kwargs={ 39 | 'count_duplicates': count_duplicates, 40 | 'min_bqual': min_bqual, 41 | 'min_mqual': min_mqual, 42 | 'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map, 43 | 'cell_id': mgd.Instance('cell_id'), 44 | 'sample_id': sample_id, 45 | 'library_id': library_id, 46 | 'report_zero_count_positions': False, 47 | 'dtypes': dtypes()['snv_allele_counts'], 48 | 'write_header': False 49 | } 50 | ) 51 | 52 | workflow.transform( 53 | name='merge_snv_allele_counts', 54 | ctx={'mem': memory_cfg['high'], 'disk': 20}, 55 | func="single_cell.utils.csvutils.concatenate_csv", 56 | args=( 57 | mgd.TempInputFile('counts.csv.gz', 'cell_id', extensions=['.yaml']), 58 | mgd.OutputFile(out_file, extensions=['.yaml']), 59 | ), 60 | kwargs={ 61 | 'write_header': True, 62 | } 63 | ) 64 | 65 | return workflow 66 | -------------------------------------------------------------------------------- /single_cell/workflows/snv_allele_counts/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_allele_counts = { 3 | 'chrom': 'str', 4 | 'coord': 'int', 5 | 'ref': 'str', 6 | 'alt': 'str', 7 | 'ref_counts': 'int', 8 | 'alt_counts': 'int', 9 | 'cell_id': 'str', 10 | 'sample_id': 'str', 11 | 'library_id': 'str', 12 | } 13 | 14 | dtypes = locals() 15 | 16 | return dtypes 17 | -------------------------------------------------------------------------------- /single_cell/workflows/snv_annotate/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | import pypeliner.managed as mgd 3 | 4 | 5 | def create_snv_annotate_workflow( 6 | config, 7 | museq_vcf, 8 | strelka_vcf, 9 | mappability_csv, 10 | snpeff_csv, 11 | trinuc_csv, 12 | additional_csv, 13 | memory_config, 14 | ): 15 | ctx = { 16 | 'mem': memory_config['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1, 17 | 'disk_retry_increment': 50, 18 | } 19 | split_size = config['split_size'] 20 | 21 | workflow = pypeliner.workflow.Workflow(ctx=ctx) 22 | 23 | workflow.transform( 24 | name='merge_snvs', 25 | func='biowrappers.components.io.vcf.tasks.merge_vcfs', 26 | ctx=ctx, 27 | args=( 28 | [ 29 | mgd.InputFile(museq_vcf, extensions=['.tbi', '.csi']), 30 | mgd.InputFile(strelka_vcf, extensions=['.tbi', '.csi']), 31 | ], 32 | mgd.TempOutputFile('all.snv.vcf') 33 | ), 34 | ) 35 | 36 | workflow.transform( 37 | name='finalise_snvs', 38 | func="biowrappers.components.io.vcf.tasks.finalise_vcf", 39 | ctx=ctx, 40 | args=( 41 | mgd.TempInputFile('all.snv.vcf'), 42 | mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']) 43 | ), 44 | ) 45 | 46 | workflow.subworkflow( 47 | name='snpeff_annotation', 48 | func="single_cell.workflows.snpeff_annotation.create_snpeff_annotation_workflow", 49 | args=( 50 | mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 51 | mgd.OutputFile(snpeff_csv, extensions=['.yaml']), 52 | config['databases']['snpeff']['db'], 53 | config['databases']['snpeff']['path'], 54 | ) 55 | ) 56 | 57 | workflow.subworkflow( 58 | name='trinuc_annotation', 59 | func="single_cell.workflows.trinuc_annotation.create_trinuc_annotation_workflow", 60 | args=( 61 | mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 62 | mgd.OutputFile(trinuc_csv, extensions=['.yaml']), 63 | config['ref_genome'], 64 | ), 65 | kwargs={'split_size': split_size} 66 | ) 67 | 68 | workflow.subworkflow( 69 | name='mappability_annotation', 70 | func="single_cell.workflows.mappability_annotation.create_mappability_annotation_workflow", 71 | args=( 72 | mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 73 | mgd.OutputFile(mappability_csv, extensions=['.yaml']), 74 | config['databases']['mappability']['path'], 75 | ), 76 | kwargs={'split_size': split_size} 77 | ) 78 | 79 | for k, v in config['databases']['additional_databases'].items(): 80 | workflow.subworkflow( 81 | name='{}_status'.format(k), 82 | func='single_cell.workflows.db_annotation.create_db_annotation_workflow', 83 | ctx=dict(mem=4, mem_retry_increment=2), 84 | args=( 85 | mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), 86 | mgd.OutputFile(additional_csv[k], extensions=['.yaml']), 87 | v['path'], 88 | ), 89 | kwargs={'split_size': split_size} 90 | ) 91 | 92 | return workflow 93 | -------------------------------------------------------------------------------- /single_cell/workflows/split_bams/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 21, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | import pypeliner.managed as mgd 7 | 8 | import pypeliner 9 | 10 | 11 | def create_split_workflow( 12 | normal_bam, normal_split_bam, 13 | regions, config, by_reads=False 14 | ): 15 | 16 | normal_split_bam = dict([(ival, normal_split_bam[ival]) 17 | for ival in regions]) 18 | 19 | one_split_job = config["one_split_job"] 20 | 21 | workflow = pypeliner.workflow.Workflow() 22 | 23 | workflow.setobj( 24 | obj=mgd.OutputChunks('region'), 25 | value=regions, 26 | ) 27 | 28 | # split by reads always runs no a single node 29 | if by_reads: 30 | workflow.transform( 31 | name='split_normal_bam', 32 | ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']}, 33 | func="single_cell.workflows.split_bams.tasks.split_bam_file_by_reads", 34 | args=( 35 | mgd.InputFile(normal_bam, extensions=['.bai']), 36 | mgd.OutputFile( 37 | "normal.split.bam", "region", 38 | fnames=normal_split_bam, axes_origin=[], 39 | extensions=['.bai'] 40 | ), 41 | mgd.TempSpace("bam_split_by_reads"), 42 | regions, 43 | ), 44 | ) 45 | 46 | elif one_split_job: 47 | workflow.transform( 48 | name='split_normal_bam', 49 | ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']}, 50 | func="single_cell.workflows.split_bams.tasks.split_bam_file_one_job", 51 | args=( 52 | mgd.InputFile(normal_bam, extensions=['.bai']), 53 | mgd.OutputFile( 54 | "normal.split.bam", "region", 55 | fnames=normal_split_bam, axes_origin=[], 56 | extensions=['.bai'], 57 | ), 58 | regions, 59 | mgd.TempSpace("one_job_split_tempdir") 60 | ), 61 | kwargs={"ncores": config["max_cores"]} 62 | ) 63 | 64 | else: 65 | workflow.transform( 66 | name='split_normal_bam', 67 | ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']}, 68 | axes=('region',), 69 | func="single_cell.workflows.split_bams.tasks.split_bam_file", 70 | args=( 71 | mgd.InputFile(normal_bam, extensions=['.bai']), 72 | mgd.OutputFile( 73 | "normal.split.bam", "region", fnames=normal_split_bam, 74 | extensions=['.bai'] 75 | ), 76 | mgd.InputInstance('region') 77 | ) 78 | ) 79 | 80 | return workflow 81 | -------------------------------------------------------------------------------- /single_cell/workflows/strelka/components_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 21, 2015 3 | 4 | @author: Andrew Roth 5 | ''' 6 | import errno 7 | import os 8 | import random 9 | import time 10 | 11 | 12 | def find(name, path): 13 | for root, _, files in os.walk(path): 14 | if name in files: 15 | return os.path.join(root, name) 16 | 17 | 18 | def get_ancestor_directory(path, level=1): 19 | ''' 20 | Get the path of the directory a specified number of levels above the given path. 21 | 22 | >>> get_ancestor_directory('/foo/bar/some/where/my_file.txt', level=2) 23 | '/foo/bar/some' 24 | ''' 25 | ancestor_dir = path 26 | 27 | for _ in range(level): 28 | ancestor_dir = os.path.dirname(ancestor_dir) 29 | 30 | return ancestor_dir 31 | 32 | 33 | def make_directory(target_dir, mode=775): 34 | ''' 35 | Check if a directory exists and make it if not. 36 | 37 | For example, given /some/where make the folder /some/where. If /some does not exist, it will also be made. 38 | ''' 39 | i = 0 40 | 41 | try: 42 | old_umask = os.umask(0000) 43 | 44 | while not os.path.exists(target_dir): 45 | # Randomly sleep for a short random time so multiple simultaneous calls don't try to create the directory. 46 | time.sleep(random.random() * 2) 47 | 48 | try: 49 | os.makedirs(target_dir, mode) 50 | 51 | except OSError: 52 | i += 1 53 | 54 | if i > 10: 55 | raise 56 | 57 | finally: 58 | os.umask(old_umask) 59 | 60 | 61 | def make_parent_directory(file_name, mode=775): 62 | ''' 63 | Given a file name, make the parent directory if it does not exist using make_directory. 64 | 65 | For example, given /some/where/foo.bar make the folder /some/where. 66 | ''' 67 | parent_dir = os.path.dirname(file_name) 68 | 69 | make_directory(parent_dir, mode=mode) 70 | 71 | 72 | def flatten_input(files): 73 | if type(files) == dict: 74 | parsed_files = [files[x] for x in sorted(files)] 75 | elif type(files) == str: 76 | parsed_files = [files, ] 77 | else: 78 | parsed_files = [] 79 | for x in files: 80 | if type(x) == dict: 81 | parsed_files.extend([x[y] for y in sorted(x)]) 82 | else: 83 | parsed_files.append(x) 84 | return parsed_files 85 | 86 | 87 | def remove(filename): 88 | ''' 89 | Remove a file that may not exist 90 | ''' 91 | try: 92 | os.remove(filename) 93 | except OSError as e: 94 | if e.errno != errno.ENOENT: 95 | raise 96 | 97 | 98 | def symlink(filename, link_name=None, link_directory=None): 99 | ''' 100 | Create a symlink, with additional options for flexibility, 101 | 102 | Args: 103 | filename (str): file to link to 104 | 105 | KwArgs: 106 | link_name (str): base name of the link, defaults to same as link to 107 | link_directory (str): directory of the, defaults to directory of link to 108 | 109 | ''' 110 | if link_name is None: 111 | link_name = os.path.basename(filename) 112 | if link_directory is None: 113 | link_directory = os.getcwd() 114 | link_filename = os.path.join(link_directory, link_name) 115 | remove(link_filename) 116 | filename = os.path.abspath(filename) 117 | os.symlink(filename, link_filename) 118 | return link_filename 119 | 120 | 121 | if __name__ == '__main__': 122 | import doctest 123 | 124 | doctest.testmod() 125 | -------------------------------------------------------------------------------- /single_cell/workflows/strelka/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_strelka = { 3 | "chrom": "str", 4 | "coord": "int", 5 | "ref": "str", 6 | "alt": "str", 7 | "score": "int" 8 | } 9 | 10 | dtypes = locals() 11 | 12 | return dtypes 13 | -------------------------------------------------------------------------------- /single_cell/workflows/strelka/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 24, 2017 3 | 4 | @author: dgrewal 5 | ''' 6 | from .parse_strelka import ParseStrelka -------------------------------------------------------------------------------- /single_cell/workflows/strelka/scripts/vizutils/__init__.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | 4 | from parseutils import ParseUtils 5 | from vcf import Vcf 6 | from utils import Utils 7 | -------------------------------------------------------------------------------- /single_cell/workflows/trinuc_annotation/__init__.py: -------------------------------------------------------------------------------- 1 | import pypeliner 2 | import pypeliner.managed as mgd 3 | 4 | 5 | def create_trinuc_annotation_workflow( 6 | in_vcf_file, 7 | out_csv_file, 8 | ref_genome, 9 | split_size=int(1e4), 10 | ): 11 | workflow = pypeliner.workflow.Workflow( 12 | ctx={'num_retry': 3, 'mem_retry_increment': 2} 13 | ) 14 | 15 | workflow.transform( 16 | name='split_vcf', 17 | func='single_cell.utils.vcfutils.split_vcf', 18 | args=( 19 | mgd.InputFile(in_vcf_file), 20 | mgd.TempOutputFile('split.vcf', 'split') 21 | ), 22 | kwargs={'lines_per_file': split_size} 23 | ) 24 | 25 | workflow.transform( 26 | name='annotate_db_status', 27 | axes=('split',), 28 | func='single_cell.workflows.trinuc_annotation.tasks.get_tri_nucelotide_context', 29 | args=( 30 | ref_genome, 31 | mgd.TempInputFile('split.vcf', 'split'), 32 | mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), 33 | ) 34 | ) 35 | 36 | workflow.transform( 37 | name='merge_tables', 38 | func='single_cell.utils.csvutils.concatenate_csv', 39 | args=( 40 | mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), 41 | mgd.OutputFile(out_csv_file, extensions=['.yaml']) 42 | ) 43 | ) 44 | 45 | return workflow 46 | -------------------------------------------------------------------------------- /single_cell/workflows/trinuc_annotation/dtypes.py: -------------------------------------------------------------------------------- 1 | def dtypes(): 2 | snv_annotate = { 3 | 'cell_id': 'str', 4 | 'chrom': 'str', 5 | 'coord': 'int', 6 | 'ref': 'str', 7 | 'alt': 'str', 8 | 'db_id': 'str', 9 | 'exact_match': 'int', 10 | 'indel': 'int', 11 | 'mappability': 'float', 12 | 'effect': 'str', 13 | 'effect_impact': 'str', 14 | 'functional_class': 'str', 15 | 'codon_change': 'str', 16 | 'amino_acid_change': 'str', 17 | 'amino_acid_length': 'str', 18 | 'gene_name': 'str', 19 | 'transcript_biotype': 'str', 20 | 'gene_coding': 'str', 21 | 'transcript_id': 'str', 22 | 'exon_rank': 'str', 23 | 'genotype': 'str', 24 | 'tri_nucleotide_context': 'str', 25 | } 26 | 27 | 28 | return snv_annotate 29 | -------------------------------------------------------------------------------- /single_cell/workflows/trinuc_annotation/tasks.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pysam 3 | import vcf 4 | 5 | from single_cell.utils import csvutils 6 | from single_cell.workflows.trinuc_annotation.dtypes import dtypes 7 | 8 | 9 | def get_tri_nucelotide_context(ref_genome_fasta_file, vcf_file, out_file): 10 | vcf_reader = vcf.Reader(filename=vcf_file) 11 | 12 | fasta_reader = pysam.Fastafile(ref_genome_fasta_file) 13 | 14 | data = [] 15 | 16 | for record in vcf_reader: 17 | chrom = record.CHROM 18 | 19 | coord = record.POS 20 | 21 | tri_nucleotide_context = fasta_reader.fetch(chrom, coord - 2, coord + 1) 22 | 23 | data.append({'chrom': record.CHROM, 'coord': record.POS, 'tri_nucleotide_context': tri_nucleotide_context}) 24 | 25 | data = pd.DataFrame(data) 26 | 27 | csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes()) 28 | --------------------------------------------------------------------------------