├── .gitattributes
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── __init__.py
├── buildspec_master.yml
├── buildspec_staging.yml
├── docker
    ├── alignment
    │   ├── build.sh
    │   ├── conda_requirements.yml
    │   └── dockerfile_template
    ├── annotation
    │   ├── build.sh
    │   ├── conda_requirements.yml
    │   └── dockerfile_template
    ├── breakpoint
    │   ├── build.sh
    │   ├── conda_base_requirements.yml
    │   ├── conda_lumpy_requirements.yml
    │   └── dockerfile_template
    ├── cohort_qc
    │   ├── conda_requirements.txt
    │   └── python_requirements.txt
    ├── haplotypes
    │   ├── build.sh
    │   ├── dockerfile_template
    │   └── requirements.yml
    ├── hmmcopy
    │   ├── build.sh
    │   ├── conda_requirements.yml
    │   └── dockerfile_template
    ├── qc
    │   ├── build.sh
    │   ├── conda_requirements.txt
    │   ├── dockerfile_template
    │   ├── oncokb-annotator
    │   │   ├── AnnotatorCore.py
    │   │   └── MafAnnotator.py
    │   └── pip_requirements.txt
    ├── sample_qc
    │   ├── conda_requirements.txt
    │   └── python_requirements.txt
    ├── variant
    │   ├── build.sh
    │   ├── conda_base_requirements.yml
    │   ├── conda_museq_requirements.yml
    │   └── dockerfile_template
    └── vcf2maf
    │   ├── conda_requirements.txt
    │   └── python_requirements.txt
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── alignment_metrics.md
    │   ├── annotation_metrics.md
    │   ├── conf.py
    │   ├── gc_metrics.md
    │   ├── hmmcopy_metrics.md
    │   ├── hmmcopy_reads.md
    │   ├── hmmcopy_segments.md
    │   ├── index.md
    │   ├── install.md
    │   ├── organism_filter.md
    │   ├── quality_classifier.md
    │   └── readme_data
    │       ├── alignment.png
    │       ├── alignment.tikz
    │       ├── annotation.png
    │       ├── annotation.tikz
    │       ├── breakpoint_calling.png
    │       ├── breakpoint_calling.tikz
    │       ├── dlp_cohort_pipeline.png
    │       ├── germline.png
    │       ├── germline.tikz
    │       ├── hmmcopy.png
    │       ├── hmmcopy.tikz
    │       ├── infer_haps.png
    │       ├── infer_haps.tikz
    │       ├── merge_cell_bams.png
    │       ├── merge_cell_bams.tikz
    │       ├── pseudo_bulk_qc.png
    │       ├── split_wgs_bam.png
    │       ├── split_wgs_bam.tikz
    │       ├── variant_calling.png
    │       ├── variant_calling.tikz
    │       ├── variant_counting.png
    │       └── variant_counting.tikz
├── setup.cfg
├── setup.py
├── single_cell
    ├── __init__.py
    ├── _version.py
    ├── alignment.py
    ├── annotation.py
    ├── breakpoint_calling.py
    ├── clean_sentinels.py
    ├── cmdline.py
    ├── cohort_qc.py
    ├── config
    │   ├── __init__.py
    │   ├── batch.py
    │   ├── config_reference.py
    │   ├── generate_batch_config.py
    │   ├── generate_pipeline_config.py
    │   └── pipeline_config.py
    ├── generate_config.py
    ├── germline_calling.py
    ├── hmmcopy.py
    ├── infer_haps.py
    ├── merge_bams.py
    ├── run.py
    ├── sample_qc.py
    ├── snv_genotyping.py
    ├── split_bam.py
    ├── sv_genotyping.py
    ├── tests
    │   ├── __init__.py
    │   └── codebuild
    │   │   ├── __init__.py
    │   │   ├── align
    │   │       ├── align.sh
    │   │       ├── inputs.yaml
    │   │       └── test_alignment.py
    │   │   ├── annotation
    │   │       ├── annotation.sh
    │   │       ├── inputs.yaml
    │   │       └── test_annotation.py
    │   │   ├── breakpoint_calling
    │   │       ├── breakpoint_calling.sh
    │   │       ├── inputs.yaml
    │   │       └── test_breakpoint_calling.py
    │   │   ├── cohort_qc
    │   │       ├── cohort_qc.sh
    │   │       └── inputs.yaml
    │   │   ├── compare.py
    │   │   ├── count_haps
    │   │       ├── count_haps.sh
    │   │       ├── inputs.yaml
    │   │       └── test_count_haps.py
    │   │   ├── hmmcopy
    │   │       ├── hmmcopy.sh
    │   │       ├── inputs.yaml
    │   │       └── test_hmmcopy.py
    │   │   ├── infer_haps
    │   │       ├── infer_haps.sh
    │   │       ├── inputs.yaml
    │   │       └── test_infer_haps.py
    │   │   ├── merge_cell_bams
    │   │       ├── inputs.yaml
    │   │       ├── merge_cell_bams.sh
    │   │       └── test_merge_cell_bams.py
    │   │   ├── preflight
    │   │       └── preflight.sh
    │   │   ├── pseudo_bulk_qc
    │   │       ├── inputs.yaml
    │   │       └── pseudo_bulk_qc.sh
    │   │   ├── refdata
    │   │       └── download.sh
    │   │   ├── snv_genotyping
    │   │       ├── inputs.yaml
    │   │       └── snv_genotyping.sh
    │   │   ├── split_wgs_bam
    │   │       ├── inputs.yaml
    │   │       ├── split_wgs_bam.sh
    │   │       └── test_split_wgs_bam.py
    │   │   └── variant_calling
    │   │       ├── inputs.yaml
    │   │       ├── test_variant_calling.py
    │   │       └── variant_calling.sh
    ├── utils
    │   ├── __init__.py
    │   ├── bamutils.py
    │   ├── csvutils.py
    │   ├── fastqutils.py
    │   ├── gatkutils.py
    │   ├── helpers.py
    │   ├── inpututils.py
    │   ├── ltmutils.py
    │   ├── pdfutils.py
    │   ├── picardutils.py
    │   ├── pysamutils.py
    │   ├── refgenome.py
    │   ├── singlecell_copynumber_plot_utils
    │   │   ├── __init__.py
    │   │   ├── heatmap.py
    │   │   ├── plot_hmmcopy.py
    │   │   ├── plot_kernel_density.py
    │   │   ├── plot_metrics.py
    │   │   ├── plot_pcolormesh.py
    │   │   └── utils.py
    │   ├── storageutils.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── csvutils_test.py
    │   │   └── test_helpers.py
    │   ├── validator
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   └── validate.py
    │   └── vcfutils.py
    ├── variant_calling.py
    └── workflows
    │   ├── __init__.py
    │   ├── align
    │       ├── __init__.py
    │       ├── align_tasks.py
    │       ├── coverage_metrics.py
    │       ├── dtypes.py
    │       ├── fastqscreen.py
    │       ├── fastqscreen_test.py
    │       ├── fastqscreen_utils.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   ├── collect_metrics.py
    │       │   ├── gen_cn_matrix.py
    │       │   ├── run_trimgalore.py
    │       │   └── summary_metrics.py
    │       └── tasks.py
    │   ├── cohort_qc
    │       ├── __init__.py
    │       ├── scripts
    │       │   ├── oncoplot.R
    │       │   ├── report.Rmd
    │       │   ├── vcf2maf
    │       │   └── vcf2maf.sh
    │       └── tasks.py
    │   ├── db_annotation
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── destruct_singlecell
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── extract_allele_readcounts
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── germline
    │       ├── __init__.py
    │       └── tasks.py
    │   ├── hmmcopy
    │       ├── __init__.py
    │       ├── dtypes.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   ├── classify.py
    │       │   ├── convert_csv_to_seg.py
    │       │   ├── correct_read_count.py
    │       │   ├── hmmcopy_single_cell.R
    │       │   └── read_counter.py
    │       └── tasks.py
    │   ├── infer_haps
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── lumpy
    │       ├── __init__.py
    │       ├── dtypes.py
    │       ├── generate_histogram.py
    │       ├── merge_histograms.py
    │       ├── parse_lumpy_to_csv.py
    │       └── tasks.py
    │   ├── mappability_annotation
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── merge_bams
    │       ├── __init__.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   └── collect_metrics.py
    │       └── tasks.py
    │   ├── mutationseq
    │       ├── __init__.py
    │       ├── dtypes.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   ├── parse_museq.py
    │       │   └── vizutils
    │       │   │   ├── __init__.py
    │       │   │   ├── parseutils.py
    │       │   │   ├── utils.py
    │       │   │   └── vcf.py
    │       └── tasks.py
    │   ├── pseudo_bulk_qc
    │       ├── __init__.py
    │       ├── scripts
    │       │   ├── mergemafs.R
    │       │   ├── mergesnvs.R
    │       │   ├── mutationreport.Rmd
    │       │   ├── report.Rmd
    │       │   ├── single_cell_qc_plots.py
    │       │   └── vcf2maf.sh
    │       └── tasks.py
    │   ├── qc_annotation
    │       ├── __init__.py
    │       ├── dtypes.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   ├── classify.py
    │       │   ├── fastqscreen_classify.py
    │       │   └── generate_qc.py
    │       ├── tasks.py
    │       └── tests.py
    │   ├── snpeff_annotation
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
    │   ├── snv_allele_counts
    │       ├── __init__.py
    │       └── dtypes.py
    │   ├── snv_annotate
    │       └── __init__.py
    │   ├── split_bams
    │       ├── __init__.py
    │       └── tasks.py
    │   ├── strelka
    │       ├── __init__.py
    │       ├── _merge.py
    │       ├── components_utils.py
    │       ├── dtypes.py
    │       ├── scripts
    │       │   ├── __init__.py
    │       │   ├── parse_strelka.py
    │       │   └── vizutils
    │       │   │   ├── __init__.py
    │       │   │   ├── parseutils.py
    │       │   │   ├── utils.py
    │       │   │   └── vcf.py
    │       ├── strelkautils.py
    │       ├── tasks.py
    │       └── vcf_tasks.py
    │   ├── sv_genotyping
    │       ├── __init__.py
    │       └── tasks.py
    │   └── trinuc_annotation
    │       ├── __init__.py
    │       ├── dtypes.py
    │       └── tasks.py
└── versioneer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | single_cell/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store
3 | *.egg-info
4 | *.project
5 | *.pydevproject
6 | *.swp
7 | build
8 | dist
9 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at grewald@mskcc.org. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include single_cell/_version.py
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Single Cell Pipeline
 3 | We've stopped development on this project, Please checkout mondrian for the latest DLP+ workflows:
 4 | https://github.com/mondrian-scwgs/mondrian
 5 | 
 6 | 
 7 | For a detailed guide see [INSTALL](docs/source/index.md)
 8 | 
 9 | [Changelog](CHANGELOG.md)
10 | 
11 | 
12 | ## What is it?
13 | 
14 | single cell pipeline is suite of workflows for analysing the single cell data generated by DLP+. 
15 | 
16 | ## Where to get it
17 | The source code is currently hosted on GitHub at:
18 | https://github.com/shahcompbio/single_cell_pipeline
19 | 
20 | docker containers are available at
21 | https://quay.io/organization/singlecellpipeline
22 | 
23 | conda packages are available at
24 | https://anaconda.org/shahcompbio
25 | 
26 | ## License
27 | [GPL v3.0](LICENSE)
28 | 
29 | ## Documentation
30 | The official documentation is hosted on http://single_cell_pipeline.readthedocs.io/
31 | 
32 | ## Getting Help
33 | 
34 | Please contact the developers at
35 | * Diljot Grewal <grewald@mskcc.org>
36 | * Andrew Mcpherson <mcphera1@mskcc.org>
37 | 
38 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/__init__.py


--------------------------------------------------------------------------------
/buildspec_master.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   pre_build:
 5 |     commands:
 6 |       - bash single_cell/tests/codebuild/preflight/preflight.sh
 7 |       - cd docker/alignment/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
 8 |       - cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
 9 |       - cd docker/annotation/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
10 |       - cd docker/variant/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
11 |       - cd docker/breakpoint/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
12 |       - cd docker/haplotypes/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
13 |       - cd docker/qc/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
14 |   build:
15 |     commands:
16 |       - bash single_cell/tests/codebuild/refdata/download.sh
17 |       - bash single_cell/tests/codebuild/align/align.sh quay.io/singlecellpipelinetest
18 |       - bash single_cell/tests/codebuild/hmmcopy/hmmcopy.sh quay.io/singlecellpipelinetest
19 |       - bash single_cell/tests/codebuild/annotation/annotation.sh quay.io/singlecellpipelinetest
20 |       - bash single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh quay.io/singlecellpipelinetest
21 |       - bash single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh quay.io/singlecellpipelinetest
22 |       - bash single_cell/tests/codebuild/variant_calling/variant_calling.sh quay.io/singlecellpipelinetest
23 |       - bash single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh quay.io/singlecellpipelinetest
24 |       - bash single_cell/tests/codebuild/infer_haps/infer_haps.sh quay.io/singlecellpipelinetest
25 |       - bash single_cell/tests/codebuild/count_haps/count_haps.sh quay.io/singlecellpipelinetest
26 |       - bash single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh quay.io/singlecellpipelinetest
27 |       - bash single_cell/tests/codebuild/cohort_qc/cohort_qc.sh quay.io/singlecellpipelinetest
28 |       - bash single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh quay.io/singlecellpipelinetest
29 |   post_build:
30 |     commands:
31 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/alignment/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
32 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
33 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/annotation/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
34 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/variant/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
35 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/breakpoint/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
36 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/haplotypes/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
37 |       - if [ $CODEBUILD_BUILD_SUCCEEDING = 1 ]; then cd docker/qc/ && bash build.sh quay.io singlecellpipeline $QUAY_USR $QUAY_PSW && cd ../../; fi
38 | 


--------------------------------------------------------------------------------
/buildspec_staging.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   pre_build:
 5 |     commands:
 6 |       - cd docker/alignment/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
 7 |       - cd docker/hmmcopy/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
 8 |       - cd docker/annotation/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
 9 |       - cd docker/variant/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
10 |       - cd docker/breakpoint/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
11 |       - cd docker/haplotypes/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
12 |       - cd docker/qc/ && bash build.sh quay.io singlecellpipelinetest $QUAY_USR $QUAY_PSW && cd ../../
13 |   build:
14 |     commands:
15 |       - bash single_cell/tests/codebuild/refdata/download.sh
16 |       - bash single_cell/tests/codebuild/align/align.sh quay.io/singlecellpipelinetest
17 |       - bash single_cell/tests/codebuild/hmmcopy/hmmcopy.sh quay.io/singlecellpipelinetest
18 |       - bash single_cell/tests/codebuild/annotation/annotation.sh quay.io/singlecellpipelinetest
19 |       - bash single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh quay.io/singlecellpipelinetest
20 |       - bash single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh quay.io/singlecellpipelinetest
21 |       - bash single_cell/tests/codebuild/variant_calling/variant_calling.sh quay.io/singlecellpipelinetest
22 |       - bash single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh quay.io/singlecellpipelinetest
23 |       - bash single_cell/tests/codebuild/infer_haps/infer_haps.sh quay.io/singlecellpipelinetest
24 |       - bash single_cell/tests/codebuild/count_haps/count_haps.sh quay.io/singlecellpipelinetest
25 |       - bash single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh quay.io/singlecellpipelinetest
26 |       - bash single_cell/tests/codebuild/cohort_qc/cohort_qc.sh quay.io/singlecellpipelinetest
27 |       - bash single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh quay.io/singlecellpipelinetest
28 | 


--------------------------------------------------------------------------------
/docker/alignment/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_alignment:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_alignment:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/alignment/dockerfile_template:
--------------------------------------------------------------------------------
 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
 2 | ADD . /app
 3 | 
 4 | RUN apt-get update -y && apt-get install -y libltdl7 parallel && rm -rf /var/lib/apt/lists/*
 5 | 
 6 | RUN conda install --file /app/conda_requirements.yml
 7 | 
 8 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
 9 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
10 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
11 | 


--------------------------------------------------------------------------------
/docker/annotation/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_annotation:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_annotation:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/annotation/dockerfile_template:
--------------------------------------------------------------------------------
1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
2 | ADD . /app
3 | RUN conda install --file /app/conda_requirements.yml
4 | RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py
5 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
6 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
7 | RUN pip install git+https://github.com/shahcompbio/cell_cycle_classifier.git@v0.0.3
8 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
9 | 


--------------------------------------------------------------------------------
/docker/breakpoint/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_breakpoint:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_breakpoint:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/breakpoint/conda_lumpy_requirements.yml:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: linux-64
 4 | @EXPLICIT
 5 | https://conda.anaconda.org/bioconda/linux-64/sambamba-0.6.6-2.tar.bz2
 6 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
 7 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda
 8 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2021.1.19-h06a4308_0.conda
 9 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.2-254.conda
10 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.conda
11 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.conda
12 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.conda
13 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.2-256.conda
14 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.tar.bz2
15 | https://repo.anaconda.com/pkgs/main/linux-64/gawk-5.1.0-h7b6447c_0.conda
16 | https://conda.anaconda.org/bioconda/linux-64/libdeflate-1.0-h14c3975_1.tar.bz2
17 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda
18 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.conda
19 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1i-h27cfd23_0.conda
20 | https://conda.anaconda.org/bioconda/linux-64/samblaster-0.1.26-hc9558a2_0.tar.bz2
21 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.conda
22 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.conda
23 | https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20191231-h14c3975_1.conda
24 | https://repo.anaconda.com/pkgs/main/linux-64/libssh2-1.9.0-h1ba5d50_1.conda
25 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1-h27cfd23_0.conda
26 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.conda
27 | https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.18.2-h173b8e3_0.conda
28 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.33.0-h62c20be_0.conda
29 | https://repo.anaconda.com/pkgs/main/linux-64/libcurl-7.71.1-h20c2e04_1.conda
30 | https://repo.anaconda.com/pkgs/main/linux-64/python-2.7.18-h15b4118_1.conda
31 | https://repo.anaconda.com/pkgs/main/noarch/certifi-2020.6.20-pyhd3eb1b0_3.conda
32 | https://repo.anaconda.com/pkgs/main/linux-64/curl-7.71.1-hbc83047_1.conda
33 | https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-pyhd3eb1b0_0.tar.bz2
34 | https://repo.anaconda.com/pkgs/main/noarch/wheel-0.36.2-pyhd3eb1b0_0.conda
35 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py27he904b0f_0.conda
36 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.15.3-py27hda2845c_1.tar.bz2
37 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-44.0.0-py27_0.conda
38 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.16.6-py27hde5b4d6_0.conda
39 | https://repo.anaconda.com/pkgs/main/linux-64/pip-19.3.1-py27_0.conda
40 | https://conda.anaconda.org/componc/linux-64/lumpy-sv-0.2.12-h14c3975_0.tar.bz2
41 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.0.15-py27ha843d7b_0.conda
42 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.0-py27hd6b4f25_0.conda
43 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.16.6-py27hbc911f0_0.conda
44 | 


--------------------------------------------------------------------------------
/docker/breakpoint/dockerfile_template:
--------------------------------------------------------------------------------
 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
 2 | ADD . /app
 3 | 
 4 | 
 5 | # this does not work because bioconda package for lumpy is missing a commit w fix that's needed for bed output
 6 | #RUN conda create --name lumpy python=2.7
 7 | #SHELL ["conda", "run", "-n", "lumpy", "/bin/bash", "-c"]
 8 | #RUN conda install --file /app/conda_lumpy_requirements.yml
 9 | #RUN sed 's/usr\/bin\/env python/usr\/bin\/env python2/' /opt/conda/envs/lumpy/bin/lumpy_extractSplitReads_BwaMem > /opt/conda/envs/lumpy/bin/extractSplitReads_BwaMem && chmod 777 /opt/conda/envs/lumpy/bin/extractSplitReads_BwaMem
10 | #SHELL ["conda", "run", "-n", "base", "/bin/bash", "-c"]
11 | 
12 | RUN apt-get update -y && apt install autoconf make gcc zlib1g-dev libcurl3-dev libssl-dev g++ samtools -y && rm -rf /var/lib/apt/lists/*
13 | RUN git clone --recursive https://github.com/arq5x/lumpy-sv.git && cd ./lumpy-sv && make && cp bin/* /usr/local/bin/. && cp ./scripts/extractSplitReads_BwaMem /usr/local/bin/.
14 | 
15 | 
16 | 
17 | RUN conda install --file /app/conda_base_requirements.yml
18 | 
19 | ENV PATH="${PATH}:/opt/conda/envs/lumpy/bin"
20 | 
21 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
22 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
23 | RUN pip install git+https://github.com/shahcompbio/biowrappers.git@master
24 | RUN pip install -e git+https://github.com/amcpherson/blossomv.git@download_link_fix#egg=blossomv
25 | RUN pip install networkx==2.1
26 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
27 | 


--------------------------------------------------------------------------------
/docker/cohort_qc/python_requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2021.5.30
 2 | cffi==1.14.5
 3 | chardet==4.0.0
 4 | click==7.1.2
 5 | cycler==0.10.0
 6 | decorator==4.4.2
 7 | dill==0.3.3
 8 | idna==2.10
 9 | Jinja2==3.0.0
10 | kiwisolver==1.3.1
11 | mafannotator @ git+https://github.com/shahcompbio/mafannotator.git@2d773b4fe77b1408d60916ef70f47183adbba5b0
12 | MarkupSafe==2.0.0
13 | matplotlib==3.4.1
14 | networkx==2.5.1
15 | numexpr==2.7.3
16 | numpy==1.20.2
17 | pandas==1.2.4
18 | Pillow==8.2.0
19 | pycparser==2.20
20 | pyparsing==2.4.7
21 | pypeliner @ git+https://github.com/shahcompbio/pypeliner.git@b452c14c4abc6e653ac8e8f52d3c9b9a158becd1
22 | python-dateutil==2.8.1
23 | pytz==2021.1
24 | PyYAML==5.4.1
25 | requests==2.25.1
26 | rpy2==3.4.4
27 | #scgenome @ git+https://github.com/shahcompbio/scgenome.git@179017b23b423b17c9a40450927ed6bbbd21cc7b
28 | scipy==1.6.3
29 | seaborn==0.11.1
30 | # Editable install with no version control (single-cell==0.7.6+14.gc0a7879.dirty)
31 | #-e /juno/home/abramsd/miniconda3/envs/scp_cohort_qc/lib/python3.9/site-packages/single_cell-0.7.6+14.gc0a7879.dirty-py3.9.egg
32 | six==1.15.0
33 | tables==3.6.1
34 | tzlocal==2.1
35 | urllib3==1.26.4
36 | wgs-analysis @ git+https://github.com/amcpherson/wgs_analysis.git@e86b3a158f4cbc2e43fab0e24b8c2b7dded360ad
37 | #classifycopynumber @ git+https://github.com/shahcompbio/classifycopynumber.git@1c7c81ada82b885b8da6d540cd6cd3ccf2656f1e
38 | 
39 | 


--------------------------------------------------------------------------------
/docker/haplotypes/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_haplotypes:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_haplotypes:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/haplotypes/dockerfile_template:
--------------------------------------------------------------------------------
 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
 2 | 
 3 | ADD . /app
 4 | 
 5 | RUN rm -rf /opt/conda/lib/python2.7/site-packages/remixt* && apt-get update -y && apt install libc-dev libz-dev build-essential -y && rm -rf /var/lib/apt/lists/* && conda install -c bioconda cython
 6 | 
 7 | RUN conda install --file /app/requirements.yml
 8 | 
 9 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
10 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
11 | 
12 | RUN pip install git+https://github.com/amcpherson/remixt.git@0.5.13r2
13 | RUN mkdir -p /root/.config/matplotlib && echo "backend : Agg" > /root/.config/matplotlib/matplotlibrc
14 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
15 | 


--------------------------------------------------------------------------------
/docker/hmmcopy/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_hmmcopy:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_hmmcopy:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/hmmcopy/dockerfile_template:
--------------------------------------------------------------------------------
1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
2 | ADD . /app
3 | RUN conda install --file /app/conda_requirements.yml
4 | 
5 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
6 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
7 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
8 | 


--------------------------------------------------------------------------------
/docker/qc/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_qc:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_qc:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/qc/dockerfile_template:
--------------------------------------------------------------------------------
 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
 2 | 
 3 | ADD . /app
 4 | 
 5 | RUN apt update && apt install build-essential samtools -y
 6 | 
 7 | RUN conda install --file /app/conda_requirements.txt
 8 | RUN pip install -r /app/pip_requirements.txt
 9 | 
10 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/pypeliner* && pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
11 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/single_cell* && pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
12 | RUN rm -rf /opt/conda/lib/python3.7/site-packages/biowrappers* && pip install git+https://github.com/shahcompbio/biowrappers.git@master
13 | 
14 | RUN pip install git+https://github.com/amcpherson/wgs_analysis.git@v0.0.2
15 | RUN pip install git+https://github.com/shahcompbio/scgenome.git@v0.0.1
16 | RUN pip install git+https://github.com/shahcompbio/classifycopynumber.git@v0.0.5
17 | RUN pip install git+https://github.com/shahcompbio/mafannotator.git@master
18 | 
19 | RUN cp /app/oncokb-annotator/MafAnnotator.py /app/oncokb-annotator/AnnotatorCore.py /opt/conda/bin && chmod 777 /opt/conda/bin/MafAnnotator.py  /opt/conda/bin/AnnotatorCore.py
20 | RUN rm -rf /opt/conda/bin/samtools


--------------------------------------------------------------------------------
/docker/qc/pip_requirements.txt:
--------------------------------------------------------------------------------
 1 | adal==1.2.7
 2 | adjusttext==0.7.3
 3 | azure-batch==12.0.0
 4 | azure-common==1.1.28
 5 | azure-core==1.24.0
 6 | azure-identity==1.10.0
 7 | azure-storage-blob==12.12.0
 8 | backports-zoneinfo==0.2.1
 9 | brewer2mpl==1.4.1
10 | click==8.1.3
11 | cython==0.29.30
12 | dill==0.3.5.1
13 | fonttools==4.33.3
14 | hdbscan==0.8.28
15 | importlib-metadata==4.11.4
16 | isodate==0.6.1
17 | jinja2==3.1.2
18 | kiwisolver==1.4.2
19 | lda==2.0.0
20 | markupsafe==2.1.1
21 | matplotlib==3.5.2
22 | msal==1.17.0
23 | msal-extensions==1.0.0
24 | msrest==0.6.21
25 | msrestazure==0.6.4
26 | networkx==2.6.3
27 | numexpr==2.8.1
28 | numpy==1.21.6
29 | oauthlib==3.2.0
30 | packaging==21.3
31 | pandas==1.3.5
32 | pbr==3.1.1
33 | pillow==9.1.1
34 | portalocker==2.4.0
35 | pyjwt==2.4.0
36 | pytz==2022.1
37 | pytz-deprecation-shim==0.1.0.post0
38 | pyyaml==5.4.1
39 | requests-oauthlib==1.3.1
40 | rpy2==3.5.2
41 | scikit-learn==1.0.2
42 | scipy==1.7.3
43 | seaborn==0.11.2
44 | tables==3.7.0
45 | typing-extensions==4.2.0
46 | tzdata==2022.1
47 | tzlocal==4.2
48 | umap==0.1.1
49 | zipp==3.8.0


--------------------------------------------------------------------------------
/docker/sample_qc/python_requirements.txt:
--------------------------------------------------------------------------------
 1 | adjustText==0.7.3
 2 | brewer2mpl==1.4.1
 3 | certifi==2020.12.5
 4 | cffi==1.14.5
 5 | click==8.0.0a1
 6 | cycler==0.10.0
 7 | Cython==0.29.22
 8 | decorator==4.4.2
 9 | dill==0.3.3
10 | hdbscan==0.8.27
11 | hmmlearn==0.2.5
12 | Jinja2==2.11.3
13 | joblib==1.0.1
14 | kiwisolver==1.3.1
15 | lda==2.0.0
16 | MarkupSafe==1.1.1
17 | matplotlib==3.4.1
18 | networkx==2.5
19 | numexpr==2.7.3
20 | numpy==1.20.2
21 | packaging==20.9
22 | pandas==1.2.3
23 | pbr==3.1.1
24 | Pillow==8.1.2
25 | pycparser==2.20
26 | pyparsing==2.4.7
27 | pypeliner @ git+https://github.com/shahcompbio/pypeliner.git@b452c14c4abc6e653ac8e8f52d3c9b9a158becd1
28 | python-dateutil==2.8.1
29 | pytz==2021.1
30 | PyYAML==5.4.1
31 | rpy2==3.4.3
32 | scgenome @ git+https://github.com/DouglasAbrams/scgenome.git@fb2e01e16bce038367d8e45184f2d14dde200fb5
33 | scikit-learn==0.24.1
34 | scipy==1.6.2
35 | seaborn==0.11.1
36 | single-cell @ git+https://github.com/shahcompbio/single_cell_pipeline.git@f3ac2b7b1857a64279fe2b2b8a7ae3d9c13df45d
37 | six==1.15.0
38 | sklearn==0.0
39 | tables==3.6.1
40 | threadpoolctl==2.1.0
41 | tzlocal==2.1
42 | umap==0.1.1
43 | -e git+https://github.com/amcpherson/wgs_analysis.git@c73a9bd0268b5e6fb55a8c18a58ac28e5f918482#egg=wgs_analysis
44 | 


--------------------------------------------------------------------------------
/docker/variant/build.sh:
--------------------------------------------------------------------------------
 1 | REGISTRY=$1
 2 | ORG=$2
 3 | 
 4 | echo "\n LOGIN \n"
 5 | docker login $REGISTRY -u $3 --password $4
 6 | 
 7 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 8 | 
 9 | COMMIT=`git rev-parse HEAD`
10 | 
11 | cat dockerfile_template \
12 |  | sed "s/{git_commit}/$COMMIT/g" \
13 |  > dockerfile
14 | 
15 | docker build -t $REGISTRY/$ORG/single_cell_pipeline_variant:$TAG . --no-cache
16 | 
17 | docker push $REGISTRY/$ORG/single_cell_pipeline_variant:$TAG
18 | 
19 | 


--------------------------------------------------------------------------------
/docker/variant/dockerfile_template:
--------------------------------------------------------------------------------
 1 | FROM quay.io/singlecellpipelinetest/miniconda3:4.10.3
 2 | ADD . /app
 3 | 
 4 | 
 5 | RUN conda create --name museq python=2.7
 6 | SHELL ["conda", "run", "-n", "museq", "/bin/bash", "-c"]
 7 | RUN conda install --file /app/conda_museq_requirements.yml
 8 | RUN conda install -c bioconda variantbam
 9 | SHELL ["conda", "run", "-n", "base", "/bin/bash", "-c"]
10 | 
11 | RUN conda install --file /app/conda_base_requirements.yml
12 | RUN apt update -y && apt install samtools -y && rm -rf /var/lib/apt/lists/*
13 | ENV PATH="${PATH}:/opt/conda/envs/museq/bin"
14 | 
15 | RUN pip install git+https://github.com/shahcompbio/pypeliner.git@v0.6.3
16 | RUN pip install git+https://github.com/shahcompbio/single_cell_pipeline.git@{git_commit}
17 | RUN pip install git+https://github.com/shahcompbio/biowrappers.git@master
18 | RUN pip install pyvcf bx-python==0.8.9 numpy==1.19.5 pandas==0.25.3 --force-reinstall
19 | RUN pip install azure-batch azure-common azure-core azure-storage-blob azure-identity
20 | 


--------------------------------------------------------------------------------
/docker/vcf2maf/python_requirements.txt:
--------------------------------------------------------------------------------
 1 | analytics-python==1.2.9
 2 | azure-core==1.0.0
 3 | azure-identity==1.2.0
 4 | azure-keyvault-secrets==4.0.0
 5 | azure-storage-blob==1.5.0
 6 | azure-storage-common==1.4.2
 7 | cached-property==1.4.2
 8 | certifi==2020.12.5
 9 | idna==2.7
10 | msal==1.0.0
11 | msal-extensions==0.1.3
12 | msrest==0.6.10
13 | munch==2.3.2
14 | numpy==1.20.2
15 | pandas==1.2.3
16 | python-dateutil==2.8.1
17 | python-slugify==1.1.2
18 | pytz==2018.4
19 | PyYAML==5.3
20 | requests==2.19.1
21 | six==1.11.0
22 | Unidecode==1.2.0
23 | urllib3==1.23
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | PAPER         =
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/alignment_metrics.md:
--------------------------------------------------------------------------------
 1 | # QC pipeline metrics
 2 | 
 3 | |Column|Description|
 4 | |------|-----------|
 5 | |cell_id|label of the cell|
 6 | |index_sequence|index sequence of the adaptor sequence|
 7 | |column|column of the cell on the nanowell chip|
 8 | |img_col|column of the cell from the perspective of the microscope|
 9 | |index_i5|id of the i5 index adapter sequence|
10 | |sample_type|type of the sample|
11 | |primer_i7|id of the i5 index primer sequence|
12 | |experimental_condition|experimental treatment of the cell, includes controls|
13 | |index_i7|id of the i7 index adapter sequence|
14 | |cell_call|living/dead classification of the cell based on staining usually, C1 == living, C2 == dead|
15 | |sample_id|name of the sample|
16 | |primer_i5|id of the i5 index primer sequence|
17 | |row|row of the cell on the nanowell chip|
18 | |estimated_library_size|scaled total number of mapped reads|
19 | |total_mapped_reads|total number of mapped reads|
20 | |nohit|number of reads with no organism match|
21 | |salmon_multihit|number of reads that were classified as salmon and something else|
22 | |total_duplicate_reads|number of duplicate reads|
23 | |percent_duplicate_reads|percentage of duplicate reads|
24 | |total_properly_paired|number of properly paired reads|
25 | |mean_insert_size|mean insert size between paired reads|
26 | |coverage_breadth|percentage of genome covered by some read|
27 | |grch37|number of reads that were classified as human|
28 | |unpaired_duplicate_reads|number of unpaired duplicated reads|
29 | |unpaired_mapped_reads|number of unpaired mapped reads|
30 | |unmapped_reads|number of unmapped reads|
31 | |coverage_depth|average reads per nucleotide position in the genome|
32 | |median_insert_size|median insert size between paired reads|
33 | |salmon|number of reads that were classified as salmon|
34 | |grch37_multihit|number of reads that were classified as human and something else|
35 | |mm10|number of reads that were classified as mouse|
36 | |total_reads|total number of reads, regardless of mapping status|
37 | |standard_deviation_insert_size|standard deviation of the insert size between paired reads|
38 | |paired_mapped_reads|number of mapped reads that were properly paired|
39 | |mm10_multihit|number of reads classified as mouse and something else|
40 | |paired_duplicate_reads|number of paired reads that were also marked as duplicate|
41 | 


--------------------------------------------------------------------------------
/docs/source/gc_metrics.md:
--------------------------------------------------------------------------------
1 | ## GC metrics table
2 | 
3 | 
4 | 
5 | For each run, the corresponding reference sequence is divided into bins or windows based on the percentage of G + C content ranging from 0 - 100%. The  data is collected with picard tools CollectGcMetrics tool and you can read more about it [here](https://gatk.broadinstitute.org/hc/en-us/articles/360036801531-CollectGcBiasMetrics-Picard-)


--------------------------------------------------------------------------------
/docs/source/hmmcopy_metrics.md:
--------------------------------------------------------------------------------
 1 | # hmmcopy metrics
 2 | 
 3 | |Column|Description|
 4 | |------|-----------|
 5 | |multiplier|during parameter searching, the set [1..6] that was chosen|
 6 | |MSRSI_non_integerness|median of segment residuals from segment integer copy number states|
 7 | |MBRSI_dispersion_non_integerness|median of bin residuals from segment integer copy number states|
 8 | |MBRSM_dispersion|median of bin residuals from segment median copy number values|
 9 | |autocorrelation_hmmcopy|hmmcopy copy autocorrelation|
10 | |cv_hmmcopy||
11 | |empty_bins_hmmcopy|number of empty bins in hmmcopy|
12 | |mad_hmmcopy|median absolute deviation of hmmcopy copy|
13 | |mean_hmmcopy_reads_per_bin|mean reads per hmmcopy bin|
14 | |median_hmmcopy_reads_per_bin|median reads per hmmcopy bin|
15 | |std_hmmcopy_reads_per_bin|standard deviation value of reads in hmmcopy bins|
16 | |total_mapped_reads_hmmcopy|total mapped reads in all hmmcopy bins|
17 | |total_halfiness|summed halfiness penality score of the cell|
18 | |scaled_halfiness|summed scaled halfiness penalty score of the cell|
19 | |mean_state_mads|mean value for all median absolute deviation scores for each state|
20 | |mean_state_vars|variance value for all median absolute deviation scores for each state|
21 | |mad_neutral_state|median absolute deviation score of the neutral 2 copy state|
22 | |breakpoints|number of breakpoints, as indicated by state changes not at the ends of chromosomes|
23 | |mean_copy|mean hmmcopy copy value|
24 | |state_mode|the most commonly occuring state|
25 | |log_likelihood|hmmcopy log likelihood for the cell|
26 | |true_multiplier|the exact decimal value used to scale the copy number for segmentation|
27 | |cell_id|label of the cell|
28 | |order|order of the cell in the hierarchical clustering tree|
29 | |index_sequence|index sequence of the adaptor sequence|
30 | |column|column of the cell on the nanowell chip|
31 | |img_col|column of the cell from the perspective of the microscope|
32 | |index_i5|id of the i5 index adapter sequence|
33 | |sample_type|type of the sample|
34 | |primer_i7|id of the i5 index primer sequence|
35 | |experimental_condition|experimental treatment of the cell, includes controls|
36 | |index_i7|id of the i7 index adapter sequence|
37 | |cell_call|living/dead classification of the cell based on staining usually, C1 == living, C2 == dead|
38 | |sample_id|name of the sample|
39 | |primer_i5|id of the i5 index primer sequence|
40 | |row|row of the cell on the nanowell chip|
41 | |is_contaminated|boolean, set to True if most reads belong to a different genome|
42 | 


--------------------------------------------------------------------------------
/docs/source/hmmcopy_reads.md:
--------------------------------------------------------------------------------
 1 | # HMMCopy Reads
 2 | 
 3 | |Column|Description|
 4 | |------|-----------|
 5 | |chr|chromosome|
 6 | |start|start position|
 7 | |end|end position|
 8 | |width|width of genomie segment that comprises the bin|
 9 | |reads|number of reads that start in the bin|
10 | |gc|average GC content of all bases in the bin, -1 if N is present|
11 | |map|average mappability value of bin|
12 | |cor_gc|gc-corrected copy number value|
13 | |copy|final output copy number value|
14 | |valid|TRUE if reads > 0 & gc > 0, else FALSE|
15 | |ideal|TRUE if bin is VALID with good mappability and non-outlier gc and read values|
16 | |modal_curve|value of the gc-correction modal curve given the bin's gc|
17 | |modal_quantile||
18 | |cor_map|mappability-corrected gc-corrected copy number value|
19 | |multiplier|hmmcopy parameter set used [1..6]|
20 | |state|the copy number state of the bin|
21 | |cell_id|label of the cell|
22 | |is_low_mappability|bool, set to True if the segment has a low mappability score|
23 | 


--------------------------------------------------------------------------------
/docs/source/hmmcopy_segments.md:
--------------------------------------------------------------------------------
 1 | # HMMCopy Segments
 2 | |Column|Description|
 3 | |------|-----------|
 4 | |chr|chromosome|
 5 | |start|start position|
 6 | |end|end position|
 7 | |state|copy number state|
 8 | |median|median copy number value of segment|
 9 | |multiplier|hmmcopy parameter set used [1..6]|
10 | |cell_id|label of the cell|


--------------------------------------------------------------------------------
/docs/source/organism_filter.md:
--------------------------------------------------------------------------------
 1 | ## Organism Filter
 2 | 
 3 | The pipeline uses [FastqScreen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) to classify and filter non human reads. 
 4 | 
 5 | The QC pipeline runs fastq screen on each single cell fastq pair.  Fastq screen takes fastq inputs and outputs fastqs with tags added to read names. Each read in a pair is classified independently. We run our classification against human, mouse and salmon genomes. The bam files generated by the pipeline will be tagged with the fastqscreen tag to specify the species that they belong to.  
 6 | 
 7 | | Fastq Screen Flag| Explanation|
 8 | |----|----|
 9 | |0|Read does not map|
10 | |1|Read maps uniquely|
11 | |2|Read multi maps|
12 | 
13 | #### Fastq format
14 | Flag Format:
15 | The Flag information is appended to the read id in the fastq file. The very first read will have the following format:
16 | 
17 | 
18 | The Flag information is appended to the read id in the fastq file. The very first read will have the following format:
19 | ```
20 | @<Read-id>#FQST:grch37:mm10:salmon:100
21 | ```
22 | In this example, the read uniquely maps to the human genome and doesn't align to Mouse or Salmon genome at all.
23 | 
24 | All subsequent reads will have the following format:
25 | ```
26 | @<Read-id>#FQST:100
27 | ```
28 | 
29 | #### Bam format
30 | 
31 | Each read in the bam file will contain the following tag:
32 | 
33 | ```
34 | FS:Z:mm10_0,salmon_0,grch37_1
35 | ```
36 | 
37 | 
38 | ## Pipeline features:
39 | 
40 | #### Metrics:
41 | 
42 | ###### Detailed Metrics:
43 | 
44 | The pipeline generates a csv file with detailed counts for every flag option. The counts are also split by the Read direction.  The table columns depend on the references that we're checking against. For instance, the table will have following columns for a run against Human, Mouse and Salmon genomes:
45 | 
46 | * cell_id: id of the cell
47 | * read_end: end 1 or 2 of read pairs
48 | * Human: The column will have values {0,1,2}. Please see the table in fastq screen for details
49 | * Mouse: The column will have values {0,1,2}. Please see the table in fastq screen for details
50 | * Salmon: The column will have values {0,1,2}. Please see the table in fastq screen for details
51 | * count: number of reads
52 | 
53 | ###### Summary Metrics:
54 | 
55 | The pipeline will also add some summary metrics to the main alignment metrics table. The column names depend on the references.  For instance, the table will have following columns for a run against Human, Mouse and Salmon genomes
56 | 
57 | * human: count of reads that align to human genome (uniquely or multi-map)
58 | * human_multihit: count of reads that align to human genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map)
59 | * mouse: count of reads that align to mouse genome (uniquely or multi-map)
60 | * mouse_multihit: count of reads that align to mouse genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map)
61 | * salmon: count of reads that align to salmon genome (uniquely or multi-map)
62 | * salmon_multihit: count of reads that align to salmon genome (uniquely or multi-map) and also align to another genome at the same time (uniquely or multi-map)
63 | * nohit: count of reads that do not align to any genome
64 | 
65 | 
66 | #### Options
67 | 
68 | ###### Default functionality:
69 | 
70 | do not filter the files at all. The output bam files will have the information in their read tags.
71 | 
72 | 
73 | ###### Filter options:
74 | 
75 | * filter_contaminated_reads flag in config file.
76 | keep the following read pairs:
77 | 
78 | * Both R1 and R2 match human only (remove reads that match multiple references)
79 | * one of the mates matches human only, other one doesnt match anything.


--------------------------------------------------------------------------------
/docs/source/quality_classifier.md:
--------------------------------------------------------------------------------
 1 | # Cell Quality Classifier
 2 | 
 3 | 
 4 | |Feature name|Source|Description |
 5 | | ----| ----|----|
 6 | |percent_duplicate_reads|picard|percentage of reads marked as PCR duplicate by MarkDuplicates|
 7 | |total_mapped_reads | samtools|number of reads mapped by the bwa mem alignment algorithm |
 8 | |total_duplicate_reads | samtools|number of reads marked as PCR duplicate by MarkDuplicates |
 9 | |standard_deviation_insert_size| picard| read insert size standard deviation |
10 | |MSRSI_non_integerness| hmmcopy| median of segment residuals from segment integer copy number states|
11 | |MBRSI_dispersion_non_integerness| hmmcopy| median of bin residuals from segment integer copy number states|
12 | |MBRSM_dispersion| hmmcopy | median of bin residuals from segment median copy number values|
13 | |autocorrelation_hmmcopy| | autocorrelation of CNV results|
14 | |cv_hmmcopy| hmmcopy| coefficient of variation of CNV results|
15 | |mad_hmmcopy| hmmcopy| mean absolute deviation of CNV results|
16 | |total_halfiness|hmmcopy | halfiness score but without copy number state scaling|
17 | |scaled_halfiness| hmmcopy| a scaled metric to assess integer goodness of fit, described in text|
18 | |mean_state_mads| hmmcopy| the mean across all MADs of each copy number state|
19 | |mean_state_vars| hmmcopy| the mean across all variances of each copy number state|
20 | |breakpoints| hmmcopy| number of intrachromosomal breakpoints|
21 | |mean_copy| hmmcopy| mean copy number of all genomic bin segments|
22 | |state_mode| hmmcopy| the most commonly occuring copy nubmer state|
23 | |log_likelihood| hmmcopy| log-likelihood of HMMcopy CNV fit|
24 | 
25 | 
26 | ## Percent Duplicate Reads
27 | 
28 |  Calculated from the output of Mark Duplicates from picard tools. Please see [mark duplicates](http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics)
29 | 
30 | 
31 | Formula:
32 | 
33 | UNPAIRED_READ_DUPLICATES + ((READ_PAIR_DUPLICATES + READ_PAIR_OPTICAL_DUPLICATES)*2) / (UNPAIRED_READS_EXAMINED  + (READ_PAIRS_EXAMINED * 2)) 
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/source/readme_data/alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/alignment.png


--------------------------------------------------------------------------------
/docs/source/readme_data/annotation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/annotation.png


--------------------------------------------------------------------------------
/docs/source/readme_data/breakpoint_calling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/breakpoint_calling.png


--------------------------------------------------------------------------------
/docs/source/readme_data/breakpoint_calling.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (normal_bams) [io, text width=3cm] {Normal (WGS/cell) Bam};
27 | \node (tumour_bams) [io, text width=3cm, xshift=6cm] {Tumour Cell Bams};
28 | 
29 | \node (destruct) [process, below of=normal_bams] {destruct};
30 | \draw[-latex] (normal_bams) -- coordinate (ab) (destruct);
31 | \draw[-latex] (tumour_bams) -- coordinate (ab) (destruct);
32 | 
33 | \node (lumpy) [process, below of=tumour_bams] {Lumpy};
34 | \draw[-latex] (normal_bams) -- coordinate (ab) (lumpy);
35 | \draw[-latex] (tumour_bams) -- coordinate (ab) (lumpy);
36 | 
37 | \node (lumpy_bed) [io, below of=lumpy, text width=2cm] {Breakpoints};
38 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_bed);
39 | 
40 | \node (lumpy_csv) [io, below of=lumpy, text width=2cm, xshift=4cm] {Breakpoints csv};
41 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_csv);
42 | 
43 | \node (lumpy_counts) [io, below of=lumpy, text width=1.5cm, xshift=8cm] {Cell counts};
44 | \draw[-latex] (lumpy) -- coordinate (ab) (lumpy_counts);
45 | 
46 | 
47 | 
48 | \node (destruct_breakpoints) [io, below of=destruct, text width=2cm] {Breakpoints};
49 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_breakpoints);
50 | 
51 | \node (destruct_breakpoints_lib) [io, below of=destruct, text width=2cm, xshift=-4cm] {Breakpoints Library};
52 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_breakpoints_lib);
53 | 
54 | \node (destruct_counts) [io, below of=destruct, text width=1.5cm, xshift=-8cm] {Cell counts};
55 | \draw[-latex] (destruct) -- coordinate (ab) (destruct_counts);
56 | 
57 | \end{tikzpicture}
58 | 
59 | \end{adjustbox}
60 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/dlp_cohort_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/dlp_cohort_pipeline.png


--------------------------------------------------------------------------------
/docs/source/readme_data/germline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/germline.png


--------------------------------------------------------------------------------
/docs/source/readme_data/germline.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (normal_bams) [io, text width=3cm] {Normal Region Bams};
27 | 
28 | \node (samtools_germline) [process, below of=normal_bams] {Samtools Germline Calling};
29 | \draw[-latex] (normal_bams) -- coordinate (ab) (samtools_germline);
30 | 
31 | \node (germline_out) [io, below of=normal_bams, text width=1.5cm, xshift=4cm] {Germline vcf};
32 | \draw[-latex] (samtools_germline) -- coordinate (ab) (germline_out);
33 | 
34 | \node (mapp) [process, below of=samtools_germline] {Annotate Mappability};
35 | \draw[-latex] (samtools_germline) -- coordinate (ab) (mapp);
36 | 
37 | \node (mapp_out) [io, below of=mapp, text width=2cm] {Mappability vcf};
38 | \draw[-latex] (mapp) -- coordinate (ab) (mapp_out);
39 | 
40 | \node (geno) [process, below of=samtools_germline, xshift=4cm] {Annotate Genotype};
41 | \draw[-latex] (samtools_germline) -- coordinate (ab) (geno);
42 | 
43 | \node (geno_out) [io, below of=geno, text width=2cm] {Genotype vcf};
44 | \draw[-latex] (geno) -- coordinate (ab) (geno_out);
45 | 
46 | \node (snpeff) [process, below of=samtools_germline, xshift=-4cm] {Snpeff};
47 | \draw[-latex] (samtools_germline) -- coordinate (ab) (snpeff);
48 | 
49 | \node (snpeff_out) [io, below of=snpeff, text width=1.5cm] {snpeff vcf};
50 | \draw[-latex] (snpeff) -- coordinate (ab) (snpeff_out);
51 | 
52 | \end{tikzpicture}
53 | 
54 | \end{adjustbox}
55 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/hmmcopy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/hmmcopy.png


--------------------------------------------------------------------------------
/docs/source/readme_data/hmmcopy.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (in1) [io] {Input};
27 | 
28 | \node (readcounter) [pprocess, below of=in1] {Readcounter};
29 | \draw[-latex] (in1) -- coordinate (ab) (readcounter);
30 | \draw (ab) -- ++(0.5,-0.3)coordinate[pos=.3](ab1) coordinate[pos=.6](ab2);
31 | \draw[-latex] (ab1) -- ($(readcounter.north east)!(ab1)!(readcounter.north west)$);
32 | \draw[-latex] (ab2) -- ($(readcounter.north west)!(ab2)!(readcounter.north east)$);
33 | 
34 | \node (hmmcopy) [pprocess, below of=readcounter] {HMMCopy};
35 | \draw[-latex] (readcounter.south) -- (hmmcopy.north);
36 | \draw[-latex] ([xshift=0.2 cm]readcounter.south) -- ([xshift=0.2 cm]hmmcopy.north);
37 | \draw[-latex] ([xshift=-0.2 cm]readcounter.south) -- ([xshift=-0.2 cm]hmmcopy.north);
38 | 
39 | \node (segs) [io, below of=hmmcopy, xshift = 3cm] {Segments};
40 | \draw[-latex] (hmmcopy) -- (segs.north);
41 | 
42 | \node (params) [io, below of=hmmcopy, xshift = 7cm] {Params};
43 | \draw[-latex] (hmmcopy) -- (params.north);
44 | 
45 | \node (reads) [io, below of=hmmcopy, xshift = -3cm] {Reads};
46 | \draw[-latex] (hmmcopy) -- (reads.north);
47 | 
48 | \node (metrics) [io, below of=hmmcopy, xshift = -7cm] {Metrics};
49 | \draw[-latex] (hmmcopy) -- (metrics.north);
50 | 
51 | \node (merge_metrics) [decision, below of=hmmcopy, yshift=-2cm] {Merge};
52 | \draw[-latex] (metrics.south) -- (merge_metrics);
53 | \draw[-latex] (params.south) -- (merge_metrics);
54 | \draw[-latex] (reads.south) -- (merge_metrics);
55 | \draw[-latex] (segs.south) -- (merge_metrics);
56 | 
57 | \node (plot_metrics) [process, below of=merge_metrics,xshift=7cm] {Plot metrics};
58 | \draw[-latex] (merge_metrics) -- (plot_metrics.north);
59 | 
60 | \node (plot_kernel) [process, below of=merge_metrics, xshift=3cm] {Plot kernel density};
61 | \draw[-latex] (merge_metrics) -- (plot_kernel.north);
62 | 
63 | 
64 | \node (plot_hmap) [process, below of=merge_metrics,xshift=-3cm] {Plot heatmap};
65 | \draw[-latex] (merge_metrics) -- (plot_hmap.north);
66 | 
67 | \node (plot_hmm) [process, below of=merge_metrics,xshift=-7cm] {Plot hmmcopy};
68 | \draw[-latex] (merge_metrics) -- (plot_hmm.north);
69 | 
70 | \end{tikzpicture}
71 | 
72 | \end{adjustbox}
73 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/infer_haps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/infer_haps.png


--------------------------------------------------------------------------------
/docs/source/readme_data/infer_haps.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (normal_bams) [io, text width=3cm] {Normal (WGS/cell) Bam};
27 | \node (tumour_bams) [io, text width=3cm, xshift=6cm] {Tumour Cell Bams};
28 | 
29 | \node (infer_haps) [process, below of=normal_bams] {Infer Haps};
30 | \draw[-latex] (normal_bams) -- coordinate (ab) (infer_haps);
31 | 
32 | \node (readcounts) [process, below of=tumour_bams] {Extract Read Counts};
33 | \draw[-latex] (tumour_bams) -- coordinate (ab) (readcounts);
34 | \draw[-latex] (infer_haps) -- coordinate (ab) (readcounts);
35 | 
36 | \node (haplotypes) [io, below of=infer_haps, text width=2cm] {Haplotypes};
37 | \draw[-latex] (infer_haps) -- coordinate (ab) (haplotypes);
38 | 
39 | \node (allele_counts) [io, below of=readcounts, text width=2cm] {Read Counts};
40 | \draw[-latex] (readcounts) -- coordinate (ab) (allele_counts);
41 | 
42 | \end{tikzpicture}
43 | 
44 | \end{adjustbox}
45 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/merge_cell_bams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/merge_cell_bams.png


--------------------------------------------------------------------------------
/docs/source/readme_data/merge_cell_bams.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (cell_bams) [io, text width=2.5cm] {Cell Bam files};
27 | 
28 | \node (merge_bams) [process, below of=cell_bams] {merge bams};
29 | \draw[-latex] (cell_bams) -- coordinate (ab) (merge_bams);
30 | 
31 | \node (region_bams) [io, right of=merge_bams, xshift=4cm] {Region bams};
32 | \draw[-latex] (merge_bams) -- coordinate (ab) (region_bams);
33 | 
34 | 
35 | \node (index_bams) [process, below of=merge_bams] {index bams};
36 | \draw[-latex] (merge_bams) -- coordinate (ab) (index_bams);
37 | 
38 | \node (region_bams_index) [io, right of=index_bams, xshift=4cm, text width=3cm] {Region bam Indexes};
39 | \draw[-latex] (index_bams) -- coordinate (ab) (region_bams_index);
40 | 
41 | 
42 | \end{tikzpicture}
43 | 
44 | \end{adjustbox}
45 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/pseudo_bulk_qc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/pseudo_bulk_qc.png


--------------------------------------------------------------------------------
/docs/source/readme_data/split_wgs_bam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/split_wgs_bam.png


--------------------------------------------------------------------------------
/docs/source/readme_data/split_wgs_bam.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (wgs_bam) [io] {WGS bam};
27 | 
28 | \node (split_bam) [process, below of=wgs_bam] {split bam};
29 | \draw[-latex] (wgs_bam) -- coordinate (ab) (split_bam);
30 | 
31 | \node (region_bams) [io, right of=split_bam, xshift=4cm] {Region bams};
32 | \draw[-latex] (split_bam) -- coordinate (ab) (region_bams);
33 | 
34 | 
35 | \node (index_bams) [process, below of=split_bam] {index bams};
36 | \draw[-latex] (split_bam) -- coordinate (ab) (index_bams);
37 | 
38 | \node (region_bams_index) [io, right of=index_bams, xshift=4cm, text width=3cm] {Region bam Indexes};
39 | \draw[-latex] (index_bams) -- coordinate (ab) (region_bams_index);
40 | 
41 | 
42 | \end{tikzpicture}
43 | 
44 | \end{adjustbox}
45 | \end{document}


--------------------------------------------------------------------------------
/docs/source/readme_data/variant_calling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/variant_calling.png


--------------------------------------------------------------------------------
/docs/source/readme_data/variant_counting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/docs/source/readme_data/variant_counting.png


--------------------------------------------------------------------------------
/docs/source/readme_data/variant_counting.tikz:
--------------------------------------------------------------------------------
 1 | \documentclass[class=minimal,border=2pt]{standalone}
 2 | \usepackage[utf8]{inputenc}
 3 | \usepackage{tikz}
 4 | \usepackage{adjustbox}
 5 | \usetikzlibrary{shapes.geometric, arrows, shadows, positioning, calc}
 6 | \pagenumbering{gobble}
 7 | 
 8 | \tikzset{multiple/.style = {double copy shadow={shadow xshift=1ex,shadow
 9 |          yshift=-1.5ex,draw=black!30},fill=white,draw=black,thick,minimum height = 1cm,minimum
10 |            width=2cm},
11 |          ordinary/.style = {rectangle,draw,thick,minimum height = 1cm,minimum width=2cm}}
12 | 
13 | \tikzstyle{startstop} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm,text centered, draw=black, fill=red!30]
14 | \tikzstyle{io} = [trapezium, trapezium left angle=70, trapezium right angle=110, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=blue!30]
15 | \tikzstyle{process} = [rectangle, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
16 | \tikzstyle{pprocess} = [multiple, minimum width=3cm, minimum height=1cm, text centered, text width=3cm, draw=black, fill=orange!30]
17 | 
18 | \tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1cm, text centered, draw=black, fill=green!30]
19 | \tikzstyle{arrow} = [thick,->,>=stealth]
20 | 
21 | \begin{document}
22 | \begin{adjustbox}{margin=2cm}
23 | 
24 | \begin{tikzpicture}[node distance=2cm]
25 | 
26 | \node (museq_vcf) [io, text width=1.5cm, xshift=4cm] {Museq vcf file};
27 | \node (strelka_vcf) [io, text width=1.5cm, xshift=8cm] {strelka vcf};
28 | 
29 | \node (merge_snv) [decision, text width=1.5cm, below of=museq_vcf, xshift=2cm] {Merge calls};
30 | \draw[-latex] (museq_vcf) -- coordinate (ab) (merge_snv);
31 | \draw[-latex] (strelka_vcf) -- coordinate (ab) (merge_snv);
32 | 
33 | \node (tumour_bams) [io, text width=2cm, below of=merge_snv, xshift=-4cm, yshift=-1cm] {Tumour Region Bams};
34 | 
35 | \node (counting) [pprocess, text width=1.5cm, below of=merge_snv, yshift=-1cm] {Generate Count};
36 | \draw[-latex] (merge_snv) -- coordinate (ab) (counting);
37 | \draw[-latex] (tumour_bams) -- coordinate (ab) (counting);
38 | 
39 | \node (counts) [io, text width=1.5cm, below of=counting] {Counts csv};
40 | \draw[-latex] (counting) -- coordinate (ab) (counts);
41 | 
42 | \end{tikzpicture}
43 | 
44 | \end{adjustbox}
45 | \end{document}


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | [versioneer]
4 | VCS = git
5 | style = pep440
6 | versionfile_source = single_cell/_version.py
7 | versionfile_build = single_cell/_version.py
8 | tag_prefix = v


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import versioneer
 3 | 
 4 | 
 5 | setup(
 6 |     name='single_cell',
 7 |     packages=find_packages(),
 8 |     version=versioneer.get_version(),
 9 |     cmdclass=versioneer.get_cmdclass(),
10 |     description='Single cell pipeline',
11 |     author='Andrew McPherson',
12 |     author_email='andrew.mcpherson@gmail.com',
13 |     entry_points={'console_scripts': ['single_cell = single_cell.run:main']},
14 |     package_data={'':['scripts/*.py', 'scripts/*.R', 'scripts/*.npz', "config/*.yaml", 'scripts/*.Rmd', 'scripts/*.sh', "data/*"]}
15 | )
16 | 


--------------------------------------------------------------------------------
/single_cell/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from ._version import get_versions
3 | __version__ = get_versions()['version']
4 | del get_versions
5 | 


--------------------------------------------------------------------------------
/single_cell/annotation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Feb 19, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import os
 7 | import sys
 8 | 
 9 | import pypeliner.managed as mgd
10 | from single_cell.utils import inpututils
11 | from single_cell.workflows import qc_annotation
12 | 
13 | import pypeliner
14 | 
15 | 
16 | def annotation_workflow(args):
17 |     config = inpututils.load_config(args)
18 | 
19 |     annotation_infiles = inpututils.load_yaml(args['input_yaml'])
20 | 
21 |     lib = args["library_id"]
22 | 
23 |     workflow = pypeliner.workflow.Workflow()
24 | 
25 |     annotation_dir = args["output_prefix"]
26 | 
27 |     input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
28 |     annotation_files = get_output_files(annotation_dir)
29 |     annotation_meta = os.path.join(args['out_dir'], 'metadata.yaml')
30 | 
31 |     workflow.subworkflow(
32 |         name='annotation_workflow',
33 |         func=qc_annotation.create_qc_annotation_workflow,
34 |         args=(
35 |             mgd.InputFile(annotation_infiles['hmmcopy_metrics']),
36 |             mgd.InputFile(annotation_infiles['hmmcopy_reads']),
37 |             mgd.InputFile(annotation_infiles['alignment_metrics']),
38 |             mgd.InputFile(annotation_infiles['gc_metrics']),
39 |             mgd.InputFile(annotation_infiles['segs_pdf_tar']),
40 |             mgd.OutputFile(annotation_files['merged_metrics_csvs']),
41 |             mgd.OutputFile(annotation_files['qc_report']),
42 |             mgd.OutputFile(annotation_files['segs_pass']),
43 |             mgd.OutputFile(annotation_files['segs_fail']),
44 |             mgd.OutputFile(annotation_files['heatmap_filt_pdf']),
45 |             config['annotation'],
46 |         )
47 |     )
48 | 
49 |     workflow.transform(
50 |         name='generate_meta_files_results',
51 |         func='single_cell.utils.helpers.generate_and_upload_metadata',
52 |         args=(
53 |             sys.argv[0:],
54 |             args['out_dir'],
55 |             list(annotation_files.values()),
56 |             mgd.OutputFile(annotation_meta)
57 |         ),
58 |         kwargs={
59 |             'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
60 |             'input_yaml': mgd.OutputFile(input_yaml_blob),
61 |             'metadata': {
62 |                 'library_id': lib,
63 |                 'type': 'annotation'
64 |             }
65 |         }
66 |     )
67 | 
68 |     return workflow
69 | 
70 | 
71 | def get_output_files(outdir):
72 |     data = {
73 |         'merged_metrics_csvs': outdir + 'metrics.csv.gz',
74 |         'qc_report': outdir + 'QC_report.html',
75 |         'segs_pass': outdir + 'segs_pass.tar.gz',
76 |         'segs_fail': outdir + 'segs_fail.tar.gz',
77 |         'heatmap_filt_pdf': outdir + 'heatmap_by_ec_filtered.pdf',
78 |     }
79 | 
80 |     return data
81 | 
82 | 
83 | def annotation_pipeline(args):
84 |     pyp = pypeliner.app.Pypeline(config=args)
85 | 
86 |     workflow = annotation_workflow(args)
87 | 
88 |     pyp.run(workflow)
89 | 


--------------------------------------------------------------------------------
/single_cell/clean_sentinels.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 9, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | import fnmatch
 8 | import os
 9 | 
10 | from pypeliner.sqlitedb import SqliteDb
11 | 
12 | 
13 | def clean_sentinels(args):
14 |     dirname = args["pipelinedir"]
15 | 
16 |     rundir, pattern = args["pattern"]
17 | 
18 |     rundir = os.path.join(dirname, rundir)
19 | 
20 |     if args["mode"] == "list":
21 |         list_sentinels(rundir, pattern)
22 |     else:
23 |         delete_sentinels(rundir, pattern)
24 | 
25 | 
26 | def list_sentinels(dirname, pattern):
27 |     jobs_shelf = os.path.join(dirname, "jobs.db")
28 | 
29 |     jobs = SqliteDb(jobs_shelf)
30 | 
31 |     job_matches = [v for v in jobs.keys() if fnmatch.fnmatch(v, pattern)]
32 | 
33 |     jobs.close()
34 | 
35 |     matches = job_matches
36 | 
37 |     matches = '\n'.join(matches)
38 | 
39 |     print(matches)
40 | 
41 | 
42 | def delete_sentinels(dirname, pattern):
43 |     jobs_shelf = os.path.join(dirname, "jobs.db")
44 | 
45 |     jobs = SqliteDb(jobs_shelf)
46 | 
47 |     for job in jobs.keys():
48 |         if fnmatch.fnmatch(job, pattern):
49 |             jobs.delete(job)
50 | 
51 |     jobs.close()
52 | 


--------------------------------------------------------------------------------
/single_cell/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/config/__init__.py


--------------------------------------------------------------------------------
/single_cell/config/generate_batch_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import os
 4 | 
 5 | from single_cell.config import batch
 6 | from single_cell.utils import helpers
 7 | 
 8 | 
 9 | def generate_submit_config_in_temp(args):
10 | 
11 |     if args['which'] in ['clean_sentinels', 'generate_config']:
12 |         return args
13 | 
14 |     if args.get("submit_config", None):
15 |         return args
16 | 
17 |     azure_submit = ['azurebatch',
18 |                     'pypeliner.contrib.azure.batchqueue.AzureJobQueue']
19 |     if not args.get("submit", None) in azure_submit:
20 |         return args
21 | 
22 |     batch_yaml = "batch.yaml"
23 |     tmpdir = args.get("tmpdir", None)
24 |     pipelinedir = args.get("pipelinedir", None)
25 | 
26 |     # use pypeliner tmpdir to store yaml
27 |     if pipelinedir:
28 |         batch_yaml = os.path.join(pipelinedir, batch_yaml)
29 |     elif tmpdir:
30 |         batch_yaml = os.path.join(tmpdir, batch_yaml)
31 |     else:
32 |         logging.getLogger("single_cell.generate_batch_config").warn(
33 |             "no tmpdir specified, generating configs in working dir"
34 |         )
35 |         batch_yaml = os.path.join(os.getcwd(), batch_yaml)
36 | 
37 |     helpers.makedirs(batch_yaml, isfile=True)
38 | 
39 |     batch_yaml = helpers.get_incrementing_filename(batch_yaml)
40 | 
41 |     params_override = args["config_override"]
42 |     if not params_override:
43 |         params_override = {}
44 | 
45 |     config_params = batch.get_batch_params(override=params_override)
46 |     config = batch.get_batch_config(config_params, override=params_override)
47 |     batch.write_config(config, batch_yaml)
48 | 
49 |     args["submit_config"] = batch_yaml
50 | 
51 |     return args
52 | 


--------------------------------------------------------------------------------
/single_cell/config/generate_pipeline_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | from single_cell.config import pipeline_config
 4 | from single_cell.utils import helpers
 5 | 
 6 | 
 7 | def generate_pipeline_config_in_temp(args):
 8 | 
 9 |     if args['which'] in ['clean_sentinels', 'generate_config']:
10 |         return args
11 | 
12 |     if args.get("config_file", None):
13 |         return args
14 | 
15 |     config_yaml = "config.yaml"
16 |     tmpdir = args.get("tmpdir", None)
17 |     pipelinedir = args.get("pipelinedir", None)
18 | 
19 |     # use pypeliner tmpdir to store yaml
20 |     if pipelinedir:
21 |         config_yaml = os.path.join(pipelinedir, config_yaml)
22 |     elif tmpdir:
23 |         config_yaml = os.path.join(tmpdir, config_yaml)
24 |     else:
25 |         logging.getLogger("single_cell.generate_pipeline_config").warn(
26 |             "no tmpdir specified, generating configs in working dir"
27 |         )
28 |         config_yaml = os.path.join(os.getcwd(), config_yaml)
29 | 
30 |     config_yaml = helpers.get_incrementing_filename(config_yaml)
31 | 
32 |     params_override = args["config_override"]
33 | 
34 |     helpers.makedirs(config_yaml, isfile=True)
35 | 
36 |     config_params = pipeline_config.get_config_params(override=params_override)
37 |     config = pipeline_config.get_singlecell_pipeline_config(config_params, override=params_override)
38 |     pipeline_config.write_config(config, config_yaml)
39 | 
40 |     args["config_file"] = config_yaml
41 | 
42 |     return args
43 | 


--------------------------------------------------------------------------------
/single_cell/generate_config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 9, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | from single_cell.config import pipeline_config
 8 | from single_cell.config import batch
 9 | 
10 | 
11 | def generate_config(args):
12 |     config_yaml = args.get("pipeline_config")
13 |     batch_yaml = args.get("batch_config")
14 |     params_override = args.get("config_override")
15 | 
16 |     if config_yaml:
17 |         config_params = pipeline_config.get_config_params(override=params_override)
18 |         config = pipeline_config.get_singlecell_pipeline_config(config_params)
19 |         pipeline_config.write_config(config, config_yaml)
20 | 
21 |     if batch_yaml:
22 |         config_params = batch.get_batch_params(override=params_override)
23 |         config = batch.get_batch_config(config_params)
24 |         batch.write_config(config, batch_yaml)
25 | 


--------------------------------------------------------------------------------
/single_cell/hmmcopy.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Feb 19, 2018
  3 | 
  4 | @author: dgrewal
  5 | '''
  6 | import os
  7 | import sys
  8 | 
  9 | import pypeliner
 10 | import pypeliner.managed as mgd
 11 | from single_cell.utils import inpututils
 12 | from single_cell.workflows import hmmcopy
 13 | 
 14 | 
 15 | def get_output_files(outdir):
 16 |     data = {
 17 |         'reads_csvs': outdir + 'reads.csv.gz',
 18 |         'segs_csvs': outdir + 'segments.csv.gz',
 19 |         'params_csvs': outdir + 'params.csv.gz',
 20 |         'metrics_csvs': outdir + 'hmmcopy_metrics.csv.gz',
 21 |         'hmmcopy_data_tar': outdir + 'hmmcopy_data.tar.gz',
 22 |         'igv_csvs': outdir + 'igv_segments.seg',
 23 |         'segs_pdf': outdir + 'segs.tar.gz',
 24 |         'bias_pdf': outdir + 'bias.tar.gz',
 25 |         'heatmap_pdf': outdir + 'heatmap_by_ec.pdf',
 26 |         'metrics_pdf': outdir + 'hmmcopy_metrics.pdf',
 27 |         'kernel_density_pdf': outdir + 'kernel_density.pdf',
 28 |     }
 29 | 
 30 |     return data
 31 | 
 32 | 
 33 | def hmmcopy_workflow(args):
 34 |     config = inpututils.load_config(args)
 35 |     config = config['hmmcopy']
 36 | 
 37 |     sampleinfo = inpututils.get_sample_info(args['input_yaml'])
 38 |     cellids = inpututils.get_samples(args['input_yaml'])
 39 |     bam_files = inpututils.get_bams(args['input_yaml'])
 40 | 
 41 |     lib = args["library_id"]
 42 | 
 43 |     workflow = pypeliner.workflow.Workflow()
 44 | 
 45 |     hmmcopy_prefix = args["output_prefix"]
 46 | 
 47 |     hmmcopy_files = get_output_files(hmmcopy_prefix)
 48 |     hmmcopy_meta = os.path.join(args['out_dir'], 'metadata.yaml')
 49 |     input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
 50 | 
 51 |     workflow.setobj(
 52 |         obj=mgd.OutputChunks('cell_id'),
 53 |         value=list(bam_files.keys()),
 54 |     )
 55 | 
 56 |     workflow.subworkflow(
 57 |         name='hmmcopy_workflow',
 58 |         func=hmmcopy.create_hmmcopy_workflow,
 59 |         args=(
 60 |             mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']),
 61 |             mgd.OutputFile(hmmcopy_files['reads_csvs']),
 62 |             mgd.OutputFile(hmmcopy_files['segs_csvs']),
 63 |             mgd.OutputFile(hmmcopy_files['metrics_csvs']),
 64 |             mgd.OutputFile(hmmcopy_files['params_csvs']),
 65 |             mgd.OutputFile(hmmcopy_files['igv_csvs']),
 66 |             mgd.OutputFile(hmmcopy_files['segs_pdf']),
 67 |             mgd.OutputFile(hmmcopy_files['bias_pdf']),
 68 |             mgd.OutputFile(hmmcopy_files['heatmap_pdf']),
 69 |             mgd.OutputFile(hmmcopy_files['metrics_pdf']),
 70 |             mgd.OutputFile(hmmcopy_files['kernel_density_pdf']),
 71 |             mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']),
 72 |             cellids,
 73 |             config,
 74 |             sampleinfo
 75 |         ),
 76 |     )
 77 | 
 78 |     workflow.transform(
 79 |         name='generate_meta_files_results',
 80 |         func='single_cell.utils.helpers.generate_and_upload_metadata',
 81 |         args=(
 82 |             sys.argv[0:],
 83 |             args['out_dir'],
 84 |             list(hmmcopy_files.values()),
 85 |             mgd.OutputFile(hmmcopy_meta)
 86 |         ),
 87 |         kwargs={
 88 |             'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
 89 |             'input_yaml': mgd.OutputFile(input_yaml_blob),
 90 |             'metadata': {
 91 |                 'library_id': lib,
 92 |                 'cell_ids': list(bam_files.keys()),
 93 |                 'type': 'hmmcopy',
 94 |             }
 95 |         }
 96 |     )
 97 | 
 98 |     return workflow
 99 | 
100 | 
101 | def hmmcopy_pipeline(args):
102 |     pyp = pypeliner.app.Pypeline(config=args)
103 | 
104 |     workflow = hmmcopy_workflow(args)
105 | 
106 |     pyp.run(workflow)
107 | 


--------------------------------------------------------------------------------
/single_cell/merge_bams.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Feb 22, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import os
 7 | import sys
 8 | 
 9 | import pypeliner
10 | import pypeliner.managed as mgd
11 | from single_cell.utils import inpututils
12 | from single_cell.workflows import merge_bams
13 | 
14 | 
15 | def merge_bams_workflow(args):
16 |     config = inpututils.load_config(args)
17 |     config = config['merge_bams']
18 | 
19 |     ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50,
20 |            'ncpus': 1, 'mem': config["memory"]['low']}
21 |     workflow = pypeliner.workflow.Workflow(ctx=ctx)
22 | 
23 |     bam_files = inpututils.load_merge_cell_bams(args['input_yaml'])
24 | 
25 |     merge_out_template = args['output_prefix'] + '{region}.bam'
26 |     meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
27 |     input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
28 | 
29 |     workflow.setobj(
30 |         obj=mgd.OutputChunks('cell_id'),
31 |         value=list(bam_files.keys()),
32 |     )
33 | 
34 |     workflow.transform(
35 |         name="get_regions",
36 |         func="single_cell.utils.pysamutils.get_regions_from_reference",
37 |         ret=pypeliner.managed.OutputChunks('region'),
38 |         args=(
39 |             config["ref_genome"],
40 |             config["split_size"],
41 |             config["chromosomes"],
42 |         )
43 |     )
44 | 
45 |     workflow.transform(
46 |         name="remove_softclipped_reads",
47 |         func="single_cell.utils.pysamutils.remove_softclipped_reads",
48 |         axes=('cell_id',),
49 |         args=(
50 |             mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']),
51 |             mgd.TempOutputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']),
52 |             args['softclipped_reads_threshold']
53 |         )
54 |     )
55 | 
56 |     workflow.subworkflow(
57 |         name="wgs_merge_workflow",
58 |         func=merge_bams.create_merge_bams_workflow,
59 |         args=(
60 |             mgd.TempInputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']),
61 |             mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai'], template=merge_out_template),
62 |             mgd.InputChunks("region"),
63 |             config,
64 |         )
65 |     )
66 | 
67 |     workflow.transform(
68 |         name='generate_meta_files_results',
69 |         func='single_cell.utils.helpers.generate_and_upload_metadata',
70 |         args=(
71 |             sys.argv[0:],
72 |             args['out_dir'],
73 |             mgd.Template('bam_filenames', 'region', template=merge_out_template),
74 |             mgd.OutputFile(meta_yaml)
75 |         ),
76 |         kwargs={
77 |             'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
78 |             'input_yaml': mgd.OutputFile(input_yaml_blob),
79 |             'template': (mgd.InputChunks('region'), merge_out_template, 'region'),
80 |             'metadata': {
81 |                 'type': 'pseudowgs_regionbams',
82 |                 'cell_ids': list(bam_files.keys())}
83 | 
84 |         }
85 |     )
86 | 
87 |     return workflow
88 | 
89 | 
90 | def merge_bams_pipeline(args):
91 |     pyp = pypeliner.app.Pypeline(config=args)
92 | 
93 |     workflow = merge_bams_workflow(args)
94 | 
95 |     pyp.run(workflow)
96 | 


--------------------------------------------------------------------------------
/single_cell/run.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from single_cell.alignment import alignment_pipeline
 4 | from single_cell.annotation import annotation_pipeline
 5 | from single_cell.breakpoint_calling import breakpoint_calling_pipeline
 6 | from single_cell.clean_sentinels import clean_sentinels
 7 | from single_cell.cmdline import parse_args
 8 | from single_cell.generate_config import generate_config
 9 | from single_cell.germline_calling import germline_calling_pipeline
10 | from single_cell.hmmcopy import hmmcopy_pipeline
11 | from single_cell.infer_haps import count_haps_pipeline
12 | from single_cell.infer_haps import infer_haps_pipeline
13 | from single_cell.merge_bams import merge_bams_pipeline
14 | from single_cell.sample_qc import sample_qc_pipeline
15 | from single_cell.snv_genotyping import snv_genotyping_pipeline
16 | from single_cell.split_bam import split_bam_pipeline
17 | from single_cell.sv_genotyping import sv_genotyping_pipeline
18 | from single_cell.variant_calling import variant_calling_pipeline
19 | from single_cell.cohort_qc import cohort_qc_pipeline
20 | 
21 | 
22 | def main():
23 |     args = parse_args()
24 | 
25 |     if args["which"] == "generate_config":
26 |         generate_config(args)
27 |         return
28 | 
29 |     if args["which"] == "clean_sentinels":
30 |         clean_sentinels(args)
31 |         return
32 | 
33 |     if args["which"] == "alignment":
34 |         alignment_pipeline(args)
35 | 
36 |     if args["which"] == "hmmcopy":
37 |         hmmcopy_pipeline(args)
38 | 
39 |     if args["which"] == "annotation":
40 |         annotation_pipeline(args)
41 | 
42 |     if args["which"] == "merge_cell_bams":
43 |         merge_bams_pipeline(args)
44 | 
45 |     if args["which"] == "split_wgs_bam":
46 |         split_bam_pipeline(args)
47 | 
48 |     if args["which"] == "variant_calling":
49 |         variant_calling_pipeline(args)
50 | 
51 |     if args["which"] == "germline_calling":
52 |         germline_calling_pipeline(args)
53 | 
54 |     if args["which"] == "infer_haps":
55 |         infer_haps_pipeline(args)
56 | 
57 |     if args["which"] == "count_haps":
58 |         count_haps_pipeline(args)
59 | 
60 |     if args["which"] == "breakpoint_calling":
61 |         breakpoint_calling_pipeline(args)
62 | 
63 |     if args["which"] == "snv_genotyping":
64 |         snv_genotyping_pipeline(args)
65 | 
66 |     if args["which"] == "sv_genotyping":
67 |         sv_genotyping_pipeline(args)
68 | 
69 |     if args["which"] == "sample_qc":
70 |         sample_qc_pipeline(args)
71 | 
72 |     if args["which"] == "cohort_qc":
73 |         cohort_qc_pipeline(args)
74 |         
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/single_cell/snv_genotyping.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pypeliner
 5 | import pypeliner.managed as mgd
 6 | from single_cell.utils import inpututils
 7 | 
 8 | 
 9 | def create_variant_counting_workflow(args):
10 |     """ Count variant reads for multiple sets of variants across cells.
11 |     """
12 | 
13 |     vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input(
14 |         args['input_yaml']
15 |     )
16 | 
17 |     counts_template = '{sample_id}_{library_id}_counts.csv.gz'
18 |     counts_output_template = args['output_prefix'] + counts_template
19 | 
20 |     meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
21 |     input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')
22 | 
23 |     config = inpututils.load_config(args)
24 |     config = config['variant_calling']
25 | 
26 |     workflow = pypeliner.workflow.Workflow()
27 | 
28 |     workflow.setobj(
29 |         obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
30 |         value=list(tumour_cell_bams.keys()),
31 |     )
32 | 
33 |     workflow.transform(
34 |         name='merge_snvs_museq',
35 |         func='single_cell.utils.vcfutils.merge_vcf',
36 |         args=(
37 |             [mgd.InputFile(vcf_file, extensions=['.tbi','.csi']) for vcf_file in vcf_files],
38 |             mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
39 |             mgd.TempSpace("merge_vcf_temp")
40 |         ),
41 |     )
42 | 
43 |     workflow.subworkflow(
44 |         name='count_alleles',
45 |         axes=('sample_id', 'library_id'),
46 |         func='single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
47 |         args=(
48 |             mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'],
49 |                           fnames=tumour_cell_bams, axes_origin=[]),
50 |             mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
51 |             mgd.OutputFile('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template),
52 |             mgd.Instance('sample_id'),
53 |             mgd.Instance('library_id'),
54 |             config['memory'],
55 |         ),
56 |     )
57 | 
58 |     workflow.transform(
59 |         name='generate_meta_files_results',
60 |         func='single_cell.utils.helpers.generate_and_upload_metadata',
61 |         args=(
62 |             sys.argv[0:],
63 |             args['out_dir'],
64 |             mgd.Template('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template),
65 |             mgd.OutputFile(meta_yaml)
66 |         ),
67 |         kwargs={
68 |             'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
69 |             'input_yaml': mgd.OutputFile(input_yaml_blob),
70 |             'metadata': {
71 |                 'type': 'snv_genotyping',
72 |                 'counts': {
73 |                     'template': counts_template,
74 |                     'instances': sample_library,
75 |                 }
76 |             }
77 |         }
78 |     )
79 | 
80 |     return workflow
81 | 
82 | 
83 | def snv_genotyping_pipeline(args):
84 |     pyp = pypeliner.app.Pypeline(config=args)
85 | 
86 |     workflow = create_variant_counting_workflow(args)
87 | 
88 |     pyp.run(workflow)
89 | 


--------------------------------------------------------------------------------
/single_cell/split_bam.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 6, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import os
 7 | import sys
 8 | 
 9 | import pypeliner
10 | import pypeliner.managed as mgd
11 | from single_cell.utils import inpututils
12 | from single_cell.workflows import split_bams
13 | 
14 | 
15 | def split_bam_workflow(args):
16 |     config = inpututils.load_config(args)
17 |     config = config['split_bam']
18 | 
19 |     bam_file = inpututils.load_split_wgs_input(args['input_yaml'])
20 | 
21 |     split_bam_template = args['output_prefix'] + '{region}.bam'
22 | 
23 |     meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
24 |     input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')
25 | 
26 |     workflow = pypeliner.workflow.Workflow()
27 | 
28 |     workflow.transform(
29 |         name="get_regions",
30 |         ctx={'mem': config['memory']['low'], 'ncpus': 1},
31 |         func="single_cell.utils.pysamutils.get_regions_from_reference",
32 |         ret=pypeliner.managed.OutputChunks('region'),
33 |         args=(
34 |             config["ref_genome"],
35 |             config["split_size"],
36 |             config["chromosomes"],
37 |         )
38 |     )
39 | 
40 |     workflow.subworkflow(
41 |         name="split_normal",
42 |         func=split_bams.create_split_workflow,
43 |         ctx={'mem': config['memory']['low'], 'ncpus': 1},
44 |         args=(
45 |             mgd.InputFile(bam_file),
46 |             mgd.OutputFile(
47 |                 "normal.split.bam", 'region',
48 |                 template=split_bam_template, axes_origin=[]
49 |             ),
50 |             pypeliner.managed.InputChunks('region'),
51 |             config,
52 |         ),
53 |     )
54 | 
55 |     workflow.transform(
56 |         name='generate_meta_files_results',
57 |         func='single_cell.utils.helpers.generate_and_upload_metadata',
58 |         args=(
59 |             sys.argv[0:],
60 |             args['output_prefix'],
61 |             mgd.Template('bam_filenames', 'region', template=split_bam_template),
62 |             mgd.OutputFile(meta_yaml)
63 |         ),
64 |         kwargs={
65 |             'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
66 |             'input_yaml': mgd.OutputFile(input_yaml_blob),
67 |             'metadata': {'type': 'wgs_regionbams'},
68 |             'template': (mgd.InputChunks('region'), split_bam_template, 'region'),
69 |         }
70 |     )
71 | 
72 |     return workflow
73 | 
74 | 
75 | def split_bam_pipeline(args):
76 |     pyp = pypeliner.app.Pypeline(config=args)
77 | 
78 |     workflow = split_bam_workflow(args)
79 | 
80 |     pyp.run(workflow)
81 | 


--------------------------------------------------------------------------------
/single_cell/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/tests/__init__.py


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/tests/codebuild/__init__.py


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/align/align.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | 
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p ALIGN/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/alignment ALIGN/ref_test_data --recursive --quiet
13 | 
14 | 
15 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
16 |   $1/single_cell_pipeline_alignment:$TAG \
17 |   single_cell alignment --input_yaml single_cell/tests/codebuild/align/inputs.yaml \
18 |   --library_id A97318A --maxjobs 1 --nocleanup --sentinel_only  \
19 |   --submit local --loglevel DEBUG \
20 |   --tmpdir ALIGN/temp \
21 |   --pipelinedir ALIGN/pipeline \
22 |   --submit local \
23 |   --output_prefix ALIGN/output/A97318A \
24 |   --bams_dir ALIGN/bams \
25 |   --sequencing_center TEST --trim
26 | 
27 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
28 |   $1/single_cell_pipeline_alignment:$TAG \
29 |   python single_cell/tests/codebuild/align/test_alignment.py ALIGN/output A97318A  ALIGN/ref_test_data/refdata/bwa-mem
30 | 
31 | 
32 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf ALIGN
33 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/align/inputs.yaml:
--------------------------------------------------------------------------------
 1 | SA1090-A96213A-R20-C28:
 2 |   column: 28
 3 |   condition: B
 4 |   fastqs:
 5 |     HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5:
 6 |       fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C28_1.fastq.gz
 7 |       fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C28_2.fastq.gz
 8 |   img_col: 45
 9 |   index_i5: i5-20
10 |   index_i7: i7-28
11 |   pick_met: C1
12 |   primer_i5: GTATAG
13 |   primer_i7: CTATCT
14 |   row: 20
15 |   sample_id: SA1090
16 |   library_id: A96213A
17 |   is_control: True
18 | SA1090-A96213A-R20-C62:
19 |   column: 62
20 |   condition: B
21 |   fastqs:
22 |     HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5:
23 |       fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C62_1.fastq.gz
24 |       fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R20-C62_2.fastq.gz
25 |   img_col: 11
26 |   index_i5: i5-20
27 |   index_i7: i7-62
28 |   pick_met: C1
29 |   primer_i5: GTATAG
30 |   primer_i7: AAGCTA
31 |   row: 20
32 |   sample_id: SA1090
33 |   library_id: A96213A
34 |   is_control: False
35 | SA1090-A96213A-R22-C43:
36 |   column: 43
37 |   condition: B
38 |   fastqs:
39 |     HHCJ7CCXY_5.HGTJJCCXY_8.HYG5LCCXY_6.HYG5LCCXY_7.HYG5LCCXY_5:
40 |       fastq_1: ALIGN/ref_test_data/testdata/SA1090-A96213A-R22-C43_1.fastq.gz
41 |       fastq_2: ALIGN/ref_test_data/testdata/SA1090-A96213A-R22-C43_2.fastq.gz
42 |   img_col: 30
43 |   index_i5: i5-22
44 |   index_i7: i7-43
45 |   pick_met: C2
46 |   primer_i5: GCTGTA
47 |   primer_i7: ATTCCG
48 |   row: 22
49 |   sample_id: SA1090
50 |   library_id: A96213A
51 |   is_control: False
52 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/align/test_alignment.py:
--------------------------------------------------------------------------------
 1 | import os.path 
 2 | import sys 
 3 | from single_cell.tests.codebuild import compare
 4 | 
 5 | def get_inputs(path, library_id):
 6 |     '''
 7 |     get metrics and gc metrics given a directory and library
 8 |     :param path:  path to metrics files
 9 |     :param library_id: library id associated with metrics files
10 |     '''
11 |     metrics = os.path.join(path, library_id) 
12 |     metrics += "_alignment_metrics.csv.gz"
13 | 
14 |     gc_metrics = os.path.join(path, library_id)
15 |     gc_metrics += "_gc_metrics.csv.gz"
16 | 
17 |     return metrics, gc_metrics
18 | 
19 | def compare_alignment(ref_metrics, metrics, 
20 |                     ref_gc_metrics, gc_metrics):
21 | 
22 |     compare.compare_metrics(ref_metrics, metrics)
23 |     compare.compare_metrics(ref_gc_metrics, gc_metrics)
24 | 
25 | if __name__ == "__main__":
26 | 
27 |     output_path = sys.argv[1]
28 |     output_lib = sys.argv[2]
29 | 
30 |     ref_path = sys.argv[3]
31 |     ref_lib = "A97318A"
32 | 
33 |     ref_metrics, ref_gc_metrics = get_inputs(ref_path, "A97318A")
34 |     metrics, gc_metrics = get_inputs(output_path, output_lib)
35 | 
36 |     compare_alignment(ref_metrics, metrics,
37 |                     ref_gc_metrics, gc_metrics)
38 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/annotation/annotation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | NUMCORES=`nproc --all`
 7 | 
 8 | mkdir -p ANNOTATION/ref_test_data
 9 | 
10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
11 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/annotation ANNOTATION/ref_test_data --recursive --quiet
12 | 
13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
14 |   $1/single_cell_pipeline_annotation:$TAG \
15 |   single_cell annotation --input_yaml single_cell/tests/codebuild/annotation/inputs.yaml \
16 |   --library_id A97318A --maxjobs $NUMCORES --nocleanup --sentinel_only  \
17 |   --submit local --loglevel DEBUG \
18 |   --tmpdir ANNOTATION/temp \
19 |   --pipelinedir ANNOTATION/pipeline \
20 |   --submit local \
21 |   --output_prefix ANNOTATION/output/A97318A \
22 |   --config_override '{"annotation": {"chromosomes": ["6", "8", "17"]}}' \
23 |   --no_corrupt_tree
24 | 
25 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
26 |   $1/single_cell_pipeline_annotation:$TAG \
27 |   python single_cell/tests/codebuild/annotation/test_annotation.py ANNOTATION/output A97318A  ANNOTATION/ref_test_data/refdata
28 | 
29 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_annotation:$TAG rm -rf ANNOTATION
30 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/annotation/inputs.yaml:
--------------------------------------------------------------------------------
1 | hmmcopy_metrics: ANNOTATION/ref_test_data/testdata/A96213A_hmmcopy_metrics.csv.gz
2 | hmmcopy_reads: ANNOTATION/ref_test_data/testdata/A96213A_reads.csv.gz
3 | alignment_metrics: ANNOTATION/ref_test_data/testdata/A96213A_alignment_metrics.csv.gz
4 | gc_metrics: ANNOTATION/ref_test_data/testdata/A96213A_gc_metrics.csv.gz
5 | segs_pdf_tar: ANNOTATION/ref_test_data/testdata/A96213A_segs.tar.gz
6 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/annotation/test_annotation.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | 
 5 | 
 6 | def get_inputs(path, library_id):
 7 |     '''
 8 |     get metrics and gc metrics given a directory and library
 9 |     :param path:  path to metrics files
10 |     :param library_id: library id associated with metrics files
11 |     '''
12 |     metrics = os.path.join(path, library_id)
13 |     metrics += "_metrics.csv.gz"
14 | 
15 |     return metrics
16 | 
17 | if __name__ == "__main__":
18 |     output_path = sys.argv[1]
19 |     output_lib = sys.argv[2]
20 | 
21 |     ref_path = sys.argv[3]
22 |     ref_lib = "A97318A"
23 | 
24 |     ref_metrics = get_inputs(ref_path, "A97318A")
25 |     metrics = get_inputs(output_path, output_lib)
26 | 
27 |     compare.compare_annotation_metrics(ref_metrics, metrics)
28 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/breakpoint_calling/breakpoint_calling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p BREAKPOINT_CALLING/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/breakpoint-calling BREAKPOINT_CALLING/ref_test_data --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_breakpoint:$TAG \
16 |   single_cell breakpoint_calling \
17 |   --input_yaml single_cell/tests/codebuild/breakpoint_calling/inputs.yaml \
18 |   --maxjobs $NUMCORES \
19 |   --nocleanup \
20 |   --sentinel_only \
21 |   --submit local \
22 |   --loglevel DEBUG \
23 |   --tmpdir BREAKPOINT_CALLING/temp \
24 |   --pipelinedir BREAKPOINT_CALLING/pipeline \
25 |   --submit local \
26 |   --output_prefix BREAKPOINT_CALLING/output/ \
27 |   --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}}'
28 | 
29 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
30 |   $1/single_cell_pipeline_breakpoint:$TAG \
31 |   python single_cell/tests/codebuild/breakpoint_calling/test_breakpoint_calling.py BREAKPOINT_CALLING/output BREAKPOINT_CALLING/ref_test_data/refdata
32 | 
33 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_breakpoint:$TAG rm -rf BREAKPOINT_CALLING
34 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/breakpoint_calling/test_breakpoint_calling.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | from single_cell.utils import csvutils
 5 | 
 6 | def get_inputs(path):
 7 |     """"
 8 |     get metrics and gc metrics given a directory and library
 9 |     :param path:  path to metrics files
10 |     """
11 | 
12 |     must_exist = ["destruct_breakpoints_library.csv.gz",
13 |                   "destruct_breakpoints_library.csv.gz.yaml",
14 |                   "destruct_cell_counts.csv.gz",
15 |                   "destruct_cell_counts.csv.gz.yaml",
16 |                   "input.yaml",
17 |                   "lumpy_breakpoints.bed",
18 |                   "lumpy_breakpoints_evidence.csv.gz",
19 |                   "lumpy_breakpoints_evidence.csv.gz.yaml",
20 |                   "metadata.yaml"]
21 | 
22 |     lumpy_breakpoints = os.path.join(path, "lumpy_breakpoints.csv.gz")
23 |     destruct_breakpoints = os.path.join(path, "destruct_breakpoints.csv.gz")
24 | 
25 |     must_exist = [os.path.join(path, f) for f in must_exist]
26 | 
27 |     return must_exist, lumpy_breakpoints, destruct_breakpoints
28 | 
29 | 
30 | def test_breakpoint_calling(args):
31 |     output_path = args[1]
32 |     ref_path = args[2]
33 | 
34 |     ref_must_exist, ref_lumpy, ref_destruct = get_inputs(ref_path)
35 |     must_exist, lumpy, destruct = get_inputs(output_path)
36 | 
37 |     assert all(map(os.path.exists, ref_must_exist))
38 |     assert all(map(os.path.exists, must_exist))
39 | 
40 |     compare.compare_breakpoint_calls(ref_lumpy, lumpy)
41 | 
42 |     ref_destruct = csvutils.read_csv_and_yaml(ref_destruct)
43 |     destruct = csvutils.read_csv_and_yaml(destruct)
44 | 
45 |     assert ref_destruct.empty and destruct.empty
46 | 
47 | if __name__ == "__main__":
48 |     test_breakpoint_calling(sys.argv)
49 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/cohort_qc/cohort_qc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p COHORT_QC/testdata
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/cohort-qc-2 COHORT_QC/testdata --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_qc:$TAG \
16 |   single_cell cohort_qc --input_yaml single_cell/tests/codebuild/cohort_qc/inputs.yaml \
17 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
18 |   --loglevel DEBUG \
19 |   --tmpdir COHORT_QC/temp \
20 |   --pipelinedir COHORT_QC/pipeline \
21 |   --submit local \
22 |   --output_prefix COHORT_QC/output \
23 |   --config_override '{"refdir":"/refdata"}' \
24 |   --API_key $ONCOKB_KEY
25 | 
26 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_qc:$TAG rm -rf COHORT_QC
27 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/cohort_qc/inputs.yaml:
--------------------------------------------------------------------------------
 1 | SIGNATURES:
 2 |   DG1134:
 3 |     libdata:
 4 |       A96168B:
 5 |         hmmcopy_reads: COHORT_QC/testdata/A96168B_reads.csv.gz
 6 |         hmmcopy_metrics: COHORT_QC/testdata/A96168B_hmmcopy_metrics.csv.gz
 7 |         museq: COHORT_QC/testdata/museq.vcf.gz
 8 |         strelka_snv: COHORT_QC/testdata/strelka_s.vcf.gz
 9 |         strelka_indel: COHORT_QC/testdata/strelka_i.vcf.gz
10 |       A96168C:
11 |         hmmcopy_reads: COHORT_QC/testdata/A96168B_reads.csv.gz
12 |         hmmcopy_metrics: COHORT_QC/testdata/A96168B_hmmcopy_metrics.csv.gz
13 |         museq: COHORT_QC/testdata/museq.vcf.gz
14 |         strelka_snv: COHORT_QC/testdata/strelka_s.vcf.gz
15 |         strelka_indel: COHORT_QC/testdata/strelka_i.vcf.gz
16 |     germline_maf: COHORT_QC/testdata/germline_small.maf
17 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/count_haps/count_haps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | NUMCORES=`nproc --all`
 7 | 
 8 | mkdir -p COUNT_HAPS/ref_test_data
 9 | 
10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
11 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/count-haps-new COUNT_HAPS/ref_test_data --recursive --quiet
12 | 
13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
14 |   $1/single_cell_pipeline_haplotypes:$TAG \
15 |   single_cell count_haps \
16 |   --input_yaml single_cell/tests/codebuild/count_haps/inputs.yaml \
17 |   --maxjobs $NUMCORES \
18 |   --nocleanup \
19 |   --sentinel_only \
20 |   --submit local \
21 |   --loglevel DEBUG \
22 |   --tmpdir COUNT_HAPS/temp \
23 |   --config_override '{"count_haps":{"chromosomes":["15"], "extract_seqdata": {"genome_fai_template": "/refdata/human/infer_haps/GRCh37-lite.fa.fai", "genome_fasta_template": "/refdata/human/infer_haps/GRCh37-lite.fa"}, "ref_data_dir": "/refdata/human/infer_haps/"}}' \
24 |   --pipelinedir COUNT_HAPS/pipeline \
25 |   --submit local \
26 |   --output_prefix COUNT_HAPS/output/
27 | 
28 | 
29 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
30 |   $1/single_cell_pipeline_haplotypes:$TAG \
31 |   python single_cell/tests/codebuild/count_haps/test_count_haps.py COUNT_HAPS/output COUNT_HAPS/ref_test_data
32 | 
33 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_haplotypes:$TAG rm -rf COUNT_HAPS
34 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/count_haps/inputs.yaml:
--------------------------------------------------------------------------------
 1 | haplotypes: COUNT_HAPS/ref_test_data/haps.csv.gz
 2 | tumour:
 3 |   SA607_3X10XB02284-A108843A-R03-C03:
 4 |     bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C03.bam
 5 |   SA607_3X10XB02284-A108843A-R03-C10:
 6 |     bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C10.bam
 7 |   SA607_3X10XB02284-A108843A-R03-C08:
 8 |     bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C08.bam
 9 |   SA607_3X10XB02284-A108843A-R03-C09:
10 |     bam: COUNT_HAPS/ref_test_data/SA607_3X10XB02284-A108843A-R03-C09.bam


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/count_haps/test_count_haps.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | 
 5 | def compare_count_haps():
 6 |     output_path = sys.argv[1]
 7 |     ref_path = sys.argv[2]
 8 | 
 9 |     refhaps = os.path.join(ref_path, "allele_counts_ref.csv.gz")
10 |     haps = os.path.join(output_path, "allele_counts.csv.gz")
11 | 
12 |     compare.compare_count_haps(haps, refhaps)
13 | 
14 | if __name__ == "__main__":
15 |     compare_count_haps()
16 | 
17 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/hmmcopy/hmmcopy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p HMMCOPY/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/hmmcopy HMMCOPY/ref_test_data --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_hmmcopy:$TAG \
16 |   single_cell hmmcopy \
17 |   --input_yaml single_cell/tests/codebuild/hmmcopy/inputs.yaml \
18 |   --library_id A97318A \
19 |   --maxjobs $NUMCORES \
20 |   --nocleanup \
21 |   --sentinel_only  \
22 |   --submit local \
23 |   --loglevel DEBUG \
24 |   --config_override '{"hmmcopy": {"chromosomes": ["6", "8", "17"]}}' \
25 |   --tmpdir HMMCOPY/temp \
26 |   --pipelinedir HMMCOPY/pipeline \
27 |   --submit local \
28 |   --output_prefix HMMCOPY/output/A97318A
29 | 
30 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
31 |   $1/single_cell_pipeline_hmmcopy:$TAG \
32 |   python single_cell/tests/codebuild/hmmcopy/test_hmmcopy.py HMMCOPY/output A97318A  HMMCOPY/ref_test_data/refdata
33 | 
34 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_hmmcopy:$TAG rm -rf HMMCOPY
35 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/hmmcopy/inputs.yaml:
--------------------------------------------------------------------------------
 1 | SA1090-A96213A-R20-C28:
 2 |   bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R20-C28.bam
 3 |   column: 28
 4 |   condition: B
 5 |   img_col: 45
 6 |   index_i5: i5-20
 7 |   index_i7: i7-28
 8 |   pick_met: C1
 9 |   primer_i5: GTATAG
10 |   primer_i7: CTATCT
11 |   row: 20
12 |   sample_id: SA1090
13 |   library_id: A96213A
14 |   is_control: True
15 | SA1090-A96213A-R20-C62:
16 |   bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R20-C62.bam
17 |   column: 62
18 |   condition: B
19 |   img_col: 11
20 |   index_i5: i5-20
21 |   index_i7: i7-62
22 |   pick_met: C1
23 |   primer_i5: GTATAG
24 |   primer_i7: AAGCTA
25 |   row: 20
26 |   sample_id: SA1090
27 |   library_id: A96213A
28 |   is_control: False
29 | SA1090-A96213A-R22-C43:
30 |   bam: HMMCOPY/ref_test_data/testdata/SA1090-A96213A-R22-C43.bam
31 |   column: 43
32 |   condition: B
33 |   img_col: 30
34 |   index_i5: i5-22
35 |   index_i7: i7-43
36 |   pick_met: C2
37 |   primer_i5: GCTGTA
38 |   primer_i7: ATTCCG
39 |   row: 22
40 |   sample_id: SA1090
41 |   library_id: A96213A
42 |   is_control: False
43 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/hmmcopy/test_hmmcopy.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | 
 5 | 
 6 | def get_inputs(path, library_id):
 7 |     '''
 8 |     get metrics and gc metrics given a directory and library
 9 |     :param path:  path to metrics files
10 |     :param library_id: library id associated with metrics files
11 |     '''
12 |     metrics = os.path.join(path, library_id)
13 |     metrics += "_hmmcopy_metrics.csv.gz"
14 | 
15 |     reads = os.path.join(path, library_id)
16 |     reads += "_reads.csv.gz"
17 | 
18 |     return metrics, reads
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     output_path = sys.argv[1]
23 |     output_lib = sys.argv[2]
24 | 
25 |     ref_path = sys.argv[3]
26 |     ref_lib = "A97318A"
27 | 
28 |     ref_metrics, ref_reads = get_inputs(ref_path, "A97318A")
29 |     metrics, reads = get_inputs(output_path, output_lib)
30 | 
31 |     compare.compare_metrics(ref_metrics, metrics)
32 |     compare.compare_reads(ref_reads, reads)
33 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/infer_haps/infer_haps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p INFER_HAPS/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/infer-haps INFER_HAPS/ref_test_data/ --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_haplotypes:$TAG \
16 |   single_cell infer_haps --input_yaml single_cell/tests/codebuild/infer_haps/inputs.yaml \
17 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
18 |   --submit local --loglevel DEBUG \
19 |   --tmpdir INFER_HAPS/temp \
20 |   --pipelinedir INFER_HAPS/pipeline \
21 |   --submit local \
22 |   --output_prefix INFER_HAPS/output/ \
23 |   --config_override '{"infer_haps":{"chromosomes":["15"], "ref_data_dir": "/refdata/human/infer_haps/"}}' \
24 | 
25 | 
26 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
27 |   $1/single_cell_pipeline_haplotypes:$TAG \
28 |   python single_cell/tests/codebuild/infer_haps/test_infer_haps.py INFER_HAPS/output INFER_HAPS/ref_test_data
29 | 
30 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_haplotypes:$TAG rm -rf INFER_HAPS
31 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/infer_haps/inputs.yaml:
--------------------------------------------------------------------------------
1 | normal:
2 |     bam: INFER_HAPS/ref_test_data/HCC1395BL_chr15.bam
3 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/infer_haps/test_infer_haps.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | 
 4 | from single_cell.tests.codebuild import compare
 5 | 
 6 | 
 7 | def compare_infer_haps():
 8 |     output_path = sys.argv[1]
 9 |     ref_path = sys.argv[2]
10 | 
11 |     refhaps = os.path.join(ref_path, "ref_haplotypes.csv.gz")
12 |     haps = os.path.join(output_path, "haplotypes.csv.gz")
13 | 
14 |     compare.compare_infer_haps(haps, refhaps)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     compare_infer_haps()
19 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/merge_cell_bams/merge_cell_bams.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | NUMCORES=`nproc --all`
 7 | 
 8 | mkdir -p MERGE_CELL_BAMS/ref_test_data
 9 | 
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/merge-bams MERGE_CELL_BAMS/ref_test_data --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_alignment:$TAG \
16 |   single_cell merge_cell_bams \
17 |   --input_yaml single_cell/tests/codebuild/merge_cell_bams/inputs.yaml \
18 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
19 |   --submit local --loglevel DEBUG \
20 |   --tmpdir MERGE_CELL_BAMS/temp \
21 |   --pipelinedir MERGE_CELL_BAMS/pipeline \
22 |   --submit local \
23 |   --output_prefix MERGE_CELL_BAMS/output/ --config_override '{"merge_bams": {"chromosomes": ["6", "8", "17"]}}'
24 | 
25 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
26 |   $1/single_cell_pipeline_alignment:$TAG \
27 |   python single_cell/tests/codebuild/merge_cell_bams/test_merge_cell_bams.py MERGE_CELL_BAMS/output MERGE_CELL_BAMS/ref_test_data/refdata
28 | 
29 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf MERGE_CELL_BAMS
30 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/merge_cell_bams/test_merge_cell_bams.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | import pandas as pd
 5 | import pysam
 6 | 
 7 | def get_merged_counts(path):
 8 |     bam_fnames = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".bam")]
 9 |     bams = [pysam.AlignmentFile(bam, "rb") for bam in bam_fnames]
10 | 
11 |     regions = [os.path.basename(fname).split(".")[0] for fname in bam_fnames]
12 |     mapped = [bam.mapped for bam in bams]
13 |     unmapped = [bam.unmapped for bam in bams]
14 |     return pd.DataFrame({"interval":regions, "mapped": mapped, "unmapped": unmapped})
15 | 
16 | def compare_merge_counts():
17 |     output_path = sys.argv[1]
18 |     ref_path = sys.argv[2]
19 | 
20 |     refcounts = os.path.join(ref_path, "counts.csv")
21 | 
22 |     counts = get_merged_counts(output_path)
23 |     refcounts = pd.read_csv(refcounts)
24 | 
25 |     counts = counts.sort_values("interval", ascending=True)
26 |     counts = counts.set_index("interval")
27 | 
28 |     refcounts = refcounts.sort_values("interval", ascending=True)
29 |     refcounts = refcounts.set_index("interval")
30 | 
31 |     compare.compare_tables(counts, refcounts)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     compare_merge_counts()
36 | 
37 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/preflight/preflight.sh:
--------------------------------------------------------------------------------
 1 | CURR_HEAD=$(git rev-parse $(git rev-parse --abbrev-ref HEAD))
 2 | TAG=$(git describe --tags $(git rev-list --tags --max-count=1))
 3 | TAG_HEAD=$(git rev-parse $TAG^{commit})
 4 | 
 5 | if test $CURR_HEAD != $TAG_HEAD; then
 6 |     echo "Branch is not tagged"
 7 |     exit -1
 8 | fi
 9 | 
10 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/pseudo_bulk_qc/inputs.yaml:
--------------------------------------------------------------------------------
 1 | PT1:
 2 |   '271592':
 3 |     '11574':
 4 |       alignment_metrics: PSEUDO_BULK_QC/ref_test_data/11574_alignment_metrics.csv.gz
 5 |       annotation_metrics: PSEUDO_BULK_QC/ref_test_data/11574_metrics.csv.gz
 6 |       cosmic_status: PSEUDO_BULK_QC/ref_test_data/snv_cosmic_status.csv.gz
 7 |       counts: PSEUDO_BULK_QC/ref_test_data/271592_11574_counts.csv.gz
 8 |       dbsnp_status: PSEUDO_BULK_QC/ref_test_data/snv_dbsnp_status.csv.gz
 9 |       destruct_breakpoint_annotation: PSEUDO_BULK_QC/ref_test_data/destruct_breakpoints.csv.gz
10 |       destruct_breakpoint_counts: PSEUDO_BULK_QC/ref_test_data/destruct_cell_counts.csv.gz
11 |       gc_metrics: PSEUDO_BULK_QC/ref_test_data/11574_gc_metrics.csv.gz
12 |       haplotype_allele_data: PSEUDO_BULK_QC/ref_test_data/allele_counts.csv.gz
13 |       hmmcopy_metrics: PSEUDO_BULK_QC/ref_test_data/11574_hmmcopy_metrics.csv.gz
14 |       hmmcopy_reads: PSEUDO_BULK_QC/ref_test_data/11574_reads.csv.gz
15 |       hmmcopy_segs: PSEUDO_BULK_QC/ref_test_data/11574_segments.csv.gz
16 |       indel_file: PSEUDO_BULK_QC/ref_test_data/strelka_indel.vcf.gz
17 |       isabl_id: '271592'
18 |       lumpy_breakpoint_annotation: PSEUDO_BULK_QC/ref_test_data/lumpy_breakpoints.csv.gz
19 |       lumpy_breakpoint_evidence: PSEUDO_BULK_QC/ref_test_data/lumpy_breakpoints_evidence.csv.gz
20 |       mappability: PSEUDO_BULK_QC/ref_test_data/snv_mappability.csv.gz
21 |       museq: PSEUDO_BULK_QC/ref_test_data/snv_museq.csv.gz
22 |       snpeff: PSEUDO_BULK_QC/ref_test_data/snv_snpeff.csv.gz
23 |       strelka: PSEUDO_BULK_QC/ref_test_data/snv_strelka.csv.gz
24 |       trinuc: PSEUDO_BULK_QC/ref_test_data/snv_trinuc.csv.gz
25 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/pseudo_bulk_qc/pseudo_bulk_qc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p PSEUDO_BULK_QC/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/sample_qc PSEUDO_BULK_QC/ref_test_data --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_qc:$TAG \
16 |   single_cell sample_qc --input_yaml single_cell/tests/codebuild/pseudo_bulk_qc/inputs.yaml \
17 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
18 |   --submit local --loglevel DEBUG \
19 |   --tmpdir PSEUDO_BULK_QC/temp \
20 |   --pipelinedir PSEUDO_BULK_QC/pipeline \
21 |   --submit local \
22 |   --output_prefix PSEUDO_BULK_QC/output \
23 |   --config_override '{"annotation": {"chromosomes": ["6", "8", "17"]}}' \
24 | 
25 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_qc:$TAG rm -rf PSEUDO_BULK_QC
26 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/refdata/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | if [ ! -d "/refdata" ]; then
4 |     docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v /refdata:/refdata quay.io/singlecellpipeline/awscli:v0.0.1 aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/refdata /refdata --recursive --quiet
5 | fi
6 | 
7 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/snv_genotyping/inputs.yaml:
--------------------------------------------------------------------------------
 1 | vcf_files:
 2 |   - SNV_GENOTYPING/testdata/vcf/museq.vcf.gz
 3 |   - SNV_GENOTYPING/testdata/vcf/strelka_snv.vcf.gz
 4 | tumour_cells:
 5 |   SA1090:
 6 |     A96213A:
 7 |       SA1090-A96213A-R20-C28:
 8 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R20-C28.bam
 9 |       SA1090-A96213A-R22-C43:
10 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R22-C43.bam
11 |       SA1090-A96213A-R22-C44:
12 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R22-C44.bam
13 |       SA1090-A96213A-R24-C12:
14 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C12.bam
15 |       SA1090-A96213A-R24-C20:
16 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C20.bam
17 |       SA1090-A96213A-R24-C58:
18 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R24-C58.bam
19 |       SA1090-A96213A-R25-C14:
20 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C14.bam
21 |       SA1090-A96213A-R25-C22:
22 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C22.bam
23 |       SA1090-A96213A-R25-C40:
24 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C40.bam
25 |       SA1090-A96213A-R25-C64:
26 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R25-C64.bam
27 |       SA1090-A96213A-R26-C49:
28 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C49.bam
29 |       SA1090-A96213A-R26-C50:
30 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C50.bam
31 |       SA1090-A96213A-R26-C64:
32 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R26-C64.bam
33 |       SA1090-A96213A-R27-C14:
34 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C14.bam
35 |       SA1090-A96213A-R27-C21:
36 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C21.bam
37 |       SA1090-A96213A-R27-C45:
38 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R27-C45.bam
39 |       SA1090-A96213A-R28-C23:
40 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C23.bam
41 |       SA1090-A96213A-R28-C39:
42 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C39.bam
43 |       SA1090-A96213A-R28-C64:
44 |         bam: SNV_GENOTYPING/testdata/cell_bams/SA1090-A96213A-R28-C64.bam


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/snv_genotyping/snv_genotyping.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p SNV_GENOTYPING/testdata
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/snv_genotyping SNV_GENOTYPING/testdata/ --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_variant:$TAG \
16 |   single_cell snv_genotyping --input_yaml single_cell/tests/codebuild/snv_genotyping/inputs.yaml \
17 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
18 |   --submit local --loglevel DEBUG \
19 |   --tmpdir SNV_GENOTYPING/temp \
20 |   --pipelinedir SNV_GENOTYPING/pipeline --submit local --output_prefix SNV_GENOTYPING/output \
21 |   --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}, "version": '\"$TAG\"'}'
22 | 
23 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_variant:$TAG rm -rf SNV_GENOTYPING
24 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/split_wgs_bam/inputs.yaml:
--------------------------------------------------------------------------------
1 | normal:
2 |   bam: SPLIT_WGS_BAM/ref_test_data/DAH370N_A41086.bam
3 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/split_wgs_bam/split_wgs_bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | DOCKER=`which docker`
 7 | NUMCORES=`nproc --all`
 8 | 
 9 | mkdir -p SPLIT_WGS_BAM/ref_test_data
10 | 
11 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
12 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/split-bam SPLIT_WGS_BAM/ref_test_data --recursive --quiet
13 | 
14 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
15 |   $1/single_cell_pipeline_alignment:$TAG \
16 |   single_cell split_wgs_bam --input_yaml single_cell/tests/codebuild/split_wgs_bam/inputs.yaml \
17 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
18 |   --submit local --loglevel DEBUG \
19 |   --tmpdir SPLIT_WGS_BAM/temp \
20 |   --pipelinedir SPLIT_WGS_BAM/pipeline \
21 |   --submit local \
22 |   --output_prefix SPLIT_WGS_BAM/output/ --config_override '{"split_bam": {"chromosomes": ["6", "8", "17"]}}'
23 | 
24 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
25 |   $1/single_cell_pipeline_alignment:$TAG \
26 |   python single_cell/tests/codebuild/split_wgs_bam/test_split_wgs_bam.py SPLIT_WGS_BAM/output SPLIT_WGS_BAM/ref_test_data/refdata
27 | 
28 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_alignment:$TAG rm -rf SPLIT_WGS_BAM
29 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/split_wgs_bam/test_split_wgs_bam.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | import pandas as pd
 5 | import pysam
 6 | 
 7 | def get_merged_counts(path):
 8 |     bam_fnames = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".bam")]
 9 |     bams = [pysam.AlignmentFile(bam, "rb") for bam in bam_fnames]
10 | 
11 |     regions = [os.path.basename(fname).split(".")[0] for fname in bam_fnames]
12 |     mapped = [bam.mapped for bam in bams]
13 |     unmapped = [bam.unmapped for bam in bams]
14 |     return pd.DataFrame({"interval":regions, "mapped": mapped, "unmapped": unmapped})
15 | 
16 | def compare_merge_counts():
17 |     output_path = sys.argv[1]
18 |     ref_path = sys.argv[2]
19 | 
20 |     refcounts = os.path.join(ref_path, "counts.csv")
21 | 
22 |     counts = get_merged_counts(output_path)
23 |     refcounts = pd.read_csv(refcounts)
24 | 
25 |     counts = counts.sort_values("interval", ascending=True)
26 |     counts = counts.set_index("interval")
27 | 
28 |     refcounts = refcounts.sort_values("interval", ascending=True)
29 |     refcounts = refcounts.set_index("interval")
30 | 
31 |     compare.compare_tables(counts, refcounts)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     compare_merge_counts()
36 | 
37 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/variant_calling/test_variant_calling.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | from single_cell.tests.codebuild import compare
 4 | from single_cell.utils import csvutils
 5 | 
 6 | def get_inputs(path):
 7 |     """"
 8 |     get metrics and gc metrics given a directory and library
 9 |     :param path:  path to metrics files
10 |     """
11 |     strelka = os.path.join(path, "snv_strelka.csv.gz")
12 |     museq = os.path.join(path, "snv_museq.csv.gz")
13 |     snpeff = os.path.join(path, "snv_snpeff.csv.gz")
14 | 
15 |     return strelka, museq, snpeff
16 | 
17 | 
18 | def test_breakpoint_calling(args):
19 |     output_path = args[1]
20 |     ref_path = args[2]
21 | 
22 |     ref_strelka, ref_museq, ref_snpeff = get_inputs(ref_path)
23 |     strelka, museq, snpeff = get_inputs(output_path)
24 | 
25 |     compare.compare_variant_calls(ref_snpeff, snpeff)
26 | 
27 |     ref_strelka = csvutils.read_csv_and_yaml(ref_strelka)
28 |     strelka = csvutils.read_csv_and_yaml(strelka)
29 | 
30 |     assert ref_strelka.empty and strelka.empty
31 | 
32 | if __name__ == "__main__":
33 |     test_breakpoint_calling(sys.argv)
34 | 


--------------------------------------------------------------------------------
/single_cell/tests/codebuild/variant_calling/variant_calling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | 
 5 | TAG=`git describe --tags $(git rev-list --tags --max-count=1)`
 6 | NUMCORES=`nproc --all`
 7 | 
 8 | mkdir -p VARIANT_CALLING/ref_test_data
 9 | 
10 | docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION -v $PWD:$PWD -w $PWD $1/awscli:v0.0.1 \
11 |   aws s3 cp s3://singlecelltestsets/TESTDATA_CODEBUILD/variant-calling VARIANT_CALLING/ref_test_data/ --recursive --quiet
12 | 
13 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
14 |   $1/single_cell_pipeline_variant:$TAG \
15 |   single_cell variant_calling --input_yaml single_cell/tests/codebuild/variant_calling/inputs.yaml \
16 |   --maxjobs $NUMCORES --nocleanup --sentinel_only  \
17 |   --submit local --loglevel DEBUG \
18 |   --tmpdir VARIANT_CALLING/temp \
19 |   --pipelinedir VARIANT_CALLING/pipeline --submit local --output_prefix VARIANT_CALLING/output/ \
20 |   --config_override '{"variant_calling": {"chromosomes": ["6", "8", "17"]}, "version": '\"$TAG\"'}'
21 | 
22 | docker run -w $PWD -v $PWD:$PWD -v /refdata:/refdata --rm \
23 |   $1/single_cell_pipeline_variant:$TAG \
24 |   python single_cell/tests/codebuild/variant_calling/test_variant_calling.py VARIANT_CALLING/output VARIANT_CALLING/ref_test_data/refdata
25 | 
26 | docker run -w $PWD -v $PWD:$PWD --rm $1/single_cell_pipeline_variant:$TAG rm -rf VARIANT_CALLING
27 | 


--------------------------------------------------------------------------------
/single_cell/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/single_cell/utils/gatkutils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Feb 19, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import os
 7 | import pypeliner
 8 | 
 9 | def generate_targets(input_bams, config, intervals, interval, **kwargs):
10 |     # generate positions
11 |     cmd = ['gatk', '-Xmx8G',
12 |            '-T', 'RealignerTargetCreator',
13 |            '-R', config['ref_genome'],
14 |            '-o', intervals, '-L', interval,
15 |            ]
16 | 
17 |     for _, bamfile in input_bams.items():
18 |         cmd.extend(['-I', bamfile])
19 | 
20 |     pypeliner.commandline.execute(*cmd, **kwargs)
21 | 
22 | 
23 | def gatk_realigner(inputs, config, targets, interval, tempdir, **kwargs):
24 | 
25 | 
26 |     targets = os.path.abspath(targets)
27 |     cmd = ['gatk', '-Xmx8G',
28 |            '-T', 'IndelRealigner',
29 |            '-R', config['ref_genome'],
30 |            '-targetIntervals', targets,
31 |            '--nWayOut', '_indel_realigned.bam', '-L', interval,
32 |            '--maxReadsForRealignment','150000'
33 |            ]
34 | 
35 |     for _, bamfile in inputs.items():
36 |         bamfile = os.path.abspath(bamfile)
37 |         cmd.extend(['-I', bamfile])
38 | 
39 | 
40 |     cwd = os.getcwd()
41 |     os.chdir(tempdir)
42 | 
43 |     pypeliner.commandline.execute(*cmd, **kwargs)
44 | 
45 |     os.chdir(cwd)


--------------------------------------------------------------------------------
/single_cell/utils/ltmutils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on July 31, 2018
 3 | 
 4 | @author: pwalters
 5 | '''
 6 | 
 7 | import logging
 8 | 
 9 | import pandas as pd
10 | 
11 | 
12 | def read_input_file(input_file):
13 |     inputs = pd.read_csv(input_file, dtype=str)
14 | 
15 |     for column in ('timepoint', 'hmmcopy',):
16 |         if column not in inputs.columns:
17 |             raise Exception(
18 |                 'input_csv should contain {}'.format(column))
19 | 
20 |     timepoints = list(sorted(inputs['timepoint'].unique()))
21 | 
22 |     if inputs.duplicated(['timepoint']).any():
23 |         raise Exception('duplicate timepoints in input_csv')
24 | 
25 |     hmmcopy = dict()
26 |     for _, row in inputs.iterrows():
27 |         hmmcopy[row['timepoint']] = row['hmmcopy'].strip()
28 | 
29 |     return hmmcopy, timepoints
30 | 
31 | 
32 | def get_cn_matrix_from_hdf(hmmcopy_hdf_file, ploidy='0'):
33 |     df = pd.read_hdf(hmmcopy_hdf_file, '/hmmcopy/reads/' + ploidy)
34 | 
35 |     df["bin"] = list(zip(df.chr, df.start, df.end))
36 |     df = df.pivot(index='cell_id', columns='bin', values='state')
37 |     chromosomes = map(str, range(1, 23)) + ['X', 'Y']
38 |     bins = pd.DataFrame(df.columns.values.tolist(),
39 |                         columns=['chr', 'start', 'end'])
40 |     bins["chr"] = pd.Categorical(bins["chr"], chromosomes)
41 |     bins = bins.sort_values(['start', ])
42 |     bins = [tuple(v) for v in bins.values.tolist()]
43 |     df = df.sort_values(bins, axis=0).T
44 | 
45 |     dropped_cells = df.columns[df.isna().all()].tolist()
46 | 
47 |     if len(dropped_cells) != 0:
48 |         logging.getLogger("single_cell.helpers.ltmutils").warn(
49 |             'Dropping {} cells: {}'.format(len(dropped_cells), dropped_cells)
50 |         )
51 | 
52 |     df = df.loc[:, ~df.isna().all()].astype(int)
53 |     df.columns = df.columns.astype(str)
54 |     df = df.reset_index()
55 | 
56 |     chrom = []
57 |     start = []
58 |     end = []
59 |     width = []
60 |     for i, b in df['bin'].items():
61 |         chrom.append(b[0])
62 |         start.append(b[1])
63 |         end.append(b[2])
64 |         width.append(b[2] - b[1] + 1)
65 |     df['chr'] = chrom
66 |     df['start'] = start
67 |     df['end'] = end
68 |     df['width'] = width
69 | 
70 |     df = df.drop(columns='bin')
71 | 
72 |     return df, dropped_cells
73 | 
74 | 
75 | def get_root(cells_list, root_id_file):
76 |     for cell in cells_list:
77 |         if 'SA928' in cell:
78 |             with open(root_id_file, 'w') as outfile:
79 |                 outfile.write(cell + '\n')
80 |             outfile.close()
81 |             return cell
82 | 
83 |     raise Exception('No SA928 cells in the copy number matrix.')
84 | 


--------------------------------------------------------------------------------
/single_cell/utils/pdfutils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Feb 20, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | import os
 8 | 
 9 | from PyPDF2 import PdfFileMerger, PdfFileWriter, PdfFileReader
10 | 
11 | from single_cell.utils import helpers
12 | 
13 | 
14 | def merge_pdfs(infiles, outfile):
15 |     if isinstance(infiles, dict):
16 |         infiles = infiles.values()
17 | 
18 |     merger = PdfFileMerger()
19 | 
20 |     for infile in infiles:
21 |         # add it to list if not empty. skip empty files to avoid errors later
22 |         if os.path.getsize(infile):
23 |             merger.append(open(infile, 'rb'))
24 | 
25 |     helpers.makedirs(outfile, isfile=True)
26 | 
27 |     with open(outfile, 'wb') as fout:
28 |         merger.write(fout)
29 | 
30 | 
31 | def merge_pdfs_with_scaling(infiles, outfile, width=500, height=500):
32 |     if isinstance(infiles, dict):
33 |         infiles = infiles.values()
34 | 
35 |     pdf_writer = PdfFileWriter()
36 | 
37 |     pagenum = 0
38 | 
39 |     for infile in infiles:
40 |         pdf_file = PdfFileReader(open(infile, 'rb'))
41 |         num_pages = pdf_file.getNumPages()
42 | 
43 |         for page_number in range(0, num_pages):
44 |             pdf_page = pdf_file.getPage(page_number)
45 | 
46 |             pdf_page.scaleTo(width, height)
47 | 
48 |             pdf_writer.addPage(pdf_page)
49 | 
50 |             pdf_writer.addBookmark(title=infile, pagenum=pagenum)
51 |             pagenum += 1
52 | 
53 |     with open(outfile, 'wb') as fout:
54 |         pdf_writer.write(fout)
55 | 


--------------------------------------------------------------------------------
/single_cell/utils/pysamutils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jun 1, 2018
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import shutil
 7 | from collections import OrderedDict
 8 | 
 9 | import pysam
10 | from single_cell.utils.bamutils import bam_index
11 | 
12 | 
13 | def load_chromosome_lengths(file_name, chromosomes=None):
14 |     chromosome_lengths = OrderedDict()
15 | 
16 |     ref = pysam.Fastafile(file_name)
17 | 
18 |     for chrom, length in zip(ref.references, ref.lengths):
19 |         if chromosomes and chrom not in chromosomes:
20 |             continue
21 | 
22 |         chromosome_lengths[str(chrom)] = int(length)
23 | 
24 |     return chromosome_lengths
25 | 
26 | 
27 | def get_regions_from_reference(reference_fastq, split_size, chromosomes):
28 |     chromosome_lengths = load_chromosome_lengths(
29 |         reference_fastq,
30 |         chromosomes=chromosomes
31 |     )
32 |     return get_regions(chromosome_lengths, split_size)
33 | 
34 | 
35 | def get_regions(chromosome_lengths, split_size):
36 |     if split_size is None:
37 |         return dict(enumerate(chromosome_lengths.keys()))
38 | 
39 |     regions = []
40 | 
41 |     for chrom, length in chromosome_lengths.items():
42 |         lside_interval = range(1, length + 1, split_size)
43 |         rside_interval = range(split_size, length + split_size, split_size)
44 | 
45 |         for beg, end in zip(lside_interval, rside_interval):
46 |             end = min(end, length)
47 | 
48 |             regions.append('{}-{}-{}'.format(chrom, beg, end))
49 | 
50 |     return regions
51 | 
52 | 
53 | def _fraction_softclipped(x):
54 |     total_softclipped = 0
55 |     for a in x.cigar:
56 |         if a[0] == 4:
57 |             total_softclipped += a[1]
58 |     return float(total_softclipped) / x.query_length
59 | 
60 | 
61 | def remove_softclipped_reads(infile, outfile, softclipped_reads_threshold):
62 |     if softclipped_reads_threshold == 1:
63 |         shutil.copyfile(infile, outfile)
64 |         shutil.copyfile(infile + '.bai', outfile + '.bai')
65 |         return
66 | 
67 |     bamfile = pysam.AlignmentFile(infile, "rb")
68 | 
69 |     filteredbam = pysam.AlignmentFile(outfile, "wb", template=bamfile)
70 |     for read in bamfile.fetch():
71 |         if _fraction_softclipped(read) < softclipped_reads_threshold:
72 |             filteredbam.write(read)
73 |     filteredbam.close()
74 | 
75 |     bam_index(outfile, outfile + '.bai')
76 | 


--------------------------------------------------------------------------------
/single_cell/utils/refgenome.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | default_chromosomes = [str(a) for a in range(1, 23)] + ['X', 'Y']
 4 | 
 5 | 
 6 | def read_chromosome_lengths(genome_fasta_index, chromosomes=default_chromosomes):
 7 |     fai = pd.read_csv(genome_fasta_index, sep='\t', header=None, names=['chrom', 'length', 'V3', 'V4', 'V5'])
 8 |     fai = fai.set_index('chrom')['length']
 9 |     fai = fai.reindex(chromosomes).astype(int)
10 |     return fai.to_dict()
11 | 
12 | 
13 | def get_split_regions(split_size, refgenome, chromosomes=default_chromosomes):
14 |     genome_fasta_index = refgenome + '.fai'
15 | 
16 |     chromosome_lengths = read_chromosome_lengths(genome_fasta_index, chromosomes=chromosomes)
17 | 
18 |     if split_size is None:
19 |         return dict(enumerate(chromosome_lengths.keys()))
20 | 
21 |     regions = []
22 | 
23 |     for chrom, length in chromosome_lengths.items():
24 |         lside_interval = range(1, length + 1, split_size)
25 |         rside_interval = range(split_size, length + split_size, split_size)
26 | 
27 |         for beg, end in zip(lside_interval, rside_interval):
28 |             end = min(end, length)
29 | 
30 |             regions.append('{}-{}-{}'.format(chrom, beg, end))
31 | 
32 |     return regions
33 | 


--------------------------------------------------------------------------------
/single_cell/utils/singlecell_copynumber_plot_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .plot_metrics import PlotMetrics
3 | from .plot_kernel_density import PlotKernelDensity
4 | from .plot_pcolormesh import PlotPcolor
5 | from .plot_hmmcopy import GenHmmPlots


--------------------------------------------------------------------------------
/single_cell/utils/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/utils/tests/__init__.py


--------------------------------------------------------------------------------
/single_cell/utils/validator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shahcompbio/single_cell_pipeline/f0e6a6f540aa12637fe24fc2dfc7945ab9a79fa1/single_cell/utils/validator/__init__.py


--------------------------------------------------------------------------------
/single_cell/utils/validator/utils.py:
--------------------------------------------------------------------------------
 1 | class DtypeException(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class MissingFieldError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class InvalidBarcode(Exception):
10 |     pass
11 | 
12 | 
13 | class InvalidIndex(Exception):
14 |     pass
15 | 
16 | 
17 | class MissingInput(Exception):
18 |     pass
19 | 
20 | 
21 | class InvalidInstrument(Exception):
22 |     pass
23 | 
24 | 
25 | class DLPIndexError(Exception):
26 |     pass
27 | 
28 | 
29 | def get(data, key):
30 |     if key not in data:
31 |         raise MissingFieldError('{} key missing in yaml file.'.format(key))
32 |     return data[key]
33 | 
34 | 
35 | def check_data_type(keys, dtype, data):
36 |     for key in keys:
37 | 
38 |         if not isinstance(get(data, key), dtype):
39 |             raise DtypeException('{} value must be {}'.format(key, dtype))
40 | 
41 | 
42 | def check_barcodes(barcode_str):
43 |     for val in barcode_str:
44 |         if val not in ['A', 'C', 'G', 'T']:
45 |             raise InvalidBarcode('{} is not a valid varcode'.format(barcode_str))
46 | 
47 | 
48 | def check_genomic_regions(region, sep='-'):
49 |     chroms = list(map(str, range(1, 23))) + ['X', 'Y']
50 | 
51 |     chrom, start, end = region.split(sep)
52 | 
53 |     assert chrom in chroms, '{} is not a valid chrom'.format(chrom)
54 | 
55 | 
56 | def check_cells_data(data):
57 |     for cell in data:
58 |         check_data_type(['bam'], str, data[cell])
59 | 
60 | 
61 | def check_normal_data(normal):
62 |     if 'bam' in normal:
63 |         check_data_type(['bam'], str, normal)
64 |     else:
65 |         for cell in normal:
66 |             check_data_type(['bam'], str, normal[cell])
67 | 


--------------------------------------------------------------------------------
/single_cell/utils/validator/validate.py:
--------------------------------------------------------------------------------
 1 | from single_cell.utils.validator import utils
 2 | 
 3 | 
 4 | def validate_alignment_fastqs(data):
 5 |     for sample, sample_data in data.items():
 6 |         for lane, lane_data in sample_data['fastqs'].items():
 7 |             if not utils.get(lane_data, 'fastq_1') or not utils.get(lane_data, 'fastq_2'):
 8 |                 raise utils.MissingInput()
 9 | 
10 | 
11 | def validate_sample_info(yamldata):
12 |     for cell in yamldata:
13 |         celldata = yamldata[cell]
14 | 
15 |         utils.check_data_type(['column', 'img_col', 'row'], int, celldata)
16 |         utils.check_data_type(['condition', 'pick_met', 'index_i5', 'index_i7', 'sample_id', 'library_id'], str, celldata)
17 |         utils.check_data_type(['is_control'], bool, celldata)
18 | 
19 |         utils.check_barcodes(utils.get(celldata, 'primer_i5'))
20 |         utils.check_barcodes(utils.get(celldata, 'primer_i7'))
21 | 
22 |         if not utils.get(celldata, 'index_i5').startswith('i5-'):
23 |             raise utils.DLPIndexError()
24 |         if not utils.get(celldata, 'index_i7').startswith('i7-'):
25 |             raise utils.DLPIndexError()
26 | 
27 | 
28 | def validate_hmmcopy_bams(yamldata):
29 |     for cell, celldata in yamldata.items():
30 |         utils.check_data_type(['bam'], str, celldata)
31 | 
32 | 
33 | def validate_annotation(yamldata):
34 |     utils.check_data_type(
35 |         ['hmmcopy_metrics', 'hmmcopy_reads', 'alignment_metrics', 'gc_metrics', 'segs_pdf_tar'],
36 |         str,
37 |         yamldata
38 |     )
39 | 
40 | 
41 | def validate_merge_cell_bams(yamldata):
42 |     utils.check_cells_data(utils.get(yamldata, 'cell_bams'))
43 | 
44 | 
45 | def validate_split_wgs_bam(yamldata):
46 |     data = utils.get(yamldata, 'normal')
47 |     utils.check_data_type(['bam'], str, data)
48 | 
49 | 
50 | def validate_variant_calling(yamldata):
51 |     normals = yamldata['normal']
52 |     for region in normals:
53 |         utils.check_data_type(['bam'], str, normals[region])
54 |         utils.check_genomic_regions(region)
55 | 
56 |     tumours = yamldata['tumour']
57 |     for region in tumours:
58 |         utils.check_data_type(['bam'], str, tumours[region])
59 |         utils.check_genomic_regions(region)
60 | 
61 | 
62 | def validate_germline_calling(yamldata):
63 |     utils.check_normal_data(utils.get(yamldata, 'normal'))
64 | 
65 | 
66 | def validate_infer_haps(yamldata):
67 |     utils.check_normal_data(utils.get(yamldata, 'normal'))
68 | 
69 | 
70 | def validate_count_haps(yamldata):
71 |     utils.check_cells_data(utils.get(yamldata, 'tumour'))
72 |     utils.check_data_type(['haplotypes'], str, yamldata)
73 | 
74 | 
75 | def validate_breakpoint_calling(yamldata):
76 |     utils.check_normal_data(utils.get(yamldata, 'normal'))
77 |     utils.check_cells_data(utils.get(yamldata, 'tumour'))
78 | 
79 | 
80 | def validate_snv_genotyping(yamldata):
81 |     tumour_cells = utils.get(yamldata, 'tumour_cells')
82 |     for sample in tumour_cells:
83 |         for library in tumour_cells[sample]:
84 |             utils.check_cells_data(tumour_cells[sample][library])
85 | 
86 |     vcf_files = utils.get(yamldata, 'vcf_files')
87 |     assert isinstance(vcf_files, list)
88 |     for filepath in vcf_files:
89 |         assert isinstance(filepath, str)
90 | 
91 | 
92 | def validate_sv_genotyping(yamldata):
93 |     pass
94 | 


--------------------------------------------------------------------------------
/single_cell/utils/vcfutils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Feb 27, 2018
  3 | 
  4 | @author: dgrewal
  5 | '''
  6 | import itertools
  7 | import logging
  8 | import os
  9 | 
 10 | import biowrappers.components.io.vcf.tasks as vcf_tasks
 11 | import vcf
 12 | from single_cell.utils import helpers
 13 | 
 14 | 
 15 | def _get_header(infile):
 16 |     '''
 17 |     Extract header from the VCF file
 18 | 
 19 |     :param infile: input VCF file
 20 |     :return: header
 21 |     '''
 22 | 
 23 |     header = []
 24 |     for line in infile:
 25 |         if line.startswith('##'):
 26 |             header.append(line)
 27 |         elif line.startswith('#'):
 28 |             header.append(line)
 29 |             return header
 30 |         else:
 31 |             raise Exception('invalid header: missing #CHROM line')
 32 | 
 33 |     logging.getLogger("single_cell.helpers.vcfutils").warn(
 34 |         "One of the input files is empty"
 35 |     )
 36 |     return []
 37 | 
 38 | 
 39 | def concatenate_vcf(infiles, outfile):
 40 |     '''
 41 |     Concatenate VCF files
 42 | 
 43 |     :param infiles: dictionary of input VCF files to be concatenated
 44 |     :param outfile: output VCF file
 45 |     '''
 46 | 
 47 |     with open(outfile, 'w') as ofile:
 48 |         header = None
 49 | 
 50 |         for _, ifile in infiles.items():
 51 | 
 52 |             if os.path.getsize(ifile) == 0:
 53 |                 logging.getLogger("single_cell.helpers.vcfutils").warn(
 54 |                     'input file {} is empty'.format(ifile)
 55 |                 )
 56 |                 continue
 57 | 
 58 |             with open(ifile) as f:
 59 | 
 60 |                 if not header:
 61 |                     header = _get_header(f)
 62 | 
 63 |                     for line in header:
 64 |                         ofile.write(line)
 65 |                 else:
 66 |                     if not _get_header(f) == header:
 67 |                         logging.getLogger("single_cell.helpers.vcfutils").warn(
 68 |                             'merging vcf files with mismatching headers'
 69 |                         )
 70 | 
 71 |                 for l in f:
 72 |                     ofile.write(l)
 73 | 
 74 | 
 75 | def merge_vcf(infiles, outfile, tempdir):
 76 |     vcf_files = []
 77 |     for infile in infiles:
 78 |         if isinstance(infile, str):
 79 |             vcf_files.append(infile)
 80 |         elif isinstance(infile, dict):
 81 |             vcf_files.extend(list(infile.values()))
 82 |         elif isinstance(infile, (list, tuple)):
 83 |             vcf_files.extend(list(infile))
 84 |         else:
 85 |             raise Exception("unknown data type")
 86 | 
 87 |     helpers.makedirs(tempdir)
 88 |     temp_output = os.path.join(tempdir, 'merged.vcf')
 89 | 
 90 |     vcf_tasks.merge_vcfs(vcf_files, temp_output)
 91 | 
 92 |     vcf_tasks.finalise_vcf(temp_output, outfile)
 93 | 
 94 | 
 95 | def split_vcf(in_file, out_files, lines_per_file):
 96 |     """ Split a VCF file into smaller files.
 97 | 
 98 |     :param in_file: Path of VCF file to split.
 99 | 
100 |     :param out_files: Callback function which supplies file name given index of split.
101 | 
102 |     :param lines_per_file: Maximum number of lines to be written per file.
103 | 
104 |      """
105 | 
106 |     def line_group(_, line_idx=itertools.count()):
107 |         return int(next(line_idx) / lines_per_file)
108 | 
109 |     reader = vcf.Reader(filename=in_file)
110 | 
111 |     for file_idx, records in itertools.groupby(reader, key=line_group):
112 |         file_name = out_files[file_idx]
113 | 
114 |         with open(file_name, 'w') as out_fh:
115 |             writer = vcf.Writer(out_fh, reader)
116 | 
117 |             for record in records:
118 |                 writer.write_record(record)
119 | 
120 |             writer.close()
121 | 


--------------------------------------------------------------------------------
/single_cell/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 6, 2017
3 | 
4 | @author: dgrewal
5 | '''
6 | 


--------------------------------------------------------------------------------
/single_cell/workflows/align/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     metrics = {
 3 |         'cell_id': 'str',
 4 |         'total_mapped_reads': 'int',
 5 |         'library_id': 'str',
 6 |         'unpaired_mapped_reads': 'int',
 7 |         'paired_mapped_reads': 'int',
 8 |         'unpaired_duplicate_reads': 'int',
 9 |         'paired_duplicate_reads': 'int',
10 |         'unmapped_reads': 'int',
11 |         'percent_duplicate_reads': 'float',
12 |         'estimated_library_size': 'int',
13 |         'total_reads': 'int',
14 |         'total_duplicate_reads': 'int',
15 |         'total_properly_paired': 'int',
16 |         'coverage_breadth': 'float',
17 |         'coverage_depth': 'float',
18 |         'median_insert_size': 'float',
19 |         'mean_insert_size': 'float',
20 |         'standard_deviation_insert_size': 'float',
21 |         'cell_call': 'str',
22 |         'column': 'int',
23 |         'experimental_condition': 'str',
24 |         'img_col': 'int',
25 |         'index_i5': 'str',
26 |         'index_i7': 'str',
27 |         'primer_i5': 'str',
28 |         'primer_i7': 'str',
29 |         'row': 'int',
30 |         'sample_type': 'str',
31 |         'is_contaminated': 'bool',
32 |         'trim': 'bool',
33 |         'sample_id': 'str',
34 |         'aligned': 'float',
35 |         'expected': 'float',
36 |         'overlap_with_all_filters': 'float',
37 |         'overlap_with_all_filters_and_qual': 'float',
38 |         'overlap_with_dups': 'float',
39 |         'overlap_without_dups': 'float',
40 |         'is_control': 'bool',
41 |     }
42 | 
43 |     gc = {str(i): 'float' for i in range(0, 101)}
44 |     gc['cell_id'] = 'str'
45 | 
46 |     dtypes = locals()
47 | 
48 |     return dtypes
49 | 
50 | 
51 | def fastqscreen_dtypes(genome_labels):
52 |     metrics = {'fastqscreen_nohit': 'int', 'cell_id': 'str'}
53 |     for label in genome_labels:
54 |         metrics['fastqscreen_{}'.format(label)] = 'int'
55 |         metrics['fastqscreen_{}_multihit'.format(label)] = 'int'
56 | 
57 |     fastqscreen_detailed = {
58 |         'cell_id': 'str',
59 |         'readend': 'str',
60 |         'count': 'int'
61 |     }
62 | 
63 |     for label in genome_labels:
64 |         fastqscreen_detailed[label] = 'int'
65 | 
66 |     dtypes = locals()
67 |     return dtypes
68 | 


--------------------------------------------------------------------------------
/single_cell/workflows/align/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 24, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | from .gen_cn_matrix import GenerateCNMatrix
 8 | from .collect_metrics import CollectMetrics
 9 | from .run_trimgalore import RunTrimGalore
10 | from .summary_metrics import SummaryMetrics


--------------------------------------------------------------------------------
/single_cell/workflows/align/scripts/gen_cn_matrix.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 8, 2015
  3 | 
  4 | @author: dgrewal
  5 | '''
  6 | import logging
  7 | import numpy as np
  8 | import pandas as pd
  9 | from single_cell.utils import csvutils
 10 | 
 11 | class GenerateCNMatrix(object):
 12 |     '''
 13 |     merges files. no overlap queries, simple concatenation
 14 |     since columns are different, select header and insert values at proper
 15 |     indices. use N/A for missing.
 16 |     '''
 17 | 
 18 |     def __init__(self, infile, output, sep, colname, sample_id, typ, dtypes):
 19 |         self.sep = sep
 20 |         self.output = output
 21 |         self.column_name = colname
 22 |         self.input = infile
 23 |         self.sample_id = sample_id
 24 |         self.type = typ
 25 |         self.dtypes = dtypes
 26 | 
 27 |     @staticmethod
 28 |     def replace_missing_vals(input_df, nan_val='N/A'):
 29 |         '''
 30 |         replace NaN values with nan_val
 31 |         '''
 32 |         input_df = input_df.fillna(nan_val)
 33 | 
 34 |         return input_df
 35 | 
 36 |     def write(self, input_df, transpose=False):
 37 |         '''
 38 |         write the dataframe to output file
 39 |         '''
 40 |         if transpose:
 41 |             del input_df["gc"]
 42 |             input_df = input_df.T
 43 |             input_df["cell_id"] = input_df.index
 44 | 
 45 |         input_df.columns = input_df.columns.astype(str)
 46 |         csvutils.write_dataframe_to_csv_and_yaml(input_df, self.output, self.dtypes)
 47 | 
 48 | 
 49 |     def read_hmmcopy_corrected_read_file(self, sample_id):
 50 |         """
 51 |         
 52 |         """
 53 |         column_name = self.column_name
 54 |         data = pd.read_csv(self.input)
 55 |         if column_name in data.columns:
 56 |             df = data[['chr', 'start', 'end', 'width', column_name]]
 57 |         else:
 58 |             df = data[['chr', 'start', 'end', 'width']]
 59 |             
 60 |             df[column_name] = float('NaN')
 61 |             
 62 |         df = df.rename(columns = {column_name:sample_id})
 63 |         
 64 |         return df
 65 | 
 66 |     def read_gcbias_file(self, sample_id):
 67 |         """
 68 |         parses the gcbias data
 69 |         """
 70 |         column_name = self.column_name
 71 | 
 72 |         data = open(self.input).readlines()
 73 |         skiprows = [i for i,v in enumerate(data) if v[0] == '#' or v=='\n']
 74 | 
 75 |         #If the file is empty (only header no data) then return 0s (dummy data)
 76 |         try:
 77 |             data = pd.read_csv(self.input, sep='\t', skiprows=skiprows)
 78 |         except pd.io.common.EmptyDataError:
 79 |             logging.getLogger("single_cell.align.gcbias").warn(
 80 |                 'No data in the GCBias output')
 81 |             #If the file is empty (only header no data) then return 0s (dummy data)
 82 |             data = np.array([np.arange(100), [0]*100]).T
 83 |             data = pd.DataFrame(data, columns = ['gc', sample_id])
 84 |             return data
 85 | 
 86 |         data = pd.DataFrame(data[column_name])
 87 | 
 88 |         data['gc'] = data.index
 89 | 
 90 |         df = data.rename(columns={'NORMALIZED_COVERAGE':sample_id})
 91 | 
 92 |         df = df[['gc',sample_id]]
 93 |         return df
 94 | 
 95 |     def main(self):
 96 |         '''
 97 |         main function
 98 |         '''
 99 |         sample_id = self.sample_id
100 |         
101 |         if self.type == 'hmmcopy_corrected_reads':
102 |             data = self.read_hmmcopy_corrected_read_file(sample_id)
103 |             self.write(data)
104 |         else:
105 |             data = self.read_gcbias_file(sample_id)
106 |             self.write(data, transpose=True)
107 | 
108 | 


--------------------------------------------------------------------------------
/single_cell/workflows/cohort_qc/scripts/oncoplot.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(maftools)
 4 | 
 5 | 
 6 | oncoplot = function(read_maf, oncoplot_path, genes){
 7 |     png(filename=oncoplot_path, units="px", width=1600, height=1600, res=300)
 8 |     
 9 |     maftools::oncoplot(maf=read_maf,showTumorSampleBarcodes=TRUE,genes=genes)
10 |     dev.off()
11 | }
12 | 
13 | 
14 | 
15 | main = function(){
16 |     args = commandArgs(trailingOnly=TRUE)
17 |     genes=c("PPM1D", "TP53",  "BRCA1", "BRCA2", "MECOM", "RB1", "PTEN", "PALB2","ERBB2", "CDK12", "PIK3CA", "KRAS", "CCNE1", "MYC")
18 | 
19 |     maf_file = args[1]
20 |     vcNames=args[2]
21 |     cn=args[3]
22 |     oncoplot_path = args[4]
23 | 
24 | 
25 |     vcNames=read.table(vcNames,header=TRUE)$Variant_Classification
26 | 
27 |     maf = maftools::read.maf(maf=maf_file, cnTable=cn, vc_nonSyn=vcNames)
28 |     
29 |     oncoplot(maf, oncoplot_path, genes)
30 | 
31 | 
32 | }
33 | 
34 | 
35 | main()
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/single_cell/workflows/cohort_qc/scripts/report.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 3 | output:
 4 |   html_document
 5 | params:
 6 |   cohort: ""
 7 |   oncoplot: ""
 8 | 
 9 | ---
10 | ---
11 | title: `r params$cohort`
12 | ---
13 | 
14 | ```{r setup, include=FALSE}
15 | knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE, fig.align = 'center')
16 | library(knitr)
17 | 
18 | ```
19 | 
20 | ## oncoplot
21 | ```{r adjdist, echo = FALSE, out.width = "100%", out.height = "300",}
22 | # All defaults
23 | include_graphics(params$oncoplot)
24 | ```
25 | 


--------------------------------------------------------------------------------
/single_cell/workflows/cohort_qc/scripts/vcf2maf:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | INPUTVCF=$1
 4 | OUTPUTMAF=$2
 5 | FASTA=$3
 6 | VEPDATA=$4
 7 | BUFFERSIZE=$5
 8 | 
 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4  --vep-path $(dirname `which vep`) --buffer-size $5
10 | 


--------------------------------------------------------------------------------
/single_cell/workflows/cohort_qc/scripts/vcf2maf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | INPUTVCF=$1
 4 | OUTPUTMAF=$2
 5 | FASTA=$3
 6 | VEPDATA=$4
 7 | BUFFERSIZE=$5
 8 | 
 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4  --vep-path $(dirname `which vep`) --buffer-size $5
10 | 


--------------------------------------------------------------------------------
/single_cell/workflows/db_annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | import pypeliner.managed as mgd
 3 | 
 4 | 
 5 | def create_db_annotation_workflow(
 6 |         in_vcf_file,
 7 |         out_csv_file,
 8 |         db_vcf_file,
 9 |         split_size=1e4
10 | ):
11 |     workflow = pypeliner.workflow.Workflow(ctx=dict(mem=2, num_retry=3, mem_retry_increment=2))
12 | 
13 |     workflow.transform(
14 |         name='split_vcf',
15 |         func='single_cell.utils.vcfutils.split_vcf',
16 |         args=(
17 |             mgd.InputFile(in_vcf_file),
18 |             mgd.TempOutputFile('split.vcf', 'split')
19 |         ),
20 |         kwargs={'lines_per_file': split_size}
21 |     )
22 | 
23 |     workflow.transform(
24 |         name='annotate_db_status',
25 |         axes=('split',),
26 |         func='single_cell.workflows.db_annotation.tasks.annotate_db_status',
27 |         args=(
28 |             db_vcf_file,
29 |             mgd.TempInputFile('split.vcf', 'split'),
30 |             mgd.TempOutputFile('annotated.csv.gz', 'split', extensions=['.yaml'])
31 |         )
32 |     )
33 | 
34 |     workflow.transform(
35 |         name='merge_tables',
36 |         func='single_cell.utils.csvutils.concatenate_csv',
37 |         args=(
38 |             mgd.TempInputFile('annotated.csv.gz', 'split', extensions=['.yaml']),
39 |             mgd.OutputFile(out_csv_file, extensions=['.yaml'])
40 |         )
41 |     )
42 | 
43 |     return workflow
44 | 


--------------------------------------------------------------------------------
/single_cell/workflows/db_annotation/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_annotate = {
 3 |         'cell_id': 'str',
 4 |         'chrom': 'str',
 5 |         'coord': 'int',
 6 |         'ref': 'str',
 7 |         'alt': 'str',
 8 |         'db_id': 'str',
 9 |         'exact_match': 'int',
10 |         'indel': 'int',
11 |         'mappability': 'float',
12 |         'effect': 'str',
13 |         'effect_impact': 'str',
14 |         'functional_class': 'str',
15 |         'codon_change': 'str',
16 |         'amino_acid_change': 'str',
17 |         'amino_acid_length': 'str',
18 |         'gene_name': 'str',
19 |         'transcript_biotype': 'str',
20 |         'gene_coding': 'str',
21 |         'transcript_id': 'str',
22 |         'exon_rank': 'str',
23 |         'genotype': 'str',
24 |         'tri_nucleotide_context': 'str',
25 |     }
26 | 
27 | 
28 |     return snv_annotate
29 | 


--------------------------------------------------------------------------------
/single_cell/workflows/db_annotation/tasks.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import vcf
 3 | from single_cell.utils import csvutils
 4 | from single_cell.workflows.db_annotation.dtypes import dtypes
 5 | 
 6 | 
 7 | def annotate_db_status(db_vcf_file, target_vcf_file, out_file):
 8 |     db_reader = vcf.Reader(filename=db_vcf_file)
 9 | 
10 |     reader = vcf.Reader(filename=target_vcf_file)
11 | 
12 |     data = []
13 | 
14 |     for record in reader:
15 |         chrom = record.CHROM
16 | 
17 |         coord = record.POS
18 | 
19 |         try:
20 |             db_position_records = [x for x in db_reader.fetch(chrom, coord - 1, coord)]
21 | 
22 |         except ValueError:
23 |             db_position_records = []
24 | 
25 |         for db_record in db_position_records:
26 | 
27 |             if (db_record.CHROM != chrom) or (db_record.POS != coord):
28 |                 continue
29 | 
30 |             if db_record.is_indel:
31 |                 indel = 1
32 | 
33 |             else:
34 |                 indel = 0
35 | 
36 |             for alt in record.ALT:
37 | 
38 |                 if (record.REF == db_record.REF) and (alt in db_record.ALT):
39 |                     exact_match = 1
40 | 
41 |                 else:
42 |                     exact_match = 0
43 | 
44 |                 out_row = {
45 |                     'chrom': chrom,
46 |                     'coord': coord,
47 |                     'ref': record.REF,
48 |                     'alt': str(alt),
49 |                     'db_id': db_record.ID,
50 |                     'exact_match': exact_match,
51 |                     'indel': indel
52 |                 }
53 | 
54 |                 data.append(out_row)
55 | 
56 |     data = pd.DataFrame(data)
57 | 
58 |     csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
59 | 


--------------------------------------------------------------------------------
/single_cell/workflows/destruct_singlecell/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     cell_counts = {
 3 |         "cluster_id": "int",
 4 |         "cell_id": "str",
 5 |         "read_count": "int"
 6 |     }
 7 |     library = {
 8 |         "prediction_id": "int",
 9 |         "num_reads": "int",
10 |         "num_unique_reads": "int",
11 |         "library": "str",
12 |         "is_normal": "bool",
13 |         "patient_id": "float"
14 |     }
15 |     breakpoints = {
16 |         "prediction_id": "int",
17 |         "chromosome_1": "str",
18 |         "strand_1": "str",
19 |         "position_1": "int",
20 |         "chromosome_2": "str",
21 |         "strand_2": "str",
22 |         "position_2": "int",
23 |         "homology": "int",
24 |         "num_split": "int",
25 |         "inserted": "str",
26 |         "mate_score": "float",
27 |         "template_length_1": "int",
28 |         "log_cdf": "float",
29 |         "template_length_2": "int",
30 |         "log_likelihood": "float",
31 |         "template_length_min": "int",
32 |         "num_reads": "int",
33 |         "num_unique_reads": "int",
34 |         "type": "str",
35 |         "num_inserted": "int",
36 |         "sequence": "str",
37 |         "gene_id_1": "str",
38 |         "gene_name_1": "str",
39 |         "gene_location_1": "str",
40 |         "gene_id_2": "str",
41 |         "gene_name_2": "str",
42 |         "gene_location_2": "str",
43 |         "dgv_ids": "float",
44 |         "is_germline": "bool",
45 |         "is_dgv": "bool",
46 |         "num_patients": "int",
47 |         "is_filtered": "bool",
48 |         "dist_filtered": "float",
49 |         "balanced": "bool",
50 |         "rearrangement_type": "str"
51 |     }
52 | 
53 |     dtypes = locals()
54 | 
55 |     return dtypes
56 | 


--------------------------------------------------------------------------------
/single_cell/workflows/extract_allele_readcounts/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     readcount = {
 3 |         'chromosome': 'str',
 4 |         'start': 'int',
 5 |         'end': 'int',
 6 |         'hap_label': 'str',
 7 |         'allele_id': 'str',
 8 |         'readcount': 'int',
 9 |         'cell_id': 'str'
10 |     }
11 | 
12 |     dtypes = locals()
13 | 
14 |     return dtypes
15 | 


--------------------------------------------------------------------------------
/single_cell/workflows/extract_allele_readcounts/tasks.py:
--------------------------------------------------------------------------------
 1 | from single_cell.utils import csvutils
 2 | 
 3 | 
 4 | def convert_csv_to_tsv(csv_infile, tsv_outfile):
 5 |     csvinput = csvutils.CsvInput(csv_infile)
 6 | 
 7 |     csvdata = csvinput.read_csv()
 8 | 
 9 |     csvdata.to_csv(tsv_outfile, sep='\t', index=False)
10 | 


--------------------------------------------------------------------------------
/single_cell/workflows/germline/__init__.py:
--------------------------------------------------------------------------------
 1 | from pypeliner.workflow import Workflow
 2 | 
 3 | import pypeliner
 4 | 
 5 | default_chromosomes = [str(x) for x in range(1, 23)] + ['X', 'Y']
 6 | 
 7 | 
 8 | def create_samtools_germline_workflow(
 9 |         normal_bam_files,
10 |         ref_genome_fasta_file,
11 |         vcf_file,
12 |         config,
13 | ):
14 | 
15 |     ctx = {'mem': config["memory"]['low'],
16 |            'mem_retry_increment': 2,
17 |            'disk_retry_increment': 50,
18 |            'ncpus': 1}
19 | 
20 |     regions = list(normal_bam_files.keys())
21 | 
22 |     workflow = Workflow(ctx=ctx)
23 | 
24 |     workflow.setobj(
25 |         obj=pypeliner.managed.OutputChunks('regions'),
26 |         value=regions,
27 |     )
28 | 
29 |     workflow.transform(
30 |         name='run_samtools_variant_calling',
31 |         axes=('regions',),
32 |         func="single_cell.workflows.germline.tasks.run_samtools_variant_calling",
33 |         args=(
34 |             pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files, extensions=['.bai']),
35 |             ref_genome_fasta_file,
36 |             pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
37 |         ),
38 |         kwargs={
39 |             'region': pypeliner.managed.InputInstance('regions'),
40 |         },
41 |     )
42 | 
43 |     workflow.transform(
44 |         name='concatenate_variants',
45 |         func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
46 |         args=(
47 |             pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
48 |             pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
49 |             pypeliner.managed.TempSpace("merge_variants_germline"),
50 |         ),
51 |     )
52 | 
53 |     return workflow
54 | 


--------------------------------------------------------------------------------
/single_cell/workflows/hmmcopy/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     reads = {
 3 |         'chr': 'str',
 4 |         'start': 'int',
 5 |         'end': 'int',
 6 |         'width': 'int',
 7 |         'reads': 'int',
 8 |         'gc': 'float',
 9 |         'cor_gc': 'float',
10 |         'cor_map': 'float',
11 |         'copy': 'float',
12 |         'map': 'float',
13 |         'state': 'float',
14 |         'cell_id': 'str',
15 |         'sample_id': 'str',
16 |         'library_id': 'str',
17 |         'valid': 'bool',
18 |         'ideal': 'bool',
19 |         'modal_curve': 'float',
20 |         'modal_quantile': 'float',
21 |         'multiplier': 'int',
22 |         'is_low_mappability': 'bool'
23 |     }
24 | 
25 |     segs = {
26 |         'chr': 'str',
27 |         'start': 'int',
28 |         'end': 'int',
29 |         'state': 'float',
30 |         'median': 'float',
31 |         'multiplier': 'int',
32 |         'cell_id': 'str',
33 |     }
34 | 
35 |     params = {
36 |         'iteration': 'float',
37 |         # 'is_final': 'bool',
38 |         'state':'float',
39 |         'parameter': 'str',
40 |         'cell_id':'str',
41 |         'value':'float',
42 |     }
43 | 
44 |     metrics = {
45 |         'multiplier': 'int',
46 |         'cell_id': 'str',
47 |         'sample_id': 'str',
48 |         'library_id': 'str',
49 |         'MSRSI_non_integerness': 'float',
50 |         'MBRSI_dispersion_non_integerness': 'float',
51 |         'MBRSM_dispersion': 'float',
52 |         'autocorrelation_hmmcopy': 'float',
53 |         'cv_hmmcopy': 'float',
54 |         'empty_bins_hmmcopy': 'int',
55 |         'mad_hmmcopy': 'float',
56 |         'mean_hmmcopy_reads_per_bin': 'float',
57 |         'median_hmmcopy_reads_per_bin': 'float',
58 |         'std_hmmcopy_reads_per_bin': 'float',
59 |         'total_mapped_reads_hmmcopy': 'int',
60 |         'total_halfiness': 'float',
61 |         'scaled_halfiness': 'float',
62 |         'mean_state_mads': 'float',
63 |         'mean_state_vars': 'float',
64 |         'mad_neutral_state': 'float',
65 |         'breakpoints': 'int',
66 |         'mean_copy': 'float',
67 |         'state_mode': 'int',
68 |         'log_likelihood': 'float',
69 |         'true_multiplier': 'float',
70 |         'column': 'int',
71 |         'img_col': 'int',
72 |         'primer_i7': 'str',
73 |         'index_i5': 'str',
74 |         'sample_type': 'str',
75 |         'primer_i5': 'str',
76 |         'experimental_condition': 'str',
77 |         'cell_call': 'str',
78 |         'index_i7': 'str',
79 |         'order': 'int',
80 |         'row': 'int',
81 |         'trim': 'bool',
82 |         'is_control': 'bool'
83 |     }
84 | 
85 |     dtypes = locals()
86 | 
87 |     return dtypes
88 | 


--------------------------------------------------------------------------------
/single_cell/workflows/hmmcopy/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 24, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | 
 8 | from .read_counter import ReadCounter
 9 | from .convert_csv_to_seg import ConvertCSVToSEG
10 | from .read_counter import ReadCounter
11 | from .correct_read_count import CorrectReadCount
12 | 


--------------------------------------------------------------------------------
/single_cell/workflows/infer_haps/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     haplotypes = {
 3 |         'chromosome': 'str',
 4 |         'position': 'int',
 5 |         'allele': 'str',
 6 |         'hap_label': 'str',
 7 |         'allele_id': 'str',
 8 |         'ref': 'str',
 9 |         'alt': 'str'
10 |     }
11 | 
12 |     dtypes = locals()
13 | 
14 |     return dtypes
15 | 


--------------------------------------------------------------------------------
/single_cell/workflows/infer_haps/tasks.py:
--------------------------------------------------------------------------------
 1 | from single_cell.utils import helpers
 2 | import os
 3 | 
 4 | def annotate_ref_alt(haps_csv, refdir, output_csv):
 5 |     thousand_genomes = os.path.join(refdir, 'thousand_genomes_snps.tsv')
 6 | 
 7 |     annotation_data = {}
 8 | 
 9 |     with helpers.getFileHandle(thousand_genomes, 'rt') as db:
10 |         for line in db:
11 |             line = line.strip().split('\t')
12 | 
13 |             chrom, pos, ref, alt = line
14 | 
15 |             annotation_data[(chrom, pos)] = (ref, alt)
16 | 
17 |     with helpers.getFileHandle(haps_csv, 'rt') as reader, helpers.getFileHandle(output_csv, 'wt') as writer:
18 | 
19 |         header = reader.readline().strip()
20 |         header += '\tref\talt\n'
21 |         writer.write(header)
22 | 
23 |         for line in reader:
24 |             line = line.strip()
25 |             l_split = line.split('\t')
26 | 
27 |             chrom = l_split[0]
28 |             pos = l_split[1]
29 | 
30 |             if (chrom, pos) in annotation_data:
31 |                 ref, alt = annotation_data[(chrom, pos)]
32 |             else:
33 |                 ref = 'NA'
34 |                 alt = 'NA'
35 | 
36 |             line += '\t{}\t{}\n'.format(ref, alt)
37 | 
38 |             writer.write(line)
39 | 


--------------------------------------------------------------------------------
/single_cell/workflows/lumpy/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     evidence = {
 3 |         "breakpoint_id": "int",
 4 |         "cell_id": "str",
 5 |         "count": "int"
 6 |     }
 7 | 
 8 |     breakpoint = {
 9 |         "breakpoint_id": "int",
10 |         "chrom1": "str",
11 |         "start1": "int",
12 |         "end1": "int",
13 |         "strand1": "str",
14 |         "max_chr1": "str",
15 |         "max_pos1": "int",
16 |         "confidence_interval_chr1": "str",
17 |         "confidence_interval_start1": "int",
18 |         "confidence_interval_end1": "int",
19 |         "chrom2": "str",
20 |         "start2": "int",
21 |         "end2": "int",
22 |         "strand2": "str",
23 |         "max_chr2": "str",
24 |         "max_pos2": "int",
25 |         "confidence_interval_chr2": "str",
26 |         "confidence_interval_start2": "int",
27 |         "confidence_interval_end2": "int",
28 |         "type": "str",
29 |         "score": "float",
30 |         "strands": "str",
31 |         "normal_PE": "float",
32 |         "tumour_PE": "float",
33 |         "tumour_SR": "float",
34 |         "normal_SR": "float",
35 |     }
36 |     dtypes = locals()
37 | 
38 |     return dtypes
39 | 


--------------------------------------------------------------------------------
/single_cell/workflows/lumpy/merge_histograms.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | 
  3 | 
  4 | def parse_histogram(infile):
  5 |     data = []
  6 | 
  7 |     with open(infile) as inputdata:
  8 |         for line in inputdata:
  9 |             if line.startswith('#'):
 10 |                 line = line.strip().split(':')
 11 |                 if line[0] == "#numreads":
 12 |                     numreads = int(line[1])
 13 |                 elif line[0] == "#mean":
 14 |                     mean = float(line[1])
 15 |                 elif line[0] == "#stdev":
 16 |                     stdev = float(line[1])
 17 |                 else:
 18 |                     raise Exception()
 19 |                 continue
 20 | 
 21 |             line = line.strip().split(',')
 22 |             i = int(line[0])
 23 |             val = float(line[1])
 24 |             data.append((i, val))
 25 | 
 26 |     return data, mean, stdev, numreads
 27 | 
 28 | 
 29 | def merge_histo(indata, merged_data, numreads):
 30 |     for (i, val) in indata:
 31 |         if not i in merged_data:
 32 |             merged_data[i] = 0
 33 |         merged_data[i] += (val * numreads)
 34 |     return merged_data
 35 | 
 36 | 
 37 | def normalize_histo(merged_data, total_reads):
 38 |     data = []
 39 |     indices = sorted(merged_data.keys())
 40 |     for idx in indices:
 41 |         value = merged_data[idx]
 42 |         value = value / total_reads
 43 |         data.append((idx, value))
 44 |     return data
 45 | 
 46 | 
 47 | def prune_histogram(histogram):
 48 |     # towards the tail end, most cells will be 0
 49 |     # dividing by total reads will make most of these almost 0
 50 |     # remove these
 51 |     if not histogram:
 52 |         return histogram
 53 |     for idx in range(len(histogram) - 1, -1, -1):
 54 |         if float(histogram[idx][1]) >= 0.0001:
 55 |             break
 56 | 
 57 |     histogram = histogram[:idx]
 58 | 
 59 |     return histogram
 60 | 
 61 | 
 62 | def write_histo_file(data, outfile):
 63 |     with open(outfile, 'w') as histo_file:
 64 |         for i, val in data:
 65 |             histo_file.write("{}\t{}\n".format(i, val))
 66 | 
 67 | 
 68 | def write_metadata(mean, stdev, outfile):
 69 |     with open(outfile, 'w') as fileoutput:
 70 |         yaml.safe_dump({'mean': mean, 'stdev': stdev}, fileoutput)
 71 | 
 72 | 
 73 | def merge_histograms(infiles, outfile, metadata):
 74 |     merged_data = {}
 75 |     total_reads = 0
 76 | 
 77 |     means = 0
 78 |     stdevs = 0
 79 | 
 80 |     if isinstance(infiles, dict):
 81 |         infiles = infiles.values()
 82 | 
 83 |     # if input is a single file
 84 |     if isinstance(infiles, str):
 85 |         infiles = [infiles]
 86 | 
 87 |     for infile in infiles:
 88 |         data, mean, stdev, numreads = parse_histogram(infile)
 89 | 
 90 |         merged_data = merge_histo(data, merged_data, numreads)
 91 | 
 92 |         total_reads += numreads
 93 | 
 94 |         means += (mean * numreads)
 95 |         stdevs += (stdev * numreads)
 96 | 
 97 |     final_histo = normalize_histo(merged_data, total_reads)
 98 |     final_histo = prune_histogram(final_histo)
 99 | 
100 |     mean = means / total_reads
101 |     stdev = stdevs / total_reads
102 | 
103 |     write_histo_file(final_histo, outfile)
104 | 
105 |     write_metadata(mean, stdev, metadata)
106 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mappability_annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | import pypeliner.managed as mgd
 3 | 
 4 | 
 5 | def create_mappability_annotation_workflow(
 6 |         in_vcf_file,
 7 |         out_csv_file,
 8 |         mappability_file,
 9 |         split_size=1e4
10 | ):
11 |     workflow = pypeliner.workflow.Workflow(
12 |         ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}
13 |     )
14 | 
15 |     workflow.transform(
16 |         name="get_regions",
17 |         func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions",
18 |         ret=mgd.TempOutputObj('regions_obj', 'regions'),
19 |         args=(
20 |             mgd.InputFile(in_vcf_file, extensions=['.tbi']),
21 |             int(split_size),
22 |         ),
23 |     )
24 | 
25 |     workflow.transform(
26 |         name='annotate_db_status',
27 |         axes=('regions',),
28 |         func='single_cell.workflows.mappability_annotation.tasks.get_mappability',
29 |         args=(
30 |             mappability_file,
31 |             mgd.InputFile(in_vcf_file, extensions=['.tbi']),
32 |             mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml'])
33 |         ),
34 |         kwargs={
35 |             'region': mgd.TempInputObj('regions_obj', 'regions'),
36 |         },
37 |     )
38 | 
39 |     workflow.transform(
40 |         name='merge_tables',
41 |         func='single_cell.utils.csvutils.concatenate_csv',
42 |         args=(
43 |             mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']),
44 |             mgd.OutputFile(out_csv_file, extensions=['.yaml'])
45 |         )
46 |     )
47 | 
48 |     return workflow
49 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mappability_annotation/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_annotate = {
 3 |         'cell_id': 'str',
 4 |         'chrom': 'str',
 5 |         'coord': 'int',
 6 |         'ref': 'str',
 7 |         'alt': 'str',
 8 |         'db_id': 'str',
 9 |         'exact_match': 'int',
10 |         'indel': 'int',
11 |         'mappability': 'float',
12 |         'effect': 'str',
13 |         'effect_impact': 'str',
14 |         'functional_class': 'str',
15 |         'codon_change': 'str',
16 |         'amino_acid_change': 'str',
17 |         'amino_acid_length': 'str',
18 |         'gene_name': 'str',
19 |         'transcript_biotype': 'str',
20 |         'gene_coding': 'str',
21 |         'transcript_id': 'str',
22 |         'exon_rank': 'str',
23 |         'genotype': 'str',
24 |         'tri_nucleotide_context': 'str',
25 |     }
26 | 
27 | 
28 |     return snv_annotate
29 | 


--------------------------------------------------------------------------------
/single_cell/workflows/merge_bams/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 11, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | 
 7 | import pypeliner.managed as mgd
 8 | 
 9 | import pypeliner
10 | 
11 | 
12 | def create_merge_bams_workflow(
13 |         input_bams,
14 |         merged_bams,
15 |         regions,
16 |         config,
17 | ):
18 |     merged_bams = dict([(region, merged_bams[region])
19 |                         for region in regions])
20 | 
21 | 
22 |     workflow = pypeliner.workflow.Workflow()
23 | 
24 |     workflow.setobj(
25 |         obj=mgd.OutputChunks('cell_id'),
26 |         value=list(input_bams.keys()),
27 |     )
28 | 
29 |     workflow.setobj(
30 |         obj=mgd.OutputChunks('region'),
31 |         value=regions,
32 |     )
33 | 
34 |     one_split_job = config["one_split_job"]
35 | 
36 |     if one_split_job:
37 |         workflow.transform(
38 |             name='merge_bams',
39 |             ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']},
40 |             func="single_cell.workflows.merge_bams.tasks.merge_bams",
41 |             args=(
42 |                 mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']),
43 |                 mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']),
44 |                 regions,
45 |                 mgd.TempSpace("merge_bams_tempdir")
46 |             ),
47 |             kwargs={"ncores": config["max_cores"]}
48 |         )
49 |     else:
50 |         workflow.transform(
51 |             name='split_merge_tumour',
52 |             func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
53 |             axes=('region',),
54 |             args=(
55 |                 mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams),
56 |                 mgd.OutputFile(
57 |                     'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams),
58 |                 mgd.Instance('region'),
59 |             ),
60 |         )
61 | 
62 |     return workflow
63 | 


--------------------------------------------------------------------------------
/single_cell/workflows/merge_bams/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 24, 2017
3 | 
4 | @author: dgrewal
5 | '''
6 | 
7 | 
8 | from .collect_metrics import CollectMetrics


--------------------------------------------------------------------------------
/single_cell/workflows/merge_bams/tasks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 24, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import os
 7 | 
 8 | from single_cell.utils import bamutils
 9 | from single_cell.utils import helpers
10 | 
11 | 
12 | def cell_region_merge_bams(cell_bams, region_bam, region):
13 |     cell_bams = cell_bams.values()
14 |     region = '{}:{}-{}'.format(*region.split('-'))
15 | 
16 |     bamutils.bam_merge(
17 |         cell_bams, region_bam,
18 |         region=region)
19 | 
20 |     bamutils.bam_index(
21 |         region_bam, region_bam + '.bai',
22 |     )
23 | 
24 | 
25 | def merge_bams(bams, outputs, regions, tempdir, ncores=None):
26 |     merge_tempdir = os.path.join(tempdir, "merge")
27 |     commands = []
28 |     for region in regions:
29 |         output = outputs[region]
30 |         region = '{}:{}-{}'.format(*region.split('-'))
31 |         cmd = list(['samtools', 'merge', '-f', '-R', region])
32 |         cmd.append(output)
33 |         cmd.extend(bams.values())
34 |         commands.append(cmd)
35 |     helpers.run_in_gnu_parallel(commands, merge_tempdir, ncores=ncores)
36 | 
37 |     index_tempdir = os.path.join(tempdir, "index")
38 |     commands = []
39 |     for region in regions:
40 |         output = outputs[region]
41 |         commands.append(['samtools', 'index', output, output + ".bai"])
42 | 
43 |     helpers.run_in_gnu_parallel(commands, index_tempdir, ncores=ncores)
44 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mutationseq/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_museq = {
 3 |         "chrom": "str",
 4 |         "coord": "int",
 5 |         "ref": "str",
 6 |         "alt": "str",
 7 |         "score": "float"
 8 |     }
 9 | 
10 |     dtypes = locals()
11 | 
12 |     return dtypes
13 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mutationseq/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 24, 2017
3 | 
4 | @author: dgrewal
5 | '''
6 | from .parse_museq import ParseMuseq


--------------------------------------------------------------------------------
/single_cell/workflows/mutationseq/scripts/parse_museq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: dgrewal
 4 | 
 5 | Last updated: Diljot Grewal <dgrewal@bccrc.ca> Jun 3 2015
 6 | 
 7 | reads vcf files, filters and write the output in tsv format
 8 | """
 9 | 
10 | #!/usr/bin/env python
11 | 
12 | from vizutils import Utils as pau
13 | from vizutils import Vcf
14 | 
15 | class ParseMuseq(object):
16 |     '''
17 |     parse, filter and print museq vcf in tsv format
18 |     '''
19 | 
20 |     def __init__(self, **kwargs):
21 | 
22 |         self.infiles = pau.get_inputs(kwargs.get('tid'),
23 |                                       kwargs.get('nid'),
24 |                                       kwargs.get('case'),
25 |                                       kwargs.get('infile'),
26 |                                       kwargs.get('all_files'),
27 |                                       fh_names='infile')
28 | 
29 |         self.output = kwargs.get('output')
30 |         self.project = kwargs.get('project')
31 | 
32 |         self.genes = pau.read_file_to_list(kwargs.get('genes'))
33 |         self.snpeff_keywords = kwargs.get('snpeff_keywords')
34 |         self.chromosomes = kwargs.get('chromosomes')
35 |         self.remove_duplicates = kwargs.get('rm_dups')
36 |         self.pr_threshold = kwargs.get('pr_thres')
37 | 
38 |         self.keep_dbsnp = kwargs.get('keep_dbsnp')
39 |         self.keep_1000gen = kwargs.get('keep_1000gen')
40 | 
41 | 
42 |     def main(self):
43 |         '''
44 |         loop through files, load, filter and print
45 |         '''
46 |         header = False
47 |         with open(self.output, 'w') as outfile:
48 |             for (case, tum, norm), fname in self.infiles.items():
49 |     
50 |                 museq = Vcf(tumour_id = tum,
51 |                             normal_id = norm,
52 |                             case_id = case,
53 |                             infile = fname,
54 |                             snpeff_keywords = self.snpeff_keywords,
55 |                             keep_dbsnp = self.keep_dbsnp,
56 |                             keep_1000gen = self.keep_1000gen,
57 |                             chromosomes = self.chromosomes,
58 |                             genes = self.genes,
59 |                             rmdups = self.remove_duplicates,
60 |                             pr_threshold = self.pr_threshold,
61 |                             mode = 'museq'
62 |                            )
63 |                 #write header
64 |                 if not header:
65 |                     colnames = museq.get_info_header()
66 |                     pau.write_list(outfile, colnames, sep=",")
67 |                     header=True
68 |                 
69 |                 infos = museq.get_data()
70 |     
71 |                 for info in infos:
72 |                     pau.write_list(outfile, info, sep=',')
73 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mutationseq/scripts/vizutils/__init__.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | 
4 | from parseutils import ParseUtils
5 | from vcf import Vcf
6 | from utils import Utils
7 | 


--------------------------------------------------------------------------------
/single_cell/workflows/mutationseq/tasks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 24, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import pypeliner
 7 | from single_cell.utils import vcfutils
 8 | 
 9 | 
10 | def subsample(input_bam, output_bam, max_coverage=10000):
11 |     cmd = ['variant', input_bam, '-m', max_coverage, '-v', '-b', '-o', output_bam]
12 |     pypeliner.commandline.execute(*cmd)
13 | 
14 |     cmd = ['samtools', 'index', output_bam]
15 |     pypeliner.commandline.execute(*cmd)
16 | 
17 | 
18 | def run_museq(tumour, normal, out, log, region, config):
19 |     '''
20 |     Run museq script for each chromosome
21 | 
22 |     :param tumour: path to tumour bam
23 |     :param normal: path to normal bam
24 |     :param out: path to temporary output VCF file for the chromosome
25 |     :param log: path to the log file
26 |     :param config: path to the config YAML file
27 |     :param chrom: chromosome number
28 |     '''
29 | 
30 |     reference = config['ref_genome']
31 | 
32 |     region = '{}:{}-{}'.format(*region.split('-'))
33 | 
34 |     cmd = ['museq', 'normal:' + normal, 'tumour:' + tumour,
35 |            'reference:' + reference, '--out', out,
36 |            '--log', log, '--interval', region]
37 | 
38 |     museq_params = config.get('museq_params', {})
39 |     for key, val in museq_params.items():
40 |         if isinstance(val, bool):
41 |             if val:
42 |                 cmd.append('--{}'.format(key))
43 |         else:
44 |             cmd.append('--{}'.format(key))
45 |             if isinstance(val, list):
46 |                 cmd.extend(val)
47 |             else:
48 |                 cmd.append(val)
49 | 
50 |     pypeliner.commandline.execute(*cmd)
51 | 
52 | 
53 | def concatenate_vcfs(inputs, output):
54 |     vcfutils.concatenate_vcf(inputs, output)
55 | 


--------------------------------------------------------------------------------
/single_cell/workflows/pseudo_bulk_qc/scripts/mergemafs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(tidyverse)
 4 | library(data.table)
 5 | args <- commandArgs(TRUE)
 6 | 
 7 | input = args[1]
 8 | output = args[2]
 9 | maf = data.table::fread(input)
10 | 
11 | filtmaf <- filter(maf, str_detect(Consequence, "frameshift|stop") | IMPACT == "HIGH") %>%
12 |     group_by_at(vars(-contains("depth"), -contains("count"))) %>%
13 |     summarise(t_depth = sum(t_depth),
14 |            t_ref_count = sum(t_ref_count),
15 |            t_alt_count = sum(t_alt_count),
16 |            n_depth = sum(n_depth),
17 |            n_ref_count = sum(n_ref_count),
18 |            n_alt_count = sum(n_alt_count),
19 |            nlibrary = n()
20 |          ) %>%
21 |     ungroup() %>%
22 |     mutate(tVAF = t_alt_count / t_depth, nVAF = n_alt_count / n_depth) %>%
23 |     dplyr::select(id, Hugo_Symbol, Chromosome, Start_Position,
24 |       Reference_Allele, Variant_Type, Tumor_Seq_Allele1,
25 |       Tumor_Seq_Allele2, Consequence, IMPACT, tVAF, nVAF, nlibrary) %>%
26 |     dplyr::arrange(id, Chromosome, Start_Position)
27 | 
28 | write_delim(filtmaf, output, delim = "\t")
29 | 


--------------------------------------------------------------------------------
/single_cell/workflows/pseudo_bulk_qc/scripts/mergesnvs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(tidyverse)
 4 | 
 5 | args <- commandArgs(TRUE)
 6 | input = data.table::fread(args[1])
 7 | print(input)
 8 | output = args[2]
 9 | 
10 | 
11 | filtsnvs <- input %>%
12 |     group_by_at(vars(-contains("counts"), -num_cells)) %>%
13 |     summarise(alt_counts = sum(alt_counts),
14 |            ref_counts = sum(ref_counts),
15 |            total_counts = sum(total_counts),
16 |            num_cells = sum(num_cells),
17 |            nlibrary = n()
18 |          ) %>%
19 |     ungroup() %>%
20 |     mutate(tVAF = alt_counts / total_counts) %>%
21 |     dplyr::select(chrom,coord,ref,alt,gene_name,effect,effect_impact,is_cosmic,
22 |       amino_acid_change,num_cells,alt_counts,ref_counts,total_counts,
23 |       id, tVAF, nlibrary) %>%
24 |     dplyr::arrange(id, chrom, coord)
25 | 
26 | write_delim(filtsnvs, output, delim = "\t")
27 | 


--------------------------------------------------------------------------------
/single_cell/workflows/pseudo_bulk_qc/scripts/vcf2maf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | INPUTVCF=$1
 4 | OUTPUTMAF=$2
 5 | FASTA=$3
 6 | VEPDATA=$4
 7 | BUFFERSIZE=$5
 8 | 
 9 | vcf2maf.pl --input-vcf $1 --output-maf $2 --ref-fasta $3 --vep-data $4  --vep-path $(dirname `which vep`) --buffer-size $5
10 | 


--------------------------------------------------------------------------------
/single_cell/workflows/qc_annotation/dtypes.py:
--------------------------------------------------------------------------------
 1 | def metrics_dtypes():
 2 |     metrics = {
 3 |         'cell_id': 'str',
 4 |         'sample_id': 'str',
 5 |         'library_id': 'str',
 6 |         'multiplier': 'Int64',
 7 |         'MSRSI_non_integerness': 'float64',
 8 |         'MBRSI_dispersion_non_integerness': 'float64',
 9 |         'MBRSM_dispersion': 'float64',
10 |         'autocorrelation_hmmcopy': 'float64',
11 |         'cv_hmmcopy': 'float64',
12 |         'empty_bins_hmmcopy': 'Int64',
13 |         'mad_hmmcopy': 'float64',
14 |         'mean_hmmcopy_reads_per_bin': 'float64',
15 |         'median_hmmcopy_reads_per_bin': 'float64',
16 |         'std_hmmcopy_reads_per_bin': 'float64',
17 |         'total_mapped_reads_hmmcopy': 'Int64',
18 |         'total_halfiness': 'float64',
19 |         'scaled_halfiness': 'float64',
20 |         'mean_state_mads': 'float64',
21 |         'mean_state_vars': 'float64',
22 |         'mad_neutral_state': 'float64',
23 |         'breakpoints': 'Int64',
24 |         'mean_copy': 'float64',
25 |         'state_mode': 'Int64',
26 |         'log_likelihood': 'float64',
27 |         'true_multiplier': 'float64',
28 |         'column': 'Int64',
29 |         'img_col': 'Int64',
30 |         'primer_i7': 'str',
31 |         'index_i5': 'str',
32 |         'sample_type': 'str',
33 |         'primer_i5': 'str',
34 |         'experimental_condition': 'str',
35 |         'cell_call': 'str',
36 |         'index_i7': 'str',
37 |         'order': 'Int64',
38 |         'row': 'Int64',
39 |         'is_s_phase': 'bool',
40 |         'is_s_phase_prob': 'float64',
41 |         'quality': 'float64',
42 |         'coverage_depth': 'float64',
43 |         'paired_duplicate_reads': 'Int64',
44 |         'total_reads': 'Int64',
45 |         'unpaired_duplicate_reads': 'Int64',
46 |         'percent_duplicate_reads': 'float64',
47 |         'coverage_breadth': 'float64',
48 |         'mean_insert_size': 'float64',
49 |         'unpaired_mapped_reads': 'Int64',
50 |         'median_insert_size': 'float64',
51 |         'total_duplicate_reads': 'Int64',
52 |         'is_contaminated': 'bool',
53 |         'is_control': 'bool',
54 |         'estimated_library_size': 'Int64',
55 |         'standard_deviation_insert_size': 'float64',
56 |         'unmapped_reads': 'Int64',
57 |         'total_mapped_reads': 'Int64',
58 |         'total_properly_paired': 'Int64',
59 |         'paired_mapped_reads': 'Int64',
60 |         'order_corrupt_tree': 'Int64',
61 |         'species': 'str',
62 |         'trim': 'bool',
63 |         'aligned': 'float',
64 |         'expected': 'float',
65 |         'overlap_with_all_filters': 'float',
66 |         'overlap_with_all_filters_and_qual': 'float',
67 |         'overlap_with_dups': 'float',
68 |         'overlap_without_dups': 'float',
69 |     }
70 | 
71 |     return metrics
72 | 
73 | 
74 | def fastqscreen_dtypes(genome_labels):
75 |     metrics = {
76 |         'fastqscreen_nohit': 'int',
77 |         'fastqscreen_nohit_ratio': 'float',
78 |         'cell_id': 'str'
79 |     }
80 |     for label in genome_labels:
81 |         metrics['fastqscreen_{}'.format(label)] = 'int'
82 |         metrics['fastqscreen_{}_multihit'.format(label)] = 'int'
83 |         metrics['fastqscreen_{}_ratio'.format(label)] = 'float'
84 | 
85 |     return metrics
86 | 
87 | 
88 | def dtypes(genome_labels):
89 |     return {**metrics_dtypes(), **fastqscreen_dtypes(genome_labels)}
90 | 


--------------------------------------------------------------------------------
/single_cell/workflows/qc_annotation/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 24, 2017
3 | 
4 | @author: dgrewal
5 | '''
6 | 
7 | from . import generate_qc


--------------------------------------------------------------------------------
/single_cell/workflows/qc_annotation/scripts/fastqscreen_classify.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from single_cell.utils import csvutils
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | from sklearn.preprocessing import *
 5 | import numpy as np
 6 | 
 7 | def train(training_data_path):
 8 |     '''
 9 |     Train the model using the provided training data.
10 |     Return a feature scaler and a classifier.
11 |     '''
12 |     data = pd.read_csv(training_data_path)
13 |     species_list = ["salmon", "grch37", "mm10"]
14 |     labels = data["species"]
15 |     features = data.drop('species', axis=1)
16 | 
17 |     le = LabelEncoder()
18 |     le.fit(species_list)
19 |     # convert the labels
20 |     labels = le.transform(labels)
21 |     # train a feature scaler
22 |     transformer = RobustScaler().fit(features)
23 |     features = transformer.transform(features)
24 |     # train the random forest model
25 |     rf = RandomForestClassifier(n_estimators=10, random_state=42)
26 |     rf.fit(features, labels)
27 | 
28 |     return features, transformer, rf
29 | 
30 | 
31 | def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes):
32 |     df = csvutils.read_csv_and_yaml(metrics_path)
33 |     features_train, feature_transformer, model = train(training_data_path)
34 | 
35 |     features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio",
36 |                 "fastqscreen_salmon_ratio"]
37 |     label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"}
38 |     # check if all the features exists, if yes, make predictions, else create an empty species column.
39 |     exist = all([feature[:-6] in df for feature in features])
40 |     if exist:
41 |         # make the feature columns
42 |         for feature in features:
43 |             df[feature] = df[feature[:-6]].divide(df["total_reads"])
44 |         # check if there's any missing value
45 |         feature_test = df[features]
46 |         feature_test = feature_test.replace([np.inf, -np.inf], np.nan)
47 |         feature_test.fillna(features_train.mean(), inplace=True)
48 |         # scale the features
49 |         scaled_features = feature_transformer.transform(feature_test)
50 |         df["species"] = model.predict(scaled_features)
51 |         df["species"].replace(label_to_species, inplace=True)
52 |     csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes)
53 | 


--------------------------------------------------------------------------------
/single_cell/workflows/qc_annotation/tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | from single_cell.utils import csvutils
 5 | from single_cell.workflows.qc_annotation import tasks
 6 | 
 7 | 
 8 | def test_contamination(tmpdir):
 9 |     data = {}
10 | 
11 |     cols = [
12 |         'fastqscreen_nohit',
13 |         'fastqscreen_grch37',
14 |         'fastqscreen_grch37_multihit',
15 |         'fastqscreen_mm10',
16 |         'fastqscreen_mm10_multihit',
17 |         'fastqscreen_salmon',
18 |         'fastqscreen_salmon_multihit'
19 |     ]
20 | 
21 |     for i in range(5):
22 |         data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
23 |         for col in cols:
24 |             data[i][col] = i * 10
25 |         data[i]['fastqscreen_grch37'] = i * 1000
26 |         data[i]['fastqscreen_mm10'] = i * 100
27 | 
28 |     for i in range(5, 10):
29 |         data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
30 |         for col in cols:
31 |             data[i][col] = (i * 10)
32 |         data[i]['fastqscreen_grch37'] = i * 1000
33 | 
34 |     data = pd.DataFrame.from_dict(data, orient='index')
35 |     data['total_reads'] = data[cols].sum(axis=1)
36 | 
37 |     dtypes = {col: 'int' for col in cols}
38 |     dtypes['cell_id'] = 'str'
39 |     dtypes['total_reads'] = 'int'
40 | 
41 |     infile = os.path.join(tmpdir, 'input.csv.gz')
42 |     outfile = os.path.join(tmpdir, 'output.csv.gz')
43 | 
44 |     csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes)
45 | 
46 |     config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]}
47 | 
48 |     tasks.add_contamination_status(infile, outfile, config)
49 | 
50 |     output = csvutils.read_csv_and_yaml(outfile)
51 | 
52 |     assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5
53 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snpeff_annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | 
 3 | import pypeliner.managed as mgd
 4 | 
 5 | 
 6 | 
 7 | def create_snpeff_annotation_workflow(
 8 |         in_vcf_file,
 9 |         out_csv_file,
10 |         db,
11 |         data_dir,
12 |         split_size=int(1e3)
13 | ):
14 |     workflow = pypeliner.workflow.Workflow(
15 |         ctx={'num_retry': 3, 'mem_retry_increment': 2}
16 |     )
17 | 
18 |     workflow.transform(
19 |         name='split_vcf',
20 |         func='single_cell.utils.vcfutils.split_vcf',
21 |         args=(
22 |             mgd.InputFile(in_vcf_file),
23 |             mgd.TempOutputFile('split.vcf', 'split')
24 |         ),
25 |         kwargs={'lines_per_file': split_size}
26 |     )
27 | 
28 |     workflow.transform(
29 |         name='run_snpeff',
30 |         axes=('split',),
31 |         func='single_cell.workflows.snpeff_annotation.tasks.run_snpeff',
32 |         args=(
33 |             db,
34 |             data_dir,
35 |             mgd.TempInputFile('split.vcf', 'split'),
36 |             mgd.TempOutputFile('snpeff.vcf', 'split')
37 |         ),
38 |         kwargs={
39 |             'classic_mode': True
40 |         }
41 |     )
42 | 
43 |     workflow.transform(
44 |         name='convert_vcf_to_csv',
45 |         axes=('split',),
46 |         func='single_cell.workflows.snpeff_annotation.tasks.convert_vcf_to_table',
47 |         args=(
48 |             mgd.TempInputFile('snpeff.vcf', 'split'),
49 |             mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']),
50 |         )
51 |     )
52 | 
53 |     workflow.transform(
54 |         name='concatenate_tables',
55 |         func='single_cell.utils.csvutils.concatenate_csv',
56 |         args=(
57 |             mgd.TempInputFile('snpeff.csv.gz', 'split', extensions=['.yaml']),
58 |             mgd.OutputFile(out_csv_file, extensions=['.yaml'])
59 |         )
60 |     )
61 | 
62 |     return workflow
63 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snpeff_annotation/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_annotate = {
 3 |         'cell_id': 'str',
 4 |         'chrom': 'str',
 5 |         'coord': 'int',
 6 |         'ref': 'str',
 7 |         'alt': 'str',
 8 |         'db_id': 'str',
 9 |         'exact_match': 'int',
10 |         'indel': 'int',
11 |         'mappability': 'float',
12 |         'effect': 'str',
13 |         'effect_impact': 'str',
14 |         'functional_class': 'str',
15 |         'codon_change': 'str',
16 |         'amino_acid_change': 'str',
17 |         'amino_acid_length': 'str',
18 |         'gene_name': 'str',
19 |         'transcript_biotype': 'str',
20 |         'gene_coding': 'str',
21 |         'transcript_id': 'str',
22 |         'exon_rank': 'str',
23 |         'genotype': 'str',
24 |         'tri_nucleotide_context': 'str',
25 |     }
26 | 
27 | 
28 |     return snv_annotate
29 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snpeff_annotation/tasks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from collections import OrderedDict
  4 | 
  5 | import pandas as pd
  6 | import pypeliner
  7 | import vcf
  8 | from single_cell.utils import csvutils
  9 | from single_cell.workflows.snpeff_annotation.dtypes import dtypes
 10 | 
 11 | 
 12 | def run_snpeff(db, data_dir, in_vcf_file, out_file, classic_mode=True):
 13 |     os.environ['MALLOC_ARENA_MAX'] = '2'
 14 |     data_dir = os.path.abspath(data_dir)
 15 | 
 16 |     cmd = [
 17 |         'snpEff',
 18 |         '-noStats',
 19 |         '-noLog',
 20 |         '-Xms2g',
 21 |         '-Xmx5g',
 22 |         '-hgvs1LetterAa',
 23 |         '-dataDir',
 24 |         data_dir,
 25 |     ]
 26 | 
 27 |     if classic_mode:
 28 |         cmd.append('-classic')
 29 | 
 30 |     cmd.extend([
 31 |         db,
 32 |         in_vcf_file,
 33 |         '>',
 34 |         out_file
 35 |     ])
 36 | 
 37 |     pypeliner.commandline.execute(*cmd)
 38 | 
 39 | 
 40 | class ClassicSnpEffParser(object):
 41 | 
 42 |     def __init__(self, file_name):
 43 |         self._reader = vcf.Reader(filename=file_name)
 44 | 
 45 |         self.fields = self._get_field_names()
 46 | 
 47 |         self._buffer = []
 48 | 
 49 |         self._effect_matcher = re.compile(r'(.*)\(')
 50 | 
 51 |         self._fields_matcher = re.compile(r'\((.*)\)')
 52 | 
 53 |     def __iter__(self):
 54 |         while True:
 55 |             try:
 56 |                 yield self.next()
 57 |             except StopIteration:
 58 |                 break
 59 | 
 60 |     def next(self):
 61 |         while len(self._buffer) == 0:
 62 |             record = next(self._reader)
 63 | 
 64 |             if 'EFF' not in record.INFO:
 65 |                 continue
 66 | 
 67 |             for row in self._parse_record(record):
 68 |                 self._buffer.append(row)
 69 | 
 70 |         return self._buffer.pop(0)
 71 | 
 72 |     def _get_field_names(self):
 73 |         fields = []
 74 | 
 75 |         match = re.search(r'\((.*)\[', self._reader.infos['EFF'].desc)
 76 | 
 77 |         for x in match.groups()[0].split('|'):
 78 |             fields.append(x.strip().lower())
 79 | 
 80 |         return fields
 81 | 
 82 |     def _parse_record(self, record):
 83 |         for annotation in record.INFO['EFF']:
 84 |             effect = self._effect_matcher.search(annotation).groups()[0]
 85 | 
 86 |             out_row = OrderedDict((
 87 |                 ('chrom', record.CHROM),
 88 |                 ('coord', record.POS),
 89 |                 ('ref', record.REF),
 90 |                 ('alt', ','.join([str(x) for x in record.ALT])),
 91 |                 ('effect', effect),
 92 |             ))
 93 | 
 94 |             fields = self._fields_matcher.search(annotation).groups()[0].split('|')
 95 | 
 96 |             for i, key in enumerate(self.fields):
 97 |                 out_row[key] = fields[i]
 98 | 
 99 |             yield out_row
100 | 
101 | 
102 | def convert_vcf_to_table(in_file, out_file):
103 |     data = []
104 | 
105 |     parser = ClassicSnpEffParser(in_file)
106 | 
107 |     for row in parser:
108 |         data.append(row)
109 | 
110 |     data = pd.DataFrame(data)
111 | 
112 |     csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
113 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snv_allele_counts/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | import pypeliner.managed as mgd
 3 | from single_cell.workflows.snv_allele_counts.dtypes import dtypes
 4 | 
 5 | 
 6 | def create_snv_allele_counts_for_vcf_targets_workflow(
 7 |         bam_files,
 8 |         vcf_file,
 9 |         out_file,
10 |         sample_id,
11 |         library_id,
12 |         memory_cfg,
13 |         count_duplicates=False,
14 |         min_bqual=0,
15 |         min_mqual=0,
16 |         vcf_to_bam_chrom_map=None,
17 | ):
18 |     ctx = {
19 |         'mem': memory_cfg['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1,
20 |         'disk_retry_increment': 50,
21 |     }
22 |     workflow = pypeliner.workflow.Workflow(ctx=ctx)
23 | 
24 |     workflow.setobj(
25 |         obj=mgd.OutputChunks('cell_id'),
26 |         value=list(bam_files.keys()),
27 |     )
28 | 
29 |     workflow.transform(
30 |         name='get_snv_allele_counts_for_vcf_targets',
31 |         axes=('cell_id',),
32 |         func="biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets",
33 |         args=(
34 |             mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files, extensions=['.bai']),
35 |             mgd.InputFile(vcf_file),
36 |             mgd.TempOutputFile('counts.csv.gz', 'cell_id', extensions=['.yaml']),
37 |         ),
38 |         kwargs={
39 |             'count_duplicates': count_duplicates,
40 |             'min_bqual': min_bqual,
41 |             'min_mqual': min_mqual,
42 |             'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map,
43 |             'cell_id': mgd.Instance('cell_id'),
44 |             'sample_id': sample_id,
45 |             'library_id': library_id,
46 |             'report_zero_count_positions': False,
47 |             'dtypes': dtypes()['snv_allele_counts'],
48 |             'write_header': False
49 |         }
50 |     )
51 | 
52 |     workflow.transform(
53 |         name='merge_snv_allele_counts',
54 |         ctx={'mem': memory_cfg['high'], 'disk': 20},
55 |         func="single_cell.utils.csvutils.concatenate_csv",
56 |         args=(
57 |             mgd.TempInputFile('counts.csv.gz', 'cell_id', extensions=['.yaml']),
58 |             mgd.OutputFile(out_file, extensions=['.yaml']),
59 |         ),
60 |         kwargs={
61 |             'write_header': True,
62 |         }
63 |     )
64 | 
65 |     return workflow
66 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snv_allele_counts/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_allele_counts = {
 3 |         'chrom': 'str',
 4 |         'coord': 'int',
 5 |         'ref': 'str',
 6 |         'alt': 'str',
 7 |         'ref_counts': 'int',
 8 |         'alt_counts': 'int',
 9 |         'cell_id': 'str',
10 |         'sample_id': 'str',
11 |         'library_id': 'str',
12 |     }
13 | 
14 |     dtypes = locals()
15 | 
16 |     return dtypes
17 | 


--------------------------------------------------------------------------------
/single_cell/workflows/snv_annotate/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | import pypeliner.managed as mgd
 3 | 
 4 | 
 5 | def create_snv_annotate_workflow(
 6 |         config,
 7 |         museq_vcf,
 8 |         strelka_vcf,
 9 |         mappability_csv,
10 |         snpeff_csv,
11 |         trinuc_csv,
12 |         additional_csv,
13 |         memory_config,
14 | ):
15 |     ctx = {
16 |         'mem': memory_config['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1,
17 |         'disk_retry_increment': 50,
18 |     }
19 |     split_size = config['split_size']
20 | 
21 |     workflow = pypeliner.workflow.Workflow(ctx=ctx)
22 | 
23 |     workflow.transform(
24 |         name='merge_snvs',
25 |         func='biowrappers.components.io.vcf.tasks.merge_vcfs',
26 |         ctx=ctx,
27 |         args=(
28 |             [
29 |                 mgd.InputFile(museq_vcf, extensions=['.tbi', '.csi']),
30 |                 mgd.InputFile(strelka_vcf, extensions=['.tbi', '.csi']),
31 |             ],
32 |             mgd.TempOutputFile('all.snv.vcf')
33 |         ),
34 |     )
35 | 
36 |     workflow.transform(
37 |         name='finalise_snvs',
38 |         func="biowrappers.components.io.vcf.tasks.finalise_vcf",
39 |         ctx=ctx,
40 |         args=(
41 |             mgd.TempInputFile('all.snv.vcf'),
42 |             mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi'])
43 |         ),
44 |     )
45 | 
46 |     workflow.subworkflow(
47 |         name='snpeff_annotation',
48 |         func="single_cell.workflows.snpeff_annotation.create_snpeff_annotation_workflow",
49 |         args=(
50 |             mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
51 |             mgd.OutputFile(snpeff_csv, extensions=['.yaml']),
52 |             config['databases']['snpeff']['db'],
53 |             config['databases']['snpeff']['path'],
54 |         )
55 |     )
56 | 
57 |     workflow.subworkflow(
58 |         name='trinuc_annotation',
59 |         func="single_cell.workflows.trinuc_annotation.create_trinuc_annotation_workflow",
60 |         args=(
61 |             mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
62 |             mgd.OutputFile(trinuc_csv, extensions=['.yaml']),
63 |             config['ref_genome'],
64 |         ),
65 |         kwargs={'split_size': split_size}
66 |     )
67 | 
68 |     workflow.subworkflow(
69 |         name='mappability_annotation',
70 |         func="single_cell.workflows.mappability_annotation.create_mappability_annotation_workflow",
71 |         args=(
72 |             mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
73 |             mgd.OutputFile(mappability_csv, extensions=['.yaml']),
74 |             config['databases']['mappability']['path'],
75 |         ),
76 |         kwargs={'split_size': split_size}
77 |     )
78 | 
79 |     for k, v in config['databases']['additional_databases'].items():
80 |         workflow.subworkflow(
81 |             name='{}_status'.format(k),
82 |             func='single_cell.workflows.db_annotation.create_db_annotation_workflow',
83 |             ctx=dict(mem=4, mem_retry_increment=2),
84 |             args=(
85 |                 mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
86 |                 mgd.OutputFile(additional_csv[k], extensions=['.yaml']),
87 |                 v['path'],
88 |             ),
89 |             kwargs={'split_size': split_size}
90 |         )
91 | 
92 |     return workflow
93 | 


--------------------------------------------------------------------------------
/single_cell/workflows/split_bams/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 21, 2017
 3 | 
 4 | @author: dgrewal
 5 | '''
 6 | import pypeliner.managed as mgd
 7 | 
 8 | import pypeliner
 9 | 
10 | 
11 | def create_split_workflow(
12 |         normal_bam, normal_split_bam,
13 |         regions, config, by_reads=False
14 | ):
15 | 
16 |     normal_split_bam = dict([(ival, normal_split_bam[ival])
17 |                              for ival in regions])
18 | 
19 |     one_split_job = config["one_split_job"]
20 | 
21 |     workflow = pypeliner.workflow.Workflow()
22 | 
23 |     workflow.setobj(
24 |         obj=mgd.OutputChunks('region'),
25 |         value=regions,
26 |     )
27 | 
28 |     # split by reads always runs no a single node
29 |     if by_reads:
30 |         workflow.transform(
31 |             name='split_normal_bam',
32 |             ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']},
33 |             func="single_cell.workflows.split_bams.tasks.split_bam_file_by_reads",
34 |             args=(
35 |                 mgd.InputFile(normal_bam, extensions=['.bai']),
36 |                 mgd.OutputFile(
37 |                     "normal.split.bam", "region",
38 |                     fnames=normal_split_bam, axes_origin=[],
39 |                     extensions=['.bai']
40 |                 ),
41 |                 mgd.TempSpace("bam_split_by_reads"),
42 |                 regions,
43 |             ),
44 |         )
45 | 
46 |     elif one_split_job:
47 |         workflow.transform(
48 |             name='split_normal_bam',
49 |             ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']},
50 |             func="single_cell.workflows.split_bams.tasks.split_bam_file_one_job",
51 |             args=(
52 |                 mgd.InputFile(normal_bam, extensions=['.bai']),
53 |                 mgd.OutputFile(
54 |                     "normal.split.bam", "region",
55 |                     fnames=normal_split_bam, axes_origin=[],
56 |                     extensions=['.bai'],
57 |                 ),
58 |                 regions,
59 |                 mgd.TempSpace("one_job_split_tempdir")
60 |             ),
61 |             kwargs={"ncores": config["max_cores"]}
62 |         )
63 | 
64 |     else:
65 |         workflow.transform(
66 |             name='split_normal_bam',
67 |             ctx={'mem': config['memory']['low'], 'ncpus': config['max_cores']},
68 |             axes=('region',),
69 |             func="single_cell.workflows.split_bams.tasks.split_bam_file",
70 |             args=(
71 |                 mgd.InputFile(normal_bam, extensions=['.bai']),
72 |                 mgd.OutputFile(
73 |                     "normal.split.bam", "region", fnames=normal_split_bam,
74 |                     extensions=['.bai']
75 |                 ),
76 |                 mgd.InputInstance('region')
77 |             )
78 |         )
79 | 
80 |     return workflow
81 | 


--------------------------------------------------------------------------------
/single_cell/workflows/strelka/components_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Nov 21, 2015
  3 | 
  4 | @author: Andrew Roth
  5 | '''
  6 | import errno
  7 | import os
  8 | import random
  9 | import time
 10 | 
 11 | 
 12 | def find(name, path):
 13 |     for root, _, files in os.walk(path):
 14 |         if name in files:
 15 |             return os.path.join(root, name)
 16 | 
 17 | 
 18 | def get_ancestor_directory(path, level=1):
 19 |     '''
 20 |     Get the path of the directory a specified number of levels above the given path.
 21 | 
 22 |     >>> get_ancestor_directory('/foo/bar/some/where/my_file.txt', level=2)
 23 |     '/foo/bar/some'
 24 |     '''
 25 |     ancestor_dir = path
 26 | 
 27 |     for _ in range(level):
 28 |         ancestor_dir = os.path.dirname(ancestor_dir)
 29 | 
 30 |     return ancestor_dir
 31 | 
 32 | 
 33 | def make_directory(target_dir, mode=775):
 34 |     '''
 35 |     Check if a directory exists and make it if not.
 36 | 
 37 |     For example, given /some/where make the folder /some/where. If /some does not exist, it will also be made.
 38 |     '''
 39 |     i = 0
 40 | 
 41 |     try:
 42 |         old_umask = os.umask(0000)
 43 | 
 44 |         while not os.path.exists(target_dir):
 45 |             # Randomly sleep for a short random time so multiple simultaneous calls don't try to create the directory.
 46 |             time.sleep(random.random() * 2)
 47 | 
 48 |             try:
 49 |                 os.makedirs(target_dir, mode)
 50 | 
 51 |             except OSError:
 52 |                 i += 1
 53 | 
 54 |                 if i > 10:
 55 |                     raise
 56 | 
 57 |     finally:
 58 |         os.umask(old_umask)
 59 | 
 60 | 
 61 | def make_parent_directory(file_name, mode=775):
 62 |     '''
 63 |     Given a file name, make the parent directory if it does not exist using make_directory.
 64 | 
 65 |     For example, given /some/where/foo.bar make the folder /some/where.
 66 |     '''
 67 |     parent_dir = os.path.dirname(file_name)
 68 | 
 69 |     make_directory(parent_dir, mode=mode)
 70 | 
 71 | 
 72 | def flatten_input(files):
 73 |     if type(files) == dict:
 74 |         parsed_files = [files[x] for x in sorted(files)]
 75 |     elif type(files) == str:
 76 |         parsed_files = [files, ]
 77 |     else:
 78 |         parsed_files = []
 79 |         for x in files:
 80 |             if type(x) == dict:
 81 |                 parsed_files.extend([x[y] for y in sorted(x)])
 82 |             else:
 83 |                 parsed_files.append(x)
 84 |     return parsed_files
 85 | 
 86 | 
 87 | def remove(filename):
 88 |     '''
 89 |     Remove a file that may not exist
 90 |     '''
 91 |     try:
 92 |         os.remove(filename)
 93 |     except OSError as e:
 94 |         if e.errno != errno.ENOENT:
 95 |             raise
 96 | 
 97 | 
 98 | def symlink(filename, link_name=None, link_directory=None):
 99 |     '''
100 |     Create a symlink, with additional options for flexibility,
101 | 
102 |     Args:
103 |         filename (str): file to link to
104 | 
105 |     KwArgs:
106 |         link_name (str): base name of the link, defaults to same as link to
107 |         link_directory (str): directory of the, defaults to directory of link to
108 | 
109 |     '''
110 |     if link_name is None:
111 |         link_name = os.path.basename(filename)
112 |     if link_directory is None:
113 |         link_directory = os.getcwd()
114 |     link_filename = os.path.join(link_directory, link_name)
115 |     remove(link_filename)
116 |     filename = os.path.abspath(filename)
117 |     os.symlink(filename, link_filename)
118 |     return link_filename
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     import doctest
123 | 
124 |     doctest.testmod()
125 | 


--------------------------------------------------------------------------------
/single_cell/workflows/strelka/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_strelka = {
 3 |         "chrom": "str",
 4 |         "coord": "int",
 5 |         "ref": "str",
 6 |         "alt": "str",
 7 |         "score": "int"
 8 |     }
 9 | 
10 |     dtypes = locals()
11 | 
12 |     return dtypes
13 | 


--------------------------------------------------------------------------------
/single_cell/workflows/strelka/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 24, 2017
3 | 
4 | @author: dgrewal
5 | '''
6 | from .parse_strelka import ParseStrelka


--------------------------------------------------------------------------------
/single_cell/workflows/strelka/scripts/vizutils/__init__.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | 
4 | from parseutils import ParseUtils
5 | from vcf import Vcf
6 | from utils import Utils
7 | 


--------------------------------------------------------------------------------
/single_cell/workflows/trinuc_annotation/__init__.py:
--------------------------------------------------------------------------------
 1 | import pypeliner
 2 | import pypeliner.managed as mgd
 3 | 
 4 | 
 5 | def create_trinuc_annotation_workflow(
 6 |         in_vcf_file,
 7 |         out_csv_file,
 8 |         ref_genome,
 9 |         split_size=int(1e4),
10 | ):
11 |     workflow = pypeliner.workflow.Workflow(
12 |         ctx={'num_retry': 3, 'mem_retry_increment': 2}
13 |     )
14 | 
15 |     workflow.transform(
16 |         name='split_vcf',
17 |         func='single_cell.utils.vcfutils.split_vcf',
18 |         args=(
19 |             mgd.InputFile(in_vcf_file),
20 |             mgd.TempOutputFile('split.vcf', 'split')
21 |         ),
22 |         kwargs={'lines_per_file': split_size}
23 |     )
24 | 
25 |     workflow.transform(
26 |         name='annotate_db_status',
27 |         axes=('split',),
28 |         func='single_cell.workflows.trinuc_annotation.tasks.get_tri_nucelotide_context',
29 |         args=(
30 |             ref_genome,
31 |             mgd.TempInputFile('split.vcf', 'split'),
32 |             mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']),
33 |         )
34 |     )
35 | 
36 |     workflow.transform(
37 |         name='merge_tables',
38 |         func='single_cell.utils.csvutils.concatenate_csv',
39 |         args=(
40 |             mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']),
41 |             mgd.OutputFile(out_csv_file, extensions=['.yaml'])
42 |         )
43 |     )
44 | 
45 |     return workflow
46 | 


--------------------------------------------------------------------------------
/single_cell/workflows/trinuc_annotation/dtypes.py:
--------------------------------------------------------------------------------
 1 | def dtypes():
 2 |     snv_annotate = {
 3 |         'cell_id': 'str',
 4 |         'chrom': 'str',
 5 |         'coord': 'int',
 6 |         'ref': 'str',
 7 |         'alt': 'str',
 8 |         'db_id': 'str',
 9 |         'exact_match': 'int',
10 |         'indel': 'int',
11 |         'mappability': 'float',
12 |         'effect': 'str',
13 |         'effect_impact': 'str',
14 |         'functional_class': 'str',
15 |         'codon_change': 'str',
16 |         'amino_acid_change': 'str',
17 |         'amino_acid_length': 'str',
18 |         'gene_name': 'str',
19 |         'transcript_biotype': 'str',
20 |         'gene_coding': 'str',
21 |         'transcript_id': 'str',
22 |         'exon_rank': 'str',
23 |         'genotype': 'str',
24 |         'tri_nucleotide_context': 'str',
25 |     }
26 | 
27 | 
28 |     return snv_annotate
29 | 


--------------------------------------------------------------------------------
/single_cell/workflows/trinuc_annotation/tasks.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pysam
 3 | import vcf
 4 | 
 5 | from single_cell.utils import csvutils
 6 | from single_cell.workflows.trinuc_annotation.dtypes import dtypes
 7 | 
 8 | 
 9 | def get_tri_nucelotide_context(ref_genome_fasta_file, vcf_file, out_file):
10 |     vcf_reader = vcf.Reader(filename=vcf_file)
11 | 
12 |     fasta_reader = pysam.Fastafile(ref_genome_fasta_file)
13 | 
14 |     data = []
15 | 
16 |     for record in vcf_reader:
17 |         chrom = record.CHROM
18 | 
19 |         coord = record.POS
20 | 
21 |         tri_nucleotide_context = fasta_reader.fetch(chrom, coord - 2, coord + 1)
22 | 
23 |         data.append({'chrom': record.CHROM, 'coord': record.POS, 'tri_nucleotide_context': tri_nucleotide_context})
24 | 
25 |     data = pd.DataFrame(data)
26 | 
27 |     csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
28 | 


--------------------------------------------------------------------------------