├── .gitignore
├── Dockerfiles
├── akt.0.3.2.dockerfile
├── alientrimmer.0.4.0.dockerfile
├── bwa.0.7.17_samtools.dockerfile
├── canvas.1.35.dockerfile
├── jsm.0.7.5-1.dockerfile
├── lofreq.2.1.3.1-1.dockerfile
├── manta.1.4.0.dockerfile
├── muse_1.0rc_c.dockerfile
├── nirvana.2.0.9.dockerfile
├── picard.2.27.5.dockerfile
├── sambamba.1.0.0.dockerfile
├── samtools.1.19.2.dockerfile
├── scalpel.0.5.4.dockerfile
├── somaticseq.base-1.7.dockerfile
├── somaticseq.master.dockerfile
├── somaticseq.release.dockerfile
├── somaticsniper.1.0.5.0-2.dockerfile
├── strelka.2.9.5.dockerfile
├── tabix.1.19.dockerfile
├── trimmomatic.0.39.dockerfile
├── vardictjava.dockerfile
└── vcftools.0.1.14-2.dockerfile
├── LICENSE.txt
├── MODULES.md
├── README.md
├── docs
├── Manual.pdf
├── Manual.tex
├── Manual.tex.backup
├── Refs.bib
├── SomaticSeqYoutube.png
├── heatmap400.png
├── precisionfda.png
├── seqc2.md
├── train_for_classifiers.md
└── workflow400.png
├── pyproject.toml
├── r_scripts
├── __init__.py
├── ada_cross_validation.R
├── ada_model_builder.R
├── ada_model_builder_ntChange.R
└── ada_model_predictor.R
├── setup.cfg
├── setup.py
├── somaticseq
├── __init__.py
├── _version.py
├── annotate_caller.py
├── bam_features.py
├── combine_callers.py
├── defaults.py
├── genomic_file_parsers
│ ├── __init__.py
│ ├── concat.py
│ ├── genomic_file_handlers.py
│ ├── pileup_reader.py
│ └── read_info_extractor.py
├── ntchange_type.py
├── run_somaticseq.py
├── sequencing_features.py
├── single_sample_vcf2tsv.py
├── somatic_tsv2vcf.py
├── somatic_vcf2tsv.py
├── somatic_xgboost.py
├── somaticseq_parallel.py
├── tsv2vcf.py
├── utilities
│ ├── BAM_filter.py
│ ├── README.md
│ ├── __init__.py
│ ├── attach_pileupVAF.py
│ ├── bamQC.py
│ ├── bedFileHandler.py
│ ├── combo_callers_evaluator.py
│ ├── dockered_pipelines
│ │ ├── QC
│ │ │ ├── extract_callableRegions.sh
│ │ │ └── extract_coverageDepth.sh
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── alignments
│ │ │ ├── BQSR.sh
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── align.py
│ │ │ ├── jointIndelRealign.sh
│ │ │ ├── markdup.py
│ │ │ ├── mergeBams.py
│ │ │ ├── mergeFastqs.py
│ │ │ ├── singleIndelRealign.sh
│ │ │ ├── spreadFastq.py
│ │ │ └── trim.py
│ │ ├── bamSimulator
│ │ │ ├── BamSimulator_multiThreads.sh
│ │ │ ├── BamSimulator_singleThread.sh
│ │ │ ├── README.md
│ │ │ ├── bamSurgeon
│ │ │ │ ├── IndelRealign.sh
│ │ │ │ ├── MergeTN.sh
│ │ │ │ ├── Reheader_SM.sh
│ │ │ │ ├── SortByCoordinate.sh
│ │ │ │ ├── SortByReadName.sh
│ │ │ │ ├── bamsurgeon_addindels.sh
│ │ │ │ ├── bamsurgeon_addsnvs.sh
│ │ │ │ ├── bamsurgeon_addsvs.sh
│ │ │ │ ├── bamsurgeon_random_sites.sh
│ │ │ │ ├── bamsurgeon_split_BAM.sh
│ │ │ │ ├── cleanBam.sh
│ │ │ │ ├── concatVcfFiles.sh
│ │ │ │ ├── convert_nonStandardBasesInVcfs.py
│ │ │ │ ├── mergeBamFiles.sh
│ │ │ │ └── split_BAM_by_BED.sh
│ │ │ ├── dream_sim.jpg
│ │ │ ├── onkoinsight_sim.png
│ │ │ └── replicate_sim.jpg
│ │ ├── container_option.py
│ │ ├── germline_variants
│ │ │ ├── Canvas.sh
│ │ │ ├── Manta.sh
│ │ │ ├── Nirvana.sh
│ │ │ ├── README.md
│ │ │ ├── VQSR.sh
│ │ │ ├── bam2vcf.sh
│ │ │ └── haplotypeCaller.sh
│ │ ├── makeAlignmentScripts.py
│ │ ├── makeSomaticScripts.py
│ │ ├── run_workflows.py
│ │ ├── somatic_mutations
│ │ │ ├── JointSNVMix2.py
│ │ │ ├── LoFreq.py
│ │ │ ├── MuSE.py
│ │ │ ├── MuTect2.py
│ │ │ ├── Scalpel.py
│ │ │ ├── SomaticSniper.py
│ │ │ ├── Strelka2.py
│ │ │ ├── VarDict.py
│ │ │ ├── VarScan2.py
│ │ │ └── __init__.py
│ │ ├── tumor_normal_run.py
│ │ └── tumor_only_run.py
│ ├── filter_SomaticSeq_VCF.py
│ ├── linguistic_sequence_complexity.py
│ ├── lociCounterWithLabels.py
│ ├── lociCounters.py
│ ├── multi-nucleotide_phaser.py
│ ├── paired_end_bam2fastq.py
│ ├── plot_TPvsFP.py
│ ├── reformat_VCF2SEQC2.py
│ ├── remove_callers_from_somaticseq_tsv.py
│ ├── singularities
│ │ ├── QC
│ │ │ ├── extract_callableRegions.sh
│ │ │ └── extract_coverageDepth.sh
│ │ ├── README.md
│ │ ├── bamSimulator
│ │ │ ├── BamSimulator_multiThreads.sh
│ │ │ ├── BamSimulator_singleThread.sh
│ │ │ ├── README.md
│ │ │ └── bamSurgeon
│ │ │ │ ├── IndelRealign.sh
│ │ │ │ ├── MergeTN.sh
│ │ │ │ ├── Reheader_SM.sh
│ │ │ │ ├── SortByCoordinate.sh
│ │ │ │ ├── SortByReadName.sh
│ │ │ │ ├── bamsurgeon_addindels.sh
│ │ │ │ ├── bamsurgeon_addsnvs.sh
│ │ │ │ ├── bamsurgeon_addsvs.sh
│ │ │ │ ├── bamsurgeon_random_sites.sh
│ │ │ │ ├── bamsurgeon_split_BAM.sh
│ │ │ │ ├── cleanBam.sh
│ │ │ │ ├── concatVcfFiles.sh
│ │ │ │ ├── mergeBamFiles.sh
│ │ │ │ └── split_BAM_by_BED.sh
│ │ └── germline_variants
│ │ │ ├── Nirvana.sh
│ │ │ ├── VQSR.sh
│ │ │ ├── bam2vcf.sh
│ │ │ └── haplotypeCaller.sh
│ ├── snakemake
│ │ ├── README.md
│ │ ├── Snakefile
│ │ └── config.yaml
│ ├── split_bed_into_equal_regions.py
│ ├── split_mergedBed.py
│ ├── tally_MyVCF_vs_Truth.py
│ ├── tally_variants_from_multiple_vcfs.py
│ ├── trimSoftClippedReads.py
│ ├── variant_annotation.py
│ └── vcfsorter.pl
└── vcf_modifier
│ ├── __init__.py
│ ├── bed_util.py
│ ├── complex2indel.py
│ ├── copy_TextFile.py
│ ├── getUniqueVcfPositions.py
│ ├── leftAlign.py
│ ├── modify_JointSNVMix2.py
│ ├── modify_MuTect.py
│ ├── modify_MuTect2.py
│ ├── modify_SomaticSniper.py
│ ├── modify_Strelka.py
│ ├── modify_VarDict.py
│ ├── modify_VarScan2.py
│ ├── modify_ssMuTect2.py
│ ├── modify_ssStrelka.py
│ └── split_vcf.py
└── tests
├── conftest.py
├── example
├── README.md
├── Varsim.somatic.truth.vcf
├── invoke_dockerized_tumor_normal_callers.sh
├── invoke_dockerized_tumor_only_callers.sh
├── normal.markdup.bam
├── normal.markdup.bam.bai
├── paired_example
│ ├── Consensus.sINDEL.vcf.gz
│ ├── Consensus.sSNV.vcf.gz
│ ├── LoFreq.indel.vcf.gz
│ ├── LoFreq.snv.vcf.gz
│ ├── MuSE.vcf.gz
│ ├── MuTect2.vcf.gz
│ ├── Scalpel.vcf.gz
│ ├── SomaticSniper.vcf.gz
│ ├── Strelka.indel.vcf.gz
│ ├── Strelka.snv.vcf.gz
│ └── VarDict.vcf.gz
├── paired_somaticseq_example.sh
├── results_check.sh
├── single_somaticseq_example.sh
├── tiny.dict
├── tiny.fa
├── tiny.fa.fai
├── tiny_dbsnp.vcf
├── tiny_dbsnp.vcf.gz
├── tiny_dbsnp.vcf.gz.tbi
├── tumor.markdup.bam
├── tumor.markdup.bam.bai
└── tumor_only_example
│ ├── Consensus.sINDEL.vcf.gz
│ ├── Consensus.sSNV.vcf.gz
│ ├── MuTect2.vcf.gz
│ ├── Strelka.vcf.gz
│ └── VarDict.vcf.gz
├── functional
└── test_somaticseq.py
└── unit
├── genomic_file_parsers
└── test_read_info_extractor.py
├── utilities
└── test_split_bed_into_equal_regions.py
└── vcf_modifier
├── test_bed_utils.py
└── test_split_vcf.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | __pycache__
4 | Manual.aux
5 | Manual.bbl
6 | Manual.blg
7 | Manual.log
8 | Manual.out
9 | Manual.synctex.gz
10 | *.egg-info
11 | dist
12 | build
13 | .vscode/
14 | .made
15 | .ipynb_checkpoints/
16 | poetry.lock
17 |
--------------------------------------------------------------------------------
/Dockerfiles/akt.0.3.2.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y -q install wget libeigen3-dev tar zlib1g-dev libbz2-dev liblzma-dev bcftools r-base imagemagick && apt clean
4 | RUN cd /opt && wget https://github.com/samtools/htslib/releases/download/1.8/htslib-1.8.tar.bz2 && tar -xjvf htslib-1.8.tar.bz2 && cd htslib-1.8 && ./configure && make && make install
5 | RUN cd /opt && wget https://github.com/Illumina/akt/archive/v0.3.2.tar.gz && tar -xvf v0.3.2.tar.gz && cd akt-0.3.2 && make && cd .. && ln -s akt-0.3.2 akt
6 |
--------------------------------------------------------------------------------
/Dockerfiles/alientrimmer.0.4.0.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget gcj-jdk make unzip && apt-get clean
4 | RUN cd /opt && wget ftp://ftp.pasteur.fr/pub/gensoft/projects/AlienTrimmer/AlienTrimmer_0.4.0.tar.gz && tar -xvf AlienTrimmer_0.4.0.tar.gz && cd AlienTrimmer_0.4.0/src && make && cp -p AlienTrimmer AlienTrimmer.java /usr/local/bin/
5 | RUN cd /opt && wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip && unzip Trimmomatic-0.36.zip && ln -s Trimmomatic-0.36 Trimmomatic
6 |
--------------------------------------------------------------------------------
/Dockerfiles/bwa.0.7.17_samtools.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.04
2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install bwa samtools && apt-get clean
3 |
--------------------------------------------------------------------------------
/Dockerfiles/canvas.1.35.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget mono-runtime mono-complete tar libunwind-dev && apt-get clean
4 | RUN cd /opt && wget https://download.microsoft.com/download/D/7/A/D7A9E4E9-5D25-4F0C-B071-210CB8267943/dotnet-ubuntu.16.04-x64.1.1.2.tar.gz && tar -xvf dotnet-ubuntu.16.04-x64.1.1.2.tar.gz && ln -s /opt/shared/Microsoft.NETCore.App/1.1.2/dotnet /usr/bin/dotnet
5 | RUN cd /opt && wget https://github.com/Illumina/canvas/releases/download/1.35.1.1316%2Bmaster/Canvas-1.35.1.1316.master_x64.tar.gz && tar -xvf Canvas-1.35.1.1316.master_x64.tar.gz && ln -s 'Canvas-1.35.1.1316+master_x64/' Canvas && chmod a+x Canvas/tabix
6 |
--------------------------------------------------------------------------------
/Dockerfiles/jsm.0.7.5-1.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:14.04
2 |
3 | RUN apt-get update && apt-get install -y python python-dev wget build-essential samtools zlib1g-dev cython && apt-get clean
4 | RUN cd /opt && wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pysam/pysam-0.5.tar.gz && tar -xvf pysam-0.5.tar.gz
5 | RUN cd /opt/pysam-0.5 && wget https://pypi.python.org/packages/2.7/s/setuptools/setuptools-0.6c11-py2.7.egg && python setup.py build && python setup.py install
6 |
7 | RUN cd /opt && wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/joint-snv-mix/JointSNVMix-0.7.5.tar.gz && tar -xvf JointSNVMix-0.7.5.tar.gz && cd JointSNVMix-0.7.5 && python setup.py install
8 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl
9 |
--------------------------------------------------------------------------------
/Dockerfiles/lofreq.2.1.3.1-1.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget zlib1g-dev bzip2 libncurses5-dev build-essential python automake libtool git && apt-get clean
4 | RUN cd /opt/ && wget https://downloads.sourceforge.net/project/samtools/samtools/1.1/samtools-1.1.tar.bz2 && tar -xvf samtools-1.1.tar.bz2 && cd samtools-1.1 && make && make install && cd /opt/samtools-1.1/htslib-1.1 && make && make install
5 | RUN cd /opt && git clone https://github.com/CSB5/lofreq.git && cd lofreq && libtoolize && ./bootstrap && ./configure SAMTOOLS=/opt/samtools-1.1 HTSLIB=/opt/samtools-1.1/htslib-1.1 && make && make install
6 |
--------------------------------------------------------------------------------
/Dockerfiles/manta.1.4.0.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 | RUN apt update && apt -y install wget tar bzip2 python && apt clean
4 | RUN cd /opt && wget https://github.com/Illumina/manta/releases/download/v1.4.0/manta-1.4.0.centos6_x86_64.tar.bz2 && tar -xvf manta-1.4.0.centos6_x86_64.tar.bz2 && ln -s manta-1.4.0.centos6_x86_64 manta
5 |
--------------------------------------------------------------------------------
/Dockerfiles/muse_1.0rc_c.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 |
3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install wget
4 | RUN cd /usr/local/bin/ && wget http://bioinformatics.mdanderson.org/Software/MuSE/MuSEv1.0rc_submission_c039ffa && chmod a+x MuSEv1.0rc_submission_c039ffa
5 |
--------------------------------------------------------------------------------
/Dockerfiles/nirvana.2.0.9.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get -y install wget tar mono-runtime mono-complete libunwind-dev libcurl3 libssl1.0.0 libssl-dev && apt-get clean
4 | RUN cd /opt && wget https://download.microsoft.com/download/2/E/C/2EC018A0-A0FC-40A2-849D-AA692F68349E/dotnet-sdk-2.1.105-linux-x64.tar.gz && tar -xvf dotnet-sdk-2.1.105-linux-x64.tar.gz && ln -s /opt/dotnet /usr/local/bin/dotnet
5 | RUN cd /opt && wget https://github.com/Illumina/Nirvana/archive/v2.0.9.tar.gz && tar -xvf v2.0.9.tar.gz && ln -s Nirvana-2.0.9 Nirvana && cd Nirvana-2.0.9 && /opt/dotnet build -c Release
6 | RUN chmod a+rx /root/
7 |
--------------------------------------------------------------------------------
/Dockerfiles/picard.2.27.5.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:22.04
2 |
3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install openjdk-8-jdk wget picard-tools && apt clean
4 | RUN ln -s /usr/bin/PicardCommandLine /usr/bin/picard
5 | RUN cd /opt && wget https://github.com/broadinstitute/picard/releases/download/2.27.5/picard.jar
6 | RUN apt -y autoremove wget
7 |
--------------------------------------------------------------------------------
/Dockerfiles/sambamba.1.0.0.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:23.04
2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install sambamba
3 |
--------------------------------------------------------------------------------
/Dockerfiles/samtools.1.19.2.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.04
2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install samtools && apt clean
3 |
--------------------------------------------------------------------------------
/Dockerfiles/scalpel.0.5.4.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget perl make cmake build-essential zlib1g-dev libncurses5-dev libncursesw5-dev cpanminus && apt-get clean
4 | RUN cpanm -f Term::ReadKey && cpanm -f Term::ReadLine && cpanm -f FindBin
5 | RUN cd /opt && wget https://downloads.sourceforge.net/project/scalpel/scalpel-0.5.4.tar.gz && tar -xvf scalpel-0.5.4.tar.gz && ln -s scalpel-0.5.4 scalpel && cd scalpel-0.5.4 && make
6 | RUN cd /opt/scalpel-0.5.4/samtools-1.1 && make && make install
7 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl
8 |
--------------------------------------------------------------------------------
/Dockerfiles/somaticseq.base-1.7.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.10
2 |
3 | RUN export DEBIAN_FRONTEND=noninteractive && \
4 | apt update && \
5 | apt -y install r-base python3 python3-pip git wget default-jre bedtools && \
6 | apt-get clean
7 | RUN R -e "install.packages('ada', repos = 'http://cran.rstudio.com/')"
8 |
--------------------------------------------------------------------------------
/Dockerfiles/somaticseq.master.dockerfile:
--------------------------------------------------------------------------------
1 | FROM lethalfang/somaticseq:base-1.7
2 |
3 | RUN cd /opt && \
4 | git clone https://github.com/bioinform/somaticseq && \
5 | cd somaticseq && \
6 | pip install --no-cache-dir --break-system-packages .
7 |
--------------------------------------------------------------------------------
/Dockerfiles/somaticseq.release.dockerfile:
--------------------------------------------------------------------------------
1 | # Ex: docker build --build-arg VERSION='3.10.0' -f somaticseq.release.dockerfile .
2 | FROM lethalfang/somaticseq:base-1.7
3 |
4 | ARG VERSION
5 | RUN cd /opt && \
6 | wget https://github.com/bioinform/somaticseq/archive/refs/tags/v${VERSION}.tar.gz && \
7 | tar -xvf v${VERSION}.tar.gz && \
8 | mv somaticseq-${VERSION} somaticseq && \
9 | cd somaticseq && \
10 | pip install --no-cache-dir --break-system-packages .
11 |
--------------------------------------------------------------------------------
/Dockerfiles/somaticsniper.1.0.5.0-2.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y build-essential git-core cmake zlib1g-dev libncurses-dev
4 | RUN cd /opt/ && git clone https://github.com/genome/somatic-sniper.git && mkdir -p /opt/somatic-sniper/build && cd /opt/somatic-sniper/build && cmake ../ && make deps && make -j
5 |
--------------------------------------------------------------------------------
/Dockerfiles/strelka.2.9.5.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget bzip2 python && apt-get clean
4 | RUN cd /opt && wget https://github.com/Illumina/strelka/releases/download/untagged-839da48539154c23a780/strelka-2.9.5.centos6_x86_64.tar.bz2 && tar -xvf strelka-2.9.5.centos6_x86_64.tar.bz2 && rm strelka-2.9.5.centos6_x86_64.tar.bz2 && ln -s strelka-2.9.5.centos6_x86_64 strelka
5 |
--------------------------------------------------------------------------------
/Dockerfiles/tabix.1.19.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.04
2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install tabix && apt clean
3 |
--------------------------------------------------------------------------------
/Dockerfiles/trimmomatic.0.39.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 |
3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install wget unzip default-jdk && apt clean
4 | RUN cd /opt && wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip && unzip Trimmomatic-0.39.zip && ln -s Trimmomatic-0.39 Trimmomatic && cd Trimmomatic && ln -s trimmomatic-0.39.jar trimmomatic.jar
5 |
--------------------------------------------------------------------------------
/Dockerfiles/vardictjava.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y wget default-jre r-base samtools git
4 | ENV JAVA_HOME=''
5 | RUN cd /opt && wget https://github.com/AstraZeneca-NGS/VarDictJava/releases/download/1.7.0/VarDict-1.7.0.tar && tar -xvf VarDict-1.7.0.tar && ln -s VarDict-1.7.0 VarDictJava && git clone https://github.com/AstraZeneca-NGS/VarDict.git && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl
6 |
--------------------------------------------------------------------------------
/Dockerfiles/vcftools.0.1.14-2.dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 | RUN apt-get update && apt-get install -y vcftools wget && apt-get clean
4 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl
5 | RUN cd /opt && wget https://www.dropbox.com/s/bpv098m36j8ljk4/vcftools.script.sh && chmod a+x vcftools.script.sh
6 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Bina Technologies inc.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
14 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
15 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
16 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
17 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
18 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
19 | THE POSSIBILITY OF SUCH DAMAGE.
20 |
21 |
--------------------------------------------------------------------------------
/MODULES.md:
--------------------------------------------------------------------------------
1 | # SomaticSeq Modules
2 |
3 | `somaticseq` is the overarching command that takes VCF outputs from individual
4 | callers all the way to the end. For customized or debugging purposes, a number
5 | of modules can be run independently.
6 |
7 | ### Extract features from tumor and normal BAM files for any VCF file
8 |
9 | After all the VCF files are combined, `somaticseq_paired_vcf2tsv` or
10 | `somaticseq_single_vcf2tsv` were invoked to extract genomic and sequencing
11 | features from BAM and VCF files. These modules can be used independently to
12 | extract BAM features with _any_ sorted VCF files, e.g.,
13 |
14 | ```
15 | somaticseq_paired_vcf2tsv -myvcf Variants_Of_Interest.vcf -nbam normal.bam -tbam tumor.bam -ref human.fasta -mincaller 0 -outfile Variants_with_BAM_Features.vcf
16 | ```
17 |
18 | Notice the `-mincaller 0` option above, which tells the module to extract
19 | features if at least 0 callers have called the variant as somatic. In other
20 | words, `-mincaller 0` tells the module to extract feature for every input
21 | candidate. Default in SomaticSeq is `-mincaller 0.5` which means it will keep
22 | variants that are LowQual in some callers, but REJECT calls can be excluded.
23 |
24 | Run `somaticseq_paired_vcf2tsv -h` or `somaticseq_single_vcf2tsv -h` to see
25 | command line options.
26 |
27 | ### Convert SomaticSeq TSV file to SomaticSeq VCF file
28 |
29 | Run `somaticseq_tsv2vcf -h` to see all the command line options. The VCF file
30 | (`-vcf/--vcf-out`) is the output file, e.g.,
31 |
32 | ```
33 | somaticseq_tsv2vcf --tsv-in predicted_snvs.tsv --vcf-out predicted_snvs.vcf --pass-threshold 0.7 --lowqual-threshold 0.1 --individual-mutation-tools MuTect2 VarDict Strelka --emit-all --phred-scale --paired-samples
34 | ```
35 |
36 | It can only work on SomaticSeq generated TSV files.
37 |
38 | ### Train XGBoost model
39 |
40 | Run `somaticseq_xgboost train -h` to see all the options.
41 |
42 | You can combine multiple TSV files to create one single model, and try different
43 | parameters, e.g.,
44 |
45 | ```
46 | somaticseq_xgboost train -tsvs SAMPLE-01_SNVs.tsv SAMPLE-02_SNVs.tsv .... SAMPLE-NN_SNVs.tsv -out SNV.xgboost.classifier -threads 8 -depth 12 -seed 1234 -method hist -iter 250 --extra-params grow_policy:lossguide max_leaves:24
47 | ```
48 |
49 | ### Train AdaBoost model
50 |
51 | You can only input one TSV file, or combine them manually, e.g.,
52 | `somaticseq_concat -infiles */Ensemble.sSNV.tsv -outfile Ensemble.sSNVs.tsv`.
53 |
54 | ```
55 | ada_model_builder_ntChange.R Ensemble.sSNVs.tsv
56 | ```
57 |
58 | ### Predict using a XGBoost model
59 |
60 | Run `somaticseq_xgboost predict -h` to see all the options. Be absolutely sure
61 | the training and prediction data match.
62 |
63 | ```
64 | somaticseq_xgboost predict -model SNV.xgboost.classifier -tsv variant_candidates.tsv -out predicted_variant_set.tsv -ntrees 50
65 | ```
66 |
67 | ### Predict using an AdaBoost model
68 |
69 | ```
70 | ada_model_predictor.R snv.classifier.RData snv_candidates.tsv predicted_snvs.tsv
71 | ```
72 |
73 | ### To remove caller from a super set
74 |
75 | If you have previously created a classifier with MuTect2, MuSE, VarDict, and
76 | Strelka2, but now you want to create another classifier with only MuTect2 and
77 | Strelka2 (maybe you decided you don't want to run MuSE and VarDict anymore), you
78 | don't have to re-run the whole pipeline. You can take the original TSV file, and
79 | create another TSV file as if only MuTect2 and Strelka2 were run, i.e., it will
80 | remove variants that were only called by MuSE and/or VarDict, and then replace
81 | values extracted from those callers as nan.
82 |
83 | ```
84 | remove_callers_from_somaticseq_tsv.py -infile Merged_from_4_callers.tsv -outfile With_only_MuTect2_Strelka2.tsv -subtract if_VarDict MuSE_Tier
85 | ```
86 |
--------------------------------------------------------------------------------
/docs/Manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/Manual.pdf
--------------------------------------------------------------------------------
/docs/SomaticSeqYoutube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/SomaticSeqYoutube.png
--------------------------------------------------------------------------------
/docs/heatmap400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/heatmap400.png
--------------------------------------------------------------------------------
/docs/precisionfda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/precisionfda.png
--------------------------------------------------------------------------------
/docs/workflow400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/workflow400.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "somaticseq"
7 | keywords = ["somatic mutations", "bioinformatics", "genomics", "ngs"]
8 | authors = [
9 | {name = "Li Tai Fang"},
10 | {name = "Pegah Tootoonchi Afshar"},
11 | {name = "Aparna Chhibber"},
12 | {name = "Marghoob Mohiyuddin"},
13 | {name = "John C. Mu"},
14 | {name = "Greg Gibeling"},
15 | {name = "Sharon Barr"},
16 | {name = "Narges Bani Asadi"},
17 | {name = "Hugo Y.K. Lam"},
18 | ]
19 | maintainers = [
20 | {name = "Li Tai Fang", email = "ltfang@gmail.com"},
21 | ]
22 | description = "SomaticSeq: An ensemble approach to accurately detect somatic mutations using SomaticSeq"
23 | requires-python = ">=3.11.0"
24 | license = {text = "BSD-2-Clause"}
25 | dependencies = [
26 | "pysam",
27 | "numpy",
28 | "scipy",
29 | "pandas",
30 | "pybedtools>=0.12.0",
31 | "xgboost>=1.4",
32 | "pydantic>=2.0.0,<3.0",
33 | ]
34 | dynamic = ["version", "readme"]
35 | classifiers = [
36 | "Development Status :: 5 - Production/Stable",
37 | "Topic :: Scientific/Engineering :: Bio-Informatics",
38 | "Intended Audience :: Science/Research",
39 | "Intended Audience :: Healthcare Industry",
40 | "Programming Language :: Python :: 3",
41 | ]
42 |
43 | [project.urls]
44 | Homepage = "https://github.com/bioinform/somaticseq"
45 |
46 | [project.scripts]
47 | somaticseq = "somaticseq.somaticseq_parallel:main"
48 | somaticseq_parallel = "somaticseq.somaticseq_parallel:main"
49 | somaticseq_xgboost = "somaticseq.somatic_xgboost:main"
50 | somaticseq_tsv2vcf = "somaticseq.somatic_tsv2vcf:main"
51 | somaticseq_single_vcf2tsv = "somaticseq.single_sample_vcf2tsv:main"
52 | somaticseq_paired_vcf2tsv = "somaticseq.somatic_vcf2tsv:main"
53 | somaticseq_concat = "somaticseq.genomic_file_parsers.concat:main"
54 | somaticseq_linguistic_sequence_complexity = "somaticseq.utilities.linguistic_sequence_complexity:main"
55 | somaticseq_loci_counter = "somaticseq.utilities.lociCounterWithLabels:main"
56 | somaticseq_paired_end_bam2fastq = "somaticseq.utilities.paired_end_bam2fastq:main"
57 | somaticseq_split_bed_into_equal_regions = "somaticseq.utilities.split_bed_into_equal_regions:main"
58 | somaticseq_make_alignment_scripts = "somaticseq.utilities.dockered_pipelines.makeAlignmentScripts:main"
59 | somaticseq_make_somatic_scripts = "somaticseq.utilities.dockered_pipelines.makeSomaticScripts:main"
60 | somaticseq_run_workflows = "somaticseq.utilities.dockered_pipelines.run_workflows:main"
61 | somaticseq_split_vcf = "somaticseq.vcf_modifier.split_vcf:main"
62 |
63 | [project.optional-dependencies]
64 | dev = [
65 | "black",
66 | "flake8",
67 | "mypy",
68 | "pytest",
69 | "pytest-mock",
70 | "twine",
71 | ]
72 |
73 | [tool.pytest.ini_options]
74 | testpaths = ["tests"]
75 | addopts = "--import-mode=importlib"
76 |
--------------------------------------------------------------------------------
/r_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/r_scripts/__init__.py
--------------------------------------------------------------------------------
/r_scripts/ada_cross_validation.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | require("ada")
4 |
5 | args <- commandArgs(TRUE)
6 |
7 | training_data_filename = args[1]
8 |
9 | ##### Main (entry point)
10 | train_filename = paste(training_data_filename)
11 | train_data = read.table(train_filename, header=TRUE)
12 |
13 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) {
14 | stop("In training mode, there must be both true positives and false positives in the call set.")
15 | }
16 |
17 | # Use substitution identity for training
18 | train_data$GC2CG = 0
19 | train_data$GC2TA = 0
20 | train_data$GC2AT = 0
21 | train_data$TA2AT = 0
22 | train_data$TA2GC = 0
23 | train_data$TA2CG = 0
24 |
25 | train_data$GC2CG[ (train_data$REF=='G' & train_data$ALT=='C') | (train_data$REF=='C' & train_data$ALT=='G') ] = 1
26 | train_data$GC2TA[ (train_data$REF=='G' & train_data$ALT=='T') | (train_data$REF=='C' & train_data$ALT=='A') ] = 1
27 | train_data$GC2AT[ (train_data$REF=='G' & train_data$ALT=='A') | (train_data$REF=='C' & train_data$ALT=='T') ] = 1
28 | train_data$TA2AT[ (train_data$REF=='T' & train_data$ALT=='A') | (train_data$REF=='A' & train_data$ALT=='T') ] = 1
29 | train_data$TA2GC[ (train_data$REF=='T' & train_data$ALT=='G') | (train_data$REF=='A' & train_data$ALT=='C') ] = 1
30 | train_data$TA2CG[ (train_data$REF=='T' & train_data$ALT=='T') | (train_data$REF=='A' & train_data$ALT=='G') ] = 1
31 |
32 | # Do not use these for training
33 | train_data$CHROM <- NULL
34 | train_data$POS <- NULL
35 | train_data$ID <- NULL
36 | train_data$REF <- NULL
37 | train_data$ALT <- NULL
38 | train_data$if_COSMIC <- NULL
39 | train_data$COSMIC_CNT <- NULL
40 | train_data$T_VAF_REV <- NULL
41 | train_data$T_VAF_FOR <- NULL
42 |
43 | for (var_i in tail(args, -1) ) {
44 | train_data[, var_i] <- NULL
45 | cat("Remove feature:", var_i, "\n")
46 | }
47 |
48 |
49 | model_formula <- as.formula(TrueVariant_or_False ~ .)
50 |
51 | # Cross validation:
52 |
53 | for (ith_try in 1:10)
54 |
55 | {
56 | # split test/train 50-50
57 | sample <- sample.int(n = nrow(train_data), size = floor(.5*nrow(train_data)), replace = F)
58 | train <- train_data[sample, ]
59 | test <- train_data[-sample, ]
60 |
61 | # do model
62 | ada.model <- ada(model_formula, data = train, iter = 500)
63 | # print(ada.model)
64 |
65 | ada.pred <- predict(ada.model, newdata = test, type="both", n.iter=350)
66 |
67 | # probability > 0.5
68 | pass_calls <- ada.pred$prob[,2] > 0.5
69 | reject_calls <- ada.pred$prob[,2] < 0.1
70 |
71 | # Counting
72 | num_pass_calls <- sum( pass_calls )
73 | num_reject_calls <- sum( reject_calls )
74 | num_pass_true_positives <- sum( pass_calls[pass_calls == test$TrueVariant_or_False] )
75 | num_true_positives <- sum(test$TrueVariant_or_False)
76 |
77 | # Calculate results
78 | precision <- num_pass_true_positives/num_pass_calls
79 | sensitivity <- num_pass_true_positives/num_true_positives
80 | F1_score <- 2 * num_pass_true_positives / ( num_true_positives + num_pass_calls )
81 |
82 | # Print out
83 | cat (ith_try, 'th_try', '\n')
84 |
85 | cat("PASS_Calls =", num_pass_calls, "\n")
86 | cat("REJECT_Calls =", num_reject_calls, "\n")
87 |
88 | cat("PASS_TruePositives =", num_pass_true_positives, "\n")
89 | cat("PASS_FalsePositives =", num_pass_calls - num_pass_true_positives, "\n")
90 |
91 | cat("Sensitivity =", sensitivity, "\n")
92 | cat("Precision =", precision, "\n")
93 | cat("F1 =", F1_score, "\n")
94 |
95 | }
96 |
--------------------------------------------------------------------------------
/r_scripts/ada_model_builder.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | require("ada")
4 |
5 | args <- commandArgs(TRUE)
6 |
7 | train_filename = args[1]
8 |
9 | ##### Main (entry point)
10 | train_data = read.table(train_filename, header=TRUE)
11 |
12 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) {
13 | stop("In training mode, there must be both true positives and false positives in the call set.")
14 | }
15 |
16 | # train_data <- train_data[,-c(1, 2, 3, 4, 5)]
17 |
18 | # Do not use these for training
19 | train_data[,'CHROM'] <- NULL
20 | train_data[,'POS'] <- NULL
21 | train_data[,'ID'] <- NULL
22 | train_data[,'REF'] <- NULL
23 | train_data[,'ALT'] <- NULL
24 | train_data[,'if_COSMIC'] <- NULL
25 | train_data[,'COSMIC_CNT'] <- NULL
26 |
27 | train_data$SOR <- as.numeric(train_data$SOR)
28 |
29 | model_formula <- as.formula(TrueVariant_or_False ~ .)
30 |
31 | print("Fitting model...")
32 | ada.model <- ada(model_formula, data = train_data, iter = 500)
33 |
34 | save(ada.model, file = paste(train_filename, ".Classifier.RData", sep="") )
35 |
36 | print(ada.model)
37 |
38 | #pdf( paste(train_filename, ".varplot.pdf", sep = "") )
39 | #varplot(ada.model)
40 | #dev.off()
41 |
42 | #pdf( paste(train_filename, ".iterplot.pdf", sep = "") )
43 | #plot(ada.model, TRUE, TRUE)
44 | #dev.off()
45 |
46 | #print("Computing prediction values...")
47 | #ada.pred <- predict(ada.model, newdata = test_data, type="both")
48 |
--------------------------------------------------------------------------------
/r_scripts/ada_model_builder_ntChange.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | require("ada")
4 |
5 | args <- commandArgs(TRUE)
6 |
7 | training_data_filename = args[1]
8 |
9 | ##### Main (entry point)
10 | train_filename = paste(training_data_filename)
11 | train_data = read.table(train_filename, header=TRUE)
12 |
13 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) {
14 | stop("In training mode, there must be both true positives and false positives in the call set.")
15 | }
16 |
17 | # Use substitution identity for training
18 | train_data$GC2CG = 0
19 | train_data$GC2TA = 0
20 | train_data$GC2AT = 0
21 | train_data$TA2AT = 0
22 | train_data$TA2GC = 0
23 | train_data$TA2CG = 0
24 |
25 | train_data$GC2CG[ (train_data$REF=='G' & train_data$ALT=='C') | (train_data$REF=='C' & train_data$ALT=='G') ] = 1
26 | train_data$GC2TA[ (train_data$REF=='G' & train_data$ALT=='T') | (train_data$REF=='C' & train_data$ALT=='A') ] = 1
27 | train_data$GC2AT[ (train_data$REF=='G' & train_data$ALT=='A') | (train_data$REF=='C' & train_data$ALT=='T') ] = 1
28 | train_data$TA2AT[ (train_data$REF=='T' & train_data$ALT=='A') | (train_data$REF=='A' & train_data$ALT=='T') ] = 1
29 | train_data$TA2GC[ (train_data$REF=='T' & train_data$ALT=='G') | (train_data$REF=='A' & train_data$ALT=='C') ] = 1
30 | train_data$TA2CG[ (train_data$REF=='T' & train_data$ALT=='C') | (train_data$REF=='A' & train_data$ALT=='G') ] = 1
31 |
32 | # Do not use these for training
33 | train_data$CHROM <- NULL
34 | train_data$POS <- NULL
35 | train_data$ID <- NULL
36 | train_data$REF <- NULL
37 | train_data$ALT <- NULL
38 | train_data$if_COSMIC <- NULL
39 | train_data$COSMIC_CNT <- NULL
40 | train_data$T_VAF_REV <- NULL
41 | train_data$T_VAF_FOR <- NULL
42 |
43 | for (var_i in tail(args, -1) ) {
44 | train_data[, var_i] <- NULL
45 | cat("Remove", var_i, "\n")
46 | }
47 |
48 | model_formula <- as.formula(TrueVariant_or_False ~ .)
49 |
50 | print("Fitting model...")
51 |
52 | boosting_iters = 500
53 |
54 | seed_value = floor(runif(1, min=100, max=50000))
55 | print( paste("Seed =", seed_value) )
56 | set.seed(seed_value)
57 |
58 | ada.model <- ada(model_formula, data = train_data, iter = boosting_iters, control=rpart.control(cp=-1, maxdepth=16, minsplit=0, xval=0))
59 | save(ada.model, file = paste(training_data_filename, ".ada.Classifier.RData", sep="") )
60 |
61 | print(ada.model)
62 |
--------------------------------------------------------------------------------
/r_scripts/ada_model_predictor.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | require("ada")
4 |
5 | args <- commandArgs(TRUE)
6 |
7 | trained_model = args[1]
8 | test_filename = args[2]
9 | output_filename = args[3]
10 |
11 | # Make a copy of the input data since it will be modified, but don't output the modification into the output file
12 | test_data_ = read.table(test_filename, header=TRUE)
13 | test_data <- test_data_
14 |
15 | # Create 6 features based on base substitution types, just in case those are training features. Otherwise, doesn't take much time.
16 | test_data$GC2CG = 0
17 | test_data$GC2TA = 0
18 | test_data$GC2AT = 0
19 | test_data$TA2AT = 0
20 | test_data$TA2GC = 0
21 | test_data$TA2CG = 0
22 |
23 | test_data$GC2CG[ (test_data$REF=='G' & test_data$ALT=='C') | (test_data$REF=='C' & test_data$ALT=='G') ] = 1
24 | test_data$GC2TA[ (test_data$REF=='G' & test_data$ALT=='T') | (test_data$REF=='C' & test_data$ALT=='A') ] = 1
25 | test_data$GC2AT[ (test_data$REF=='G' & test_data$ALT=='A') | (test_data$REF=='C' & test_data$ALT=='T') ] = 1
26 | test_data$TA2AT[ (test_data$REF=='T' & test_data$ALT=='A') | (test_data$REF=='A' & test_data$ALT=='T') ] = 1
27 | test_data$TA2GC[ (test_data$REF=='T' & test_data$ALT=='G') | (test_data$REF=='A' & test_data$ALT=='C') ] = 1
28 | test_data$TA2CG[ (test_data$REF=='T' & test_data$ALT=='C') | (test_data$REF=='A' & test_data$ALT=='G') ] = 1
29 |
30 |
31 | # Handle empty input data
32 | if ( nrow(test_data)>=1 ) {
33 | load( trained_model )
34 | ada.pred <- predict(ada.model, newdata = test_data, type="both", n.iter=300)
35 | test_data_output <- cbind(test_data_, SCORE = ada.pred$prob[,2])
36 |
37 | } else {
38 |
39 | test_data_output <- test_data_
40 | }
41 |
42 | write.table(test_data_output, row.names = FALSE, sep="\t", na = "nan", file = output_filename, quote=FALSE)
43 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file=README.md
3 | license_files=LICENSE.txt
4 |
5 | [flake8]
6 | ignore = E203, W503, E262, E266
7 | select = B,C,E,F,W,T4,B9
8 | max-line-length = 88
9 | exclude = build
10 |
11 | [mypy]
12 | ignore_missing_imports = True
13 | exclude = ^build|dist$
14 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import re
5 |
6 | from setuptools import find_packages, setup
7 |
8 | LINK_PATTERN = r"(\[.*?\]\()([^http][^)]*)\)"
9 | IMAGE_SRC_PATTERN = r'(
]*src=")([^"]*)(")'
10 | BASE_URL = "https://github.com/bioinform/somaticseq"
11 |
12 |
13 | # Read __version__ from the _version.py file
14 | version_file = os.path.join("somaticseq", "_version.py")
15 | with open(version_file) as f:
16 | exec(f.read()) # This will define __version__
17 |
18 |
19 | def modify_markdown_for_3rd_party(base_markdown: str) -> str:
20 | def _replace_link(match: re.Match) -> str:
21 | # Replace relative links in .md from [text](RELATIVE/LINK) into
22 | # [text]({BASE_URL}/blob/TAG/RELATIVE/LINK)
23 | text = match.group(1)
24 | url = match.group(2)
25 | return f"{text}{BASE_URL}/blob/v{__version__}/{url})" # type: ignore[name-defined] # noqa
26 |
27 | def _replace_src(match: re.Match) -> str:
28 | # Replace relative image links
29 | prefix = match.group(1) # part before the url
30 | url = match.group(2) # original url
31 | suffix = match.group(3) # part after the url
32 | return f"{prefix}{BASE_URL}/raw/v{__version__}/{url}{suffix}" # type: ignore[name-defined] # noqa
33 |
34 | with_abs_url = re.sub(LINK_PATTERN, _replace_link, base_markdown)
35 | with_abs_img_src = re.sub(IMAGE_SRC_PATTERN, _replace_src, with_abs_url)
36 | return with_abs_img_src
37 |
38 |
39 | with open("README.md") as fn:
40 | long_description = fn.read()
41 | description_for_3rd_party = modify_markdown_for_3rd_party(long_description)
42 |
43 |
44 | setup(
45 | name="somaticseq",
46 | description=(
47 | "SomaticSeq: "
48 | "An ensemble approach to accurately detect somatic mutations using SomaticSeq"
49 | ),
50 | version=__version__, # type: ignore[name-defined] # noqa
51 | long_description=description_for_3rd_party,
52 | long_description_content_type="text/markdown",
53 | author="Li Tai Fang",
54 | author_email="ltfang@gmail.com",
55 | url="https://github.com/bioinform/somaticseq",
56 | packages=find_packages(),
57 | package_data={"": ["*.R"]},
58 | python_requires=">=3.11.0",
59 | setup_requires=["setuptools"],
60 | install_requires=[ # pyproject.toml overrides them
61 | "pysam",
62 | "numpy",
63 | "scipy",
64 | "pandas",
65 | "xgboost>=1.4",
66 | "pybedtools>=0.12.0",
67 | "pydantic>=2.0.0,<3.0",
68 | ],
69 | scripts=[
70 | "somaticseq/somaticseq_parallel.py",
71 | "somaticseq/run_somaticseq.py",
72 | "somaticseq/single_sample_vcf2tsv.py",
73 | "somaticseq/somatic_vcf2tsv.py",
74 | "somaticseq/somatic_xgboost.py",
75 | "somaticseq/somatic_tsv2vcf.py",
76 | "somaticseq/genomic_file_parsers/concat.py",
77 | "somaticseq/utilities/linguistic_sequence_complexity.py",
78 | "somaticseq/utilities/lociCounterWithLabels.py",
79 | "somaticseq/utilities/paired_end_bam2fastq.py",
80 | "somaticseq/utilities/remove_callers_from_somaticseq_tsv.py",
81 | "somaticseq/utilities/split_bed_into_equal_regions.py",
82 | "somaticseq/utilities/tally_variants_from_multiple_vcfs.py",
83 | "somaticseq/utilities/variant_annotation.py",
84 | "somaticseq/utilities/vcfsorter.pl",
85 | "somaticseq/utilities/dockered_pipelines/makeAlignmentScripts.py",
86 | "somaticseq/utilities/dockered_pipelines/makeSomaticScripts.py",
87 | "somaticseq/utilities/dockered_pipelines/run_workflows.py",
88 | "somaticseq/vcf_modifier/split_vcf.py",
89 | "r_scripts/ada_model_builder_ntChange.R",
90 | "r_scripts/ada_model_predictor.R",
91 | ],
92 | )
93 |
--------------------------------------------------------------------------------
/somaticseq/__init__.py:
--------------------------------------------------------------------------------
1 | from somaticseq._version import __version__ # noqa
2 |
--------------------------------------------------------------------------------
/somaticseq/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.11.1"
2 | vcf_header = f"##SomaticSeq=v{__version__}"
3 |
--------------------------------------------------------------------------------
/somaticseq/defaults.py:
--------------------------------------------------------------------------------
1 | from typing import Literal
2 |
3 | MIN_MAPPING_QUALITY: int = 1
4 | MIN_BASE_QUALITY: int = 5
5 | MIN_CALLER: float = 0.5
6 | PASS_SCORE: float = 0.5
7 | LOWQUAL_SCORE: float = 0.1
8 | HOMOZYGOUS_FRAC: float = 0.85
9 | HETEROZYGOUS_FRAC: float = 0.01
10 |
11 | SNV_TSV_SUFFIX: str = "sSNV.tsv"
12 | INDEL_TSV_SUFFIX: str = "sINDEL.tsv"
13 | SNV_VCF_SUFFIX: str = "sSNV.vcf"
14 | INDEL_VCF_SUFFIX: str = "sINDEL.vcf"
15 | ENSEMBLE_PREFIX: str = "Ensemble."
16 | CONSENSUS_PREFIX: str = "Consensus."
17 | CLASSIFIED_PREFIX: str = "SSeq.Classified."
18 | TUMOR_NAME: str = "TUMOR"
19 | NORMAL_NAME: str = "NORMAL"
20 |
21 | ALGORITHM: Literal["xgboost", "ada"] = "xgboost"
22 | DEFAULT_XGB_BOOST_ROUNDS: int = 500
23 | DEFAULT_NUM_TREES_PREDICT: int = 100
24 |
--------------------------------------------------------------------------------
/somaticseq/genomic_file_parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/genomic_file_parsers/__init__.py
--------------------------------------------------------------------------------
/somaticseq/ntchange_type.py:
--------------------------------------------------------------------------------
1 | def ntchange(variant_frame):
2 | GC2CG = []
3 | GC2TA = []
4 | GC2AT = []
5 | TA2AT = []
6 | TA2GC = []
7 | TA2CG = []
8 |
9 | for ref, alt in zip(variant_frame["REF"], variant_frame["ALT"]):
10 | ref = ref.upper()
11 | alt = alt.upper()
12 |
13 | if (ref == "G" and alt == "C") or (ref == "C" and alt == "G"):
14 | GC2CG.append(1)
15 | GC2TA.append(0)
16 | GC2AT.append(0)
17 | TA2AT.append(0)
18 | TA2GC.append(0)
19 | TA2CG.append(0)
20 |
21 | elif (ref == "G" and alt == "T") or (ref == "C" and alt == "A"):
22 | GC2CG.append(0)
23 | GC2TA.append(1)
24 | GC2AT.append(0)
25 | TA2AT.append(0)
26 | TA2GC.append(0)
27 | TA2CG.append(0)
28 |
29 | elif (ref == "G" and alt == "A") or (ref == "C" and alt == "T"):
30 | GC2CG.append(0)
31 | GC2TA.append(0)
32 | GC2AT.append(1)
33 | TA2AT.append(0)
34 | TA2GC.append(0)
35 | TA2CG.append(0)
36 |
37 | elif (ref == "T" and alt == "A") or (ref == "A" and alt == "T"):
38 | GC2CG.append(0)
39 | GC2TA.append(0)
40 | GC2AT.append(0)
41 | TA2AT.append(1)
42 | TA2GC.append(0)
43 | TA2CG.append(0)
44 |
45 | elif (ref == "T" and alt == "G") or (ref == "A" and alt == "C"):
46 | GC2CG.append(0)
47 | GC2TA.append(0)
48 | GC2AT.append(0)
49 | TA2AT.append(0)
50 | TA2GC.append(1)
51 | TA2CG.append(0)
52 |
53 | elif (ref == "T" and alt == "C") or (ref == "A" and alt == "G"):
54 | GC2CG.append(0)
55 | GC2TA.append(0)
56 | GC2AT.append(0)
57 | TA2AT.append(0)
58 | TA2GC.append(0)
59 | TA2CG.append(1)
60 |
61 | else:
62 | GC2CG.append(0)
63 | GC2TA.append(0)
64 | GC2AT.append(0)
65 | TA2AT.append(0)
66 | TA2GC.append(0)
67 | TA2CG.append(0)
68 |
69 | new_data = variant_frame.assign(
70 | GC2CG=GC2CG, GC2TA=GC2CG, GC2AT=GC2CG, TA2AT=GC2CG, TA2GC=GC2CG, TA2CG=GC2CG
71 | )
72 | return new_data
73 |
--------------------------------------------------------------------------------
/somaticseq/utilities/BAM_filter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import pysam
6 |
7 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
8 | parser.add_argument(
9 | "-bamin",
10 | "--bam-file-in",
11 | type=str,
12 | help="Input BAM file",
13 | required=True,
14 | default=None,
15 | )
16 | parser.add_argument(
17 | "-bamout",
18 | "--bam-file-out",
19 | type=str,
20 | help="Output BAM file",
21 | required=True,
22 | default=None,
23 | )
24 |
25 | parser.add_argument(
26 | "-maxNM",
27 | "--max-NM",
28 | type=int,
29 | help="filter out high edit distance reads",
30 | required=False,
31 | default=8,
32 | )
33 | parser.add_argument(
34 | "-minMQ",
35 | "--min-MQ",
36 | type=float,
37 | help="filter out low MQ reads",
38 | required=False,
39 | default=20,
40 | )
41 | parser.add_argument(
42 | "-nodisc",
43 | "--no-discordant",
44 | action="store_true",
45 | help="filter out discordant reads",
46 | required=False,
47 | default=False,
48 | )
49 | parser.add_argument(
50 | "-noclip",
51 | "--no-clipping",
52 | action="store_true",
53 | help="filter out soft-clipped reads",
54 | required=False,
55 | default=False,
56 | )
57 |
58 | args = parser.parse_args()
59 | bam_file = args.bam_file_in
60 | bam_out = args.bam_file_out
61 | maxNM = args.max_NM
62 | minMQ = args.min_MQ
63 | filter_discordant = args.no_discordant
64 | filter_clip = args.no_clipping
65 |
66 | with (
67 | pysam.AlignmentFile(bam_file) as bam,
68 | pysam.AlignmentFile(bam_out, "wb", template=bam) as bamout,
69 | ):
70 | reads = bam.fetch()
71 |
72 | for read_i in reads:
73 | assert read_i.cigarstring is not None
74 | if (
75 | read_i.mapping_quality >= minMQ
76 | and (read_i.has_tag("NM") and read_i.get_tag("NM") <= maxNM)
77 | and (read_i.is_proper_pair or not filter_discordant)
78 | and ("S" not in read_i.cigarstring or not filter_clip)
79 | ):
80 | bamout.write(read_i)
81 |
--------------------------------------------------------------------------------
/somaticseq/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/__init__.py
--------------------------------------------------------------------------------
/somaticseq/utilities/bedFileHandler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | class BedFile:
5 | def __init__(self, BedFile):
6 | """Argument is a line in pileup file."""
7 | self.BedFile = BedFile
8 |
9 | bedRegions = {}
10 |
11 | with open(self.BedFile) as bed:
12 | line_i = bed.readline().rstrip()
13 |
14 | while line_i:
15 | item = line_i.split("\t")
16 |
17 | contig = item[0]
18 | region = (int(item[1]), int(item[2]))
19 |
20 | if contig not in bedRegions:
21 | bedRegions[contig] = []
22 |
23 | bedRegions[contig].append(region)
24 |
25 | line_i = bed.readline().rstrip()
26 |
27 | self.bedRegions = bedRegions
28 |
29 | def inRegion(self, contig_i, position_i, ordered=True):
30 | intersected = False
31 |
32 | if contig_i in self.bedRegions:
33 | # If the BED file is ordered, it can break out once it goes beyond
34 | # the position_i
35 | if ordered:
36 | for region_i in self.bedRegions[contig_i]:
37 | if region_i[0] < position_i <= region_i[1]:
38 | intersected = True
39 | break
40 |
41 | elif region_i[0] > position_i:
42 | break
43 |
44 | # If the BED file is not ordered, then it needs to go all the way to
45 | # the end every time
46 | else:
47 | for region_i in self.bedRegions[contig_i]:
48 | if region_i[0] < position_i <= region_i[1]:
49 | intersected = True
50 | break
51 |
52 | return intersected
53 |
--------------------------------------------------------------------------------
/somaticseq/utilities/combo_callers_evaluator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # mypy: ignore-errors
3 |
4 | import argparse
5 | import itertools
6 |
7 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome
8 |
9 | # argparse Stuff
10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 | parser.add_argument(
12 | "-vcf",
13 | "--input-vcf",
14 | type=str,
15 | help="SomaticSeq VCF file",
16 | required=True,
17 | default=None,
18 | )
19 | parser.add_argument(
20 | "-combo",
21 | "--combo-code",
22 | type=str,
23 | help="E.g., MVJSDULK",
24 | required=True,
25 | default="MVJSDULK",
26 | )
27 |
28 | args = parser.parse_args()
29 | vcf = args.input_vcf
30 | combo = args.combo_code
31 |
32 | tool_code = list(combo)
33 |
34 | all_combos = {}
35 | for i in range(1, len(tool_code) + 1):
36 | combo_gen = itertools.combinations(tool_code, i)
37 | for j in combo_gen:
38 | all_combos[j] = [0, 0]
39 |
40 |
41 | with open(vcf) as vcf:
42 | line_i = vcf.readline().rstrip()
43 |
44 | while line_i.startswith("#"):
45 | line_i = vcf.readline().rstrip()
46 |
47 | print("#ToolCombo\tTruePositiveCalls\tAllCalls")
48 |
49 | while line_i:
50 | vcf_i = genome.VCFVariantRecord.from_vcf_line(line_i)
51 | combo_i = vcf_i.get_info_value(combo)
52 | tool_i = combo_i.split(",")
53 | tool_i = [int(i) for i in tool_i]
54 |
55 | current_call_set = set()
56 | for tool_code_j, tool_j in zip(tool_code, tool_i):
57 | if tool_j == 1:
58 | current_call_set.add(tool_code_j)
59 |
60 | for combo_j in all_combos:
61 | if set.intersection(set(combo_j), current_call_set):
62 | all_combos[combo_j][0] += 1
63 |
64 | if "TruePositive" in vcf_i.identifier:
65 | all_combos[combo_j][1] += 1
66 |
67 | line_i = vcf.readline().rstrip()
68 |
69 |
70 | for i in sorted(all_combos):
71 | print("".join(i) + "\t" + str(all_combos[i][1]) + "\t" + str(all_combos[i][0]))
72 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/QC/extract_coverageDepth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam:,genome-reference:,selector:,minBaseQuality:,minMappingQuality:,extra-arguments:,out-script:,standalone, -n 'coverageDepth.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | minBaseQuality=0
18 | minMappingQuality=0
19 |
20 |
21 | while true; do
22 | case "$1" in
23 | -o | --output-dir )
24 | case "$2" in
25 | "") shift 2 ;;
26 | *) outdir=$2 ; shift 2 ;;
27 | esac ;;
28 |
29 | --bam )
30 | case "$2" in
31 | "") shift 2 ;;
32 | *) bamFile=$2 ; shift 2 ;;
33 | esac ;;
34 |
35 | --genome-reference )
36 | case "$2" in
37 | "") shift 2 ;;
38 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
39 | esac ;;
40 |
41 | --minBaseQuality )
42 | case "$2" in
43 | "") shift 2 ;;
44 | *) minBaseQuality=$2 ; shift 2 ;;
45 | esac ;;
46 |
47 | --minMappingQuality )
48 | case "$2" in
49 | "") shift 2 ;;
50 | *) minMappingQuality=$2 ; shift 2 ;;
51 | esac ;;
52 |
53 | --selector )
54 | case "$2" in
55 | "") shift 2 ;;
56 | *) SELECTOR=$2 ; shift 2 ;;
57 | esac ;;
58 |
59 | --extra-arguments )
60 | case "$2" in
61 | "") shift 2 ;;
62 | *) extra_arguments=$2 ; shift 2 ;;
63 | esac ;;
64 |
65 | --out-script )
66 | case "$2" in
67 | "") shift 2 ;;
68 | *) out_script_name=$2 ; shift 2 ;;
69 | esac ;;
70 |
71 | --standalone )
72 | standalone=1 ; shift ;;
73 |
74 | -- ) shift; break ;;
75 | * ) break ;;
76 | esac
77 | done
78 |
79 | logdir=${outdir}/logs
80 | mkdir -p ${logdir}
81 |
82 | if [[ ${out_script_name} ]]
83 | then
84 | out_script="${out_script_name}"
85 | else
86 | out_script="${logdir}/coverageDepth.${timestamp}.cmd"
87 | fi
88 |
89 |
90 | if [[ $standalone ]]
91 | then
92 | echo "#!/bin/bash" > $out_script
93 | echo "" >> $out_script
94 | echo "#$ -o ${logdir}" >> $out_script
95 | echo "#$ -e ${logdir}" >> $out_script
96 | echo "#$ -S /bin/bash" >> $out_script
97 | echo '#$ -l h_vmem=8G' >> $out_script
98 | echo 'set -e' >> $out_script
99 | fi
100 |
101 | echo "" >> $out_script
102 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
103 | echo "" >> $out_script
104 |
105 | if [[ ${SELECTOR} ]]
106 | then
107 | selector_text="-L /mnt/${SELECTOR}"
108 | fi
109 |
110 | bamFileName=`basename ${bamFile}`
111 |
112 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-0 \\" >> $out_script
113 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
114 | echo "-T DepthOfCoverage \\" >> $out_script
115 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
116 | echo "-I /mnt/${bamFile} \\" >> $out_script
117 | echo "${selector_text} \\" >> $out_script
118 | echo "--minBaseQuality ${minBaseQuality} \\" >> $out_script
119 | echo "--minMappingQuality ${minMappingQuality} \\" >> $out_script
120 | echo "${extra_arguments} \\" >> $out_script
121 | echo "-o /mnt/${outdir}/${bamFileName}.depth" >> $out_script
122 |
123 | echo "" >> $out_script
124 |
125 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
126 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/__init__.py
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/alignments/README.md:
--------------------------------------------------------------------------------
1 | ## Requirement
2 |
3 | - Have internet connection, and able to pull and run docker images from Docker
4 | Hub.
5 |
6 | ### Alignment with bwa mem
7 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/alignments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/alignments/__init__.py
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/alignments/mergeFastqs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import subprocess
5 | from datetime import datetime
6 |
7 | from somaticseq.utilities.dockered_pipelines.container_option import (
8 | DOCKER_IMAGES,
9 | container_params,
10 | )
11 |
12 | timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S%f")
13 |
14 |
15 | DEFAULT_PARAMS = {
16 | "tabix_image": DOCKER_IMAGES.tabix,
17 | "MEM": 4,
18 | "output_directory": os.curdir,
19 | "action": "echo",
20 | "extra_docker_options": "",
21 | "script": f"mergeFastqs.{timestamp}.cmd",
22 | "threads": 1,
23 | }
24 |
25 |
26 | def gz(
27 | infiles, outfq, tech="docker", input_parameters=DEFAULT_PARAMS, remove_infiles=False
28 | ):
29 | for param_i in DEFAULT_PARAMS:
30 | if param_i not in input_parameters:
31 | input_parameters[param_i] = DEFAULT_PARAMS[param_i]
32 |
33 | logdir = os.path.join(input_parameters["output_directory"], "logs")
34 | outfile = os.path.join(logdir, input_parameters["script"])
35 | all_paths = list(infiles) + [
36 | outfq,
37 | ]
38 | tabix_line, file_dictionary = container_params(
39 | input_parameters["tabix_image"],
40 | tech=tech,
41 | files=all_paths,
42 | extra_args=input_parameters["extra_docker_options"],
43 | )
44 | mounted_outfile = file_dictionary[outfq]["mount_path"]
45 | infile_string = " ".join(
46 | [file_dictionary[file_i]["mount_path"] for file_i in infiles]
47 | )
48 |
49 | with open(outfile, "w") as out:
50 | out.write("#!/bin/bash\n\n")
51 | out.write(f"#$ -o {logdir}\n")
52 | out.write(f"#$ -e {logdir}\n")
53 | out.write("#$ -S /bin/bash\n")
54 | out.write("#$ -l h_vmem={}G\n".format(input_parameters["MEM"]))
55 | out.write("set -e\n\n")
56 | out.write(
57 | 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
58 | ) # Do not change this: picard_fractional uses this to end the copying.
59 | out.write(f"{tabix_line} bash -c \\\n")
60 | out.write(
61 | '"zcat {} | bgzip -@{} > {}"\n'.format(
62 | infile_string, input_parameters["threads"], mounted_outfile
63 | )
64 | )
65 | if remove_infiles:
66 | out.write("rm {}\n\n".format(" ".join(infiles)))
67 |
68 | out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')
69 |
70 | # "Run" the script that was generated
71 | command_line = "{} {}".format(input_parameters["action"], outfile)
72 | subprocess.call(command_line, shell=True)
73 |
74 | return outfile
75 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/alignments/singleIndelRealign.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,genome-reference:,selector:,threads:,extra-arguments:,out-script:,standalone, -n 'singleIndelRealign.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | threads=1
17 |
18 | while true; do
19 | case "$1" in
20 | -o | --output-dir )
21 | case "$2" in
22 | "") shift 2 ;;
23 | *) outdir=$2 ; shift 2 ;;
24 | esac ;;
25 |
26 | --tumor-bam )
27 | case "$2" in
28 | "") shift 2 ;;
29 | *) tumorBam=$2 ; shift 2 ;;
30 | esac ;;
31 |
32 | --genome-reference )
33 | case "$2" in
34 | "") shift 2 ;;
35 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
36 | esac ;;
37 |
38 | --selector )
39 | case "$2" in
40 | "") shift 2 ;;
41 | *) SELECTOR=$2 ; shift 2 ;;
42 | esac ;;
43 |
44 | --threads )
45 | case "$2" in
46 | "") shift 2 ;;
47 | *) threads=$2 ; shift 2 ;;
48 | esac ;;
49 |
50 | --extra-arguments )
51 | case "$2" in
52 | "") shift 2 ;;
53 | *) extra_arguments=$2 ; shift 2 ;;
54 | esac ;;
55 |
56 | --out-script )
57 | case "$2" in
58 | "") shift 2 ;;
59 | *) out_script_name=$2 ; shift 2 ;;
60 | esac ;;
61 |
62 | --standalone )
63 | standalone=1 ; shift ;;
64 |
65 | -- ) shift; break ;;
66 | * ) break ;;
67 | esac
68 | done
69 |
70 | logdir=${outdir}/logs
71 | mkdir -p ${logdir}
72 |
73 | if [[ ${out_script_name} ]]
74 | then
75 | out_script="${out_script_name}"
76 | else
77 | out_script="${logdir}/singleIndelRealign.${timestamp}.cmd"
78 | fi
79 |
80 |
81 | if [[ $standalone ]]
82 | then
83 | echo "#!/bin/bash" > $out_script
84 | echo "" >> $out_script
85 | echo "#$ -o ${logdir}" >> $out_script
86 | echo "#$ -e ${logdir}" >> $out_script
87 | echo "#$ -S /bin/bash" >> $out_script
88 | echo '#$ -l h_vmem=8G' >> $out_script
89 | echo 'set -e' >> $out_script
90 | fi
91 |
92 | echo "" >> $out_script
93 |
94 |
95 | if [[ ${SELECTOR} ]]
96 | then
97 | selector_text="-L /mnt/${SELECTOR}"
98 | fi
99 |
100 |
101 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-1 \\" >> $out_script
102 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
103 | echo "-T RealignerTargetCreator \\" >> $out_script
104 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
105 | echo "-I /mnt/${tumorBam} \\" >> $out_script
106 | echo "-nt ${threads} \\" >> $out_script
107 | echo "${selector_text} \\" >> $out_script
108 | echo "-o /mnt/${outdir}/indelRealign.${timestamp}.intervals" >> $out_script
109 |
110 | echo "" >> $out_script
111 |
112 | tumorBamFileName=`basename ${tumorBam}`
113 | tumorOut=${tumorBamFileName%.bam}.indelRealigned.bam
114 |
115 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-1 \\" >> $out_script
116 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
117 | echo "-T IndelRealigner \\" >> $out_script
118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
119 | echo "-I /mnt/${tumorBam} \\" >> $out_script
120 | echo "-targetIntervals /mnt/${outdir}/indelRealign.${timestamp}.intervals \\" >> $out_script
121 | echo "${selector_text} \\" >> $out_script
122 | echo "${extra_arguments} \\" >> $out_script
123 | echo "-o /mnt/${outdir}/${tumorOut}" >> $out_script
124 |
125 |
126 | echo "" >> $out_script
127 |
128 | echo "mv ${outdir}/${tumorOut%.bam}.bai ${outdir}/${tumorOut}.bai" >> $out_script
129 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/alignments/spreadFastq.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | from datetime import datetime
4 |
5 | from somaticseq.utilities.dockered_pipelines.container_option import (
6 | DOCKER_IMAGES,
7 | container_params,
8 | )
9 |
10 | timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S%f")
11 |
12 |
13 | DEFAULT_PARAMS = {
14 | "somaticseq_image": DOCKER_IMAGES.somaticseq,
15 | "MEM": 2,
16 | "output_directory": os.curdir,
17 | "extra_docker_options": "",
18 | "script": f"spreadFastq.{timestamp}.cmd",
19 | "action": "echo",
20 | "threads": 1,
21 | }
22 |
23 |
24 | def spread(
25 | in_fastqs, out_fastqs, tech="docker", input_parameters={}, remove_infiles=False
26 | ):
27 | for param_i in DEFAULT_PARAMS:
28 | if param_i not in input_parameters:
29 | input_parameters[param_i] = DEFAULT_PARAMS[param_i]
30 |
31 | logdir = os.path.join(input_parameters["output_directory"], "logs")
32 | outfile = os.path.join(logdir, input_parameters["script"])
33 |
34 | all_paths = list(in_fastqs) + list(out_fastqs)
35 | spread_line, file_dictionary = container_params(
36 | input_parameters["somaticseq_image"],
37 | tech=tech,
38 | files=all_paths,
39 | extra_args=input_parameters["extra_docker_options"],
40 | )
41 |
42 | infastq_string = " ".join(
43 | [file_dictionary[file_i]["mount_path"] for file_i in in_fastqs]
44 | )
45 | outfastq_string = " ".join(
46 | [file_dictionary[file_i]["mount_path"] for file_i in out_fastqs]
47 | )
48 |
49 | with open(outfile, "w") as out:
50 | out.write("#!/bin/bash\n\n")
51 |
52 | out.write(f"#$ -o {logdir}\n")
53 | out.write(f"#$ -e {logdir}\n")
54 | out.write("#$ -S /bin/bash\n")
55 | out.write("#$ -l h_vmem={}G\n".format(input_parameters["MEM"]))
56 | out.write("set -e\n\n")
57 |
58 | out.write(
59 | 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
60 | ) # Do not change this: picard_fractional uses this to end the copying.
61 |
62 | out.write(f"{spread_line} \\\n")
63 | out.write(
64 | "concat.py -spread -bgzip -nt {} -infiles {} -outfiles {} \n".format(
65 | input_parameters["threads"], infastq_string, outfastq_string
66 | )
67 | )
68 |
69 | if remove_infiles:
70 | out.write("rm {}\n\n".format(" ".join(in_fastqs)))
71 |
72 | out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')
73 |
74 | # "Run" the script that was generated
75 | command_line = "{} {}".format(input_parameters["action"], outfile)
76 | subprocess.call(command_line, shell=True)
77 |
78 | return outfile
79 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/IndelRealign.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,genome-reference:,selector:,out-tag:,extra-arguments:,out-script:,standalone, -n 'IndelRealign.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | out_tag='JointRealigned'
18 | #extra_arguments='-dt NONE --maxReadsForConsensuses 150000 --maxReadsInMemory 500000 --maxReadsForRealignment 2000000'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --tumor-bam )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) tbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --normal-bam )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) nbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --genome-reference )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --selector )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) SELECTOR=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --out-tag )
53 | case "$2" in
54 | "") shift 2 ;;
55 | *) out_tag=$2 ; shift 2 ;;
56 | esac ;;
57 |
58 | --extra-arguments )
59 | case "$2" in
60 | "") shift 2 ;;
61 | *) extra_arguments=$2 ; shift 2 ;;
62 | esac ;;
63 |
64 | --out-script )
65 | case "$2" in
66 | "") shift 2 ;;
67 | *) out_script_name=$2 ; shift 2 ;;
68 | esac ;;
69 |
70 | --standalone )
71 | standalone=1 ; shift ;;
72 |
73 | -- ) shift; break ;;
74 | * ) break ;;
75 | esac
76 | done
77 |
78 | logdir=${outdir}/logs
79 | mkdir -p ${logdir}
80 |
81 | if [[ ${out_script_name} ]]
82 | then
83 | out_script="${out_script_name}"
84 | else
85 | out_script="${logdir}/indelRealign.${timestamp}.cmd"
86 | fi
87 |
88 | if [[ $standalone ]]
89 | then
90 | echo "#!/bin/bash" > $out_script
91 | echo "" >> $out_script
92 | echo "#$ -o ${logdir}" >> $out_script
93 | echo "#$ -e ${logdir}" >> $out_script
94 | echo "#$ -S /bin/bash" >> $out_script
95 | echo '#$ -l h_vmem=10G' >> $out_script
96 | echo 'set -e' >> $out_script
97 | fi
98 |
99 | echo "" >> $out_script
100 |
101 | if [[ $SELECTOR ]]
102 | then
103 | selector_input="-L /mnt/${SELECTOR}"
104 | fi
105 |
106 | echo "docker run --rm -v /:/mnt -u $UID --memory 15g broadinstitute/gatk3:3.8-1 java -Xmx14g -jar GenomeAnalysisTK.jar \\" >> $out_script
107 | echo "-T RealignerTargetCreator \\" >> $out_script
108 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
109 | echo "-I /mnt/${tbam} \\" >> $out_script
110 | echo "-I /mnt/${nbam} \\" >> $out_script
111 | echo "$selector_input \\" >> $out_script
112 | echo "-o /mnt/${outdir}/T.N.intervals" >> $out_script
113 | echo "" >> $out_script
114 |
115 | echo "docker run --rm -v /:/mnt -u $UID --memory 15g -w /mnt/${outdir} broadinstitute/gatk3:3.8-1 \\" >> $out_script
116 | echo "java -Xmx14g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
117 | echo "-T IndelRealigner \\" >> $out_script
118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
119 | echo "-I /mnt/${tbam} \\" >> $out_script
120 | echo "-I /mnt/${nbam} \\" >> $out_script
121 | echo "-targetIntervals /mnt/${outdir}/T.N.intervals \\" >> $out_script
122 | echo "${extra_arguments} \\" >> $out_script
123 | echo "-nWayOut .${out_tag}.bam" >> $out_script
124 | echo "" >> $out_script
125 |
126 | realigned_normal=${nbam%.bam}.${out_tag}.bam
127 | realigned_tumor=${tbam%.bam}.${out_tag}.bam
128 |
129 | echo "mv ${realigned_normal%.bam}.bai ${realigned_normal}.bai" >> $out_script
130 | echo "mv ${realigned_tumor%.bam}.bai ${realigned_tumor}.bai" >> $out_script
131 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/MergeTN.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 | outSM='TN_Merged'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-out )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) outbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --tumor-bam )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) tbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --normal-bam )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) nbam=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --out-script )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) out_script_name=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --standalone )
53 | standalone=1 ; shift ;;
54 |
55 | -- ) shift; break ;;
56 | * ) break ;;
57 | esac
58 | done
59 |
60 | logdir=${outdir}/logs
61 | mkdir -p ${logdir}
62 |
63 | if [[ ${out_script_name} ]]
64 | then
65 | out_script="${out_script_name}"
66 | else
67 | out_script="${logdir}/mergeBams.${timestamp}.cmd"
68 | fi
69 |
70 | if [[ $standalone ]]
71 | then
72 | echo "#!/bin/bash" > $out_script
73 | echo "" >> $out_script
74 | echo "#$ -o ${logdir}" >> $out_script
75 | echo "#$ -e ${logdir}" >> $out_script
76 | echo "#$ -S /bin/bash" >> $out_script
77 | echo '#$ -l h_vmem=8G' >> $out_script
78 | echo 'set -e' >> $out_script
79 | fi
80 |
81 | echo "" >> $out_script
82 |
83 | # Merge the 2 BAM files
84 | echo "docker run -v /:/mnt -u $UID --memory 6g --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script
86 | echo "I=/mnt/${nbam} \\" >> $out_script
87 | echo "I=/mnt/${tbam} \\" >> $out_script
88 | echo "ASSUME_SORTED=true \\" >> $out_script
89 | echo "CREATE_INDEX=true \\" >> $out_script
90 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
91 | echo "" >> $out_script
92 |
93 | # Remove temp files
94 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
95 | echo "" >> $out_script
96 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/Reheader_SM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-SM:,out-script:,standalone -n 'Reheader_SM.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 | outSM='TN_Merged'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-out )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) outbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --bam-in )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) inbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --out-SM )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) outSM=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --out-script )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) out_script_name=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --standalone )
53 | standalone=1 ; shift ;;
54 |
55 | -- ) shift; break ;;
56 | * ) break ;;
57 | esac
58 | done
59 |
60 | logdir=${outdir}/logs
61 | mkdir -p ${logdir}
62 |
63 | if [[ ${out_script_name} ]]
64 | then
65 | out_script="${out_script_name}"
66 | else
67 | out_script="${logdir}/reheader.${timestamp}.cmd"
68 | fi
69 |
70 | if [[ $standalone ]]
71 | then
72 | echo "#!/bin/bash" > $out_script
73 | echo "" >> $out_script
74 | echo "#$ -o ${logdir}" >> $out_script
75 | echo "#$ -e ${logdir}" >> $out_script
76 | echo "#$ -S /bin/bash" >> $out_script
77 | echo '#$ -l h_vmem=8G' >> $out_script
78 | echo 'set -e' >> $out_script
79 | fi
80 |
81 | echo "" >> $out_script
82 |
83 | # Uniform sample and read group names in the merged file
84 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups \\" >> $out_script
86 | echo "I=/mnt/${outdir}/${inbam} \\" >> $out_script
87 | echo "RGID=BAMSurgeon \\" >> $out_script
88 | echo "RGLB=TNMerged \\" >> $out_script
89 | echo "RGPL=illumina \\" >> $out_script
90 | echo "RGPU=BAMSurgeon \\" >> $out_script
91 | echo "RGSM=${outSM} \\" >> $out_script
92 | echo "CREATE_INDEX=true \\" >> $out_script
93 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
94 | echo "" >> $out_script
95 |
96 | # Remove temp files
97 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
98 | echo "" >> $out_script
99 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/SortByCoordinate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,genome-reference:,out-script:,standalone -n 'SortByCoordinate.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --genome-reference )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --out-script )
46 | case "$2" in
47 | "") shift 2 ;;
48 | *) out_script_name=$2 ; shift 2 ;;
49 | esac ;;
50 |
51 | --standalone )
52 | standalone=1 ; shift ;;
53 |
54 | -- ) shift; break ;;
55 | * ) break ;;
56 | esac
57 | done
58 |
59 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
60 |
61 | logdir=${outdir}/logs
62 | mkdir -p ${logdir}
63 |
64 | if [[ ${out_script_name} ]]
65 | then
66 | out_script="${out_script_name}"
67 | else
68 | out_script="${logdir}/sort.coordinates.${timestamp}.cmd"
69 | fi
70 |
71 |
72 | if [[ $standalone ]]
73 | then
74 | echo "#!/bin/bash" > $out_script
75 | echo "" >> $out_script
76 | echo "#$ -o ${logdir}" >> $out_script
77 | echo "#$ -e ${logdir}" >> $out_script
78 | echo "#$ -S /bin/bash" >> $out_script
79 | echo '#$ -l h_vmem=8G' >> $out_script
80 | echo 'set -e' >> $out_script
81 | fi
82 |
83 |
84 | echo "" >> $out_script
85 |
86 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script
87 | echo "samtools sort -m 4G --reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script
88 | echo "-o /mnt/${outdir}/${outbam} /mnt/${inbam}" >> $out_script
89 | echo "" >> $out_script
90 |
91 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script
92 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script
93 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/SortByReadName.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
54 |
55 | logdir=${outdir}/logs
56 | mkdir -p ${logdir}
57 |
58 | if [[ ${out_script_name} ]]
59 | then
60 | out_script="${out_script_name}"
61 | else
62 | out_script="${logdir}/sort.qname.${timestamp}.cmd"
63 | fi
64 |
65 |
66 | if [[ $standalone ]]
67 | then
68 | echo "#!/bin/bash" > $out_script
69 | echo "" >> $out_script
70 | echo "#$ -o ${logdir}" >> $out_script
71 | echo "#$ -e ${logdir}" >> $out_script
72 | echo "#$ -S /bin/bash" >> $out_script
73 | echo '#$ -l h_vmem=8G' >> $out_script
74 | echo 'set -e' >> $out_script
75 | fi
76 |
77 |
78 | echo "" >> $out_script
79 |
80 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script
81 | echo "samtools sort -n -m 4G \\" >> $out_script
82 | echo "-o /mnt/${outdir}/${outbam} \\" >> $out_script
83 | echo "/mnt/${inbam} \\" >> $out_script
84 | echo "" >> $out_script
85 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/bamsurgeon_split_BAM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,genome-reference:,bam-out1:,bam-out2:,bam-in:,split-proportion:,down-sample:,seed:,out-script:,clean-bam,standalone -n 'bamsurgeon_split_BAM.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | seed=$( date +"%Y" )
17 | proportion=0.5
18 | down_sample=1
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-in )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) inbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --bam-out1 )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) outbam1=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --bam-out2 )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) outbam2=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --genome-reference )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --split-proportion )
53 | case "$2" in
54 | "") shift 2 ;;
55 | *) proportion=$2 ; shift 2 ;;
56 | esac ;;
57 |
58 | --down-sample )
59 | case "$2" in
60 | "") shift 2 ;;
61 | *) down_sample=$2 ; shift 2 ;;
62 | esac ;;
63 |
64 | --seed )
65 | case "$2" in
66 | "") shift 2 ;;
67 | *) seed=$2 ; shift 2 ;;
68 | esac ;;
69 |
70 | --out-script )
71 | case "$2" in
72 | "") shift 2 ;;
73 | *) out_script_name=$2 ; shift 2 ;;
74 | esac ;;
75 |
76 | --clean-bam )
77 | clean_bam=1 ; shift ;;
78 |
79 | --standalone )
80 | standalone=1 ; shift ;;
81 |
82 | -- ) shift; break ;;
83 | * ) break ;;
84 | esac
85 | done
86 |
87 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
88 |
89 | logdir=${outdir}/logs
90 | mkdir -p ${logdir}
91 |
92 | if [[ ${out_script_name} ]]
93 | then
94 | out_script="${out_script_name}"
95 | else
96 | out_script="${logdir}/splitBams.${timestamp}.cmd"
97 | fi
98 |
99 | if [[ $standalone ]]
100 | then
101 | echo "#!/bin/bash" > $out_script
102 | echo "" >> $out_script
103 | echo "#$ -o ${logdir}" >> $out_script
104 | echo "#$ -e ${logdir}" >> $out_script
105 | echo "#$ -S /bin/bash" >> $out_script
106 | echo '#$ -l h_vmem=8G' >> $out_script
107 | echo 'set -e' >> $out_script
108 | fi
109 |
110 |
111 | echo "" >> $out_script
112 |
113 |
114 | # Then you can split
115 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
116 | echo "/usr/local/bamsurgeon/scripts/sortedBamSplit.py \\" >> $out_script
117 | echo "--bam /mnt/${inbam} \\" >> $out_script
118 | echo "--proportion ${proportion} \\" >> $out_script
119 | echo "--downsample ${down_sample} \\" >> $out_script
120 | echo "--pick1 /mnt/${outdir}/${outbam1} \\" >> $out_script
121 | echo "--pick2 /mnt/${outdir}/${outbam2} \\" >> $out_script
122 | echo "--seed ${seed}" >> $out_script
123 | echo "" >> $out_script
124 |
125 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam1}" >> $out_script
126 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam2}" >> $out_script
127 | echo "" >> $out_script
128 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/cleanBam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | if [[ ${out_script_name} ]]
54 | then
55 | out_script="${out_script_name}"
56 | else
57 | out_script="${logdir}/cleanBam.${timestamp}.cmd"
58 | fi
59 |
60 | if [[ $standalone ]]
61 | then
62 | echo "#!/bin/bash" > $out_script
63 | echo "" >> $out_script
64 | echo "#$ -o ${logdir}" >> $out_script
65 | echo "#$ -e ${logdir}" >> $out_script
66 | echo "#$ -S /bin/bash" >> $out_script
67 | echo '#$ -l h_vmem=4G' >> $out_script
68 | echo 'set -e' >> $out_script
69 | fi
70 |
71 | echo "" >> $out_script
72 |
73 | # To split a BAM file, first you must sort by name:
74 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
75 | echo "/usr/local/bamsurgeon/scripts/remove_reads_with_many_qnames_or_bad_CIGAR.py \\" >> $out_script
76 | echo "-bamin /mnt/${inbam} \\" >> $out_script
77 | echo "-bamout /mnt/${outdir}/${outbam}" >> $out_script
78 | echo "" >> $out_script
79 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/concatVcfFiles.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,vcf-string:,vcf-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --vcf-out )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) outvcf=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --vcf-string )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) vcf_string=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | logdir=${outdir}/logs
54 | mkdir -p ${logdir}
55 |
56 | if [[ ${out_script_name} ]]
57 | then
58 | out_script="${out_script_name}"
59 | else
60 | out_script="${logdir}/concatVcfFiles.${timestamp}.cmd"
61 | fi
62 |
63 | if [[ $standalone ]]
64 | then
65 | echo "#!/bin/bash" > $out_script
66 | echo "" >> $out_script
67 | echo "#$ -o ${logdir}" >> $out_script
68 | echo "#$ -e ${logdir}" >> $out_script
69 | echo "#$ -S /bin/bash" >> $out_script
70 | echo '#$ -l h_vmem=2G' >> $out_script
71 | echo 'set -e' >> $out_script
72 | fi
73 |
74 | echo "" >> $out_script
75 |
76 |
77 | for file in ${vcf_string}
78 | do
79 | input_file_string="/mnt/${file} ${input_file_string}"
80 | done
81 |
82 | # Merge the BAM files
83 | echo "docker run -v /:/mnt -u $UID --memory 2g --rm lethalfang/vcftools:0.1.15 bash -c \\" >> $out_script
84 | echo "\"vcf-concat \\" >> $out_script
85 | echo "${input_file_string} \\" >> $out_script
86 | echo "> /mnt/${outdir}/${outvcf}\"" >> $out_script
87 | echo "" >> $out_script
88 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/convert_nonStandardBasesInVcfs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import re
4 | import sys
5 |
6 | for line_i in sys.stdin:
7 | if line_i.startswith("#"):
8 | print(line_i, end="")
9 |
10 | else:
11 | item = line_i.rstrip().split("\t")
12 | item[3] = re.sub(r"[^gctanGCTAN,0-9]", "N", item[3])
13 | item[4] = re.sub(r"[^gctanGCTAN,0-9]", "N", item[4])
14 | line_out = "\t".join(item)
15 |
16 | print(line_out)
17 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/mergeBamFiles.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-string:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-out )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) outbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-string )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) bam_string=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | logdir=${outdir}/logs
54 | mkdir -p ${logdir}
55 |
56 | if [[ ${out_script_name} ]]
57 | then
58 | out_script="${out_script_name}"
59 | else
60 | out_script="${logdir}/mergeBams.${timestamp}.cmd"
61 | fi
62 |
63 | if [[ $standalone ]]
64 | then
65 | echo "#!/bin/bash" > $out_script
66 | echo "" >> $out_script
67 | echo "#$ -o ${logdir}" >> $out_script
68 | echo "#$ -e ${logdir}" >> $out_script
69 | echo "#$ -S /bin/bash" >> $out_script
70 | echo '#$ -l h_vmem=8G' >> $out_script
71 | echo 'set -e' >> $out_script
72 | fi
73 |
74 | echo "" >> $out_script
75 |
76 |
77 | for file in ${bam_string}
78 | do
79 | input_file_string="I=/mnt/${file} ${input_file_string}"
80 | done
81 |
82 | # Merge the BAM files
83 | echo "docker run -v /:/mnt -u $UID --memory 8g --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
84 | echo "java -Xmx8g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script
85 | echo "${input_file_string} \\" >> $out_script
86 | echo "ASSUME_SORTED=true \\" >> $out_script
87 | echo "CREATE_INDEX=true \\" >> $out_script
88 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
89 | echo "" >> $out_script
90 |
91 | # Remove temp files
92 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
93 | echo "" >> $out_script
94 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/split_BAM_by_BED.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-in:,bam-out:,selector:,out-script:,standalone -n 'split_BAM_by_BED.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | while true; do
18 | case "$1" in
19 | -o | --output-dir )
20 | case "$2" in
21 | "") shift 2 ;;
22 | *) outdir=$2 ; shift 2 ;;
23 | esac ;;
24 |
25 | --bam-in )
26 | case "$2" in
27 | "") shift 2 ;;
28 | *) inbam=$2 ; shift 2 ;;
29 | esac ;;
30 |
31 | --bam-out )
32 | case "$2" in
33 | "") shift 2 ;;
34 | *) outbam=$2 ; shift 2 ;;
35 | esac ;;
36 |
37 | --selector )
38 | case "$2" in
39 | "") shift 2 ;;
40 | *) SELECTOR=$2 ; shift 2 ;;
41 | esac ;;
42 |
43 | --out-script )
44 | case "$2" in
45 | "") shift 2 ;;
46 | *) out_script_name=$2 ; shift 2 ;;
47 | esac ;;
48 |
49 | --standalone )
50 | standalone=1 ; shift ;;
51 |
52 | -- ) shift; break ;;
53 | * ) break ;;
54 | esac
55 | done
56 |
57 |
58 | logdir=${outdir}/logs
59 | mkdir -p ${logdir}
60 |
61 | if [[ ${out_script_name} ]]
62 | then
63 | out_script="${out_script_name}"
64 | else
65 | out_script="${logdir}/splitByBed.${timestamp}.cmd"
66 | fi
67 |
68 | if [[ $standalone ]]
69 | then
70 | echo "#!/bin/bash" > $out_script
71 | echo "" >> $out_script
72 | echo "#$ -o ${logdir}" >> $out_script
73 | echo "#$ -e ${logdir}" >> $out_script
74 | echo "#$ -S /bin/bash" >> $out_script
75 | echo '#$ -l h_vmem=4G' >> $out_script
76 | echo 'set -e' >> $out_script
77 | fi
78 |
79 | echo "" >> $out_script
80 |
81 |
82 | echo "docker run --rm -v /:/mnt -u $UID --memory 4g lethalfang/samtools:1.7 bash -c \\" >> $out_script
83 | echo "\"samtools view /mnt/${inbam} -L /mnt/${SELECTOR} -Sbh \\" >> $out_script
84 | echo "> /mnt/${outdir}/${outbam}\"" >> $out_script
85 |
86 | echo "" >> $out_script
87 |
88 | echo "docker run --rm -v /:/mnt -u $UID --memory 4g lethalfang/samtools:1.7 \\" >> $out_script
89 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script
90 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/dream_sim.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/dream_sim.jpg
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/onkoinsight_sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/onkoinsight_sim.png
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/bamSimulator/replicate_sim.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/replicate_sim.jpg
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/container_option.py:
--------------------------------------------------------------------------------
1 | import os
2 | import uuid
3 | from dataclasses import dataclass
4 | from pathlib import Path
5 | from typing import Literal
6 |
7 | from somaticseq._version import __version__ as VERSION
8 |
9 |
10 | @dataclass
11 | class DockerImages:
12 | alientrimmer: str = "lethalfang/alientrimmer:0.4.0"
13 | bedtools: str = "lethalfang/bedtools:2.26.0"
14 | bwa: str = "lethalfang/bwa:0.7.17_samtools_1.19"
15 | jsm2: str = "lethalfang/jointsnvmix2:0.7.5"
16 | lofreq: str = "lethalfang/lofreq:2.1.3.1-1"
17 | muse: str = "marghoob/muse:1.0rc_c"
18 | mutect2: str = "broadinstitute/gatk:4.0.5.2"
19 | picard: str = "lethalfang/picard:2.22.7"
20 | sambamba: str = "lethalfang/sambamba:0.7.1"
21 | samtools: str = "lethalfang/samtools:1.19.2"
22 | scalpel: str = "lethalfang/scalpel:0.5.4"
23 | somaticseq: str = f"lethalfang/somaticseq:{VERSION}"
24 | somaticsniper: str = "lethalfang/somaticsniper:1.0.5.0-2"
25 | strelka2: str = "lethalfang/strelka:2.9.5"
26 | tabix: str = "lethalfang/tabix:1.10"
27 | trimmomatic: str = "lethalfang/trimmomatic:0.39"
28 | vardict: str = "lethalfang/vardictjava:1.7.0"
29 | varscan2: str = "djordjeklisic/sbg-varscan2:v1"
30 |
31 |
32 | @dataclass
33 | class MountedFileProperty:
34 | file: str
35 | filepath: Path
36 | filename: str
37 | directory: Path
38 | absolute_directory: Path
39 | mount_directory: str
40 | mount_path: str
41 |
42 |
43 | DOCKER_IMAGES = DockerImages()
44 |
45 |
46 | def container_params(
47 | container_image: str,
48 | tech: Literal["docker", "singularity"] = "docker",
49 | files: list[str] = [],
50 | extra_args: str = "",
51 | singularity_image_loc: str = "docker://",
52 | ) -> tuple[str, dict[str, dict[str, str]]]:
53 |
54 | file_paths = [Path(i) for i in files]
55 | file_names = [i.name for i in file_paths]
56 | file_dirs = [i.parent for i in file_paths]
57 | file_abs_dirs = [i.absolute().parent for i in file_paths]
58 | random_dirs = ["/" + uuid.uuid4().hex for _ in files]
59 |
60 | file_dictionary = {}
61 | for file_i, path_i, filename_i, dir_i, abs_dir_i, random_dir_i in zip(
62 | files, file_paths, file_names, file_dirs, file_abs_dirs, random_dirs
63 | ):
64 | file_dictionary[file_i] = {
65 | "filepath": path_i,
66 | "filename": filename_i,
67 | "dir": dir_i,
68 | "abs_dir": abs_dir_i,
69 | "mount_dir": random_dir_i,
70 | "mount_path": os.path.join(random_dir_i, filename_i),
71 | }
72 |
73 | if tech == "docker":
74 | MOUNT_STRING = ""
75 | for file_i in file_dictionary:
76 | sys_dir = file_dictionary[file_i]["abs_dir"]
77 | container_dir = file_dictionary[file_i]["mount_dir"]
78 | MOUNT_STRING = MOUNT_STRING + f" -v {sys_dir}:{container_dir}"
79 |
80 | container_string = (
81 | f"docker run {MOUNT_STRING} -u $(id -u):$(id -g) "
82 | f"--rm {extra_args} {container_image}"
83 | )
84 |
85 | elif tech == "singularity":
86 | MOUNT_STRING = ""
87 | for file_i in file_dictionary:
88 | sys_dir = file_dictionary[file_i]["abs_dir"]
89 | container_dir = file_dictionary[file_i]["mount_dir"]
90 | MOUNT_STRING = MOUNT_STRING + f" --bind {sys_dir}:{container_dir}"
91 |
92 | container_string = (
93 | "singularity exec --cleanenv "
94 | f"{MOUNT_STRING} {extra_args} {singularity_image_loc}{container_image}"
95 | )
96 |
97 | else:
98 | raise NotImplementedError("Only supports docker and singularity.")
99 |
100 | return container_string, file_dictionary
101 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/germline_variants/Canvas.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,bam:,in-vcf:,sample-name:,canvas-reference:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'canvas.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 | SAMPLE_NAME='Canvas'
20 |
21 | while true; do
22 | case "$1" in
23 |
24 | -o | --out-dir )
25 | case "$2" in
26 | "") shift 2 ;;
27 | *) outdir=$2 ; shift 2 ;;
28 | esac ;;
29 |
30 | --bam )
31 | case "$2" in
32 | "") shift 2 ;;
33 | *) bamFile=$2 ; shift 2 ;;
34 | esac ;;
35 |
36 | --in-vcf )
37 | case "$2" in
38 | "") shift 2 ;;
39 | *) inVcf=$2 ; shift 2 ;;
40 | esac ;;
41 |
42 | --sample-name )
43 | case "$2" in
44 | "") shift 2 ;;
45 | *) SAMPLE_NAME=$2 ; shift 2 ;;
46 | esac ;;
47 |
48 | --canvas-reference )
49 | case "$2" in
50 | "") shift 2 ;;
51 | *) CANVAS_REFERENCE=$2 ; shift 2 ;;
52 | esac ;;
53 |
54 | --genome-reference-dir )
55 | case "$2" in
56 | "") shift 2 ;;
57 | *) GENOMIC_REFERENCE_DIR=$2 ; shift 2 ;;
58 | esac ;;
59 |
60 | --filter-bed )
61 | case "$2" in
62 | "") shift 2 ;;
63 | *) filterBed=$2 ; shift 2 ;;
64 | esac ;;
65 |
66 | --extra-arguments )
67 | case "$2" in
68 | "") shift 2 ;;
69 | *) extra_arguments=$2 ; shift 2 ;;
70 | esac ;;
71 |
72 | --out-script )
73 | case "$2" in
74 | "") shift 2 ;;
75 | *) out_script_name=$2 ; shift 2 ;;
76 | esac ;;
77 |
78 | --action )
79 | case "$2" in
80 | "") shift 2 ;;
81 | *) action=$2 ; shift 2 ;;
82 | esac ;;
83 |
84 | --standalone )
85 | standalone=1 ; shift ;;
86 |
87 | -- ) shift; break ;;
88 | * ) break ;;
89 | esac
90 |
91 | done
92 |
93 | logdir=${outdir}/logs
94 | mkdir -p ${logdir}
95 |
96 | if [[ ${out_script_name} ]]
97 | then
98 | out_script="${out_script_name}"
99 | else
100 | out_script="${logdir}/canvas.${timestamp}.cmd"
101 | fi
102 |
103 |
104 | if [[ $standalone ]]
105 | then
106 | echo "#!/bin/bash" > $out_script
107 | echo "" >> $out_script
108 | echo "#$ -o ${logdir}" >> $out_script
109 | echo "#$ -e ${logdir}" >> $out_script
110 | echo "#$ -S /bin/bash" >> $out_script
111 | echo '#$ -l h_vmem=12G' >> $out_script
112 | echo "#$ -pe smp ${threads}" >> $out_script
113 | echo 'set -e' >> $out_script
114 | fi
115 |
116 | echo "" >> $out_script
117 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
118 | echo "" >> $out_script
119 |
120 |
121 | # Cannot yet control number of threads to invoke
122 | # First, create Canvas-ready fasta
123 | # docker run --rm -v /:/mnt -u $UID lethalfang/canvas:1.35.1 bash -c 'export COMPlus_gcAllowVeryLargeObjects=1 && dotnet /opt/Canvas/Tools/FlagUniqueKmers/FlagUniqueKmers.dll /mnt/sc1/groups/bfx-red/data/datainsights/SEQC2_Resources/GRCh38.d1.vd1.fa /mnt/sc1/groups/bfx-red/data/datainsights/SEQC2_Resources/GRCh38.d1.vd1.Canvas-ready.fasta'
124 |
125 | # Run Canvas
126 | echo "docker run -u $UID --rm -v /:/mnt lethalfang/canvas:1.35.1 \\" >> $out_script
127 | echo "dotnet /opt/Canvas/Canvas.dll Germline-WGS \\" >> $out_script
128 | echo "--bam /mnt/${bamFile} \\" >> $out_script
129 | echo "--sample-b-allele-vcf /mnt/${inVcf} \\" >> $out_script
130 | echo "--sample-name ${SAMPLE_NAME} \\" >> $out_script
131 | echo "--reference /mnt/${CANVAS_REFERENCE} \\" >> $out_script
132 | echo "--genome-folder /mnt/${GENOMIC_REFERENCE_DIR} \\" >> $out_script
133 | echo "--filter-bed /mnt/${filterBed} \\" >> $out_script
134 | echo "--output /mnt/${out_dir} " >> $out_script
135 |
136 | echo "" >> $out_script
137 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
138 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/germline_variants/Manta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,bam:,genome-reference:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 |
20 | while true; do
21 | case "$1" in
22 |
23 | -o | --out-dir )
24 | case "$2" in
25 | "") shift 2 ;;
26 | *) outdir=$2 ; shift 2 ;;
27 | esac ;;
28 |
29 | --bam )
30 | case "$2" in
31 | "") shift 2 ;;
32 | *) bamFile=$2 ; shift 2 ;;
33 | esac ;;
34 |
35 | --genome-reference )
36 | case "$2" in
37 | "") shift 2 ;;
38 | *) GENOME_REFERENCE=$2 ; shift 2 ;;
39 | esac ;;
40 |
41 | --extra-arguments )
42 | case "$2" in
43 | "") shift 2 ;;
44 | *) extra_arguments=$2 ; shift 2 ;;
45 | esac ;;
46 |
47 | --out-script )
48 | case "$2" in
49 | "") shift 2 ;;
50 | *) out_script_name=$2 ; shift 2 ;;
51 | esac ;;
52 |
53 | --threads )
54 | case "$2" in
55 | "") shift 2 ;;
56 | *) threads=$2 ; shift 2 ;;
57 | esac ;;
58 |
59 | --action )
60 | case "$2" in
61 | "") shift 2 ;;
62 | *) action=$2 ; shift 2 ;;
63 | esac ;;
64 |
65 | --standalone )
66 | standalone=1 ; shift ;;
67 |
68 | -- ) shift; break ;;
69 | * ) break ;;
70 | esac
71 |
72 | done
73 |
74 | logdir=${outdir}/logs
75 | mkdir -p ${logdir}
76 |
77 | if [[ ${out_script_name} ]]
78 | then
79 | out_script="${out_script_name}"
80 | else
81 | out_script="${logdir}/canvas.${timestamp}.cmd"
82 | fi
83 |
84 |
85 | if [[ $standalone ]]
86 | then
87 | echo "#!/bin/bash" > $out_script
88 | echo "" >> $out_script
89 | echo "#$ -o ${logdir}" >> $out_script
90 | echo "#$ -e ${logdir}" >> $out_script
91 | echo "#$ -S /bin/bash" >> $out_script
92 | echo '#$ -l h_vmem=4G' >> $out_script
93 | echo "#$ -pe smp ${threads}" >> $out_script
94 | echo 'set -e' >> $out_script
95 | fi
96 |
97 | echo "" >> $out_script
98 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
99 | echo "" >> $out_script
100 |
101 |
102 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/manta:1.4.0 \\" >> $out_script
103 | echo "/opt/manta/bin/configManta.py \\" >> $out_script
104 | echo "--bam /mnt/${bamFile} \\" >> $out_script
105 | echo "--referenceFasta /mnt/${GENOME_REFERENCE} \\" >> $out_script
106 | echo "--runDir /mnt/${outdir}" >> $out_script
107 |
108 | echo "" >> $out_script
109 |
110 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/manta:1.4.0 /mnt/${outdir}/runWorkflow.py -m local -j $thread" >> $out_script
111 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/germline_variants/Nirvana.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,in-vcf:,nirvana-resources-dir:,sample:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 | sampleID='Nirvana'
20 |
21 | while true; do
22 | case "$1" in
23 |
24 | -o | --out-dir )
25 | case "$2" in
26 | "") shift 2 ;;
27 | *) outdir=$2 ; shift 2 ;;
28 | esac ;;
29 |
30 | --in-vcf )
31 | case "$2" in
32 | "") shift 2 ;;
33 | *) inVcf=$2 ; shift 2 ;;
34 | esac ;;
35 |
36 | --nirvana-resources-dir )
37 | case "$2" in
38 | "") shift 2 ;;
39 | *) NIRVANA_RESOURCES_DIR=$2 ; shift 2 ;;
40 | esac ;;
41 |
42 | --sample )
43 | case "$2" in
44 | "") shift 2 ;;
45 | *) sampleID=$2 ; shift 2 ;;
46 | esac ;;
47 |
48 | --extra-arguments )
49 | case "$2" in
50 | "") shift 2 ;;
51 | *) extra_arguments=$2 ; shift 2 ;;
52 | esac ;;
53 |
54 | --out-script )
55 | case "$2" in
56 | "") shift 2 ;;
57 | *) out_script_name=$2 ; shift 2 ;;
58 | esac ;;
59 |
60 | --threads )
61 | case "$2" in
62 | "") shift 2 ;;
63 | *) threads=$2 ; shift 2 ;;
64 | esac ;;
65 |
66 | --action )
67 | case "$2" in
68 | "") shift 2 ;;
69 | *) action=$2 ; shift 2 ;;
70 | esac ;;
71 |
72 | --standalone )
73 | standalone=1 ; shift ;;
74 |
75 | -- ) shift; break ;;
76 | * ) break ;;
77 | esac
78 |
79 | done
80 |
81 | logdir=${outdir}/logs
82 | mkdir -p ${logdir}
83 |
84 | if [[ ${out_script_name} ]]
85 | then
86 | out_script="${out_script_name}"
87 | else
88 | out_script="${logdir}/canvas.${timestamp}.cmd"
89 | fi
90 |
91 |
92 | if [[ $standalone ]]
93 | then
94 | echo "#!/bin/bash" > $out_script
95 | echo "" >> $out_script
96 | echo "#$ -o ${logdir}" >> $out_script
97 | echo "#$ -e ${logdir}" >> $out_script
98 | echo "#$ -S /bin/bash" >> $out_script
99 | echo '#$ -l h_vmem=4G' >> $out_script
100 | echo "#$ -pe smp ${threads}" >> $out_script
101 | echo 'set -e' >> $out_script
102 | fi
103 |
104 | echo "" >> $out_script
105 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
106 | echo "" >> $out_script
107 |
108 |
109 | echo "docker run --rm -u $UID -v /:/mnt lethalfang/nirvana:2.0.9 \\" >> $out_script
110 | echo "dotnet /opt/Nirvana/bin/Release/netcoreapp2.0/Nirvana.dll \\" >> $out_script
111 | echo "-c /mnt/${NIRVANA_RESOURCES_DIR}/Cache/26/GRCh38/Ensembl \\" >> $out_script
112 | echo "--sd /mnt/${NIRVANA_RESOURCES_DIR}/GRCh38 \\" >> $out_script
113 | echo "-r /mnt/${NIRVANA_RESOURCES_DIR}/References/5/Homo_sapiens.GRCh38.Nirvana.dat \\" >> $out_script
114 | echo "-i /mnt/${inVcf} \\" >> $out_script
115 | echo "-o /mnt/${outdir}/${sampleID}" >> $out_script
116 |
117 | echo "" >> $out_script
118 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
119 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/germline_variants/README.md:
--------------------------------------------------------------------------------
1 | **Requirement**
2 |
3 | - Have internet connection, and able to pull and run docker images from Docker
4 | Hub.
5 | - **Recommended**: Have cluster management system with valid "qsub" command,
6 | such as Sun Grid Engine (SGE).
7 |
8 | - Germline tasks starting with BAM file.
9 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/germline_variants/haplotypeCaller.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,out-vcf:,bam:,human-reference:,selector:,dbsnp:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'gatk_haplotypecaller.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 |
20 | while true; do
21 | case "$1" in
22 |
23 | -o | --out-dir )
24 | case "$2" in
25 | "") shift 2 ;;
26 | *) outdir=$2 ; shift 2 ;;
27 | esac ;;
28 |
29 | --out-vcf )
30 | case "$2" in
31 | "") shift 2 ;;
32 | *) outVcfName=$2 ; shift 2 ;;
33 | esac ;;
34 |
35 | --bam )
36 | case "$2" in
37 | "") shift 2 ;;
38 | *) bamFile=$2 ; shift 2 ;;
39 | esac ;;
40 |
41 | --human-reference )
42 | case "$2" in
43 | "") shift 2 ;;
44 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
45 | esac ;;
46 |
47 | --selector )
48 | case "$2" in
49 | "") shift 2 ;;
50 | *) SELECTOR=$2 ; shift 2 ;;
51 | esac ;;
52 |
53 | --dbsnp )
54 | case "$2" in
55 | "") shift 2 ;;
56 | *) dbsnp=$2 ; shift 2 ;;
57 | esac ;;
58 |
59 | --MEM )
60 | case "$2" in
61 | "") shift 2 ;;
62 | *) MEM=$2 ; shift 2 ;;
63 | esac ;;
64 |
65 | --threads )
66 | case "$2" in
67 | "") shift 2 ;;
68 | *) threads=$2 ; shift 2 ;;
69 | esac ;;
70 |
71 | --extra-arguments )
72 | case "$2" in
73 | "") shift 2 ;;
74 | *) extra_arguments=$2 ; shift 2 ;;
75 | esac ;;
76 |
77 | --out-script )
78 | case "$2" in
79 | "") shift 2 ;;
80 | *) out_script_name=$2 ; shift 2 ;;
81 | esac ;;
82 |
83 | --action )
84 | case "$2" in
85 | "") shift 2 ;;
86 | *) action=$2 ; shift 2 ;;
87 | esac ;;
88 |
89 | --standalone )
90 | standalone=1 ; shift ;;
91 |
92 | -- ) shift; break ;;
93 | * ) break ;;
94 | esac
95 |
96 | done
97 |
98 | logdir=${outdir}/logs
99 | mkdir -p ${logdir}
100 |
101 | if [[ ${out_script_name} ]]
102 | then
103 | out_script="${out_script_name}"
104 | else
105 | out_script="${logdir}/HaplotypeCaller.${timestamp}.cmd"
106 | fi
107 |
108 |
109 | if [[ $standalone ]]
110 | then
111 | echo "#!/bin/bash" > $out_script
112 | echo "" >> $out_script
113 | echo "#$ -o ${logdir}" >> $out_script
114 | echo "#$ -e ${logdir}" >> $out_script
115 | echo "#$ -S /bin/bash" >> $out_script
116 | echo '#$ -l h_vmem=12G' >> $out_script
117 | echo "#$ -pe smp ${threads}" >> $out_script
118 | echo 'set -e' >> $out_script
119 | fi
120 |
121 | echo "" >> $out_script
122 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
123 | echo "" >> $out_script
124 |
125 |
126 | if [[ ${SELECTOR} ]]
127 | then
128 | selector_text="-L /mnt/${SELECTOR}"
129 | fi
130 |
131 | dbsnp_text=''
132 | if [[ ${dbsnp} ]]; then
133 | dbsnp_text="--dbsnp /mnt/${dbsnp}"
134 | fi
135 |
136 |
137 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk:4.0.5.2 \\" >> $out_script
138 | echo "java -Xmx${MEM}g -jar /gatk/gatk.jar \\" >> $out_script
139 | echo "HaplotypeCaller \\" >> $out_script
140 | echo "--reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script
141 | echo "--input /mnt/${bamFile} \\" >> $out_script
142 | echo "--native-pair-hmm-threads ${threads} \\" >> $out_script
143 | echo "$selector_text \\" >> $out_script
144 | echo "$dbsnp_text \\" >> $out_script
145 | echo "${extra_arguments} \\" >> $out_script
146 | echo "--output /mnt/${outdir}/${outVcfName}" >> $out_script
147 |
148 | echo "" >> $out_script
149 |
150 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
151 |
--------------------------------------------------------------------------------
/somaticseq/utilities/dockered_pipelines/somatic_mutations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/somatic_mutations/__init__.py
--------------------------------------------------------------------------------
/somaticseq/utilities/linguistic_sequence_complexity.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | from copy import copy
5 | from sys import float_info
6 |
7 | import somaticseq.sequencing_features as seq_features
8 |
9 | eps = float_info.epsilon
10 |
11 |
12 | def all_possible_dna_sequences(seq_length):
13 | seqs = ["G", "C", "T", "A"]
14 |
15 | for _ in range(seq_length - 1):
16 | seqs_i = copy(seqs)
17 | seqs = []
18 | for sub_seq in seqs_i:
19 | for i in "TCGA":
20 | extended_seq = sub_seq + i
21 | seqs.append(extended_seq)
22 |
23 | return set(seqs)
24 |
25 |
26 | def max_vocabularies(seq_length):
27 | # According to:
28 | # https://doi.org/10.1093/bioinformatics/18.5.679
29 | # Assume 4 different nucleotides
30 | counts = 0
31 | k = 1
32 | while k <= seq_length:
33 | if 4**k < (seq_length - k + 1):
34 | counts = counts + 4**k
35 | else:
36 | counts = (
37 | counts + (seq_length - k + 1 + 1) * (seq_length - k + 1 - 1 + 1) / 2
38 | )
39 | break
40 |
41 | k += 1
42 |
43 | return counts
44 |
45 |
46 | def LC(sequence):
47 | # Calculate linguistic sequence complexity according to
48 | # https://doi.org/10.1093/bioinformatics/18.5.679
49 | # Assume 4 different nucleotides
50 | sequence = sequence.upper()
51 |
52 | if "N" not in sequence:
53 | number_of_subseqs = 0
54 | seq_length = len(sequence)
55 | max_number_of_subseqs = max_vocabularies(seq_length)
56 |
57 | for i in range(1, seq_length + 1):
58 | # max_vocab_1 = 4**i
59 | # max_vocab_2 = seq_length - i + 1
60 | set_of_seq_n = set()
61 |
62 | for n, nth_base in enumerate(sequence):
63 | if n + i <= len(sequence):
64 | sub_seq = sequence[n : n + i]
65 | set_of_seq_n.add(sub_seq)
66 |
67 | num_uniq_subseqs = len(set_of_seq_n)
68 | number_of_subseqs = number_of_subseqs + num_uniq_subseqs
69 |
70 | lc = number_of_subseqs / max_number_of_subseqs
71 |
72 | else:
73 | lc = float("nan")
74 |
75 | return lc
76 |
77 |
78 | def main() -> None:
79 | parser = argparse.ArgumentParser(
80 | description=(
81 | "Calculate linguistic sequence complexity according to "
82 | "DOI:10.1093/bioinformatics/18.5.679"
83 | ),
84 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
85 | )
86 |
87 | parser.add_argument("-seq", "--sequence", type=str, help="GCTA sequences")
88 | parser.add_argument(
89 | "-len",
90 | "--substring-length",
91 | type=int,
92 | help=(
93 | "Default is the whole length of the sequence. "
94 | "If specified, then it will calculate sub-length up to this value."
95 | ),
96 | )
97 | args = parser.parse_args()
98 | if args.substring_length:
99 | length = args.substring_length
100 | assert length <= len(args.sequence)
101 |
102 | else:
103 | length = len(args.sequence)
104 | # This one adds up sub-strings up to a length
105 | print(seq_features.ling_seq_complexity_with_max_vocab_length(args.sequence, length))
106 |
107 |
108 | if __name__ == "__main__":
109 | main()
110 |
--------------------------------------------------------------------------------
/somaticseq/utilities/paired_end_bam2fastq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import gzip
5 |
6 | import pysam
7 |
8 | NT_PAIRS = {
9 | "G": "C",
10 | "T": "A",
11 | "C": "G",
12 | "A": "T",
13 | "N": "N",
14 | "g": "c",
15 | "t": "a",
16 | "c": "g",
17 | "a": "t",
18 | "n": "n",
19 | }
20 |
21 |
22 | def reverse_complement(seq):
23 | seq_j = "".join([NT_PAIRS[base_i] for base_i in seq[::-1]])
24 | return seq_j
25 |
26 |
27 | def text_open_write(filename):
28 | if str(filename).endswith(".gz"):
29 | return gzip.open(filename, "wt")
30 | return open(filename, "w")
31 |
32 |
33 | def bam2fq(bam_file, fastq1, fastq2):
34 | with (
35 | pysam.AlignmentFile(bam_file) as bam,
36 | text_open_write(fastq1) as fq1,
37 | text_open_write(fastq2) as fq2,
38 | ):
39 | reads1 = {}
40 | reads2 = {}
41 | reads = bam.fetch()
42 | for read_i in reads:
43 | if not read_i.is_secondary:
44 | seq_i = (
45 | reverse_complement(read_i.query_sequence)
46 | if read_i.is_reverse
47 | else read_i.query_sequence
48 | )
49 | qual_i = read_i.qual[::-1] if read_i.is_reverse else read_i.qual
50 | if read_i.is_read1:
51 | if read_i.query_name in reads2:
52 | fq1.write(f"@{read_i.query_name}/1\n")
53 | fq1.write(seq_i + "\n")
54 | fq1.write("+\n")
55 | fq1.write(qual_i + "\n")
56 |
57 | read_2 = reads2.pop(read_i.query_name)
58 | fq2.write("@{}/2\n".format(read_2["qname"]))
59 | fq2.write(read_2["seq"] + "\n")
60 | fq2.write("+\n")
61 | fq2.write(read_2["bq"] + "\n")
62 |
63 | else:
64 | reads1[read_i.query_name] = {}
65 | reads1[read_i.query_name]["qname"] = read_i.query_name
66 | reads1[read_i.query_name]["seq"] = seq_i
67 | reads1[read_i.query_name]["bq"] = qual_i
68 |
69 | elif read_i.is_read2:
70 | if read_i.query_name in reads1:
71 | read_1 = reads1.pop(read_i.query_name)
72 | fq1.write("@{}/1\n".format(read_1["qname"]))
73 | fq1.write(read_1["seq"] + "\n")
74 | fq1.write("+\n")
75 | fq1.write(read_1["bq"] + "\n")
76 |
77 | fq2.write(f"@{read_i.query_name}/2\n")
78 | fq2.write(seq_i + "\n")
79 | fq2.write("+\n")
80 | fq2.write(qual_i + "\n")
81 | else:
82 | reads2[read_i.query_name] = {}
83 | reads2[read_i.query_name]["qname"] = read_i.query_name
84 | reads2[read_i.query_name]["seq"] = seq_i
85 | reads2[read_i.query_name]["bq"] = qual_i
86 |
87 | return True
88 |
89 |
90 | def main() -> None:
91 | parser = argparse.ArgumentParser(
92 | description="Convert paired-end BAM to FASTQ1 and 2",
93 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
94 | )
95 | parser.add_argument("-bam", "--bam", type=str, help="bam file in")
96 | parser.add_argument("-fq1", "--fastq1", type=str, help="fastq1 out")
97 | parser.add_argument("-fq2", "--fastq2", type=str, help="fastq2 out")
98 | args = parser.parse_args()
99 | bam2fq(args.bam, args.fastq1, args.fastq2)
100 |
101 |
102 | if __name__ == "__main__":
103 | main()
104 |
--------------------------------------------------------------------------------
/somaticseq/utilities/remove_callers_from_somaticseq_tsv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import gzip
5 | from math import isnan
6 |
7 | all_possible_callers = (
8 | "if_MuTect",
9 | "if_VarScan2",
10 | "if_JointSNVMix2",
11 | "if_SomaticSniper",
12 | "if_VarDict",
13 | "MuSE_Tier",
14 | "if_LoFreq",
15 | "if_Scalpel",
16 | "if_Strelka",
17 | "if_TNscope",
18 | "if_Platypus",
19 | )
20 |
21 |
22 | parser = argparse.ArgumentParser(
23 | description=(
24 | "In SomaticSeq TSV files, replace certain callers with nan and remove lines "
25 | "where they are only called by these. "
26 | "To mimic a TSV where only a subset of the callers were used."
27 | ),
28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
29 | )
30 | parser.add_argument("-infile", "--infile", type=str, help="input file", required=True)
31 | parser.add_argument("-outfile", "--outfile", type=str, help="input file", required=True)
32 | parser.add_argument(
33 | "-subtract",
34 | "--subtract-callers",
35 | type=str,
36 | nargs="+",
37 | help="columns to make nan",
38 | required=True,
39 | choices=all_possible_callers,
40 | )
41 |
42 | args = parser.parse_args()
43 |
44 | for caller_i in args.subtract_callers:
45 | assert caller_i in all_possible_callers
46 |
47 |
48 | def open_textfile(file_name):
49 | if file_name.lower().endswith(".gz"):
50 | return gzip.open(file_name, "rt")
51 | else:
52 | return open(file_name)
53 |
54 |
55 | def items_to_make_nan(callers_to_subtract):
56 | out_items = []
57 | for caller_i in callers_to_subtract:
58 | if caller_i == "if_MuTect":
59 | out_items.append("M2_NLOD")
60 | out_items.append("M2_TLOD")
61 | out_items.append("M2_STR")
62 | out_items.append("M2_ECNT")
63 | elif caller_i == "if_JointSNVMix2":
64 | out_items.append("SNVMix2_Score")
65 | elif caller_i == "if_SomaticSniper":
66 | out_items.append("Sniper_Score")
67 | elif caller_i == "if_VarDict":
68 | out_items.append("VarDict_Score")
69 | out_items.append("MSI")
70 | out_items.append("MSILEN")
71 | out_items.append("SHIFT3")
72 | elif caller_i == "if_Strelka":
73 | out_items.append("Strelka_Score")
74 | out_items.append("Strelka_QSS")
75 | out_items.append("Strelka_TQSS")
76 |
77 | return out_items
78 |
79 |
80 | with open_textfile(args.infile) as infile, open(args.outfile, "w") as outfile:
81 | line_in = infile.readline().rstrip()
82 | item_in = line_in.split("\t")
83 | out_indices = [item_in.index(i) for i in args.subtract_callers]
84 | remaining_indices = [
85 | item_in.index(i) for i in all_possible_callers if i not in args.subtract_callers
86 | ]
87 | extra_nan_items = items_to_make_nan(args.subtract_callers)
88 | extra_nan_indices = [item_in.index(i) for i in extra_nan_items]
89 | outfile.write(line_in + "\n")
90 | line_in = infile.readline().rstrip()
91 | while line_in:
92 | item_in = line_in.split("\t")
93 |
94 | other_callers = 0
95 | for other_i in remaining_indices:
96 | classification_i = item_in[other_i]
97 | classification_i = eval(classification_i)
98 | if not isnan(classification_i):
99 | other_callers += classification_i
100 |
101 | if other_callers > 0:
102 | for out_i in out_indices + extra_nan_indices:
103 | item_in[out_i] = "nan"
104 |
105 | line_out = "\t".join(item_in)
106 | outfile.write(line_out + "\n")
107 |
108 | line_in = infile.readline().rstrip()
109 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/QC/extract_coverageDepth.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam:,genome-reference:,selector:,minBaseQuality:,minMappingQuality:,extra-arguments:,out-script:,standalone, -n 'coverageDepth.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | minBaseQuality=0
18 | minMappingQuality=0
19 |
20 |
21 | while true; do
22 | case "$1" in
23 | -o | --output-dir )
24 | case "$2" in
25 | "") shift 2 ;;
26 | *) outdir=$2 ; shift 2 ;;
27 | esac ;;
28 |
29 | --bam )
30 | case "$2" in
31 | "") shift 2 ;;
32 | *) bamFile=$2 ; shift 2 ;;
33 | esac ;;
34 |
35 | --genome-reference )
36 | case "$2" in
37 | "") shift 2 ;;
38 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
39 | esac ;;
40 |
41 | --minBaseQuality )
42 | case "$2" in
43 | "") shift 2 ;;
44 | *) minBaseQuality=$2 ; shift 2 ;;
45 | esac ;;
46 |
47 | --minMappingQuality )
48 | case "$2" in
49 | "") shift 2 ;;
50 | *) minMappingQuality=$2 ; shift 2 ;;
51 | esac ;;
52 |
53 | --selector )
54 | case "$2" in
55 | "") shift 2 ;;
56 | *) SELECTOR=$2 ; shift 2 ;;
57 | esac ;;
58 |
59 | --extra-arguments )
60 | case "$2" in
61 | "") shift 2 ;;
62 | *) extra_arguments=$2 ; shift 2 ;;
63 | esac ;;
64 |
65 | --out-script )
66 | case "$2" in
67 | "") shift 2 ;;
68 | *) out_script_name=$2 ; shift 2 ;;
69 | esac ;;
70 |
71 | --standalone )
72 | standalone=1 ; shift ;;
73 |
74 | -- ) shift; break ;;
75 | * ) break ;;
76 | esac
77 | done
78 |
79 | logdir=${outdir}/logs
80 | mkdir -p ${logdir}
81 |
82 | if [[ ${out_script_name} ]]
83 | then
84 | out_script="${out_script_name}"
85 | else
86 | out_script="${logdir}/coverageDepth.${timestamp}.cmd"
87 | fi
88 |
89 |
90 | if [[ $standalone ]]
91 | then
92 | echo "#!/bin/bash" > $out_script
93 | echo "" >> $out_script
94 | echo "#$ -o ${logdir}" >> $out_script
95 | echo "#$ -e ${logdir}" >> $out_script
96 | echo "#$ -S /bin/bash" >> $out_script
97 | echo '#$ -l h_vmem=8G' >> $out_script
98 | echo 'set -e' >> $out_script
99 | fi
100 |
101 | echo "" >> $out_script
102 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
103 | echo "" >> $out_script
104 |
105 | if [[ ${SELECTOR} ]]
106 | then
107 | selector_text="-L /mnt/${SELECTOR}"
108 | fi
109 |
110 | bamFileName=`basename ${bamFile}`
111 |
112 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk3:3.8-0 \\" >> $out_script
113 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
114 | echo "-T DepthOfCoverage \\" >> $out_script
115 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
116 | echo "-I /mnt/${bamFile} \\" >> $out_script
117 | echo "${selector_text} \\" >> $out_script
118 | echo "--minBaseQuality ${minBaseQuality} \\" >> $out_script
119 | echo "--minMappingQuality ${minMappingQuality} \\" >> $out_script
120 | echo "${extra_arguments} \\" >> $out_script
121 | echo "-o /mnt/${outdir}/${bamFileName}.depth" >> $out_script
122 |
123 | echo "" >> $out_script
124 |
125 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
126 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/README.md:
--------------------------------------------------------------------------------
1 | These scripts are deprecated. See [dockered pipeline](../dockered_pipelines/)
2 | instead with `-tech singularity` to invoke singularity scripts instead of
3 | docker.
4 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/README.md:
--------------------------------------------------------------------------------
1 | Mutation Simulation Pipeline in Singularities
2 |
3 | **Requirement**
4 |
5 | - Have internet connection and Singularity. Be able to pull docker images from
6 | Docker Hub.
7 | - **Highly recommended**: Have cluster management system with valid "qsub"
8 | command, such as Sun Grid Engine (SGE).
9 |
10 | This is ported from the
11 | [dockered pipeline](../../dockered_pipelines/bamSimulator). Commands are
12 | identical, except these scripts will run on singularities instead of docker
13 | daemon. Use the same set of commands, substitute "dockered_pipelines" in the
14 | command path for "singularities."
15 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/IndelRealign.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,genome-reference:,selector:,out-tag:,extra-arguments:,out-script:,standalone, -n 'IndelRealign.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | out_tag='JointRealigned'
18 | #extra_arguments='-dt NONE --maxReadsForConsensuses 150000 --maxReadsInMemory 500000 --maxReadsForRealignment 2000000'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --tumor-bam )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) tbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --normal-bam )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) nbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --genome-reference )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --selector )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) SELECTOR=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --out-tag )
53 | case "$2" in
54 | "") shift 2 ;;
55 | *) out_tag=$2 ; shift 2 ;;
56 | esac ;;
57 |
58 | --extra-arguments )
59 | case "$2" in
60 | "") shift 2 ;;
61 | *) extra_arguments=$2 ; shift 2 ;;
62 | esac ;;
63 |
64 | --out-script )
65 | case "$2" in
66 | "") shift 2 ;;
67 | *) out_script_name=$2 ; shift 2 ;;
68 | esac ;;
69 |
70 | --standalone )
71 | standalone=1 ; shift ;;
72 |
73 | -- ) shift; break ;;
74 | * ) break ;;
75 | esac
76 | done
77 |
78 | logdir=${outdir}/logs
79 | mkdir -p ${logdir}
80 |
81 | if [[ ${out_script_name} ]]
82 | then
83 | out_script="${out_script_name}"
84 | else
85 | out_script="${logdir}/indelRealign.${timestamp}.cmd"
86 | fi
87 |
88 | if [[ $standalone ]]
89 | then
90 | echo "#!/bin/bash" > $out_script
91 | echo "" >> $out_script
92 | echo "#$ -o ${logdir}" >> $out_script
93 | echo "#$ -e ${logdir}" >> $out_script
94 | echo "#$ -S /bin/bash" >> $out_script
95 | echo '#$ -l h_vmem=10G' >> $out_script
96 | echo 'set -e' >> $out_script
97 | fi
98 |
99 | echo "" >> $out_script
100 |
101 | if [[ $SELECTOR ]]
102 | then
103 | selector_input="-L /mnt/${SELECTOR}"
104 | fi
105 |
106 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk3:3.8-0 java -Xmx9g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
107 | echo "-T RealignerTargetCreator \\" >> $out_script
108 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
109 | echo "-I /mnt/${tbam} \\" >> $out_script
110 | echo "-I /mnt/${nbam} \\" >> $out_script
111 | echo "$selector_input \\" >> $out_script
112 | echo "-o /mnt/${outdir}/T.N.intervals" >> $out_script
113 | echo "" >> $out_script
114 |
115 | echo "singularity exec --bind /:/mnt --pwd /mnt/${outdir} docker://broadinstitute/gatk3:3.8-0 \\" >> $out_script
116 | echo "java -Xmx9g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script
117 | echo "-T IndelRealigner \\" >> $out_script
118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script
119 | echo "-I /mnt/${tbam} \\" >> $out_script
120 | echo "-I /mnt/${nbam} \\" >> $out_script
121 | echo "-targetIntervals /mnt/${outdir}/T.N.intervals \\" >> $out_script
122 | echo "${extra_arguments} \\" >> $out_script
123 | echo "-nWayOut .${out_tag}.bam" >> $out_script
124 | echo "" >> $out_script
125 |
126 | realigned_normal=${nbam%.bam}.${out_tag}.bam
127 | realigned_tumor=${tbam%.bam}.${out_tag}.bam
128 |
129 | echo "mv ${realigned_normal%.bam}.bai ${realigned_normal}.bai" >> $out_script
130 | echo "mv ${realigned_tumor%.bam}.bai ${realigned_tumor}.bai" >> $out_script
131 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/MergeTN.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 | outSM='TN_Merged'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-out )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) outbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --tumor-bam )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) tbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --normal-bam )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) nbam=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --out-script )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) out_script_name=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --standalone )
53 | standalone=1 ; shift ;;
54 |
55 | -- ) shift; break ;;
56 | * ) break ;;
57 | esac
58 | done
59 |
60 | logdir=${outdir}/logs
61 | mkdir -p ${logdir}
62 |
63 | if [[ ${out_script_name} ]]
64 | then
65 | out_script="${out_script_name}"
66 | else
67 | out_script="${logdir}/mergeBams.${timestamp}.cmd"
68 | fi
69 |
70 | if [[ $standalone ]]
71 | then
72 | echo "#!/bin/bash" > $out_script
73 | echo "" >> $out_script
74 | echo "#$ -o ${logdir}" >> $out_script
75 | echo "#$ -e ${logdir}" >> $out_script
76 | echo "#$ -S /bin/bash" >> $out_script
77 | echo '#$ -l h_vmem=8G' >> $out_script
78 | echo 'set -e' >> $out_script
79 | fi
80 |
81 | echo "" >> $out_script
82 |
83 | # Merge the 2 BAM files
84 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script
86 | echo "I=/mnt/${nbam} \\" >> $out_script
87 | echo "I=/mnt/${tbam} \\" >> $out_script
88 | echo "ASSUME_SORTED=true \\" >> $out_script
89 | echo "CREATE_INDEX=true \\" >> $out_script
90 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
91 | echo "" >> $out_script
92 |
93 | # Remove temp files
94 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
95 | echo "" >> $out_script
96 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/Reheader_SM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-SM:,out-script:,standalone -n 'Reheader_SM.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 | outSM='TN_Merged'
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-out )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) outbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --bam-in )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) inbam=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --out-SM )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) outSM=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --out-script )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) out_script_name=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --standalone )
53 | standalone=1 ; shift ;;
54 |
55 | -- ) shift; break ;;
56 | * ) break ;;
57 | esac
58 | done
59 |
60 | logdir=${outdir}/logs
61 | mkdir -p ${logdir}
62 |
63 | if [[ ${out_script_name} ]]
64 | then
65 | out_script="${out_script_name}"
66 | else
67 | out_script="${logdir}/reheader.${timestamp}.cmd"
68 | fi
69 |
70 | if [[ $standalone ]]
71 | then
72 | echo "#!/bin/bash" > $out_script
73 | echo "" >> $out_script
74 | echo "#$ -o ${logdir}" >> $out_script
75 | echo "#$ -e ${logdir}" >> $out_script
76 | echo "#$ -S /bin/bash" >> $out_script
77 | echo '#$ -l h_vmem=8G' >> $out_script
78 | echo 'set -e' >> $out_script
79 | fi
80 |
81 | echo "" >> $out_script
82 |
83 | # Uniform sample and read group names in the merged file
84 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups \\" >> $out_script
86 | echo "I=/mnt/${outdir}/${inbam} \\" >> $out_script
87 | echo "RGID=BAMSurgeon \\" >> $out_script
88 | echo "RGLB=TNMerged \\" >> $out_script
89 | echo "RGPL=illumina \\" >> $out_script
90 | echo "RGPU=BAMSurgeon \\" >> $out_script
91 | echo "RGSM=${outSM} \\" >> $out_script
92 | echo "CREATE_INDEX=true \\" >> $out_script
93 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
94 | echo "" >> $out_script
95 |
96 | # Remove temp files
97 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
98 | echo "" >> $out_script
99 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/SortByCoordinate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,genome-reference:,out-script:,standalone -n 'SortByCoordinate.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --genome-reference )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --out-script )
46 | case "$2" in
47 | "") shift 2 ;;
48 | *) out_script_name=$2 ; shift 2 ;;
49 | esac ;;
50 |
51 | --standalone )
52 | standalone=1 ; shift ;;
53 |
54 | -- ) shift; break ;;
55 | * ) break ;;
56 | esac
57 | done
58 |
59 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
60 |
61 | logdir=${outdir}/logs
62 | mkdir -p ${logdir}
63 |
64 | if [[ ${out_script_name} ]]
65 | then
66 | out_script="${out_script_name}"
67 | else
68 | out_script="${logdir}/sort.coordinates.${timestamp}.cmd"
69 | fi
70 |
71 |
72 | if [[ $standalone ]]
73 | then
74 | echo "#!/bin/bash" > $out_script
75 | echo "" >> $out_script
76 | echo "#$ -o ${logdir}" >> $out_script
77 | echo "#$ -e ${logdir}" >> $out_script
78 | echo "#$ -S /bin/bash" >> $out_script
79 | echo '#$ -l h_vmem=8G' >> $out_script
80 | echo 'set -e' >> $out_script
81 | fi
82 |
83 |
84 | echo "" >> $out_script
85 |
86 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script
87 | echo "samtools sort -m 4G --reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script
88 | echo "-o /mnt/${outdir}/${outbam} /mnt/${inbam}" >> $out_script
89 | echo "" >> $out_script
90 |
91 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script
92 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script
93 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/SortByReadName.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
54 |
55 | logdir=${outdir}/logs
56 | mkdir -p ${logdir}
57 |
58 | if [[ ${out_script_name} ]]
59 | then
60 | out_script="${out_script_name}"
61 | else
62 | out_script="${logdir}/sort.qname.${timestamp}.cmd"
63 | fi
64 |
65 |
66 | if [[ $standalone ]]
67 | then
68 | echo "#!/bin/bash" > $out_script
69 | echo "" >> $out_script
70 | echo "#$ -o ${logdir}" >> $out_script
71 | echo "#$ -e ${logdir}" >> $out_script
72 | echo "#$ -S /bin/bash" >> $out_script
73 | echo '#$ -l h_vmem=8G' >> $out_script
74 | echo 'set -e' >> $out_script
75 | fi
76 |
77 |
78 | echo "" >> $out_script
79 |
80 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script
81 | echo "samtools sort -n -m 4G \\" >> $out_script
82 | echo "-o /mnt/${outdir}/${outbam} \\" >> $out_script
83 | echo "/mnt/${inbam} \\" >> $out_script
84 | echo "" >> $out_script
85 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/bamsurgeon_split_BAM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,genome-reference:,bam-out1:,bam-out2:,bam-in:,split-proportion:,down-sample:,seed:,out-script:,clean-bam,standalone -n 'bamsurgeon_split_BAM.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | seed=$( date +"%Y" )
17 | proportion=0.5
18 | down_sample=1
19 |
20 | while true; do
21 | case "$1" in
22 | -o | --output-dir )
23 | case "$2" in
24 | "") shift 2 ;;
25 | *) outdir=$2 ; shift 2 ;;
26 | esac ;;
27 |
28 | --bam-in )
29 | case "$2" in
30 | "") shift 2 ;;
31 | *) inbam=$2 ; shift 2 ;;
32 | esac ;;
33 |
34 | --bam-out1 )
35 | case "$2" in
36 | "") shift 2 ;;
37 | *) outbam1=$2 ; shift 2 ;;
38 | esac ;;
39 |
40 | --bam-out2 )
41 | case "$2" in
42 | "") shift 2 ;;
43 | *) outbam2=$2 ; shift 2 ;;
44 | esac ;;
45 |
46 | --genome-reference )
47 | case "$2" in
48 | "") shift 2 ;;
49 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
50 | esac ;;
51 |
52 | --split-proportion )
53 | case "$2" in
54 | "") shift 2 ;;
55 | *) proportion=$2 ; shift 2 ;;
56 | esac ;;
57 |
58 | --down-sample )
59 | case "$2" in
60 | "") shift 2 ;;
61 | *) down_sample=$2 ; shift 2 ;;
62 | esac ;;
63 |
64 | --seed )
65 | case "$2" in
66 | "") shift 2 ;;
67 | *) seed=$2 ; shift 2 ;;
68 | esac ;;
69 |
70 | --out-script )
71 | case "$2" in
72 | "") shift 2 ;;
73 | *) out_script_name=$2 ; shift 2 ;;
74 | esac ;;
75 |
76 | --clean-bam )
77 | clean_bam=1 ; shift ;;
78 |
79 | --standalone )
80 | standalone=1 ; shift ;;
81 |
82 | -- ) shift; break ;;
83 | * ) break ;;
84 | esac
85 | done
86 |
87 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict
88 |
89 | logdir=${outdir}/logs
90 | mkdir -p ${logdir}
91 |
92 | if [[ ${out_script_name} ]]
93 | then
94 | out_script="${out_script_name}"
95 | else
96 | out_script="${logdir}/splitBams.${timestamp}.cmd"
97 | fi
98 |
99 | if [[ $standalone ]]
100 | then
101 | echo "#!/bin/bash" > $out_script
102 | echo "" >> $out_script
103 | echo "#$ -o ${logdir}" >> $out_script
104 | echo "#$ -e ${logdir}" >> $out_script
105 | echo "#$ -S /bin/bash" >> $out_script
106 | echo '#$ -l h_vmem=8G' >> $out_script
107 | echo 'set -e' >> $out_script
108 | fi
109 |
110 |
111 | echo "" >> $out_script
112 |
113 |
114 | # Then you can split
115 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
116 | echo "/usr/local/bamsurgeon/scripts/sortedBamSplit.py \\" >> $out_script
117 | echo "--bam /mnt/${inbam} \\" >> $out_script
118 | echo "--proportion ${proportion} \\" >> $out_script
119 | echo "--downsample ${down_sample} \\" >> $out_script
120 | echo "--pick1 /mnt/${outdir}/${outbam1} \\" >> $out_script
121 | echo "--pick2 /mnt/${outdir}/${outbam2} \\" >> $out_script
122 | echo "--seed ${seed}" >> $out_script
123 | echo "" >> $out_script
124 |
125 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam1}" >> $out_script
126 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam2}" >> $out_script
127 | echo "" >> $out_script
128 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/cleanBam.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | seed=$( date +"%Y" )
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-in )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) inbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-out )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) outbam=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | if [[ ${out_script_name} ]]
54 | then
55 | out_script="${out_script_name}"
56 | else
57 | out_script="${logdir}/cleanBam.${timestamp}.cmd"
58 | fi
59 |
60 | if [[ $standalone ]]
61 | then
62 | echo "#!/bin/bash" > $out_script
63 | echo "" >> $out_script
64 | echo "#$ -o ${logdir}" >> $out_script
65 | echo "#$ -e ${logdir}" >> $out_script
66 | echo "#$ -S /bin/bash" >> $out_script
67 | echo '#$ -l h_vmem=4G' >> $out_script
68 | echo 'set -e' >> $out_script
69 | fi
70 |
71 | echo "" >> $out_script
72 |
73 | # To split a BAM file, first you must sort by name:
74 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
75 | echo "/usr/local/bamsurgeon/scripts/remove_reads_with_many_qnames_or_bad_CIGAR.py \\" >> $out_script
76 | echo "-bamin /mnt/${inbam} \\" >> $out_script
77 | echo "-bamout /mnt/${outdir}/${outbam}" >> $out_script
78 | echo "" >> $out_script
79 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/concatVcfFiles.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,vcf-string:,vcf-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --vcf-out )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) outvcf=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --vcf-string )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) vcf_string=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | logdir=${outdir}/logs
54 | mkdir -p ${logdir}
55 |
56 | if [[ ${out_script_name} ]]
57 | then
58 | out_script="${out_script_name}"
59 | else
60 | out_script="${logdir}/concatVcfFiles.${timestamp}.cmd"
61 | fi
62 |
63 | if [[ $standalone ]]
64 | then
65 | echo "#!/bin/bash" > $out_script
66 | echo "" >> $out_script
67 | echo "#$ -o ${logdir}" >> $out_script
68 | echo "#$ -e ${logdir}" >> $out_script
69 | echo "#$ -S /bin/bash" >> $out_script
70 | echo '#$ -l h_vmem=2G' >> $out_script
71 | echo 'set -e' >> $out_script
72 | fi
73 |
74 | echo "" >> $out_script
75 |
76 |
77 | for file in ${vcf_string}
78 | do
79 | input_file_string="/mnt/${file} ${input_file_string}"
80 | done
81 |
82 | # Merge the BAM files
83 | echo "singularity exec --bind /:/mnt docker://lethalfang/vcftools:0.1.15 bash -c \\" >> $out_script
84 | echo "\"vcf-concat \\" >> $out_script
85 | echo "${input_file_string} \\" >> $out_script
86 | echo "> /mnt/${outdir}/${outvcf}\"" >> $out_script
87 | echo "" >> $out_script
88 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/mergeBamFiles.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-string:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | keep_intermediates=0
18 |
19 | while true; do
20 | case "$1" in
21 | -o | --output-dir )
22 | case "$2" in
23 | "") shift 2 ;;
24 | *) outdir=$2 ; shift 2 ;;
25 | esac ;;
26 |
27 | --bam-out )
28 | case "$2" in
29 | "") shift 2 ;;
30 | *) outbam=$2 ; shift 2 ;;
31 | esac ;;
32 |
33 | --bam-string )
34 | case "$2" in
35 | "") shift 2 ;;
36 | *) bam_string=$2 ; shift 2 ;;
37 | esac ;;
38 |
39 | --out-script )
40 | case "$2" in
41 | "") shift 2 ;;
42 | *) out_script_name=$2 ; shift 2 ;;
43 | esac ;;
44 |
45 | --standalone )
46 | standalone=1 ; shift ;;
47 |
48 | -- ) shift; break ;;
49 | * ) break ;;
50 | esac
51 | done
52 |
53 | logdir=${outdir}/logs
54 | mkdir -p ${logdir}
55 |
56 | if [[ ${out_script_name} ]]
57 | then
58 | out_script="${out_script_name}"
59 | else
60 | out_script="${logdir}/mergeBams.${timestamp}.cmd"
61 | fi
62 |
63 | if [[ $standalone ]]
64 | then
65 | echo "#!/bin/bash" > $out_script
66 | echo "" >> $out_script
67 | echo "#$ -o ${logdir}" >> $out_script
68 | echo "#$ -e ${logdir}" >> $out_script
69 | echo "#$ -S /bin/bash" >> $out_script
70 | echo '#$ -l h_vmem=8G' >> $out_script
71 | echo 'set -e' >> $out_script
72 | fi
73 |
74 | echo "" >> $out_script
75 |
76 |
77 | for file in ${bam_string}
78 | do
79 | input_file_string="I=/mnt/${file} ${input_file_string}"
80 | done
81 |
82 | # Merge the BAM files
83 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script
84 | echo "java -Xmx8g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script
85 | echo "${input_file_string} \\" >> $out_script
86 | echo "ASSUME_SORTED=true \\" >> $out_script
87 | echo "CREATE_INDEX=true \\" >> $out_script
88 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script
89 | echo "" >> $out_script
90 |
91 | # Remove temp files
92 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script
93 | echo "" >> $out_script
94 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/bamSimulator/bamSurgeon/split_BAM_by_BED.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam-in:,bam-out:,selector:,out-script:,standalone -n 'split_BAM_by_BED.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | while true; do
18 | case "$1" in
19 | -o | --output-dir )
20 | case "$2" in
21 | "") shift 2 ;;
22 | *) outdir=$2 ; shift 2 ;;
23 | esac ;;
24 |
25 | --bam-in )
26 | case "$2" in
27 | "") shift 2 ;;
28 | *) inbam=$2 ; shift 2 ;;
29 | esac ;;
30 |
31 | --bam-out )
32 | case "$2" in
33 | "") shift 2 ;;
34 | *) outbam=$2 ; shift 2 ;;
35 | esac ;;
36 |
37 | --selector )
38 | case "$2" in
39 | "") shift 2 ;;
40 | *) SELECTOR=$2 ; shift 2 ;;
41 | esac ;;
42 |
43 | --out-script )
44 | case "$2" in
45 | "") shift 2 ;;
46 | *) out_script_name=$2 ; shift 2 ;;
47 | esac ;;
48 |
49 | --standalone )
50 | standalone=1 ; shift ;;
51 |
52 | -- ) shift; break ;;
53 | * ) break ;;
54 | esac
55 | done
56 |
57 |
58 | logdir=${outdir}/logs
59 | mkdir -p ${logdir}
60 |
61 | if [[ ${out_script_name} ]]
62 | then
63 | out_script="${out_script_name}"
64 | else
65 | out_script="${logdir}/splitByBed.${timestamp}.cmd"
66 | fi
67 |
68 | if [[ $standalone ]]
69 | then
70 | echo "#!/bin/bash" > $out_script
71 | echo "" >> $out_script
72 | echo "#$ -o ${logdir}" >> $out_script
73 | echo "#$ -e ${logdir}" >> $out_script
74 | echo "#$ -S /bin/bash" >> $out_script
75 | echo '#$ -l h_vmem=4G' >> $out_script
76 | echo 'set -e' >> $out_script
77 | fi
78 |
79 | echo "" >> $out_script
80 |
81 |
82 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 bash -c \\" >> $out_script
83 | echo "\"samtools view /mnt/${inbam} -L /mnt/${SELECTOR} -Sbh \\" >> $out_script
84 | echo "> /mnt/${outdir}/${outbam}\"" >> $out_script
85 |
86 | echo "" >> $out_script
87 |
88 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script
89 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script
90 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/germline_variants/Nirvana.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,in-vcf:,nirvana-resources-dir:,sample:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 | sampleID='Nirvana'
20 |
21 | while true; do
22 | case "$1" in
23 |
24 | -o | --out-dir )
25 | case "$2" in
26 | "") shift 2 ;;
27 | *) outdir=$2 ; shift 2 ;;
28 | esac ;;
29 |
30 | --in-vcf )
31 | case "$2" in
32 | "") shift 2 ;;
33 | *) inVcf=$2 ; shift 2 ;;
34 | esac ;;
35 |
36 | --nirvana-resources-dir )
37 | case "$2" in
38 | "") shift 2 ;;
39 | *) NIRVANA_RESOURCES_DIR=$2 ; shift 2 ;;
40 | esac ;;
41 |
42 | --sample )
43 | case "$2" in
44 | "") shift 2 ;;
45 | *) sampleID=$2 ; shift 2 ;;
46 | esac ;;
47 |
48 | --extra-arguments )
49 | case "$2" in
50 | "") shift 2 ;;
51 | *) extra_arguments=$2 ; shift 2 ;;
52 | esac ;;
53 |
54 | --out-script )
55 | case "$2" in
56 | "") shift 2 ;;
57 | *) out_script_name=$2 ; shift 2 ;;
58 | esac ;;
59 |
60 | --threads )
61 | case "$2" in
62 | "") shift 2 ;;
63 | *) threads=$2 ; shift 2 ;;
64 | esac ;;
65 |
66 | --action )
67 | case "$2" in
68 | "") shift 2 ;;
69 | *) action=$2 ; shift 2 ;;
70 | esac ;;
71 |
72 | --standalone )
73 | standalone=1 ; shift ;;
74 |
75 | -- ) shift; break ;;
76 | * ) break ;;
77 | esac
78 |
79 | done
80 |
81 | logdir=${outdir}/logs
82 | mkdir -p ${logdir}
83 |
84 | if [[ ${out_script_name} ]]
85 | then
86 | out_script="${out_script_name}"
87 | else
88 | out_script="${logdir}/canvas.${timestamp}.cmd"
89 | fi
90 |
91 |
92 | if [[ $standalone ]]
93 | then
94 | echo "#!/bin/bash" > $out_script
95 | echo "" >> $out_script
96 | echo "#$ -o ${logdir}" >> $out_script
97 | echo "#$ -e ${logdir}" >> $out_script
98 | echo "#$ -S /bin/bash" >> $out_script
99 | echo '#$ -l h_vmem=4G' >> $out_script
100 | echo "#$ -pe smp ${threads}" >> $out_script
101 | echo 'set -e' >> $out_script
102 | fi
103 |
104 | echo "" >> $out_script
105 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
106 | echo "" >> $out_script
107 |
108 |
109 | echo "singularity exec --bind /:/mnt docker://lethalfang/nirvana:2.0.9 \\" >> $out_script
110 | echo "dotnet /opt/Nirvana/bin/Release/netcoreapp2.0/Nirvana.dll \\" >> $out_script
111 | echo "-c /mnt/${NIRVANA_RESOURCES_DIR}/Cache/26/GRCh38/Ensembl \\" >> $out_script
112 | echo "--sd /mnt/${NIRVANA_RESOURCES_DIR}/GRCh38 \\" >> $out_script
113 | echo "-r /mnt/${NIRVANA_RESOURCES_DIR}/References/5/Homo_sapiens.GRCh38.Nirvana.dat \\" >> $out_script
114 | echo "-i /mnt/${inVcf} \\" >> $out_script
115 | echo "-o /mnt/${outdir}/${sampleID}" >> $out_script
116 |
117 | echo "" >> $out_script
118 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
119 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/germline_variants/bam2vcf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long output-dir:,bam:,out-vcf:,genome-reference:,dbsnp:,hapmap:,omni:,thousandG:,mills:,out-script:,action:,threads:, -n 'bam2vcf.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 |
17 | tumor_bam_header='@RG\tID:myPipeline\tLB:myLibrary\tPL:illumina\tSM:TUMOR'
18 | normal_bam_header='@RG\tID:myPipeline\tLB:myLibrary\tPL:illumina\tSM:NORMAL'
19 | MEM=16
20 | threads=24
21 | action=echo
22 |
23 | while true; do
24 | case "$1" in
25 |
26 | -o | --output-dir )
27 | case "$2" in
28 | "") shift 2 ;;
29 | *) outdir=$2 ; shift 2 ;;
30 | esac ;;
31 |
32 | --bam )
33 | case "$2" in
34 | "") shift 2 ;;
35 | *) bam=$2 ; shift 2 ;;
36 | esac ;;
37 |
38 | --out-vcf )
39 | case "$2" in
40 | "") shift 2 ;;
41 | *) outVcf=$2 ; shift 2 ;;
42 | esac ;;
43 |
44 |
45 | --genome-reference )
46 | case "$2" in
47 | "") shift 2 ;;
48 | *) GENOME_REFERENCE=$2 ; shift 2 ;;
49 | esac ;;
50 |
51 | --dbsnp )
52 | case "$2" in
53 | "") shift 2 ;;
54 | *) dbsnp=$2 ; shift 2 ;;
55 | esac ;;
56 |
57 | --hapmap )
58 | case "$2" in
59 | "") shift 2 ;;
60 | *) hapmapFile=$2 ; shift 2 ;;
61 | esac ;;
62 |
63 | --thousandG )
64 | case "$2" in
65 | "") shift 2 ;;
66 | *) thousandGFile=$2 ; shift 2 ;;
67 | esac ;;
68 |
69 | --omni )
70 | case "$2" in
71 | "") shift 2 ;;
72 | *) omniFile=$2 ; shift 2 ;;
73 | esac ;;
74 |
75 | --mills )
76 | case "$2" in
77 | "") shift 2 ;;
78 | *) millsFile=$2 ; shift 2 ;;
79 | esac ;;
80 |
81 | --threads )
82 | case "$2" in
83 | "") shift 2 ;;
84 | *) threads=$2 ; shift 2 ;;
85 | esac ;;
86 |
87 | --MEM )
88 | case "$2" in
89 | "") shift 2 ;;
90 | *) MEM=$2 ; shift 2 ;;
91 | esac ;;
92 |
93 | --out-script )
94 | case "$2" in
95 | "") shift 2 ;;
96 | *) out_script_name=$2 ; shift 2 ;;
97 | esac ;;
98 |
99 | --action )
100 | case "$2" in
101 | "") shift 2 ;;
102 | *) action=$2 ; shift 2 ;;
103 | esac ;;
104 |
105 | -- ) shift; break ;;
106 | * ) break ;;
107 |
108 | esac
109 | done
110 |
111 |
112 |
113 | logdir=${outdir}/logs
114 | mkdir -p ${logdir}
115 |
116 |
117 | if [[ ${out_script_name} ]]
118 | then
119 | out_script="${logdir}/${out_script_name}"
120 | else
121 | out_script="${logdir}/bam2vcf.${timestamp}.cmd"
122 | fi
123 |
124 |
125 | echo "#!/bin/bash" > $out_script
126 | echo "" >> $out_script
127 |
128 | echo "#$ -o ${logdir}" >> $out_script
129 | echo "#$ -e ${logdir}" >> $out_script
130 | echo "#$ -S /bin/bash" >> $out_script
131 | echo "#$ -l h_vmem=6G" >> $out_script
132 | echo "#$ -pe smp ${threads}" >> $out_script
133 |
134 | echo 'set -e' >> $out_script
135 | echo "" >> $out_script
136 |
137 | files_to_delete=''
138 |
139 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
140 | echo "" >> $out_script
141 |
142 |
143 |
144 | $MYDIR/haplotypeCaller.sh \
145 | --out-dir ${outdir} \
146 | --bam ${bam} \
147 | --human-reference ${GENOME_REFERENCE} \
148 | --dbsnp ${dbsnp} \
149 | --out-vcf preVQSR.${outVcf} \
150 | --threads ${threads} \
151 | --MEM ${MEM} \
152 | --out-script ${out_script}
153 |
154 | $MYDIR/VQSR.sh \
155 | --out-dir ${outdir} \
156 | --in-vcf ${outdir}/preVQSR.${outVcf} \
157 | --human-reference ${GENOME_REFERENCE} \
158 | --dbsnp ${dbsnp} \
159 | --hapmap ${hapmapFile} \
160 | --omni ${omniFile} \
161 | --thousandG ${thousandGFile} \
162 | --mills ${millsFile} \
163 | --out-vcf ${outVcf} \
164 | --out-script ${out_script}
165 |
--------------------------------------------------------------------------------
/somaticseq/utilities/singularities/germline_variants/haplotypeCaller.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Use getopt instead of getopts for long options
3 |
4 | set -e
5 |
6 | OPTS=`getopt -o o: --long out-dir:,out-vcf:,bam:,human-reference:,selector:,dbsnp:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'gatk_haplotypecaller.sh' -- "$@"`
7 |
8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
9 |
10 | #echo "$OPTS"
11 | eval set -- "$OPTS"
12 |
13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
14 |
15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" )
16 | action=echo
17 | MEM=8
18 | threads=12
19 |
20 | while true; do
21 | case "$1" in
22 |
23 | -o | --out-dir )
24 | case "$2" in
25 | "") shift 2 ;;
26 | *) outdir=$2 ; shift 2 ;;
27 | esac ;;
28 |
29 | --out-vcf )
30 | case "$2" in
31 | "") shift 2 ;;
32 | *) outVcfName=$2 ; shift 2 ;;
33 | esac ;;
34 |
35 | --bam )
36 | case "$2" in
37 | "") shift 2 ;;
38 | *) bamFile=$2 ; shift 2 ;;
39 | esac ;;
40 |
41 | --human-reference )
42 | case "$2" in
43 | "") shift 2 ;;
44 | *) HUMAN_REFERENCE=$2 ; shift 2 ;;
45 | esac ;;
46 |
47 | --selector )
48 | case "$2" in
49 | "") shift 2 ;;
50 | *) SELECTOR=$2 ; shift 2 ;;
51 | esac ;;
52 |
53 | --dbsnp )
54 | case "$2" in
55 | "") shift 2 ;;
56 | *) dbsnp=$2 ; shift 2 ;;
57 | esac ;;
58 |
59 | --MEM )
60 | case "$2" in
61 | "") shift 2 ;;
62 | *) MEM=$2 ; shift 2 ;;
63 | esac ;;
64 |
65 | --threads )
66 | case "$2" in
67 | "") shift 2 ;;
68 | *) threads=$2 ; shift 2 ;;
69 | esac ;;
70 |
71 | --extra-arguments )
72 | case "$2" in
73 | "") shift 2 ;;
74 | *) extra_arguments=$2 ; shift 2 ;;
75 | esac ;;
76 |
77 | --out-script )
78 | case "$2" in
79 | "") shift 2 ;;
80 | *) out_script_name=$2 ; shift 2 ;;
81 | esac ;;
82 |
83 | --action )
84 | case "$2" in
85 | "") shift 2 ;;
86 | *) action=$2 ; shift 2 ;;
87 | esac ;;
88 |
89 | --standalone )
90 | standalone=1 ; shift ;;
91 |
92 | -- ) shift; break ;;
93 | * ) break ;;
94 | esac
95 |
96 | done
97 |
98 | logdir=${outdir}/logs
99 | mkdir -p ${logdir}
100 |
101 | if [[ ${out_script_name} ]]
102 | then
103 | out_script="${out_script_name}"
104 | else
105 | out_script="${logdir}/HaplotypeCaller.${timestamp}.cmd"
106 | fi
107 |
108 |
109 | if [[ $standalone ]]
110 | then
111 | echo "#!/bin/bash" > $out_script
112 | echo "" >> $out_script
113 | echo "#$ -o ${logdir}" >> $out_script
114 | echo "#$ -e ${logdir}" >> $out_script
115 | echo "#$ -S /bin/bash" >> $out_script
116 | echo '#$ -l h_vmem=12G' >> $out_script
117 | echo "#$ -pe smp ${threads}" >> $out_script
118 | echo 'set -e' >> $out_script
119 | fi
120 |
121 | echo "" >> $out_script
122 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
123 | echo "" >> $out_script
124 |
125 |
126 | if [[ ${SELECTOR} ]]
127 | then
128 | selector_text="-L /mnt/${SELECTOR}"
129 | fi
130 |
131 | dbsnp_text=''
132 | if [[ ${dbsnp} ]]; then
133 | dbsnp_text="--dbsnp /mnt/${dbsnp}"
134 | fi
135 |
136 |
137 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk:4.0.5.2 \\" >> $out_script
138 | echo "java -Xmx${MEM}g -jar /gatk/gatk.jar \\" >> $out_script
139 | echo "HaplotypeCaller \\" >> $out_script
140 | echo "--reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script
141 | echo "--input /mnt/${bamFile} \\" >> $out_script
142 | echo "--native-pair-hmm-threads ${threads} \\" >> $out_script
143 | echo "$selector_text \\" >> $out_script
144 | echo "$dbsnp_text \\" >> $out_script
145 | echo "${extra_arguments} \\" >> $out_script
146 | echo "--output /mnt/${outdir}/${outVcfName}" >> $out_script
147 |
148 | echo "" >> $out_script
149 |
150 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script
151 |
--------------------------------------------------------------------------------
/somaticseq/utilities/snakemake/README.md:
--------------------------------------------------------------------------------
1 | # Automated Pipeline based on [Snakemake](https://snakemake.readthedocs.io/en/latest)
2 |
3 | This workflow expects SomaticSeq and the desired suite of variant callers to be
4 | installed. That is, their executables should be found via the `PATH` environment
5 | variable.
6 |
7 | For SomaticSeq, it is sufficient to include the top-level directory and the
8 | utilities directory:
9 |
10 | ```
11 | SOMATICSEQ_HOME=/path/to/somaticseq
12 | export PATH=$SOMATICSEQ_HOME:$SOMATICSEQ_HOME/utilities:$PATH
13 | ```
14 |
15 | Example usage:
16 |
17 | ```
18 | snakemake \
19 | -j \
20 | --config \
21 | tumor=/ABSOLUTE/PATH/TO/tumor.bam \
22 | normal=/ABSOLUTE/PATH/TO/normal.bam \
23 | reference=/ABSOLUTE/PATH/TO/GRCh38.fa \
24 | dbsnp=/ABSOLUTE/PATH/TO/dbSNP.GRCh38.vcf \
25 | gatk=/ABSOLUTE/PATH/TO/GATK.jar \
26 | varscan=/ABSOLUTE/PATH/TO/VarScan.jar \
27 | caller_threads=36 \
28 | somaticseq
29 | ```
30 |
31 | **caller_threads** is the number of threads to be used for each of the variant
32 | callers that support parallelization.
33 |
34 | The `config.yaml` file specifies default options, mostly for specifying which
35 | variant callers' results you'd like to feed into SomaticSeq. You may pass those
36 | options on the command line, as is done for `caller_threads` above, and whatever
37 | is passed on the command line will override what is specified in the
38 | configuration file.
39 |
--------------------------------------------------------------------------------
/somaticseq/utilities/snakemake/config.yaml:
--------------------------------------------------------------------------------
1 | lofreq: True
2 | muse: True
3 | mutect2: True
4 | scalpel: True
5 | sniper: True
6 | strelka: True
7 | vardict: True
8 | varscan: True
9 | # The number of threads to use for each variant caller that supports
10 | # parallelization.
11 | caller_threads: 36
12 |
--------------------------------------------------------------------------------
/somaticseq/utilities/variant_annotation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import logging
5 | import os
6 | import subprocess
7 | import tempfile
8 | import uuid
9 |
10 | import pysam
11 |
12 | COSMIC_STRING = "GENE,CDS,AA,CNT"
13 | DBSNP_STRING = (
14 | "RSPOS,GENEINFO,dbSNPBuildID,SAO,SSR,VC,PM,MUT,KGPhase1,KGPhase3,OM,CDA,CAF,COMMON"
15 | )
16 |
17 |
18 | def snpsift_snp(snpsift_jar, input_vcf, dbsnp_vcf, output_vcf, info_string):
19 | logger = logging.getLogger(snpsift_snp.__name__)
20 | sift_command = "java -Xmx8g -jar {} annotate -info {} {} {} > {}".format(
21 | snpsift_jar, info_string, dbsnp_vcf, input_vcf, output_vcf
22 | )
23 | logger.info(sift_command)
24 | subprocess.check_call(sift_command, shell=True)
25 | return output_vcf
26 |
27 |
28 | def snpsift_cosmic(snpsift_jar, input_vcf, cosmic_vcf, output_vcf, info_string):
29 | logger = logging.getLogger(snpsift_cosmic.__name__)
30 | sift_command = "java -Xmx8g -jar {} annotate -info {} {} {} > {}".format(
31 | snpsift_jar, info_string, cosmic_vcf, input_vcf, output_vcf
32 | )
33 | logger.info(sift_command)
34 | subprocess.check_call(sift_command, shell=True)
35 | return output_vcf
36 |
37 |
38 | def snpeff_annotate(snpeff_jar, input_vcf, output_vcf, db):
39 | logger = logging.getLogger(snpeff_annotate.__name__)
40 | eff_command = "java -Xmx8g -jar {} -noStats {} {} > {}".format(
41 | snpeff_jar, db, input_vcf, output_vcf
42 | )
43 | logger.info(eff_command)
44 | subprocess.check_call(eff_command, shell=True)
45 | return output_vcf
46 |
47 |
48 | def annotate_small_variants(
49 | snpsift_jar,
50 | snpeff_jar,
51 | input_vcf,
52 | dbsnp_vcf,
53 | cosmic_vcf,
54 | output_vcf,
55 | snp_string,
56 | cosmic_string,
57 | eff_db,
58 | ):
59 | dirname = tempfile.gettempdir()
60 | dbsnp_annotated = snpsift_snp(
61 | snpsift_jar,
62 | input_vcf,
63 | dbsnp_vcf,
64 | os.path.join(dirname, uuid.uuid4().hex + ".vcf"),
65 | snp_string,
66 | )
67 | cosmic_annotated = snpsift_cosmic(
68 | snpsift_jar,
69 | dbsnp_annotated,
70 | cosmic_vcf,
71 | os.path.join(dirname, uuid.uuid4().hex + ".vcf"),
72 | cosmic_string,
73 | )
74 | output_vcf = snpeff_annotate(snpeff_jar, cosmic_annotated, output_vcf, eff_db)
75 | os.remove(dbsnp_annotated)
76 | os.remove(cosmic_annotated)
77 | pysam.tabix_index(output_vcf, force=True, preset="vcf")
78 | return output_vcf + ".gz"
79 |
80 |
81 | def main() -> None:
82 | FORMAT = "%(levelname)s %(asctime)-15s %(name)-20s %(message)s"
83 | logging.basicConfig(level=logging.INFO, format=FORMAT)
84 | parser = argparse.ArgumentParser(
85 | description="Annotate with snpSift and snpEff with dbSNP and COSMIC",
86 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
87 | )
88 | parser.add_argument("-infile", "--infile", help="input vcf file")
89 | parser.add_argument("-outfile", "--outfile", help="output vcf file")
90 | parser.add_argument(
91 | "-dbsnp", "--dbsnp", help="dbsnp vcf file to feed into GATK4 HaplotypeCaller"
92 | )
93 | parser.add_argument(
94 | "-cosmic", "--cosmic", help="cosmic vcf file to feed into GATK4 HaplotypeCaller"
95 | )
96 | parser.add_argument("-snpsift", "--snpsift", help="SnpSift JAR")
97 | parser.add_argument("-snpeff", "--snpeff", help="snpEff JAR")
98 | parser.add_argument("-db", "--snpeff-db", help="snpEff db", default="GRCh38.86")
99 | args = parser.parse_args()
100 | annotate_small_variants(
101 | args.snpsift,
102 | args.snpeff,
103 | args.infile,
104 | args.dbsnp,
105 | args.cosmic,
106 | args.outfile,
107 | DBSNP_STRING,
108 | COSMIC_STRING,
109 | args.snpeff_db,
110 | )
111 |
112 |
113 | if __name__ == "__main__":
114 | main()
115 |
--------------------------------------------------------------------------------
/somaticseq/utilities/vcfsorter.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use warnings;
4 |
5 | # LICENSE: This file licensed under the GNU GPL v3
6 |
7 | # Retrieved from: https://code.google.com/p/vcfsorter/ linking to
8 | # https://drive.google.com/file/d/0B7jV6rjPCUApR1ZnWTMzakZfN3M/view?usp=sharing
9 |
10 | ######################################################
11 | # vcfsorter.pl
12 | #
13 | # Copyright (C) 2011 German Gaston Leparc
14 | #
15 | # sorts VCF by reference genome
16 | #
17 | # usage:
18 | #
19 | # vcfsorter.pl genome.dict myvcf.file > mynewvcf.file
20 | #
21 | ######################################################
22 |
23 | my $usage = < mynewvcf.file 2>STDERR
29 | EOF
30 |
31 |
32 | my $dict_file = $ARGV[0];
33 | my $vcf_file = $ARGV[1];
34 |
35 | die "\nERROR: missing an argument!\n\n$usage" if (@ARGV < 2);
36 |
37 |
38 | #---------------------------------------- LOAD IN FASTA DICT INTO MEMORY
39 | open(DICT,$dict_file) or die "Can't open $dict_file!\n";
40 | my @contig_order;
41 | my $c=0;
42 | while()
43 | {
44 | if($_=~ /\@SQ/)
45 | {
46 | my ($contig) = $_ =~ /SN:(\S+)/;
47 | $contig_order[$c]=$contig;
48 | ++$c;
49 | #print $contig,"\n";
50 | }
51 | }
52 | close(DICT);
53 |
54 | #---------------------------------------- PARSE VCF FILE & OUTPUT SORTED VCF
55 |
56 | open(VCF,$vcf_file) or die "Can't open $vcf_file!\n";
57 |
58 | my %vcf_hash;
59 | my $header;
60 |
61 | while()
62 | {
63 | if($_=~/^#/){ $header .= $_; next; } # store header and comment fields
64 | chomp($_);
65 |
66 | my @data = split(/\t/,$_);
67 | my $contig = $data[0]; #CHROM
68 | my $start = $data[1]; #POS
69 | my $variant = $data[3]."to".$data[4]; #REF and ALT
70 | my $line = $_;
71 |
72 | #print $contig,":",$start," ",$variant,"\n";
73 |
74 | $vcf_hash{$contig}{$start}{$variant}=$line;
75 |
76 | }
77 | close(VCF);
78 |
79 | #------------------ print out the VCF in the order of the reference genome
80 |
81 | #print standard VCF header
82 | print $header;
83 |
84 |
85 | foreach my $contig (@contig_order) # sort by contig order
86 | {
87 | #print $contig,"\n";
88 | foreach my $start (sort {$a <=> $b} keys %{$vcf_hash{$contig}}) # sort numerically by coordinates
89 | {
90 | #print $start,"\n";
91 | foreach my $variant (keys %{$vcf_hash{$contig}{$start}}) # if overlapping mutation, print each variant
92 | {
93 | print $vcf_hash{$contig}{$start}{$variant},"\n";
94 | }
95 | }
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/vcf_modifier/__init__.py
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/complex2indel.py:
--------------------------------------------------------------------------------
1 | def resolve_complex_variants_into_snvs_and_indels(
2 | refbases: str, altbases: str
3 | ) -> list[dict]:
4 | """
5 | Split complex variants into combination of snvs and indels.
6 | """
7 | snv_or_indel = [{"OFFSET": 0, "REF": refbases, "ALT": altbases}]
8 |
9 | if len(refbases) == 1 and len(altbases) == 1: # snv
10 | return snv_or_indel
11 |
12 | if (len(refbases) == 1 or len(altbases) == 1) and (
13 | refbases[0] == altbases[0]
14 | ): # indel
15 | return snv_or_indel
16 |
17 | # Initialize a list to hold the new records
18 | list_of_variants: list[dict] = []
19 |
20 | # "Left-align" the REF and ALT to assign snvs until one has to consider
21 | # deletion or insertion
22 | for i, (refbase, altbase) in enumerate(zip(refbases, altbases)):
23 | if refbase != altbase:
24 | list_of_variants.append(
25 | {"OFFSET": i, "REF": refbase, "ALT": altbase},
26 | )
27 | # Handle deletion
28 | if len(refbases) > len(altbases):
29 | list_of_variants.append(
30 | {"OFFSET": i, "REF": refbases[i:], "ALT": refbases[i]},
31 | )
32 | # Handle insertion
33 | elif len(altbases) > len(refbases):
34 | list_of_variants.append(
35 | {"OFFSET": i, "REF": refbases[i], "ALT": refbases[i] + altbases[i + 1 :]},
36 | )
37 | return list_of_variants
38 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/copy_TextFile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 |
5 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome
6 |
7 |
8 | def run():
9 | parser = argparse.ArgumentParser(
10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 | )
12 |
13 | # Variant Call Type, i.e., snp or indel
14 | parser.add_argument(
15 | "-infile", "--input-file", type=str, help="Input VCF file", required=True
16 | )
17 | parser.add_argument(
18 | "-outfile", "--output-file", type=str, help="Output VCF file", required=True
19 | )
20 |
21 | # Parse the arguments:
22 | args = parser.parse_args()
23 | infile = args.input_file
24 | outfile = args.output_file
25 |
26 | return infile, outfile
27 |
28 |
29 | def copy(infile, outfile):
30 | with genome.open_textfile(infile) as filein, open(outfile, "w") as fileout:
31 | line_i = filein.readline()
32 | while line_i:
33 | fileout.write(line_i)
34 | line_i = filein.readline()
35 |
36 |
37 | if __name__ == "__main__":
38 | infile, outfile = run()
39 | copy(infile, outfile)
40 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/getUniqueVcfPositions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # A simple and quick way to replace GATK3 CombineVariants
4 |
5 | import argparse
6 | import gzip
7 | import re
8 |
9 |
10 | def open_textfile(file_name):
11 | # See if the input file is a .gz file:
12 | if file_name.lower().endswith(".gz"):
13 | return gzip.open(file_name, "rt")
14 |
15 | else:
16 | return open(file_name)
17 |
18 |
19 | def run():
20 | parser = argparse.ArgumentParser(
21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
22 | )
23 | parser.add_argument(
24 | "-vcfs",
25 | "--input-vcfs",
26 | nargs="*",
27 | type=str,
28 | help="Input VCF file",
29 | required=True,
30 | default=None,
31 | )
32 | parser.add_argument(
33 | "-out", "--output-vcf", type=str, help="Output VCF file", required=True
34 | )
35 |
36 | args = parser.parse_args()
37 |
38 | infiles = args.input_vcfs
39 | outfile = args.output_vcf
40 |
41 | return infiles, outfile
42 |
43 |
44 | def combine(infiles, outfile):
45 | variant_positions = set()
46 |
47 | for file_i in infiles:
48 | with open_textfile(file_i) as vcf:
49 | line_i = vcf.readline().rstrip()
50 |
51 | while line_i.startswith("#"):
52 | line_i = vcf.readline().rstrip()
53 |
54 | while line_i:
55 | item = line_i.split("\t")
56 |
57 | chromosome = item[0]
58 | position = int(item[1])
59 | refbase = item[3]
60 | altbases = re.split(r"[,/]", item[4])
61 |
62 | for altbase_i in altbases:
63 | variant_positions.add((chromosome, position, refbase, altbase_i))
64 |
65 | line_i = vcf.readline().rstrip()
66 |
67 | with open(outfile, "w") as vcf_out:
68 | vcf_out.write("##fileformat=VCFv4.1\n")
69 | vcf_out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
70 |
71 | for variant_position_i in sorted(variant_positions):
72 | vcf_out.write(
73 | "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
74 | variant_position_i[0],
75 | variant_position_i[1],
76 | ".",
77 | variant_position_i[2],
78 | variant_position_i[3],
79 | ".",
80 | "PASS",
81 | ".",
82 | )
83 | )
84 |
85 |
86 | if __name__ == "__main__":
87 | infiles, outfile = run()
88 | combine(infiles, outfile)
89 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/leftAlign.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # flake8: noqa: E501
3 |
4 | import os
5 |
6 |
7 | def leftAlign(infile, outfile, ref, gatk3):
8 | assert infile != outfile
9 | exit_code = os.system(
10 | """java -jar {} -T LeftAlignAndTrimVariants -R {} --variant {} | egrep -v '^[0-9]+ variants|^INFO' > {}""".format(
11 | gatk3, ref, infile, outfile
12 | )
13 | )
14 | assert exit_code
15 | return outfile
16 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/modify_JointSNVMix2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # flake8: noqa: E501
3 |
4 | import argparse
5 |
6 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome
7 |
8 |
9 | def run():
10 | parser = argparse.ArgumentParser(
11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
12 | )
13 |
14 | # Variant Call Type, i.e., snp or indel
15 | parser.add_argument(
16 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True
17 | )
18 | parser.add_argument(
19 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True
20 | )
21 |
22 | # Parse the arguments:
23 | args = parser.parse_args()
24 | infile = args.input_vcf
25 | outfile = args.output_vcf
26 |
27 | return infile, outfile
28 |
29 |
30 | def convert(infile, outfile):
31 | idx_format, idx_SM1, idx_SM2 = 8, 9, 10
32 | with genome.open_textfile(infile) as vcf, open(outfile, "w") as vcfout:
33 | line_i = vcf.readline().rstrip()
34 |
35 | # VCF header
36 | while line_i.startswith("#"):
37 | if line_i.startswith("##FORMAT= 0.8:
60 | normal_gt = "1/1"
61 | elif vaf > 0.25:
62 | normal_gt = "0/1"
63 | else:
64 | normal_gt = "0/0"
65 |
66 | item_normal[idx_ad] = "{},{}".format(
67 | item_normal[idx_rd], item_normal[idx_ad]
68 | )
69 | item_normal.pop(idx_rd)
70 | item_normal = [normal_gt] + item_normal
71 |
72 | # TUMOR
73 | item_tumor = item[idx_SM2].split(":")
74 | tumor_ad = int(item_tumor[idx_ad])
75 | tumor_rd = int(item_tumor[idx_rd])
76 |
77 | try:
78 | vaf = tumor_ad / (tumor_ad + tumor_rd)
79 | except ZeroDivisionError:
80 | vaf = 0
81 |
82 | if vaf > 0.8:
83 | tumor_gt = "1/1"
84 | else:
85 | tumor_gt = "0/1"
86 |
87 | item_tumor[idx_ad] = "{},{}".format(
88 | item_tumor[idx_rd], item_tumor[idx_ad]
89 | )
90 | item_tumor.pop(idx_rd)
91 | item_tumor = [tumor_gt] + item_tumor
92 |
93 | # Rewrite
94 | item[idx_format] = "GT:" + ":".join(format_items)
95 | item[idx_SM1] = ":".join(item_normal)
96 | item[idx_SM2] = ":".join(item_tumor)
97 |
98 | line_i = "\t".join(item)
99 | vcfout.write(line_i + "\n")
100 | line_i = vcf.readline().rstrip()
101 |
102 |
103 | if __name__ == "__main__":
104 | infile, outfile = run()
105 | convert(infile, outfile)
106 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/modify_SomaticSniper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import re
5 |
6 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome
7 |
8 |
9 | def run():
10 | parser = argparse.ArgumentParser(
11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
12 | )
13 | # Variant Call Type, i.e., snp or indel
14 | parser.add_argument(
15 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True
16 | )
17 | parser.add_argument(
18 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True
19 | )
20 | # Parse the arguments:
21 | args = parser.parse_args()
22 | infile = args.input_vcf
23 | outfile = args.output_vcf
24 |
25 | return infile, outfile
26 |
27 |
28 | def convert(infile, outfile):
29 | idx_ref = 3
30 | with genome.open_textfile(infile) as vcf, open(outfile, "w") as vcfout:
31 | line_i = vcf.readline().rstrip()
32 | # VCF header
33 | while line_i.startswith("#"):
34 | vcfout.write(line_i + "\n")
35 | line_i = vcf.readline().rstrip()
36 |
37 | while line_i:
38 | # Print "SomaticSniper" into the INFO field if it is called so,
39 | # otherwise never mind.
40 | item = line_i.split("\t")
41 | # In the REF field, non-GCTA characters should be changed to N to
42 | # fit the VCF standard:
43 | item[idx_ref] = re.sub(r"[^GCTA]", "N", item[idx_ref], flags=re.I)
44 | line_i = "\t".join(item)
45 | vcfout.write(line_i + "\n")
46 | line_i = vcf.readline().rstrip()
47 |
48 |
49 | if __name__ == "__main__":
50 | infile, outfile = run()
51 | convert(infile, outfile)
52 |
--------------------------------------------------------------------------------
/somaticseq/vcf_modifier/modify_Strelka.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Add GT to Strelka's samples to make compatible with GATK CombineVariants, so
4 | # don't care about the content. Just 0/1 for everyone.
5 |
6 | import argparse
7 |
8 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome
9 |
10 |
11 | def run():
12 | parser = argparse.ArgumentParser(
13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
14 | )
15 |
16 | # Variant Call Type, i.e., snp or indel
17 | parser.add_argument(
18 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True
19 | )
20 | parser.add_argument(
21 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True
22 | )
23 |
24 | # Parse the arguments:
25 | args = parser.parse_args()
26 | infile = args.input_vcf
27 | outfile = args.output_vcf
28 |
29 | return infile, outfile
30 |
31 |
32 | def convert(infile, outfile):
33 | with genome.open_textfile(infile) as vcf_in, open(outfile, "w") as vcf_out:
34 | line_i = vcf_in.readline().rstrip()
35 |
36 | while line_i.startswith("##"):
37 | vcf_out.write(line_i + "\n")
38 | line_i = vcf_in.readline().rstrip()
39 |
40 | # This is the #CHROM line:
41 | headers = line_i.split("\t")
42 | num_columns = len(headers)
43 | vcf_out.write(line_i + "\n")
44 |
45 | line_i = vcf_in.readline().rstrip()
46 | while line_i:
47 | items = line_i.split("\t")
48 |
49 | items[8] = "GT:" + items[8]
50 |
51 | for i in range(9, num_columns):
52 | items[i] = "0/1:" + items[i]
53 |
54 | line_out = "\t".join(items)
55 | vcf_out.write(line_out + "\n")
56 |
57 | line_i = vcf_in.readline().rstrip()
58 |
59 |
60 | if __name__ == "__main__":
61 | infile, outfile = run()
62 | convert(infile, outfile)
63 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from typing import Final
4 |
5 | import pytest
6 |
7 | TEST_ROOT_DIR: Final = Path(__file__).resolve().parent
8 |
9 |
10 | @pytest.fixture(scope="session")
11 | def test_rootdir() -> Path:
12 | return TEST_ROOT_DIR
13 |
14 |
15 | @pytest.fixture(scope="session")
16 | def test_datadir(test_rootdir: Path) -> Path:
17 | return test_rootdir / "example"
18 |
19 |
20 | @pytest.fixture(scope="session")
21 | def tiny_tumor_bam(test_datadir: Path) -> str:
22 | return os.fspath(test_datadir / "tumor.markdup.bam")
23 |
24 |
25 | @pytest.fixture(scope="session")
26 | def tiny_normal_bam(test_datadir: Path) -> str:
27 | return os.fspath(test_datadir / "normal.markdup.bam")
28 |
29 |
30 | @pytest.fixture(scope="session")
31 | def tiny_fasta(test_datadir: Path) -> str:
32 | return os.fspath(test_datadir / "tiny.fa")
33 |
34 |
35 | @pytest.fixture(scope="session")
36 | def tiny_dbsnp_vcf(test_datadir: Path) -> str:
37 | return os.fspath(test_datadir / "tiny_dbsnp.vcf")
38 |
39 |
40 | @pytest.fixture(scope="session")
41 | def tiny_truth_vcf(test_datadir: Path) -> str:
42 | return os.fspath(test_datadir / "Varsim.somatic.truth.vcf")
43 |
44 |
45 | @pytest.fixture
46 | def tiny_paired_mutect2_vcf(test_datadir: Path) -> str:
47 | return os.fspath(test_datadir / "paired_example" / "MuTect2.vcf.gz")
48 |
49 |
50 | @pytest.fixture
51 | def tiny_paired_somaticsniper_vcf(test_datadir: Path) -> str:
52 | return os.fspath(test_datadir / "paired_example" / "SomaticSniper.vcf.gz")
53 |
54 |
55 | @pytest.fixture
56 | def tiny_paired_vardict_vcf(test_datadir: Path) -> str:
57 | return os.fspath(test_datadir / "paired_example" / "VarDict.vcf.gz")
58 |
59 |
60 | @pytest.fixture
61 | def tiny_paired_muse_vcf(test_datadir: Path) -> str:
62 | return os.fspath(test_datadir / "paired_example" / "MuSE.vcf.gz")
63 |
64 |
65 | @pytest.fixture
66 | def tiny_paired_lofreq_snv_vcf(test_datadir: Path) -> str:
67 | return os.fspath(test_datadir / "paired_example" / "LoFreq.snv.vcf.gz")
68 |
69 |
70 | @pytest.fixture
71 | def tiny_paired_lofreq_indel_vcf(test_datadir: Path) -> str:
72 | return os.fspath(test_datadir / "paired_example" / "LoFreq.indel.vcf.gz")
73 |
74 |
75 | @pytest.fixture
76 | def tiny_paired_scalpel_vcf(test_datadir: Path) -> str:
77 | return os.fspath(test_datadir / "paired_example" / "Scalpel.vcf.gz")
78 |
79 |
80 | @pytest.fixture
81 | def tiny_paired_strelka_snv_vcf(test_datadir: Path) -> str:
82 | return os.fspath(test_datadir / "paired_example" / "Strelka.snv.vcf.gz")
83 |
84 |
85 | @pytest.fixture
86 | def tiny_paired_strelka_indel_vcf(test_datadir: Path) -> str:
87 | return os.fspath(test_datadir / "paired_example" / "Strelka.indel.vcf.gz")
88 |
89 |
90 | @pytest.fixture
91 | def tiny_single_mutect2_vcf(test_datadir: Path) -> str:
92 | return os.fspath(test_datadir / "tumor_only_example" / "MuTect2.vcf.gz")
93 |
94 |
95 | @pytest.fixture
96 | def tiny_single_vardict_vcf(test_datadir: Path) -> str:
97 | return os.fspath(test_datadir / "tumor_only_example" / "VarDict.vcf.gz")
98 |
99 |
100 | @pytest.fixture
101 | def tiny_single_strelka_vcf(test_datadir: Path) -> str:
102 | return os.fspath(test_datadir / "tumor_only_example" / "Strelka.vcf.gz")
103 |
104 |
105 | @pytest.fixture(scope="session")
106 | def reference_output(
107 | test_datadir: Path,
108 | ) -> dict[str, str]:
109 | return {
110 | "paired_consensus_snv_vcf": str(
111 | test_datadir / "paired_example" / "Consensus.sSNV.vcf.gz"
112 | ),
113 | "paired_consensus_indel_vcf": str(
114 | test_datadir / "paired_example" / "Consensus.sINDEL.vcf.gz"
115 | ),
116 | "single_consensus_snv_vcf": str(
117 | test_datadir / "tumor_only_example" / "Consensus.sSNV.vcf.gz"
118 | ),
119 | "single_consensus_indel_vcf": str(
120 | test_datadir / "tumor_only_example" / "Consensus.sINDEL.vcf.gz"
121 | ),
122 | }
123 |
--------------------------------------------------------------------------------
/tests/example/README.md:
--------------------------------------------------------------------------------
1 | # Examples with tiny unrealistic demo data after SomaticSeq is properly installed
2 |
3 | ## Run SomaticSeq for tumor-normal mode
4 |
5 | ```
6 | /PATH/TO/somaticseq/tests/example/paired_somaticseq_example.sh
7 | ```
8 |
9 | This example uses the outputs from MuTect2, VarDict, and Strelka2 as input. This
10 | is just an example. There are additional callers that are officially supported
11 | besides those three. If this command is run successfully, the directory
12 | `paired_somaticseq` will be created in your current directory. In it, SomaticSeq
13 | TSV files, VCF files, and ada classifiers will be created. Do _not_ use those
14 | classifiers for anything other than demo purposes. They are for demo purposes
15 | and completely useless.
16 |
17 | ## Run SomaticSeq for tumor-only mode
18 |
19 | Similar to above
20 |
21 | ```
22 | cd example
23 | /PATH/TO/somaticseq/tests/example/single_somaticseq_example.sh
24 | ```
25 |
26 | The directory will be `single_somaticseq`.
27 |
28 | ### To check the results of `paired_somaticseq_example.sh` and/or `single_somaticseq_example.sh`, run `results_check.sh`.
29 |
30 | ## Run dockerized workflow with MuTect2, VarDict, and Strelka2 in tumor-normal mode
31 |
32 | If you are able to run docker, you may test the following workflow:
33 |
34 | ```
35 | cd example
36 | /PATH/TO/somaticseq/tests/example/invoke_dockerized_tumor_normal_callers.sh
37 | ```
38 |
39 | Then, the following scripts will be created and executed:
40 |
41 | ```
42 | paired_example/{1,2}/logs/mutect2.year.month.date.timestamp.cmd
43 | paired_example/{1,2}/logs/strelka.year.month.date.timestamp.cmd
44 | paired_example/{1,2}/logs/vardict.year.month.date.timestamp.cmd
45 | paired_example/{1,2}/SomaticSeq/logs/somaticSeq.year.month.date.timestamp.cmd
46 | paired_example/logs/mergeResults.year.month.date.timestamp.cmd
47 | ```
48 |
49 | Directories 1 and 2 are created because the script invokes two parallel
50 | processes using `-nt 2`. The caller scripts (i.e., mutect2, strelka, and
51 | vardict) will be executed first by two parallel processes (`-nt 2`). Then, the
52 | somaticSeq scripts will be executed. Finally, the mergeResults script will be
53 | executed.
54 |
55 | ## dockerized workflow with tumor-only mode.
56 |
57 | Same as above, but run the
58 | `/PATH/TO/somaticseq/tests/example/invoke_dockerized_tumor_only_callers.sh`.
59 |
--------------------------------------------------------------------------------
/tests/example/invoke_dockerized_tumor_normal_callers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
4 |
5 | mkdir -p paired_example
6 |
7 | somaticseq_make_somatic_scripts \
8 | paired \
9 | --output-directory $(pwd -P)/paired_example \
10 | --tumor-bam ${MYDIR}/tumor.markdup.bam \
11 | --normal-bam ${MYDIR}/normal.markdup.bam \
12 | --genome-reference ${MYDIR}/tiny.fa \
13 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \
14 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \
15 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
16 | --run-mutect2 --run-somaticsniper --run-vardict --run-muse --run-lofreq --run-scalpel --run-strelka2 \
17 | --run-somaticseq --train-somaticseq \
18 | --threads 2 --run-workflow
19 |
--------------------------------------------------------------------------------
/tests/example/invoke_dockerized_tumor_only_callers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
4 |
5 | mkdir -p tumor_only_example
6 |
7 | somaticseq_make_somatic_scripts \
8 | single \
9 | --output-directory $(pwd -P)/tumor_only_example \
10 | --bam ${MYDIR}/tumor.markdup.bam \
11 | --genome-reference ${MYDIR}/tiny.fa \
12 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \
13 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \
14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
15 | --run-mutect2 --run-vardict --run-strelka2 --run-somaticseq --train-somaticseq -nt 2 --run-workflow
16 |
--------------------------------------------------------------------------------
/tests/example/normal.markdup.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/normal.markdup.bam
--------------------------------------------------------------------------------
/tests/example/normal.markdup.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/normal.markdup.bam.bai
--------------------------------------------------------------------------------
/tests/example/paired_example/Consensus.sINDEL.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Consensus.sINDEL.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/Consensus.sSNV.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Consensus.sSNV.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/LoFreq.indel.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/LoFreq.indel.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/LoFreq.snv.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/LoFreq.snv.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/MuSE.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/MuSE.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/MuTect2.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/MuTect2.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/Scalpel.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Scalpel.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/SomaticSniper.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/SomaticSniper.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/Strelka.indel.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Strelka.indel.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/Strelka.snv.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Strelka.snv.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_example/VarDict.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/VarDict.vcf.gz
--------------------------------------------------------------------------------
/tests/example/paired_somaticseq_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
6 | VERSION=`head -n 1 ${MYDIR}/../../somaticseq/_version.py | awk -F "=" '{print $2}' | tr -d '[[:space:]]"'`
7 |
8 | somaticseq \
9 | --somaticseq-train \
10 | --algorithm xgboost \
11 | --extra-hyperparameters scale_pos_weight:0.1 seed:100 \
12 | --output-directory paired_somaticseq/training \
13 | --genome-reference ${MYDIR}/tiny.fa \
14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
15 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \
16 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \
17 | --threads 3 \
18 | paired \
19 | --tumor-bam-file ${MYDIR}/tumor.markdup.bam \
20 | --normal-bam-file ${MYDIR}/normal.markdup.bam \
21 | --mutect2-vcf ${MYDIR}/paired_example/MuTect2.vcf.gz \
22 | --somaticsniper-vcf ${MYDIR}/paired_example/SomaticSniper.vcf.gz \
23 | --vardict-vcf ${MYDIR}/paired_example/VarDict.vcf.gz \
24 | --muse-vcf ${MYDIR}/paired_example/MuSE.vcf.gz \
25 | --lofreq-snv ${MYDIR}/paired_example/LoFreq.snv.vcf.gz \
26 | --lofreq-indel ${MYDIR}/paired_example/LoFreq.indel.vcf.gz \
27 | --scalpel-vcf ${MYDIR}/paired_example/Scalpel.vcf.gz \
28 | --strelka-snv ${MYDIR}/paired_example/Strelka.snv.vcf.gz \
29 | --strelka-indel ${MYDIR}/paired_example/Strelka.indel.vcf.gz
30 |
31 |
32 | somaticseq \
33 | --algorithm xgboost \
34 | --classifier-snv paired_somaticseq/training/Ensemble.sSNV.tsv.xgb.v${VERSION}.classifier \
35 | --classifier-indel paired_somaticseq/training/Ensemble.sINDEL.tsv.xgb.v${VERSION}.classifier \
36 | --output-directory paired_somaticseq/classification \
37 | --genome-reference ${MYDIR}/tiny.fa \
38 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
39 | --threads 3 \
40 | paired \
41 | --tumor-bam-file ${MYDIR}/tumor.markdup.bam \
42 | --normal-bam-file ${MYDIR}/normal.markdup.bam \
43 | --mutect2-vcf ${MYDIR}/paired_example/MuTect2.vcf.gz \
44 | --somaticsniper-vcf ${MYDIR}/paired_example/SomaticSniper.vcf.gz \
45 | --vardict-vcf ${MYDIR}/paired_example/VarDict.vcf.gz \
46 | --muse-vcf ${MYDIR}/paired_example/MuSE.vcf.gz \
47 | --lofreq-snv ${MYDIR}/paired_example/LoFreq.snv.vcf.gz \
48 | --lofreq-indel ${MYDIR}/paired_example/LoFreq.indel.vcf.gz \
49 | --scalpel-vcf ${MYDIR}/paired_example/Scalpel.vcf.gz \
50 | --strelka-snv ${MYDIR}/paired_example/Strelka.snv.vcf.gz \
51 | --strelka-indel ${MYDIR}/paired_example/Strelka.indel.vcf.gz
52 |
--------------------------------------------------------------------------------
/tests/example/results_check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | mkdir -p result_check
4 |
5 | ### Split snv and indel from the ground truth file
6 | split_vcf.py -infile Varsim.somatic.truth.vcf -snv result_check/true.snv.vcf -indel result_check/true.indel.vcf
7 |
8 |
9 | ### Check the results for paired somaticseq with the ground truth
10 | if [[ -r paired_somaticseq/Consensus.sSNV.vcf && -r paired_somaticseq/Consensus.sINDEL.vcf ]]
11 | then
12 | true_snv_positives=`cat paired_somaticseq/Consensus.sSNV.vcf | egrep -wf <(cat result_check/true.snv.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l`
13 |
14 | true_indel_positives=`cat paired_somaticseq/Consensus.sINDEL.vcf | egrep -wf <(cat result_check/true.indel.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l`
15 |
16 | echo -e "For paired SomaticSeq run, out of a total of 73 true SNVs in ground truth, ${true_snv_positives} were collected by SomaticSeq. In our own testing, the number was 70. Did you get identical results?
17 | The 3 true SNVs not collected by SomaticSeq were 1:14062, 1:24700, and 1:223356. There were in the VarDict call set, but none was considered Somatic or LikelySomatic. Two of them were in the Strelka2 call set, but nont was considered a PASS. They were not in the MuTect2 call set. Hence, they were not included in the SomaticSeq output.
18 | Out of a total of 51 true indels in ground truth, ${true_indel_positives} were collected by SomaticSeq. In our own testing, the number was 51. Did you get identical results?\n"
19 |
20 | else
21 | echo 'You did not run paired_somaticseq_example.sh'
22 |
23 | fi
24 |
25 |
26 | ### Check the results for single (e.g., tumor-only) somaticseq with the ground truth
27 | if [[ -r single_somaticseq/Consensus.sSNV.vcf && -r single_somaticseq/Consensus.sINDEL.vcf ]]
28 | then
29 | true_single_snv_positives=`cat single_somaticseq/Consensus.sSNV.vcf | egrep -wf <(cat result_check/true.snv.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l`
30 |
31 | true_single_indel_positives=`cat single_somaticseq/Consensus.sINDEL.vcf | egrep -wf <(cat result_check/true.indel.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l`
32 |
33 | echo -e "For single-sample SomaticSeq run, out of a total of 73 true SNVs in ground truth, ${true_single_snv_positives} were collected by SomaticSeq. In our own testing, the number was 73. Did you get identical results?
34 | Out of a total of 51 true indels in ground truth, ${true_single_indel_positives} were collected by SomaticSeq. In our own testing, the number was 51. Did you get identical results?"
35 |
36 | else
37 | echo 'You did not run single_somaticseq_example.sh'
38 |
39 | fi
40 |
--------------------------------------------------------------------------------
/tests/example/single_somaticseq_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | MYDIR="$( cd "$( dirname "$0" )" && pwd )"
6 | VERSION=`head -n 1 ${MYDIR}/../../somaticseq/_version.py | awk -F "=" '{print $2}' | tr -d '[[:space:]]"'`
7 |
8 | somaticseq \
9 | --somaticseq-train \
10 | --algorithm xgboost \
11 | --extra-hyperparameters scale_pos_weight:0.1 seed:100 \
12 | --output-directory single_somaticseq/training \
13 | --genome-reference ${MYDIR}/tiny.fa \
14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
15 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \
16 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \
17 | --threads 3 \
18 | single \
19 | --bam-file ${MYDIR}/tumor.markdup.bam \
20 | --mutect2-vcf ${MYDIR}/tumor_only_example/MuTect2.vcf.gz \
21 | --vardict-vcf ${MYDIR}/tumor_only_example/VarDict.vcf.gz \
22 | --strelka-vcf ${MYDIR}/tumor_only_example/Strelka.vcf.gz
23 |
24 |
25 | somaticseq \
26 | --algorithm xgboost \
27 | --classifier-snv single_somaticseq/training/Ensemble.sSNV.tsv.xgb.v${VERSION}.classifier \
28 | --classifier-indel single_somaticseq/training/Ensemble.sINDEL.tsv.xgb.v${VERSION}.classifier \
29 | --output-directory single_somaticseq/classification \
30 | --genome-reference ${MYDIR}/tiny.fa \
31 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \
32 | --threads 3 \
33 | single \
34 | --bam-file ${MYDIR}/tumor.markdup.bam \
35 | --mutect2-vcf ${MYDIR}/tumor_only_example/MuTect2.vcf.gz \
36 | --vardict-vcf ${MYDIR}/tumor_only_example/VarDict.vcf.gz \
37 | --strelka-vcf ${MYDIR}/tumor_only_example/Strelka.vcf.gz
38 |
--------------------------------------------------------------------------------
/tests/example/tiny.dict:
--------------------------------------------------------------------------------
1 | @HD VN:1.5
2 | @SQ SN:1 LN:300000 M5:08f4e39926679c06ed248cf2f51be5d1 UR:file:/sc1/groups/bfx-red/analysis/datainsights/projects/Somatic_Benchmarks/smallDemo/tiny.fa
3 |
--------------------------------------------------------------------------------
/tests/example/tiny.fa.fai:
--------------------------------------------------------------------------------
1 | 1 300000 3 300000 300001
2 |
--------------------------------------------------------------------------------
/tests/example/tiny_dbsnp.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##phasing=none
3 | ##INDIVIDUAL=TRUTH
4 | ##SAMPLE=
5 | ##INFO=
6 | ##INFO=
7 | ##INFO=
8 | ##INFO=
9 | ##INFO=
10 | ##INFO=
11 | ##INFO=
12 | ##INFO=
13 | ##ALT=
14 | ##ALT=
15 | ##ALT=
16 | ##ALT=
17 | ##FORMAT=
18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SPIKEIN
19 | 1 14062 rs000 T A 100 PASS SOMATIC;VAF=0.333333333333;DPR=5.0 GT 0/1
20 | 1 24700 rs001 T C 100 PASS SOMATIC;VAF=0.333333333333;DPR=6.0 GT 0/1
21 | 1 223356 rs002 T C 100 PASS SOMATIC;VAF=0.4;DPR=5.0 GT 0/1
22 |
--------------------------------------------------------------------------------
/tests/example/tiny_dbsnp.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tiny_dbsnp.vcf.gz
--------------------------------------------------------------------------------
/tests/example/tiny_dbsnp.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tiny_dbsnp.vcf.gz.tbi
--------------------------------------------------------------------------------
/tests/example/tumor.markdup.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor.markdup.bam
--------------------------------------------------------------------------------
/tests/example/tumor.markdup.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor.markdup.bam.bai
--------------------------------------------------------------------------------
/tests/example/tumor_only_example/Consensus.sINDEL.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Consensus.sINDEL.vcf.gz
--------------------------------------------------------------------------------
/tests/example/tumor_only_example/Consensus.sSNV.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Consensus.sSNV.vcf.gz
--------------------------------------------------------------------------------
/tests/example/tumor_only_example/MuTect2.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/MuTect2.vcf.gz
--------------------------------------------------------------------------------
/tests/example/tumor_only_example/Strelka.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Strelka.vcf.gz
--------------------------------------------------------------------------------
/tests/example/tumor_only_example/VarDict.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/VarDict.vcf.gz
--------------------------------------------------------------------------------
/tests/unit/genomic_file_parsers/test_read_info_extractor.py:
--------------------------------------------------------------------------------
1 | import pysam
2 |
3 | from somaticseq.genomic_file_parsers.read_info_extractor import (
4 | AlignmentType,
5 | get_alignment_in_read,
6 | )
7 |
8 |
9 | def test_get_alignment() -> None:
10 | """
11 | Test the following aligned read:
12 | Coordinates: 100 200 210 220 230 235 240 245 250
13 | ^ ^ ^ ^ ^ ^ ^ ^ ^
14 | Reference: ================================================
15 | |||||||||||||||||||||||||||||||||||||||||||
16 | Read: -------- -------I------I----- ---------->
17 | CIGAR: 100M 10D 10M 10 10M 5 5M 5D 5M 5S
18 | """
19 |
20 | read_dict = {
21 | "name": "query_name",
22 | "flag": "97", # top strand read1
23 | "ref_name": "chr1",
24 | "ref_pos": "101", # 1-based coordinate
25 | "map_quality": "60",
26 | "cigar": "100M10D10M10I10M5I5M5D5M5S",
27 | "next_ref_name": "=",
28 | "next_ref_pos": "251",
29 | "length": "300", # template_length
30 | "seq": "A" * 150,
31 | "qual": "J" * 150,
32 | }
33 | header_dict = {"SQ": [{"LN": 1, "SN": contig} for contig in ["chr1", "chr2"]]}
34 | header = pysam.AlignmentHeader.from_dict(header_dict)
35 | read = pysam.AlignedSegment.from_dict(read_dict, header)
36 | # matches that are more than 3 bps from the nearest indel
37 | simple_matches = set(list(range(100, 195 + 1)) + [213, 214, 215, 223, 224, 225])
38 | for coordinate in range(300):
39 | seq_call = get_alignment_in_read(read, coordinate)
40 | if coordinate in simple_matches:
41 | assert seq_call.call_type == AlignmentType.match
42 | assert seq_call.nearest_indel == float("inf")
43 | elif coordinate in (196, 197, 198):
44 | assert seq_call.call_type == AlignmentType.match
45 | assert seq_call.nearest_indel == 199 - coordinate
46 | elif coordinate == 199:
47 | assert seq_call.call_type == AlignmentType.deletion
48 | assert seq_call.indel_length == -10
49 | assert seq_call.nearest_indel == float("inf")
50 | elif coordinate in range(200, 210):
51 | assert seq_call.call_type == AlignmentType.unknown
52 | elif coordinate in (210, 211, 212):
53 | assert seq_call.call_type == AlignmentType.match
54 | assert seq_call.nearest_indel == coordinate - 209
55 | elif coordinate in (216, 217, 218):
56 | assert seq_call.call_type == AlignmentType.match
57 | assert seq_call.nearest_indel == 219 - coordinate
58 | elif coordinate == 219:
59 | assert seq_call.call_type == AlignmentType.insertion
60 | assert seq_call.indel_length == 10
61 | assert seq_call.nearest_indel == float("inf")
62 | elif coordinate == 229:
63 | assert seq_call.call_type == AlignmentType.insertion
64 | assert seq_call.indel_length == 5
65 | assert seq_call.nearest_indel == float("inf")
66 | elif coordinate == 234:
67 | assert seq_call.call_type == AlignmentType.deletion
68 | assert seq_call.indel_length == -5
69 | assert seq_call.nearest_indel == float("inf")
70 | elif 100 > coordinate >= 245:
71 | assert seq_call.call_type is None
72 |
--------------------------------------------------------------------------------
/tests/unit/utilities/test_split_bed_into_equal_regions.py:
--------------------------------------------------------------------------------
1 | import os
2 | from collections.abc import Generator
3 | from unittest.mock import MagicMock
4 |
5 | import pytest
6 | from _pytest.tmpdir import TempPathFactory
7 | from pytest_mock import MockerFixture
8 |
9 | from somaticseq.utilities.split_bed_into_equal_regions import split
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "expected_inlines,expected_outlines",
14 | [
15 | (
16 | ["chr1\t0\t100\n", ""],
17 | [["chr1\t0\t34\n"], ["chr1\t34\t68\n"], ["chr1\t68\t100\n"]],
18 | ),
19 | (
20 | ["chr1\t0\t90\n", "chr2\t0\t10\n", ""],
21 | [
22 | ["chr1\t0\t34\n"],
23 | ["chr1\t34\t68\n"],
24 | ["chr1\t68\t90\n", "chr2\t0\t10\n"],
25 | ],
26 | ),
27 | ],
28 | )
29 | def test_split(
30 | expected_inlines: list[str],
31 | expected_outlines: list[list[str]],
32 | mocker: MockerFixture,
33 | tmp_path_factory: TempPathFactory,
34 | ) -> None:
35 | """
36 | Test case where input bed file is split into 3 output bed files of equal
37 | region lengths
38 | """
39 |
40 | def _mock_reader(expected_inlines: list[str]) -> Generator:
41 | yield from expected_inlines
42 |
43 | # Mock 1 input bed file and 3 output bed files
44 | mock_reader = mocker.MagicMock()
45 | mock_writer_1 = mocker.MagicMock()
46 | mock_writer_2 = mocker.MagicMock()
47 | mock_writer_3 = mocker.MagicMock()
48 |
49 | # Mocking context manager
50 | mock_reader.__enter__.return_value = mock_reader
51 | mock_writer_1.__enter__.return_value = mock_writer_1
52 | mock_writer_2.__enter__.return_value = mock_writer_2
53 | mock_writer_3.__enter__.return_value = mock_writer_3
54 |
55 | # Define the exit methods to do nothing
56 | mock_reader.__exit__.return_value = False
57 | mock_writer_1.__exit__.return_value = False
58 | mock_writer_2.__exit__.return_value = False
59 | mock_writer_3.__exit__.return_value = False
60 |
61 | mock_reader.readline.side_effect = _mock_reader(expected_inlines)
62 |
63 | def _mock_open(file: str, mode: str = "r") -> MagicMock:
64 | if "1.x.bed" in file:
65 | return mock_writer_1
66 | elif "2.x.bed" in file:
67 | return mock_writer_2
68 | elif "3.x.bed" in file:
69 | return mock_writer_3
70 | else:
71 | return mock_reader
72 |
73 | mocker.patch("builtins.open", side_effect=_mock_open)
74 | outdir = tmp_path_factory.mktemp("split_bed")
75 | out_files = split(
76 | infile="region.bed", outfiles=os.path.join(outdir, "x.bed"), num=3
77 | )
78 | assert out_files == [os.path.join(outdir, f"{i}.x.bed") for i in (1, 2, 3)]
79 |
80 | assert mock_writer_1.write.call_count == len(expected_outlines[0])
81 | for line in expected_outlines[0]:
82 | mock_writer_1.write.assert_any_call(line)
83 |
84 | assert mock_writer_2.write.call_count == len(expected_outlines[1])
85 | for line in expected_outlines[1]:
86 | mock_writer_2.write.assert_any_call(line)
87 |
88 | assert mock_writer_3.write.call_count == len(expected_outlines[2])
89 | for line in expected_outlines[2]:
90 | mock_writer_3.write.assert_any_call(line)
91 |
--------------------------------------------------------------------------------
/tests/unit/vcf_modifier/test_bed_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from _pytest.tmpdir import TempPathFactory
3 |
4 | import somaticseq.vcf_modifier.bed_util as bed_util
5 |
6 |
7 | @pytest.fixture
8 | def dummy_vcf(tmp_path_factory: TempPathFactory) -> str:
9 | temp_file_name = str(tmp_path_factory.mktemp("vcf") / "dummy.vcf")
10 | with open(temp_file_name, "w") as f:
11 | f.write("##fileformat=VCFv4.1\n")
12 | f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
13 | for pos in range(1, 100):
14 | line = f"chr1\t{pos}\tid_{pos}\tG\tT\t.\t.\t.\n"
15 | f.write(line)
16 | return temp_file_name
17 |
18 |
19 | @pytest.fixture
20 | def inclusion_bed(tmp_path_factory: TempPathFactory) -> str:
21 | temp_file_name = str(tmp_path_factory.mktemp("bed") / "inclusion.bed")
22 | with open(temp_file_name, "w") as f:
23 | f.write("chr1\t20\t40\n")
24 | f.write("chr1\t60\t80\n")
25 | return temp_file_name
26 |
27 |
28 | @pytest.fixture
29 | def exclusion_bed(tmp_path_factory: TempPathFactory) -> str:
30 | temp_file_name = str(tmp_path_factory.mktemp("bed") / "exclusion.bed")
31 | with open(temp_file_name, "w") as f:
32 | f.write("chr1\t30\t70\n")
33 | return temp_file_name
34 |
35 |
36 | def test_bed_intersector(
37 | dummy_vcf: str,
38 | inclusion_bed: str,
39 | exclusion_bed: str,
40 | tmp_path_factory: TempPathFactory,
41 | ) -> None:
42 | outdir = tmp_path_factory.mktemp("test")
43 | out_vcf = str(outdir / "x.vcf")
44 | result = bed_util.bed_intersector(dummy_vcf, out_vcf, inclusion_bed, exclusion_bed)
45 | positions = []
46 | with open(result) as f:
47 | line = f.readline()
48 | while line:
49 | if line.startswith("#"):
50 | line = f.readline()
51 | continue
52 | item = line.split("\t")
53 | assert item[0] == "chr1"
54 | positions.append(int(item[1]))
55 | line = f.readline()
56 |
57 | assert result == out_vcf
58 | assert positions == list(range(21, 31)) + list(range(71, 81))
59 |
--------------------------------------------------------------------------------
/tests/unit/vcf_modifier/test_split_vcf.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from _pytest.tmpdir import TempPathFactory
3 |
4 | from somaticseq.genomic_file_parsers.genomic_file_handlers import (
5 | VCFVariantRecord,
6 | )
7 | from somaticseq.vcf_modifier.split_vcf import split_into_snv_and_indel
8 |
9 | COMPLEX_VCF = [
10 | ["1", "10", ".", "A", "C", "10\t.\t.\tGT\t0/0\t0/1"], # snv
11 | ["1", "10", ".", "ATGAG", "A", "10\t.\t.\tGT\t0/0\t0/1"], # deletion
12 | ["1", "11", ".", "T", "A,TC", "10\t.\t.\tGT\t0/0\t0/1"], # snv and insertion
13 | ["1", "12", ".", "GAGGTCAGGA", "AAAA", "10\t.\t.\tGT\t0/0\t0/1"], # complex
14 | ["1", "14", ".", "GGTC", "AAAAAA", "10\t.\t.\tGT\t0/0\t0/1"], # complex
15 | ]
16 |
17 |
18 | @pytest.fixture
19 | def complex_vcf(tmp_path_factory: TempPathFactory) -> str:
20 | temp_file_name = str(tmp_path_factory.mktemp("vcf") / "complex_variants.vcf")
21 | with open(temp_file_name, "w") as f:
22 | for item in COMPLEX_VCF:
23 | f.write("\t".join(item) + "\n")
24 | return temp_file_name
25 |
26 |
27 | def test_split_into_snv_and_indel(
28 | complex_vcf: str, tiny_fasta: str, tmp_path_factory: TempPathFactory
29 | ) -> None:
30 | outdir = tmp_path_factory.mktemp("test")
31 | out_snv = str(outdir / "snv.vcf")
32 | out_indel = str(outdir / "indel.vcf")
33 | split_into_snv_and_indel(
34 | infile=complex_vcf,
35 | out_snv_vcf=out_snv,
36 | out_indel_vcf=out_indel,
37 | genome_reference=tiny_fasta,
38 | )
39 |
40 | snvs = []
41 | with open(out_snv) as snv:
42 | for line in snv:
43 | vcf = VCFVariantRecord.from_vcf_line(line)
44 | snvs.append([vcf.chromosome, vcf.position, vcf.refbase, vcf.altbase])
45 | assert snvs == [
46 | ["1", 10, "A", "C"],
47 | ["1", 11, "T", "A"],
48 | ["1", 12, "G", "A"],
49 | ["1", 14, "G", "A"],
50 | ["1", 15, "G", "A"],
51 | ["1", 16, "T", "A"],
52 | ["1", 17, "C", "A"],
53 | ]
54 |
55 | indels = []
56 | with open(out_indel) as indel:
57 | for line in indel:
58 | vcf = VCFVariantRecord.from_vcf_line(line)
59 | indels.append([vcf.chromosome, vcf.position, vcf.refbase, vcf.altbase])
60 | assert indels == [
61 | ["1", 10, "ATGAG", "A"],
62 | ["1", 11, "T", "TC"],
63 | ["1", 15, "GTCAGGA", "G"],
64 | ["1", 17, "C", "CAA"],
65 | ]
66 |
--------------------------------------------------------------------------------