├── .gitignore ├── Dockerfiles ├── akt.0.3.2.dockerfile ├── alientrimmer.0.4.0.dockerfile ├── bwa.0.7.17_samtools.dockerfile ├── canvas.1.35.dockerfile ├── jsm.0.7.5-1.dockerfile ├── lofreq.2.1.3.1-1.dockerfile ├── manta.1.4.0.dockerfile ├── muse_1.0rc_c.dockerfile ├── nirvana.2.0.9.dockerfile ├── picard.2.27.5.dockerfile ├── sambamba.1.0.0.dockerfile ├── samtools.1.19.2.dockerfile ├── scalpel.0.5.4.dockerfile ├── somaticseq.base-1.7.dockerfile ├── somaticseq.master.dockerfile ├── somaticseq.release.dockerfile ├── somaticsniper.1.0.5.0-2.dockerfile ├── strelka.2.9.5.dockerfile ├── tabix.1.19.dockerfile ├── trimmomatic.0.39.dockerfile ├── vardictjava.dockerfile └── vcftools.0.1.14-2.dockerfile ├── LICENSE.txt ├── MODULES.md ├── README.md ├── docs ├── Manual.pdf ├── Manual.tex ├── Manual.tex.backup ├── Refs.bib ├── SomaticSeqYoutube.png ├── heatmap400.png ├── precisionfda.png ├── seqc2.md ├── train_for_classifiers.md └── workflow400.png ├── pyproject.toml ├── r_scripts ├── __init__.py ├── ada_cross_validation.R ├── ada_model_builder.R ├── ada_model_builder_ntChange.R └── ada_model_predictor.R ├── setup.cfg ├── setup.py ├── somaticseq ├── __init__.py ├── _version.py ├── annotate_caller.py ├── bam_features.py ├── combine_callers.py ├── defaults.py ├── genomic_file_parsers │ ├── __init__.py │ ├── concat.py │ ├── genomic_file_handlers.py │ ├── pileup_reader.py │ └── read_info_extractor.py ├── ntchange_type.py ├── run_somaticseq.py ├── sequencing_features.py ├── single_sample_vcf2tsv.py ├── somatic_tsv2vcf.py ├── somatic_vcf2tsv.py ├── somatic_xgboost.py ├── somaticseq_parallel.py ├── tsv2vcf.py ├── utilities │ ├── BAM_filter.py │ ├── README.md │ ├── __init__.py │ ├── attach_pileupVAF.py │ ├── bamQC.py │ ├── bedFileHandler.py │ ├── combo_callers_evaluator.py │ ├── dockered_pipelines │ │ ├── QC │ │ │ ├── extract_callableRegions.sh │ │ │ └── extract_coverageDepth.sh │ │ ├── README.md │ │ ├── __init__.py │ │ ├── alignments │ │ │ ├── BQSR.sh │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── align.py │ │ │ ├── jointIndelRealign.sh │ │ │ ├── markdup.py │ │ │ ├── mergeBams.py │ │ │ ├── mergeFastqs.py │ │ │ ├── singleIndelRealign.sh │ │ │ ├── spreadFastq.py │ │ │ └── trim.py │ │ ├── bamSimulator │ │ │ ├── BamSimulator_multiThreads.sh │ │ │ ├── BamSimulator_singleThread.sh │ │ │ ├── README.md │ │ │ ├── bamSurgeon │ │ │ │ ├── IndelRealign.sh │ │ │ │ ├── MergeTN.sh │ │ │ │ ├── Reheader_SM.sh │ │ │ │ ├── SortByCoordinate.sh │ │ │ │ ├── SortByReadName.sh │ │ │ │ ├── bamsurgeon_addindels.sh │ │ │ │ ├── bamsurgeon_addsnvs.sh │ │ │ │ ├── bamsurgeon_addsvs.sh │ │ │ │ ├── bamsurgeon_random_sites.sh │ │ │ │ ├── bamsurgeon_split_BAM.sh │ │ │ │ ├── cleanBam.sh │ │ │ │ ├── concatVcfFiles.sh │ │ │ │ ├── convert_nonStandardBasesInVcfs.py │ │ │ │ ├── mergeBamFiles.sh │ │ │ │ └── split_BAM_by_BED.sh │ │ │ ├── dream_sim.jpg │ │ │ ├── onkoinsight_sim.png │ │ │ └── replicate_sim.jpg │ │ ├── container_option.py │ │ ├── germline_variants │ │ │ ├── Canvas.sh │ │ │ ├── Manta.sh │ │ │ ├── Nirvana.sh │ │ │ ├── README.md │ │ │ ├── VQSR.sh │ │ │ ├── bam2vcf.sh │ │ │ └── haplotypeCaller.sh │ │ ├── makeAlignmentScripts.py │ │ ├── makeSomaticScripts.py │ │ ├── run_workflows.py │ │ ├── somatic_mutations │ │ │ ├── JointSNVMix2.py │ │ │ ├── LoFreq.py │ │ │ ├── MuSE.py │ │ │ ├── MuTect2.py │ │ │ ├── Scalpel.py │ │ │ ├── SomaticSniper.py │ │ │ ├── Strelka2.py │ │ │ ├── VarDict.py │ │ │ ├── VarScan2.py │ │ │ └── __init__.py │ │ ├── tumor_normal_run.py │ │ └── tumor_only_run.py │ ├── filter_SomaticSeq_VCF.py │ ├── linguistic_sequence_complexity.py │ ├── lociCounterWithLabels.py │ ├── lociCounters.py │ ├── multi-nucleotide_phaser.py │ ├── paired_end_bam2fastq.py │ ├── plot_TPvsFP.py │ ├── reformat_VCF2SEQC2.py │ ├── remove_callers_from_somaticseq_tsv.py │ ├── singularities │ │ ├── QC │ │ │ ├── extract_callableRegions.sh │ │ │ └── extract_coverageDepth.sh │ │ ├── README.md │ │ ├── bamSimulator │ │ │ ├── BamSimulator_multiThreads.sh │ │ │ ├── BamSimulator_singleThread.sh │ │ │ ├── README.md │ │ │ └── bamSurgeon │ │ │ │ ├── IndelRealign.sh │ │ │ │ ├── MergeTN.sh │ │ │ │ ├── Reheader_SM.sh │ │ │ │ ├── SortByCoordinate.sh │ │ │ │ ├── SortByReadName.sh │ │ │ │ ├── bamsurgeon_addindels.sh │ │ │ │ ├── bamsurgeon_addsnvs.sh │ │ │ │ ├── bamsurgeon_addsvs.sh │ │ │ │ ├── bamsurgeon_random_sites.sh │ │ │ │ ├── bamsurgeon_split_BAM.sh │ │ │ │ ├── cleanBam.sh │ │ │ │ ├── concatVcfFiles.sh │ │ │ │ ├── mergeBamFiles.sh │ │ │ │ └── split_BAM_by_BED.sh │ │ └── germline_variants │ │ │ ├── Nirvana.sh │ │ │ ├── VQSR.sh │ │ │ ├── bam2vcf.sh │ │ │ └── haplotypeCaller.sh │ ├── snakemake │ │ ├── README.md │ │ ├── Snakefile │ │ └── config.yaml │ ├── split_bed_into_equal_regions.py │ ├── split_mergedBed.py │ ├── tally_MyVCF_vs_Truth.py │ ├── tally_variants_from_multiple_vcfs.py │ ├── trimSoftClippedReads.py │ ├── variant_annotation.py │ └── vcfsorter.pl └── vcf_modifier │ ├── __init__.py │ ├── bed_util.py │ ├── complex2indel.py │ ├── copy_TextFile.py │ ├── getUniqueVcfPositions.py │ ├── leftAlign.py │ ├── modify_JointSNVMix2.py │ ├── modify_MuTect.py │ ├── modify_MuTect2.py │ ├── modify_SomaticSniper.py │ ├── modify_Strelka.py │ ├── modify_VarDict.py │ ├── modify_VarScan2.py │ ├── modify_ssMuTect2.py │ ├── modify_ssStrelka.py │ └── split_vcf.py └── tests ├── conftest.py ├── example ├── README.md ├── Varsim.somatic.truth.vcf ├── invoke_dockerized_tumor_normal_callers.sh ├── invoke_dockerized_tumor_only_callers.sh ├── normal.markdup.bam ├── normal.markdup.bam.bai ├── paired_example │ ├── Consensus.sINDEL.vcf.gz │ ├── Consensus.sSNV.vcf.gz │ ├── LoFreq.indel.vcf.gz │ ├── LoFreq.snv.vcf.gz │ ├── MuSE.vcf.gz │ ├── MuTect2.vcf.gz │ ├── Scalpel.vcf.gz │ ├── SomaticSniper.vcf.gz │ ├── Strelka.indel.vcf.gz │ ├── Strelka.snv.vcf.gz │ └── VarDict.vcf.gz ├── paired_somaticseq_example.sh ├── results_check.sh ├── single_somaticseq_example.sh ├── tiny.dict ├── tiny.fa ├── tiny.fa.fai ├── tiny_dbsnp.vcf ├── tiny_dbsnp.vcf.gz ├── tiny_dbsnp.vcf.gz.tbi ├── tumor.markdup.bam ├── tumor.markdup.bam.bai └── tumor_only_example │ ├── Consensus.sINDEL.vcf.gz │ ├── Consensus.sSNV.vcf.gz │ ├── MuTect2.vcf.gz │ ├── Strelka.vcf.gz │ └── VarDict.vcf.gz ├── functional └── test_somaticseq.py └── unit ├── genomic_file_parsers └── test_read_info_extractor.py ├── utilities └── test_split_bed_into_equal_regions.py └── vcf_modifier ├── test_bed_utils.py └── test_split_vcf.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | __pycache__ 4 | Manual.aux 5 | Manual.bbl 6 | Manual.blg 7 | Manual.log 8 | Manual.out 9 | Manual.synctex.gz 10 | *.egg-info 11 | dist 12 | build 13 | .vscode/ 14 | .made 15 | .ipynb_checkpoints/ 16 | poetry.lock 17 | -------------------------------------------------------------------------------- /Dockerfiles/akt.0.3.2.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y -q install wget libeigen3-dev tar zlib1g-dev libbz2-dev liblzma-dev bcftools r-base imagemagick && apt clean 4 | RUN cd /opt && wget https://github.com/samtools/htslib/releases/download/1.8/htslib-1.8.tar.bz2 && tar -xjvf htslib-1.8.tar.bz2 && cd htslib-1.8 && ./configure && make && make install 5 | RUN cd /opt && wget https://github.com/Illumina/akt/archive/v0.3.2.tar.gz && tar -xvf v0.3.2.tar.gz && cd akt-0.3.2 && make && cd .. && ln -s akt-0.3.2 akt 6 | -------------------------------------------------------------------------------- /Dockerfiles/alientrimmer.0.4.0.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget gcj-jdk make unzip && apt-get clean 4 | RUN cd /opt && wget ftp://ftp.pasteur.fr/pub/gensoft/projects/AlienTrimmer/AlienTrimmer_0.4.0.tar.gz && tar -xvf AlienTrimmer_0.4.0.tar.gz && cd AlienTrimmer_0.4.0/src && make && cp -p AlienTrimmer AlienTrimmer.java /usr/local/bin/ 5 | RUN cd /opt && wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip && unzip Trimmomatic-0.36.zip && ln -s Trimmomatic-0.36 Trimmomatic 6 | -------------------------------------------------------------------------------- /Dockerfiles/bwa.0.7.17_samtools.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install bwa samtools && apt-get clean 3 | -------------------------------------------------------------------------------- /Dockerfiles/canvas.1.35.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget mono-runtime mono-complete tar libunwind-dev && apt-get clean 4 | RUN cd /opt && wget https://download.microsoft.com/download/D/7/A/D7A9E4E9-5D25-4F0C-B071-210CB8267943/dotnet-ubuntu.16.04-x64.1.1.2.tar.gz && tar -xvf dotnet-ubuntu.16.04-x64.1.1.2.tar.gz && ln -s /opt/shared/Microsoft.NETCore.App/1.1.2/dotnet /usr/bin/dotnet 5 | RUN cd /opt && wget https://github.com/Illumina/canvas/releases/download/1.35.1.1316%2Bmaster/Canvas-1.35.1.1316.master_x64.tar.gz && tar -xvf Canvas-1.35.1.1316.master_x64.tar.gz && ln -s 'Canvas-1.35.1.1316+master_x64/' Canvas && chmod a+x Canvas/tabix 6 | -------------------------------------------------------------------------------- /Dockerfiles/jsm.0.7.5-1.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | RUN apt-get update && apt-get install -y python python-dev wget build-essential samtools zlib1g-dev cython && apt-get clean 4 | RUN cd /opt && wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pysam/pysam-0.5.tar.gz && tar -xvf pysam-0.5.tar.gz 5 | RUN cd /opt/pysam-0.5 && wget https://pypi.python.org/packages/2.7/s/setuptools/setuptools-0.6c11-py2.7.egg && python setup.py build && python setup.py install 6 | 7 | RUN cd /opt && wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/joint-snv-mix/JointSNVMix-0.7.5.tar.gz && tar -xvf JointSNVMix-0.7.5.tar.gz && cd JointSNVMix-0.7.5 && python setup.py install 8 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl 9 | -------------------------------------------------------------------------------- /Dockerfiles/lofreq.2.1.3.1-1.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget zlib1g-dev bzip2 libncurses5-dev build-essential python automake libtool git && apt-get clean 4 | RUN cd /opt/ && wget https://downloads.sourceforge.net/project/samtools/samtools/1.1/samtools-1.1.tar.bz2 && tar -xvf samtools-1.1.tar.bz2 && cd samtools-1.1 && make && make install && cd /opt/samtools-1.1/htslib-1.1 && make && make install 5 | RUN cd /opt && git clone https://github.com/CSB5/lofreq.git && cd lofreq && libtoolize && ./bootstrap && ./configure SAMTOOLS=/opt/samtools-1.1 HTSLIB=/opt/samtools-1.1/htslib-1.1 && make && make install 6 | -------------------------------------------------------------------------------- /Dockerfiles/manta.1.4.0.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN apt update && apt -y install wget tar bzip2 python && apt clean 4 | RUN cd /opt && wget https://github.com/Illumina/manta/releases/download/v1.4.0/manta-1.4.0.centos6_x86_64.tar.bz2 && tar -xvf manta-1.4.0.centos6_x86_64.tar.bz2 && ln -s manta-1.4.0.centos6_x86_64 manta 5 | -------------------------------------------------------------------------------- /Dockerfiles/muse_1.0rc_c.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install wget 4 | RUN cd /usr/local/bin/ && wget http://bioinformatics.mdanderson.org/Software/MuSE/MuSEv1.0rc_submission_c039ffa && chmod a+x MuSEv1.0rc_submission_c039ffa 5 | -------------------------------------------------------------------------------- /Dockerfiles/nirvana.2.0.9.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get -y install wget tar mono-runtime mono-complete libunwind-dev libcurl3 libssl1.0.0 libssl-dev && apt-get clean 4 | RUN cd /opt && wget https://download.microsoft.com/download/2/E/C/2EC018A0-A0FC-40A2-849D-AA692F68349E/dotnet-sdk-2.1.105-linux-x64.tar.gz && tar -xvf dotnet-sdk-2.1.105-linux-x64.tar.gz && ln -s /opt/dotnet /usr/local/bin/dotnet 5 | RUN cd /opt && wget https://github.com/Illumina/Nirvana/archive/v2.0.9.tar.gz && tar -xvf v2.0.9.tar.gz && ln -s Nirvana-2.0.9 Nirvana && cd Nirvana-2.0.9 && /opt/dotnet build -c Release 6 | RUN chmod a+rx /root/ 7 | -------------------------------------------------------------------------------- /Dockerfiles/picard.2.27.5.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install openjdk-8-jdk wget picard-tools && apt clean 4 | RUN ln -s /usr/bin/PicardCommandLine /usr/bin/picard 5 | RUN cd /opt && wget https://github.com/broadinstitute/picard/releases/download/2.27.5/picard.jar 6 | RUN apt -y autoremove wget 7 | -------------------------------------------------------------------------------- /Dockerfiles/sambamba.1.0.0.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:23.04 2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install sambamba 3 | -------------------------------------------------------------------------------- /Dockerfiles/samtools.1.19.2.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install samtools && apt clean 3 | -------------------------------------------------------------------------------- /Dockerfiles/scalpel.0.5.4.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget perl make cmake build-essential zlib1g-dev libncurses5-dev libncursesw5-dev cpanminus && apt-get clean 4 | RUN cpanm -f Term::ReadKey && cpanm -f Term::ReadLine && cpanm -f FindBin 5 | RUN cd /opt && wget https://downloads.sourceforge.net/project/scalpel/scalpel-0.5.4.tar.gz && tar -xvf scalpel-0.5.4.tar.gz && ln -s scalpel-0.5.4 scalpel && cd scalpel-0.5.4 && make 6 | RUN cd /opt/scalpel-0.5.4/samtools-1.1 && make && make install 7 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl 8 | -------------------------------------------------------------------------------- /Dockerfiles/somaticseq.base-1.7.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.10 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive && \ 4 | apt update && \ 5 | apt -y install r-base python3 python3-pip git wget default-jre bedtools && \ 6 | apt-get clean 7 | RUN R -e "install.packages('ada', repos = 'http://cran.rstudio.com/')" 8 | -------------------------------------------------------------------------------- /Dockerfiles/somaticseq.master.dockerfile: -------------------------------------------------------------------------------- 1 | FROM lethalfang/somaticseq:base-1.7 2 | 3 | RUN cd /opt && \ 4 | git clone https://github.com/bioinform/somaticseq && \ 5 | cd somaticseq && \ 6 | pip install --no-cache-dir --break-system-packages . 7 | -------------------------------------------------------------------------------- /Dockerfiles/somaticseq.release.dockerfile: -------------------------------------------------------------------------------- 1 | # Ex: docker build --build-arg VERSION='3.10.0' -f somaticseq.release.dockerfile . 2 | FROM lethalfang/somaticseq:base-1.7 3 | 4 | ARG VERSION 5 | RUN cd /opt && \ 6 | wget https://github.com/bioinform/somaticseq/archive/refs/tags/v${VERSION}.tar.gz && \ 7 | tar -xvf v${VERSION}.tar.gz && \ 8 | mv somaticseq-${VERSION} somaticseq && \ 9 | cd somaticseq && \ 10 | pip install --no-cache-dir --break-system-packages . 11 | -------------------------------------------------------------------------------- /Dockerfiles/somaticsniper.1.0.5.0-2.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y build-essential git-core cmake zlib1g-dev libncurses-dev 4 | RUN cd /opt/ && git clone https://github.com/genome/somatic-sniper.git && mkdir -p /opt/somatic-sniper/build && cd /opt/somatic-sniper/build && cmake ../ && make deps && make -j 5 | -------------------------------------------------------------------------------- /Dockerfiles/strelka.2.9.5.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget bzip2 python && apt-get clean 4 | RUN cd /opt && wget https://github.com/Illumina/strelka/releases/download/untagged-839da48539154c23a780/strelka-2.9.5.centos6_x86_64.tar.bz2 && tar -xvf strelka-2.9.5.centos6_x86_64.tar.bz2 && rm strelka-2.9.5.centos6_x86_64.tar.bz2 && ln -s strelka-2.9.5.centos6_x86_64 strelka 5 | -------------------------------------------------------------------------------- /Dockerfiles/tabix.1.19.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install tabix && apt clean 3 | -------------------------------------------------------------------------------- /Dockerfiles/trimmomatic.0.39.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive && apt update && apt -y install wget unzip default-jdk && apt clean 4 | RUN cd /opt && wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip && unzip Trimmomatic-0.39.zip && ln -s Trimmomatic-0.39 Trimmomatic && cd Trimmomatic && ln -s trimmomatic-0.39.jar trimmomatic.jar 5 | -------------------------------------------------------------------------------- /Dockerfiles/vardictjava.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y wget default-jre r-base samtools git 4 | ENV JAVA_HOME='' 5 | RUN cd /opt && wget https://github.com/AstraZeneca-NGS/VarDictJava/releases/download/1.7.0/VarDict-1.7.0.tar && tar -xvf VarDict-1.7.0.tar && ln -s VarDict-1.7.0 VarDictJava && git clone https://github.com/AstraZeneca-NGS/VarDict.git && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl 6 | -------------------------------------------------------------------------------- /Dockerfiles/vcftools.0.1.14-2.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN apt-get update && apt-get install -y vcftools wget && apt-get clean 4 | RUN cd /opt && wget https://www.dropbox.com/s/rbegan3opz2fc4k/vcfsorter.pl && chmod a+x vcfsorter.pl 5 | RUN cd /opt && wget https://www.dropbox.com/s/bpv098m36j8ljk4/vcftools.script.sh && chmod a+x vcftools.script.sh 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Bina Technologies inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 14 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 15 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 16 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 17 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 18 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 19 | THE POSSIBILITY OF SUCH DAMAGE. 20 | 21 | -------------------------------------------------------------------------------- /MODULES.md: -------------------------------------------------------------------------------- 1 | # SomaticSeq Modules 2 | 3 | `somaticseq` is the overarching command that takes VCF outputs from individual 4 | callers all the way to the end. For customized or debugging purposes, a number 5 | of modules can be run independently. 6 | 7 | ### Extract features from tumor and normal BAM files for any VCF file 8 | 9 | After all the VCF files are combined, `somaticseq_paired_vcf2tsv` or 10 | `somaticseq_single_vcf2tsv` were invoked to extract genomic and sequencing 11 | features from BAM and VCF files. These modules can be used independently to 12 | extract BAM features with _any_ sorted VCF files, e.g., 13 | 14 | ``` 15 | somaticseq_paired_vcf2tsv -myvcf Variants_Of_Interest.vcf -nbam normal.bam -tbam tumor.bam -ref human.fasta -mincaller 0 -outfile Variants_with_BAM_Features.vcf 16 | ``` 17 | 18 | Notice the `-mincaller 0` option above, which tells the module to extract 19 | features if at least 0 callers have called the variant as somatic. In other 20 | words, `-mincaller 0` tells the module to extract feature for every input 21 | candidate. Default in SomaticSeq is `-mincaller 0.5` which means it will keep 22 | variants that are LowQual in some callers, but REJECT calls can be excluded. 23 | 24 | Run `somaticseq_paired_vcf2tsv -h` or `somaticseq_single_vcf2tsv -h` to see 25 | command line options. 26 | 27 | ### Convert SomaticSeq TSV file to SomaticSeq VCF file 28 | 29 | Run `somaticseq_tsv2vcf -h` to see all the command line options. The VCF file 30 | (`-vcf/--vcf-out`) is the output file, e.g., 31 | 32 | ``` 33 | somaticseq_tsv2vcf --tsv-in predicted_snvs.tsv --vcf-out predicted_snvs.vcf --pass-threshold 0.7 --lowqual-threshold 0.1 --individual-mutation-tools MuTect2 VarDict Strelka --emit-all --phred-scale --paired-samples 34 | ``` 35 | 36 | It can only work on SomaticSeq generated TSV files. 37 | 38 | ### Train XGBoost model 39 | 40 | Run `somaticseq_xgboost train -h` to see all the options. 41 | 42 | You can combine multiple TSV files to create one single model, and try different 43 | parameters, e.g., 44 | 45 | ``` 46 | somaticseq_xgboost train -tsvs SAMPLE-01_SNVs.tsv SAMPLE-02_SNVs.tsv .... SAMPLE-NN_SNVs.tsv -out SNV.xgboost.classifier -threads 8 -depth 12 -seed 1234 -method hist -iter 250 --extra-params grow_policy:lossguide max_leaves:24 47 | ``` 48 | 49 | ### Train AdaBoost model 50 | 51 | You can only input one TSV file, or combine them manually, e.g., 52 | `somaticseq_concat -infiles */Ensemble.sSNV.tsv -outfile Ensemble.sSNVs.tsv`. 53 | 54 | ``` 55 | ada_model_builder_ntChange.R Ensemble.sSNVs.tsv 56 | ``` 57 | 58 | ### Predict using a XGBoost model 59 | 60 | Run `somaticseq_xgboost predict -h` to see all the options. Be absolutely sure 61 | the training and prediction data match. 62 | 63 | ``` 64 | somaticseq_xgboost predict -model SNV.xgboost.classifier -tsv variant_candidates.tsv -out predicted_variant_set.tsv -ntrees 50 65 | ``` 66 | 67 | ### Predict using an AdaBoost model 68 | 69 | ``` 70 | ada_model_predictor.R snv.classifier.RData snv_candidates.tsv predicted_snvs.tsv 71 | ``` 72 | 73 | ### To remove caller from a super set 74 | 75 | If you have previously created a classifier with MuTect2, MuSE, VarDict, and 76 | Strelka2, but now you want to create another classifier with only MuTect2 and 77 | Strelka2 (maybe you decided you don't want to run MuSE and VarDict anymore), you 78 | don't have to re-run the whole pipeline. You can take the original TSV file, and 79 | create another TSV file as if only MuTect2 and Strelka2 were run, i.e., it will 80 | remove variants that were only called by MuSE and/or VarDict, and then replace 81 | values extracted from those callers as nan. 82 | 83 | ``` 84 | remove_callers_from_somaticseq_tsv.py -infile Merged_from_4_callers.tsv -outfile With_only_MuTect2_Strelka2.tsv -subtract if_VarDict MuSE_Tier 85 | ``` 86 | -------------------------------------------------------------------------------- /docs/Manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/Manual.pdf -------------------------------------------------------------------------------- /docs/SomaticSeqYoutube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/SomaticSeqYoutube.png -------------------------------------------------------------------------------- /docs/heatmap400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/heatmap400.png -------------------------------------------------------------------------------- /docs/precisionfda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/precisionfda.png -------------------------------------------------------------------------------- /docs/workflow400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/docs/workflow400.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "somaticseq" 7 | keywords = ["somatic mutations", "bioinformatics", "genomics", "ngs"] 8 | authors = [ 9 | {name = "Li Tai Fang"}, 10 | {name = "Pegah Tootoonchi Afshar"}, 11 | {name = "Aparna Chhibber"}, 12 | {name = "Marghoob Mohiyuddin"}, 13 | {name = "John C. Mu"}, 14 | {name = "Greg Gibeling"}, 15 | {name = "Sharon Barr"}, 16 | {name = "Narges Bani Asadi"}, 17 | {name = "Hugo Y.K. Lam"}, 18 | ] 19 | maintainers = [ 20 | {name = "Li Tai Fang", email = "ltfang@gmail.com"}, 21 | ] 22 | description = "SomaticSeq: An ensemble approach to accurately detect somatic mutations using SomaticSeq" 23 | requires-python = ">=3.11.0" 24 | license = {text = "BSD-2-Clause"} 25 | dependencies = [ 26 | "pysam", 27 | "numpy", 28 | "scipy", 29 | "pandas", 30 | "pybedtools>=0.12.0", 31 | "xgboost>=1.4", 32 | "pydantic>=2.0.0,<3.0", 33 | ] 34 | dynamic = ["version", "readme"] 35 | classifiers = [ 36 | "Development Status :: 5 - Production/Stable", 37 | "Topic :: Scientific/Engineering :: Bio-Informatics", 38 | "Intended Audience :: Science/Research", 39 | "Intended Audience :: Healthcare Industry", 40 | "Programming Language :: Python :: 3", 41 | ] 42 | 43 | [project.urls] 44 | Homepage = "https://github.com/bioinform/somaticseq" 45 | 46 | [project.scripts] 47 | somaticseq = "somaticseq.somaticseq_parallel:main" 48 | somaticseq_parallel = "somaticseq.somaticseq_parallel:main" 49 | somaticseq_xgboost = "somaticseq.somatic_xgboost:main" 50 | somaticseq_tsv2vcf = "somaticseq.somatic_tsv2vcf:main" 51 | somaticseq_single_vcf2tsv = "somaticseq.single_sample_vcf2tsv:main" 52 | somaticseq_paired_vcf2tsv = "somaticseq.somatic_vcf2tsv:main" 53 | somaticseq_concat = "somaticseq.genomic_file_parsers.concat:main" 54 | somaticseq_linguistic_sequence_complexity = "somaticseq.utilities.linguistic_sequence_complexity:main" 55 | somaticseq_loci_counter = "somaticseq.utilities.lociCounterWithLabels:main" 56 | somaticseq_paired_end_bam2fastq = "somaticseq.utilities.paired_end_bam2fastq:main" 57 | somaticseq_split_bed_into_equal_regions = "somaticseq.utilities.split_bed_into_equal_regions:main" 58 | somaticseq_make_alignment_scripts = "somaticseq.utilities.dockered_pipelines.makeAlignmentScripts:main" 59 | somaticseq_make_somatic_scripts = "somaticseq.utilities.dockered_pipelines.makeSomaticScripts:main" 60 | somaticseq_run_workflows = "somaticseq.utilities.dockered_pipelines.run_workflows:main" 61 | somaticseq_split_vcf = "somaticseq.vcf_modifier.split_vcf:main" 62 | 63 | [project.optional-dependencies] 64 | dev = [ 65 | "black", 66 | "flake8", 67 | "mypy", 68 | "pytest", 69 | "pytest-mock", 70 | "twine", 71 | ] 72 | 73 | [tool.pytest.ini_options] 74 | testpaths = ["tests"] 75 | addopts = "--import-mode=importlib" 76 | -------------------------------------------------------------------------------- /r_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/r_scripts/__init__.py -------------------------------------------------------------------------------- /r_scripts/ada_cross_validation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | require("ada") 4 | 5 | args <- commandArgs(TRUE) 6 | 7 | training_data_filename = args[1] 8 | 9 | ##### Main (entry point) 10 | train_filename = paste(training_data_filename) 11 | train_data = read.table(train_filename, header=TRUE) 12 | 13 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) { 14 | stop("In training mode, there must be both true positives and false positives in the call set.") 15 | } 16 | 17 | # Use substitution identity for training 18 | train_data$GC2CG = 0 19 | train_data$GC2TA = 0 20 | train_data$GC2AT = 0 21 | train_data$TA2AT = 0 22 | train_data$TA2GC = 0 23 | train_data$TA2CG = 0 24 | 25 | train_data$GC2CG[ (train_data$REF=='G' & train_data$ALT=='C') | (train_data$REF=='C' & train_data$ALT=='G') ] = 1 26 | train_data$GC2TA[ (train_data$REF=='G' & train_data$ALT=='T') | (train_data$REF=='C' & train_data$ALT=='A') ] = 1 27 | train_data$GC2AT[ (train_data$REF=='G' & train_data$ALT=='A') | (train_data$REF=='C' & train_data$ALT=='T') ] = 1 28 | train_data$TA2AT[ (train_data$REF=='T' & train_data$ALT=='A') | (train_data$REF=='A' & train_data$ALT=='T') ] = 1 29 | train_data$TA2GC[ (train_data$REF=='T' & train_data$ALT=='G') | (train_data$REF=='A' & train_data$ALT=='C') ] = 1 30 | train_data$TA2CG[ (train_data$REF=='T' & train_data$ALT=='T') | (train_data$REF=='A' & train_data$ALT=='G') ] = 1 31 | 32 | # Do not use these for training 33 | train_data$CHROM <- NULL 34 | train_data$POS <- NULL 35 | train_data$ID <- NULL 36 | train_data$REF <- NULL 37 | train_data$ALT <- NULL 38 | train_data$if_COSMIC <- NULL 39 | train_data$COSMIC_CNT <- NULL 40 | train_data$T_VAF_REV <- NULL 41 | train_data$T_VAF_FOR <- NULL 42 | 43 | for (var_i in tail(args, -1) ) { 44 | train_data[, var_i] <- NULL 45 | cat("Remove feature:", var_i, "\n") 46 | } 47 | 48 | 49 | model_formula <- as.formula(TrueVariant_or_False ~ .) 50 | 51 | # Cross validation: 52 | 53 | for (ith_try in 1:10) 54 | 55 | { 56 | # split test/train 50-50 57 | sample <- sample.int(n = nrow(train_data), size = floor(.5*nrow(train_data)), replace = F) 58 | train <- train_data[sample, ] 59 | test <- train_data[-sample, ] 60 | 61 | # do model 62 | ada.model <- ada(model_formula, data = train, iter = 500) 63 | # print(ada.model) 64 | 65 | ada.pred <- predict(ada.model, newdata = test, type="both", n.iter=350) 66 | 67 | # probability > 0.5 68 | pass_calls <- ada.pred$prob[,2] > 0.5 69 | reject_calls <- ada.pred$prob[,2] < 0.1 70 | 71 | # Counting 72 | num_pass_calls <- sum( pass_calls ) 73 | num_reject_calls <- sum( reject_calls ) 74 | num_pass_true_positives <- sum( pass_calls[pass_calls == test$TrueVariant_or_False] ) 75 | num_true_positives <- sum(test$TrueVariant_or_False) 76 | 77 | # Calculate results 78 | precision <- num_pass_true_positives/num_pass_calls 79 | sensitivity <- num_pass_true_positives/num_true_positives 80 | F1_score <- 2 * num_pass_true_positives / ( num_true_positives + num_pass_calls ) 81 | 82 | # Print out 83 | cat (ith_try, 'th_try', '\n') 84 | 85 | cat("PASS_Calls =", num_pass_calls, "\n") 86 | cat("REJECT_Calls =", num_reject_calls, "\n") 87 | 88 | cat("PASS_TruePositives =", num_pass_true_positives, "\n") 89 | cat("PASS_FalsePositives =", num_pass_calls - num_pass_true_positives, "\n") 90 | 91 | cat("Sensitivity =", sensitivity, "\n") 92 | cat("Precision =", precision, "\n") 93 | cat("F1 =", F1_score, "\n") 94 | 95 | } 96 | -------------------------------------------------------------------------------- /r_scripts/ada_model_builder.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | require("ada") 4 | 5 | args <- commandArgs(TRUE) 6 | 7 | train_filename = args[1] 8 | 9 | ##### Main (entry point) 10 | train_data = read.table(train_filename, header=TRUE) 11 | 12 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) { 13 | stop("In training mode, there must be both true positives and false positives in the call set.") 14 | } 15 | 16 | # train_data <- train_data[,-c(1, 2, 3, 4, 5)] 17 | 18 | # Do not use these for training 19 | train_data[,'CHROM'] <- NULL 20 | train_data[,'POS'] <- NULL 21 | train_data[,'ID'] <- NULL 22 | train_data[,'REF'] <- NULL 23 | train_data[,'ALT'] <- NULL 24 | train_data[,'if_COSMIC'] <- NULL 25 | train_data[,'COSMIC_CNT'] <- NULL 26 | 27 | train_data$SOR <- as.numeric(train_data$SOR) 28 | 29 | model_formula <- as.formula(TrueVariant_or_False ~ .) 30 | 31 | print("Fitting model...") 32 | ada.model <- ada(model_formula, data = train_data, iter = 500) 33 | 34 | save(ada.model, file = paste(train_filename, ".Classifier.RData", sep="") ) 35 | 36 | print(ada.model) 37 | 38 | #pdf( paste(train_filename, ".varplot.pdf", sep = "") ) 39 | #varplot(ada.model) 40 | #dev.off() 41 | 42 | #pdf( paste(train_filename, ".iterplot.pdf", sep = "") ) 43 | #plot(ada.model, TRUE, TRUE) 44 | #dev.off() 45 | 46 | #print("Computing prediction values...") 47 | #ada.pred <- predict(ada.model, newdata = test_data, type="both") 48 | -------------------------------------------------------------------------------- /r_scripts/ada_model_builder_ntChange.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | require("ada") 4 | 5 | args <- commandArgs(TRUE) 6 | 7 | training_data_filename = args[1] 8 | 9 | ##### Main (entry point) 10 | train_filename = paste(training_data_filename) 11 | train_data = read.table(train_filename, header=TRUE) 12 | 13 | if (!(1 %in% train_data$TrueVariant_or_False && 0 %in% train_data$TrueVariant_or_False)) { 14 | stop("In training mode, there must be both true positives and false positives in the call set.") 15 | } 16 | 17 | # Use substitution identity for training 18 | train_data$GC2CG = 0 19 | train_data$GC2TA = 0 20 | train_data$GC2AT = 0 21 | train_data$TA2AT = 0 22 | train_data$TA2GC = 0 23 | train_data$TA2CG = 0 24 | 25 | train_data$GC2CG[ (train_data$REF=='G' & train_data$ALT=='C') | (train_data$REF=='C' & train_data$ALT=='G') ] = 1 26 | train_data$GC2TA[ (train_data$REF=='G' & train_data$ALT=='T') | (train_data$REF=='C' & train_data$ALT=='A') ] = 1 27 | train_data$GC2AT[ (train_data$REF=='G' & train_data$ALT=='A') | (train_data$REF=='C' & train_data$ALT=='T') ] = 1 28 | train_data$TA2AT[ (train_data$REF=='T' & train_data$ALT=='A') | (train_data$REF=='A' & train_data$ALT=='T') ] = 1 29 | train_data$TA2GC[ (train_data$REF=='T' & train_data$ALT=='G') | (train_data$REF=='A' & train_data$ALT=='C') ] = 1 30 | train_data$TA2CG[ (train_data$REF=='T' & train_data$ALT=='C') | (train_data$REF=='A' & train_data$ALT=='G') ] = 1 31 | 32 | # Do not use these for training 33 | train_data$CHROM <- NULL 34 | train_data$POS <- NULL 35 | train_data$ID <- NULL 36 | train_data$REF <- NULL 37 | train_data$ALT <- NULL 38 | train_data$if_COSMIC <- NULL 39 | train_data$COSMIC_CNT <- NULL 40 | train_data$T_VAF_REV <- NULL 41 | train_data$T_VAF_FOR <- NULL 42 | 43 | for (var_i in tail(args, -1) ) { 44 | train_data[, var_i] <- NULL 45 | cat("Remove", var_i, "\n") 46 | } 47 | 48 | model_formula <- as.formula(TrueVariant_or_False ~ .) 49 | 50 | print("Fitting model...") 51 | 52 | boosting_iters = 500 53 | 54 | seed_value = floor(runif(1, min=100, max=50000)) 55 | print( paste("Seed =", seed_value) ) 56 | set.seed(seed_value) 57 | 58 | ada.model <- ada(model_formula, data = train_data, iter = boosting_iters, control=rpart.control(cp=-1, maxdepth=16, minsplit=0, xval=0)) 59 | save(ada.model, file = paste(training_data_filename, ".ada.Classifier.RData", sep="") ) 60 | 61 | print(ada.model) 62 | -------------------------------------------------------------------------------- /r_scripts/ada_model_predictor.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | require("ada") 4 | 5 | args <- commandArgs(TRUE) 6 | 7 | trained_model = args[1] 8 | test_filename = args[2] 9 | output_filename = args[3] 10 | 11 | # Make a copy of the input data since it will be modified, but don't output the modification into the output file 12 | test_data_ = read.table(test_filename, header=TRUE) 13 | test_data <- test_data_ 14 | 15 | # Create 6 features based on base substitution types, just in case those are training features. Otherwise, doesn't take much time. 16 | test_data$GC2CG = 0 17 | test_data$GC2TA = 0 18 | test_data$GC2AT = 0 19 | test_data$TA2AT = 0 20 | test_data$TA2GC = 0 21 | test_data$TA2CG = 0 22 | 23 | test_data$GC2CG[ (test_data$REF=='G' & test_data$ALT=='C') | (test_data$REF=='C' & test_data$ALT=='G') ] = 1 24 | test_data$GC2TA[ (test_data$REF=='G' & test_data$ALT=='T') | (test_data$REF=='C' & test_data$ALT=='A') ] = 1 25 | test_data$GC2AT[ (test_data$REF=='G' & test_data$ALT=='A') | (test_data$REF=='C' & test_data$ALT=='T') ] = 1 26 | test_data$TA2AT[ (test_data$REF=='T' & test_data$ALT=='A') | (test_data$REF=='A' & test_data$ALT=='T') ] = 1 27 | test_data$TA2GC[ (test_data$REF=='T' & test_data$ALT=='G') | (test_data$REF=='A' & test_data$ALT=='C') ] = 1 28 | test_data$TA2CG[ (test_data$REF=='T' & test_data$ALT=='C') | (test_data$REF=='A' & test_data$ALT=='G') ] = 1 29 | 30 | 31 | # Handle empty input data 32 | if ( nrow(test_data)>=1 ) { 33 | load( trained_model ) 34 | ada.pred <- predict(ada.model, newdata = test_data, type="both", n.iter=300) 35 | test_data_output <- cbind(test_data_, SCORE = ada.pred$prob[,2]) 36 | 37 | } else { 38 | 39 | test_data_output <- test_data_ 40 | } 41 | 42 | write.table(test_data_output, row.names = FALSE, sep="\t", na = "nan", file = output_filename, quote=FALSE) 43 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file=README.md 3 | license_files=LICENSE.txt 4 | 5 | [flake8] 6 | ignore = E203, W503, E262, E266 7 | select = B,C,E,F,W,T4,B9 8 | max-line-length = 88 9 | exclude = build 10 | 11 | [mypy] 12 | ignore_missing_imports = True 13 | exclude = ^build|dist$ 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | 6 | from setuptools import find_packages, setup 7 | 8 | LINK_PATTERN = r"(\[.*?\]\()([^http][^)]*)\)" 9 | IMAGE_SRC_PATTERN = r'(]*src=")([^"]*)(")' 10 | BASE_URL = "https://github.com/bioinform/somaticseq" 11 | 12 | 13 | # Read __version__ from the _version.py file 14 | version_file = os.path.join("somaticseq", "_version.py") 15 | with open(version_file) as f: 16 | exec(f.read()) # This will define __version__ 17 | 18 | 19 | def modify_markdown_for_3rd_party(base_markdown: str) -> str: 20 | def _replace_link(match: re.Match) -> str: 21 | # Replace relative links in .md from [text](RELATIVE/LINK) into 22 | # [text]({BASE_URL}/blob/TAG/RELATIVE/LINK) 23 | text = match.group(1) 24 | url = match.group(2) 25 | return f"{text}{BASE_URL}/blob/v{__version__}/{url})" # type: ignore[name-defined] # noqa 26 | 27 | def _replace_src(match: re.Match) -> str: 28 | # Replace relative image links 29 | prefix = match.group(1) # part before the url 30 | url = match.group(2) # original url 31 | suffix = match.group(3) # part after the url 32 | return f"{prefix}{BASE_URL}/raw/v{__version__}/{url}{suffix}" # type: ignore[name-defined] # noqa 33 | 34 | with_abs_url = re.sub(LINK_PATTERN, _replace_link, base_markdown) 35 | with_abs_img_src = re.sub(IMAGE_SRC_PATTERN, _replace_src, with_abs_url) 36 | return with_abs_img_src 37 | 38 | 39 | with open("README.md") as fn: 40 | long_description = fn.read() 41 | description_for_3rd_party = modify_markdown_for_3rd_party(long_description) 42 | 43 | 44 | setup( 45 | name="somaticseq", 46 | description=( 47 | "SomaticSeq: " 48 | "An ensemble approach to accurately detect somatic mutations using SomaticSeq" 49 | ), 50 | version=__version__, # type: ignore[name-defined] # noqa 51 | long_description=description_for_3rd_party, 52 | long_description_content_type="text/markdown", 53 | author="Li Tai Fang", 54 | author_email="ltfang@gmail.com", 55 | url="https://github.com/bioinform/somaticseq", 56 | packages=find_packages(), 57 | package_data={"": ["*.R"]}, 58 | python_requires=">=3.11.0", 59 | setup_requires=["setuptools"], 60 | install_requires=[ # pyproject.toml overrides them 61 | "pysam", 62 | "numpy", 63 | "scipy", 64 | "pandas", 65 | "xgboost>=1.4", 66 | "pybedtools>=0.12.0", 67 | "pydantic>=2.0.0,<3.0", 68 | ], 69 | scripts=[ 70 | "somaticseq/somaticseq_parallel.py", 71 | "somaticseq/run_somaticseq.py", 72 | "somaticseq/single_sample_vcf2tsv.py", 73 | "somaticseq/somatic_vcf2tsv.py", 74 | "somaticseq/somatic_xgboost.py", 75 | "somaticseq/somatic_tsv2vcf.py", 76 | "somaticseq/genomic_file_parsers/concat.py", 77 | "somaticseq/utilities/linguistic_sequence_complexity.py", 78 | "somaticseq/utilities/lociCounterWithLabels.py", 79 | "somaticseq/utilities/paired_end_bam2fastq.py", 80 | "somaticseq/utilities/remove_callers_from_somaticseq_tsv.py", 81 | "somaticseq/utilities/split_bed_into_equal_regions.py", 82 | "somaticseq/utilities/tally_variants_from_multiple_vcfs.py", 83 | "somaticseq/utilities/variant_annotation.py", 84 | "somaticseq/utilities/vcfsorter.pl", 85 | "somaticseq/utilities/dockered_pipelines/makeAlignmentScripts.py", 86 | "somaticseq/utilities/dockered_pipelines/makeSomaticScripts.py", 87 | "somaticseq/utilities/dockered_pipelines/run_workflows.py", 88 | "somaticseq/vcf_modifier/split_vcf.py", 89 | "r_scripts/ada_model_builder_ntChange.R", 90 | "r_scripts/ada_model_predictor.R", 91 | ], 92 | ) 93 | -------------------------------------------------------------------------------- /somaticseq/__init__.py: -------------------------------------------------------------------------------- 1 | from somaticseq._version import __version__ # noqa 2 | -------------------------------------------------------------------------------- /somaticseq/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.11.1" 2 | vcf_header = f"##SomaticSeq=v{__version__}" 3 | -------------------------------------------------------------------------------- /somaticseq/defaults.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | MIN_MAPPING_QUALITY: int = 1 4 | MIN_BASE_QUALITY: int = 5 5 | MIN_CALLER: float = 0.5 6 | PASS_SCORE: float = 0.5 7 | LOWQUAL_SCORE: float = 0.1 8 | HOMOZYGOUS_FRAC: float = 0.85 9 | HETEROZYGOUS_FRAC: float = 0.01 10 | 11 | SNV_TSV_SUFFIX: str = "sSNV.tsv" 12 | INDEL_TSV_SUFFIX: str = "sINDEL.tsv" 13 | SNV_VCF_SUFFIX: str = "sSNV.vcf" 14 | INDEL_VCF_SUFFIX: str = "sINDEL.vcf" 15 | ENSEMBLE_PREFIX: str = "Ensemble." 16 | CONSENSUS_PREFIX: str = "Consensus." 17 | CLASSIFIED_PREFIX: str = "SSeq.Classified." 18 | TUMOR_NAME: str = "TUMOR" 19 | NORMAL_NAME: str = "NORMAL" 20 | 21 | ALGORITHM: Literal["xgboost", "ada"] = "xgboost" 22 | DEFAULT_XGB_BOOST_ROUNDS: int = 500 23 | DEFAULT_NUM_TREES_PREDICT: int = 100 24 | -------------------------------------------------------------------------------- /somaticseq/genomic_file_parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/genomic_file_parsers/__init__.py -------------------------------------------------------------------------------- /somaticseq/ntchange_type.py: -------------------------------------------------------------------------------- 1 | def ntchange(variant_frame): 2 | GC2CG = [] 3 | GC2TA = [] 4 | GC2AT = [] 5 | TA2AT = [] 6 | TA2GC = [] 7 | TA2CG = [] 8 | 9 | for ref, alt in zip(variant_frame["REF"], variant_frame["ALT"]): 10 | ref = ref.upper() 11 | alt = alt.upper() 12 | 13 | if (ref == "G" and alt == "C") or (ref == "C" and alt == "G"): 14 | GC2CG.append(1) 15 | GC2TA.append(0) 16 | GC2AT.append(0) 17 | TA2AT.append(0) 18 | TA2GC.append(0) 19 | TA2CG.append(0) 20 | 21 | elif (ref == "G" and alt == "T") or (ref == "C" and alt == "A"): 22 | GC2CG.append(0) 23 | GC2TA.append(1) 24 | GC2AT.append(0) 25 | TA2AT.append(0) 26 | TA2GC.append(0) 27 | TA2CG.append(0) 28 | 29 | elif (ref == "G" and alt == "A") or (ref == "C" and alt == "T"): 30 | GC2CG.append(0) 31 | GC2TA.append(0) 32 | GC2AT.append(1) 33 | TA2AT.append(0) 34 | TA2GC.append(0) 35 | TA2CG.append(0) 36 | 37 | elif (ref == "T" and alt == "A") or (ref == "A" and alt == "T"): 38 | GC2CG.append(0) 39 | GC2TA.append(0) 40 | GC2AT.append(0) 41 | TA2AT.append(1) 42 | TA2GC.append(0) 43 | TA2CG.append(0) 44 | 45 | elif (ref == "T" and alt == "G") or (ref == "A" and alt == "C"): 46 | GC2CG.append(0) 47 | GC2TA.append(0) 48 | GC2AT.append(0) 49 | TA2AT.append(0) 50 | TA2GC.append(1) 51 | TA2CG.append(0) 52 | 53 | elif (ref == "T" and alt == "C") or (ref == "A" and alt == "G"): 54 | GC2CG.append(0) 55 | GC2TA.append(0) 56 | GC2AT.append(0) 57 | TA2AT.append(0) 58 | TA2GC.append(0) 59 | TA2CG.append(1) 60 | 61 | else: 62 | GC2CG.append(0) 63 | GC2TA.append(0) 64 | GC2AT.append(0) 65 | TA2AT.append(0) 66 | TA2GC.append(0) 67 | TA2CG.append(0) 68 | 69 | new_data = variant_frame.assign( 70 | GC2CG=GC2CG, GC2TA=GC2CG, GC2AT=GC2CG, TA2AT=GC2CG, TA2GC=GC2CG, TA2CG=GC2CG 71 | ) 72 | return new_data 73 | -------------------------------------------------------------------------------- /somaticseq/utilities/BAM_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import pysam 6 | 7 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 8 | parser.add_argument( 9 | "-bamin", 10 | "--bam-file-in", 11 | type=str, 12 | help="Input BAM file", 13 | required=True, 14 | default=None, 15 | ) 16 | parser.add_argument( 17 | "-bamout", 18 | "--bam-file-out", 19 | type=str, 20 | help="Output BAM file", 21 | required=True, 22 | default=None, 23 | ) 24 | 25 | parser.add_argument( 26 | "-maxNM", 27 | "--max-NM", 28 | type=int, 29 | help="filter out high edit distance reads", 30 | required=False, 31 | default=8, 32 | ) 33 | parser.add_argument( 34 | "-minMQ", 35 | "--min-MQ", 36 | type=float, 37 | help="filter out low MQ reads", 38 | required=False, 39 | default=20, 40 | ) 41 | parser.add_argument( 42 | "-nodisc", 43 | "--no-discordant", 44 | action="store_true", 45 | help="filter out discordant reads", 46 | required=False, 47 | default=False, 48 | ) 49 | parser.add_argument( 50 | "-noclip", 51 | "--no-clipping", 52 | action="store_true", 53 | help="filter out soft-clipped reads", 54 | required=False, 55 | default=False, 56 | ) 57 | 58 | args = parser.parse_args() 59 | bam_file = args.bam_file_in 60 | bam_out = args.bam_file_out 61 | maxNM = args.max_NM 62 | minMQ = args.min_MQ 63 | filter_discordant = args.no_discordant 64 | filter_clip = args.no_clipping 65 | 66 | with ( 67 | pysam.AlignmentFile(bam_file) as bam, 68 | pysam.AlignmentFile(bam_out, "wb", template=bam) as bamout, 69 | ): 70 | reads = bam.fetch() 71 | 72 | for read_i in reads: 73 | assert read_i.cigarstring is not None 74 | if ( 75 | read_i.mapping_quality >= minMQ 76 | and (read_i.has_tag("NM") and read_i.get_tag("NM") <= maxNM) 77 | and (read_i.is_proper_pair or not filter_discordant) 78 | and ("S" not in read_i.cigarstring or not filter_clip) 79 | ): 80 | bamout.write(read_i) 81 | -------------------------------------------------------------------------------- /somaticseq/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/__init__.py -------------------------------------------------------------------------------- /somaticseq/utilities/bedFileHandler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | class BedFile: 5 | def __init__(self, BedFile): 6 | """Argument is a line in pileup file.""" 7 | self.BedFile = BedFile 8 | 9 | bedRegions = {} 10 | 11 | with open(self.BedFile) as bed: 12 | line_i = bed.readline().rstrip() 13 | 14 | while line_i: 15 | item = line_i.split("\t") 16 | 17 | contig = item[0] 18 | region = (int(item[1]), int(item[2])) 19 | 20 | if contig not in bedRegions: 21 | bedRegions[contig] = [] 22 | 23 | bedRegions[contig].append(region) 24 | 25 | line_i = bed.readline().rstrip() 26 | 27 | self.bedRegions = bedRegions 28 | 29 | def inRegion(self, contig_i, position_i, ordered=True): 30 | intersected = False 31 | 32 | if contig_i in self.bedRegions: 33 | # If the BED file is ordered, it can break out once it goes beyond 34 | # the position_i 35 | if ordered: 36 | for region_i in self.bedRegions[contig_i]: 37 | if region_i[0] < position_i <= region_i[1]: 38 | intersected = True 39 | break 40 | 41 | elif region_i[0] > position_i: 42 | break 43 | 44 | # If the BED file is not ordered, then it needs to go all the way to 45 | # the end every time 46 | else: 47 | for region_i in self.bedRegions[contig_i]: 48 | if region_i[0] < position_i <= region_i[1]: 49 | intersected = True 50 | break 51 | 52 | return intersected 53 | -------------------------------------------------------------------------------- /somaticseq/utilities/combo_callers_evaluator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # mypy: ignore-errors 3 | 4 | import argparse 5 | import itertools 6 | 7 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome 8 | 9 | # argparse Stuff 10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 11 | parser.add_argument( 12 | "-vcf", 13 | "--input-vcf", 14 | type=str, 15 | help="SomaticSeq VCF file", 16 | required=True, 17 | default=None, 18 | ) 19 | parser.add_argument( 20 | "-combo", 21 | "--combo-code", 22 | type=str, 23 | help="E.g., MVJSDULK", 24 | required=True, 25 | default="MVJSDULK", 26 | ) 27 | 28 | args = parser.parse_args() 29 | vcf = args.input_vcf 30 | combo = args.combo_code 31 | 32 | tool_code = list(combo) 33 | 34 | all_combos = {} 35 | for i in range(1, len(tool_code) + 1): 36 | combo_gen = itertools.combinations(tool_code, i) 37 | for j in combo_gen: 38 | all_combos[j] = [0, 0] 39 | 40 | 41 | with open(vcf) as vcf: 42 | line_i = vcf.readline().rstrip() 43 | 44 | while line_i.startswith("#"): 45 | line_i = vcf.readline().rstrip() 46 | 47 | print("#ToolCombo\tTruePositiveCalls\tAllCalls") 48 | 49 | while line_i: 50 | vcf_i = genome.VCFVariantRecord.from_vcf_line(line_i) 51 | combo_i = vcf_i.get_info_value(combo) 52 | tool_i = combo_i.split(",") 53 | tool_i = [int(i) for i in tool_i] 54 | 55 | current_call_set = set() 56 | for tool_code_j, tool_j in zip(tool_code, tool_i): 57 | if tool_j == 1: 58 | current_call_set.add(tool_code_j) 59 | 60 | for combo_j in all_combos: 61 | if set.intersection(set(combo_j), current_call_set): 62 | all_combos[combo_j][0] += 1 63 | 64 | if "TruePositive" in vcf_i.identifier: 65 | all_combos[combo_j][1] += 1 66 | 67 | line_i = vcf.readline().rstrip() 68 | 69 | 70 | for i in sorted(all_combos): 71 | print("".join(i) + "\t" + str(all_combos[i][1]) + "\t" + str(all_combos[i][0])) 72 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/QC/extract_coverageDepth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam:,genome-reference:,selector:,minBaseQuality:,minMappingQuality:,extra-arguments:,out-script:,standalone, -n 'coverageDepth.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | minBaseQuality=0 18 | minMappingQuality=0 19 | 20 | 21 | while true; do 22 | case "$1" in 23 | -o | --output-dir ) 24 | case "$2" in 25 | "") shift 2 ;; 26 | *) outdir=$2 ; shift 2 ;; 27 | esac ;; 28 | 29 | --bam ) 30 | case "$2" in 31 | "") shift 2 ;; 32 | *) bamFile=$2 ; shift 2 ;; 33 | esac ;; 34 | 35 | --genome-reference ) 36 | case "$2" in 37 | "") shift 2 ;; 38 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 39 | esac ;; 40 | 41 | --minBaseQuality ) 42 | case "$2" in 43 | "") shift 2 ;; 44 | *) minBaseQuality=$2 ; shift 2 ;; 45 | esac ;; 46 | 47 | --minMappingQuality ) 48 | case "$2" in 49 | "") shift 2 ;; 50 | *) minMappingQuality=$2 ; shift 2 ;; 51 | esac ;; 52 | 53 | --selector ) 54 | case "$2" in 55 | "") shift 2 ;; 56 | *) SELECTOR=$2 ; shift 2 ;; 57 | esac ;; 58 | 59 | --extra-arguments ) 60 | case "$2" in 61 | "") shift 2 ;; 62 | *) extra_arguments=$2 ; shift 2 ;; 63 | esac ;; 64 | 65 | --out-script ) 66 | case "$2" in 67 | "") shift 2 ;; 68 | *) out_script_name=$2 ; shift 2 ;; 69 | esac ;; 70 | 71 | --standalone ) 72 | standalone=1 ; shift ;; 73 | 74 | -- ) shift; break ;; 75 | * ) break ;; 76 | esac 77 | done 78 | 79 | logdir=${outdir}/logs 80 | mkdir -p ${logdir} 81 | 82 | if [[ ${out_script_name} ]] 83 | then 84 | out_script="${out_script_name}" 85 | else 86 | out_script="${logdir}/coverageDepth.${timestamp}.cmd" 87 | fi 88 | 89 | 90 | if [[ $standalone ]] 91 | then 92 | echo "#!/bin/bash" > $out_script 93 | echo "" >> $out_script 94 | echo "#$ -o ${logdir}" >> $out_script 95 | echo "#$ -e ${logdir}" >> $out_script 96 | echo "#$ -S /bin/bash" >> $out_script 97 | echo '#$ -l h_vmem=8G' >> $out_script 98 | echo 'set -e' >> $out_script 99 | fi 100 | 101 | echo "" >> $out_script 102 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 103 | echo "" >> $out_script 104 | 105 | if [[ ${SELECTOR} ]] 106 | then 107 | selector_text="-L /mnt/${SELECTOR}" 108 | fi 109 | 110 | bamFileName=`basename ${bamFile}` 111 | 112 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-0 \\" >> $out_script 113 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 114 | echo "-T DepthOfCoverage \\" >> $out_script 115 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 116 | echo "-I /mnt/${bamFile} \\" >> $out_script 117 | echo "${selector_text} \\" >> $out_script 118 | echo "--minBaseQuality ${minBaseQuality} \\" >> $out_script 119 | echo "--minMappingQuality ${minMappingQuality} \\" >> $out_script 120 | echo "${extra_arguments} \\" >> $out_script 121 | echo "-o /mnt/${outdir}/${bamFileName}.depth" >> $out_script 122 | 123 | echo "" >> $out_script 124 | 125 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 126 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/__init__.py -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/alignments/README.md: -------------------------------------------------------------------------------- 1 | ## Requirement 2 | 3 | - Have internet connection, and able to pull and run docker images from Docker 4 | Hub. 5 | 6 | ### Alignment with bwa mem 7 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/alignments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/alignments/__init__.py -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/alignments/mergeFastqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import subprocess 5 | from datetime import datetime 6 | 7 | from somaticseq.utilities.dockered_pipelines.container_option import ( 8 | DOCKER_IMAGES, 9 | container_params, 10 | ) 11 | 12 | timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S%f") 13 | 14 | 15 | DEFAULT_PARAMS = { 16 | "tabix_image": DOCKER_IMAGES.tabix, 17 | "MEM": 4, 18 | "output_directory": os.curdir, 19 | "action": "echo", 20 | "extra_docker_options": "", 21 | "script": f"mergeFastqs.{timestamp}.cmd", 22 | "threads": 1, 23 | } 24 | 25 | 26 | def gz( 27 | infiles, outfq, tech="docker", input_parameters=DEFAULT_PARAMS, remove_infiles=False 28 | ): 29 | for param_i in DEFAULT_PARAMS: 30 | if param_i not in input_parameters: 31 | input_parameters[param_i] = DEFAULT_PARAMS[param_i] 32 | 33 | logdir = os.path.join(input_parameters["output_directory"], "logs") 34 | outfile = os.path.join(logdir, input_parameters["script"]) 35 | all_paths = list(infiles) + [ 36 | outfq, 37 | ] 38 | tabix_line, file_dictionary = container_params( 39 | input_parameters["tabix_image"], 40 | tech=tech, 41 | files=all_paths, 42 | extra_args=input_parameters["extra_docker_options"], 43 | ) 44 | mounted_outfile = file_dictionary[outfq]["mount_path"] 45 | infile_string = " ".join( 46 | [file_dictionary[file_i]["mount_path"] for file_i in infiles] 47 | ) 48 | 49 | with open(outfile, "w") as out: 50 | out.write("#!/bin/bash\n\n") 51 | out.write(f"#$ -o {logdir}\n") 52 | out.write(f"#$ -e {logdir}\n") 53 | out.write("#$ -S /bin/bash\n") 54 | out.write("#$ -l h_vmem={}G\n".format(input_parameters["MEM"])) 55 | out.write("set -e\n\n") 56 | out.write( 57 | 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' 58 | ) # Do not change this: picard_fractional uses this to end the copying. 59 | out.write(f"{tabix_line} bash -c \\\n") 60 | out.write( 61 | '"zcat {} | bgzip -@{} > {}"\n'.format( 62 | infile_string, input_parameters["threads"], mounted_outfile 63 | ) 64 | ) 65 | if remove_infiles: 66 | out.write("rm {}\n\n".format(" ".join(infiles))) 67 | 68 | out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') 69 | 70 | # "Run" the script that was generated 71 | command_line = "{} {}".format(input_parameters["action"], outfile) 72 | subprocess.call(command_line, shell=True) 73 | 74 | return outfile 75 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/alignments/singleIndelRealign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,genome-reference:,selector:,threads:,extra-arguments:,out-script:,standalone, -n 'singleIndelRealign.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | threads=1 17 | 18 | while true; do 19 | case "$1" in 20 | -o | --output-dir ) 21 | case "$2" in 22 | "") shift 2 ;; 23 | *) outdir=$2 ; shift 2 ;; 24 | esac ;; 25 | 26 | --tumor-bam ) 27 | case "$2" in 28 | "") shift 2 ;; 29 | *) tumorBam=$2 ; shift 2 ;; 30 | esac ;; 31 | 32 | --genome-reference ) 33 | case "$2" in 34 | "") shift 2 ;; 35 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 36 | esac ;; 37 | 38 | --selector ) 39 | case "$2" in 40 | "") shift 2 ;; 41 | *) SELECTOR=$2 ; shift 2 ;; 42 | esac ;; 43 | 44 | --threads ) 45 | case "$2" in 46 | "") shift 2 ;; 47 | *) threads=$2 ; shift 2 ;; 48 | esac ;; 49 | 50 | --extra-arguments ) 51 | case "$2" in 52 | "") shift 2 ;; 53 | *) extra_arguments=$2 ; shift 2 ;; 54 | esac ;; 55 | 56 | --out-script ) 57 | case "$2" in 58 | "") shift 2 ;; 59 | *) out_script_name=$2 ; shift 2 ;; 60 | esac ;; 61 | 62 | --standalone ) 63 | standalone=1 ; shift ;; 64 | 65 | -- ) shift; break ;; 66 | * ) break ;; 67 | esac 68 | done 69 | 70 | logdir=${outdir}/logs 71 | mkdir -p ${logdir} 72 | 73 | if [[ ${out_script_name} ]] 74 | then 75 | out_script="${out_script_name}" 76 | else 77 | out_script="${logdir}/singleIndelRealign.${timestamp}.cmd" 78 | fi 79 | 80 | 81 | if [[ $standalone ]] 82 | then 83 | echo "#!/bin/bash" > $out_script 84 | echo "" >> $out_script 85 | echo "#$ -o ${logdir}" >> $out_script 86 | echo "#$ -e ${logdir}" >> $out_script 87 | echo "#$ -S /bin/bash" >> $out_script 88 | echo '#$ -l h_vmem=8G' >> $out_script 89 | echo 'set -e' >> $out_script 90 | fi 91 | 92 | echo "" >> $out_script 93 | 94 | 95 | if [[ ${SELECTOR} ]] 96 | then 97 | selector_text="-L /mnt/${SELECTOR}" 98 | fi 99 | 100 | 101 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-1 \\" >> $out_script 102 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 103 | echo "-T RealignerTargetCreator \\" >> $out_script 104 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 105 | echo "-I /mnt/${tumorBam} \\" >> $out_script 106 | echo "-nt ${threads} \\" >> $out_script 107 | echo "${selector_text} \\" >> $out_script 108 | echo "-o /mnt/${outdir}/indelRealign.${timestamp}.intervals" >> $out_script 109 | 110 | echo "" >> $out_script 111 | 112 | tumorBamFileName=`basename ${tumorBam}` 113 | tumorOut=${tumorBamFileName%.bam}.indelRealigned.bam 114 | 115 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk3:3.8-1 \\" >> $out_script 116 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 117 | echo "-T IndelRealigner \\" >> $out_script 118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 119 | echo "-I /mnt/${tumorBam} \\" >> $out_script 120 | echo "-targetIntervals /mnt/${outdir}/indelRealign.${timestamp}.intervals \\" >> $out_script 121 | echo "${selector_text} \\" >> $out_script 122 | echo "${extra_arguments} \\" >> $out_script 123 | echo "-o /mnt/${outdir}/${tumorOut}" >> $out_script 124 | 125 | 126 | echo "" >> $out_script 127 | 128 | echo "mv ${outdir}/${tumorOut%.bam}.bai ${outdir}/${tumorOut}.bai" >> $out_script 129 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/alignments/spreadFastq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from datetime import datetime 4 | 5 | from somaticseq.utilities.dockered_pipelines.container_option import ( 6 | DOCKER_IMAGES, 7 | container_params, 8 | ) 9 | 10 | timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S%f") 11 | 12 | 13 | DEFAULT_PARAMS = { 14 | "somaticseq_image": DOCKER_IMAGES.somaticseq, 15 | "MEM": 2, 16 | "output_directory": os.curdir, 17 | "extra_docker_options": "", 18 | "script": f"spreadFastq.{timestamp}.cmd", 19 | "action": "echo", 20 | "threads": 1, 21 | } 22 | 23 | 24 | def spread( 25 | in_fastqs, out_fastqs, tech="docker", input_parameters={}, remove_infiles=False 26 | ): 27 | for param_i in DEFAULT_PARAMS: 28 | if param_i not in input_parameters: 29 | input_parameters[param_i] = DEFAULT_PARAMS[param_i] 30 | 31 | logdir = os.path.join(input_parameters["output_directory"], "logs") 32 | outfile = os.path.join(logdir, input_parameters["script"]) 33 | 34 | all_paths = list(in_fastqs) + list(out_fastqs) 35 | spread_line, file_dictionary = container_params( 36 | input_parameters["somaticseq_image"], 37 | tech=tech, 38 | files=all_paths, 39 | extra_args=input_parameters["extra_docker_options"], 40 | ) 41 | 42 | infastq_string = " ".join( 43 | [file_dictionary[file_i]["mount_path"] for file_i in in_fastqs] 44 | ) 45 | outfastq_string = " ".join( 46 | [file_dictionary[file_i]["mount_path"] for file_i in out_fastqs] 47 | ) 48 | 49 | with open(outfile, "w") as out: 50 | out.write("#!/bin/bash\n\n") 51 | 52 | out.write(f"#$ -o {logdir}\n") 53 | out.write(f"#$ -e {logdir}\n") 54 | out.write("#$ -S /bin/bash\n") 55 | out.write("#$ -l h_vmem={}G\n".format(input_parameters["MEM"])) 56 | out.write("set -e\n\n") 57 | 58 | out.write( 59 | 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' 60 | ) # Do not change this: picard_fractional uses this to end the copying. 61 | 62 | out.write(f"{spread_line} \\\n") 63 | out.write( 64 | "concat.py -spread -bgzip -nt {} -infiles {} -outfiles {} \n".format( 65 | input_parameters["threads"], infastq_string, outfastq_string 66 | ) 67 | ) 68 | 69 | if remove_infiles: 70 | out.write("rm {}\n\n".format(" ".join(in_fastqs))) 71 | 72 | out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') 73 | 74 | # "Run" the script that was generated 75 | command_line = "{} {}".format(input_parameters["action"], outfile) 76 | subprocess.call(command_line, shell=True) 77 | 78 | return outfile 79 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/IndelRealign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,genome-reference:,selector:,out-tag:,extra-arguments:,out-script:,standalone, -n 'IndelRealign.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | out_tag='JointRealigned' 18 | #extra_arguments='-dt NONE --maxReadsForConsensuses 150000 --maxReadsInMemory 500000 --maxReadsForRealignment 2000000' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --tumor-bam ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) tbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --normal-bam ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) nbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --genome-reference ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --selector ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) SELECTOR=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --out-tag ) 53 | case "$2" in 54 | "") shift 2 ;; 55 | *) out_tag=$2 ; shift 2 ;; 56 | esac ;; 57 | 58 | --extra-arguments ) 59 | case "$2" in 60 | "") shift 2 ;; 61 | *) extra_arguments=$2 ; shift 2 ;; 62 | esac ;; 63 | 64 | --out-script ) 65 | case "$2" in 66 | "") shift 2 ;; 67 | *) out_script_name=$2 ; shift 2 ;; 68 | esac ;; 69 | 70 | --standalone ) 71 | standalone=1 ; shift ;; 72 | 73 | -- ) shift; break ;; 74 | * ) break ;; 75 | esac 76 | done 77 | 78 | logdir=${outdir}/logs 79 | mkdir -p ${logdir} 80 | 81 | if [[ ${out_script_name} ]] 82 | then 83 | out_script="${out_script_name}" 84 | else 85 | out_script="${logdir}/indelRealign.${timestamp}.cmd" 86 | fi 87 | 88 | if [[ $standalone ]] 89 | then 90 | echo "#!/bin/bash" > $out_script 91 | echo "" >> $out_script 92 | echo "#$ -o ${logdir}" >> $out_script 93 | echo "#$ -e ${logdir}" >> $out_script 94 | echo "#$ -S /bin/bash" >> $out_script 95 | echo '#$ -l h_vmem=10G' >> $out_script 96 | echo 'set -e' >> $out_script 97 | fi 98 | 99 | echo "" >> $out_script 100 | 101 | if [[ $SELECTOR ]] 102 | then 103 | selector_input="-L /mnt/${SELECTOR}" 104 | fi 105 | 106 | echo "docker run --rm -v /:/mnt -u $UID --memory 15g broadinstitute/gatk3:3.8-1 java -Xmx14g -jar GenomeAnalysisTK.jar \\" >> $out_script 107 | echo "-T RealignerTargetCreator \\" >> $out_script 108 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 109 | echo "-I /mnt/${tbam} \\" >> $out_script 110 | echo "-I /mnt/${nbam} \\" >> $out_script 111 | echo "$selector_input \\" >> $out_script 112 | echo "-o /mnt/${outdir}/T.N.intervals" >> $out_script 113 | echo "" >> $out_script 114 | 115 | echo "docker run --rm -v /:/mnt -u $UID --memory 15g -w /mnt/${outdir} broadinstitute/gatk3:3.8-1 \\" >> $out_script 116 | echo "java -Xmx14g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 117 | echo "-T IndelRealigner \\" >> $out_script 118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 119 | echo "-I /mnt/${tbam} \\" >> $out_script 120 | echo "-I /mnt/${nbam} \\" >> $out_script 121 | echo "-targetIntervals /mnt/${outdir}/T.N.intervals \\" >> $out_script 122 | echo "${extra_arguments} \\" >> $out_script 123 | echo "-nWayOut .${out_tag}.bam" >> $out_script 124 | echo "" >> $out_script 125 | 126 | realigned_normal=${nbam%.bam}.${out_tag}.bam 127 | realigned_tumor=${tbam%.bam}.${out_tag}.bam 128 | 129 | echo "mv ${realigned_normal%.bam}.bai ${realigned_normal}.bai" >> $out_script 130 | echo "mv ${realigned_tumor%.bam}.bai ${realigned_tumor}.bai" >> $out_script 131 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/MergeTN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | outSM='TN_Merged' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-out ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) outbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --tumor-bam ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) tbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --normal-bam ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) nbam=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --out-script ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) out_script_name=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --standalone ) 53 | standalone=1 ; shift ;; 54 | 55 | -- ) shift; break ;; 56 | * ) break ;; 57 | esac 58 | done 59 | 60 | logdir=${outdir}/logs 61 | mkdir -p ${logdir} 62 | 63 | if [[ ${out_script_name} ]] 64 | then 65 | out_script="${out_script_name}" 66 | else 67 | out_script="${logdir}/mergeBams.${timestamp}.cmd" 68 | fi 69 | 70 | if [[ $standalone ]] 71 | then 72 | echo "#!/bin/bash" > $out_script 73 | echo "" >> $out_script 74 | echo "#$ -o ${logdir}" >> $out_script 75 | echo "#$ -e ${logdir}" >> $out_script 76 | echo "#$ -S /bin/bash" >> $out_script 77 | echo '#$ -l h_vmem=8G' >> $out_script 78 | echo 'set -e' >> $out_script 79 | fi 80 | 81 | echo "" >> $out_script 82 | 83 | # Merge the 2 BAM files 84 | echo "docker run -v /:/mnt -u $UID --memory 6g --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script 86 | echo "I=/mnt/${nbam} \\" >> $out_script 87 | echo "I=/mnt/${tbam} \\" >> $out_script 88 | echo "ASSUME_SORTED=true \\" >> $out_script 89 | echo "CREATE_INDEX=true \\" >> $out_script 90 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 91 | echo "" >> $out_script 92 | 93 | # Remove temp files 94 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 95 | echo "" >> $out_script 96 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/Reheader_SM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-SM:,out-script:,standalone -n 'Reheader_SM.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | outSM='TN_Merged' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-out ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) outbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --bam-in ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) inbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --out-SM ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) outSM=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --out-script ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) out_script_name=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --standalone ) 53 | standalone=1 ; shift ;; 54 | 55 | -- ) shift; break ;; 56 | * ) break ;; 57 | esac 58 | done 59 | 60 | logdir=${outdir}/logs 61 | mkdir -p ${logdir} 62 | 63 | if [[ ${out_script_name} ]] 64 | then 65 | out_script="${out_script_name}" 66 | else 67 | out_script="${logdir}/reheader.${timestamp}.cmd" 68 | fi 69 | 70 | if [[ $standalone ]] 71 | then 72 | echo "#!/bin/bash" > $out_script 73 | echo "" >> $out_script 74 | echo "#$ -o ${logdir}" >> $out_script 75 | echo "#$ -e ${logdir}" >> $out_script 76 | echo "#$ -S /bin/bash" >> $out_script 77 | echo '#$ -l h_vmem=8G' >> $out_script 78 | echo 'set -e' >> $out_script 79 | fi 80 | 81 | echo "" >> $out_script 82 | 83 | # Uniform sample and read group names in the merged file 84 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups \\" >> $out_script 86 | echo "I=/mnt/${outdir}/${inbam} \\" >> $out_script 87 | echo "RGID=BAMSurgeon \\" >> $out_script 88 | echo "RGLB=TNMerged \\" >> $out_script 89 | echo "RGPL=illumina \\" >> $out_script 90 | echo "RGPU=BAMSurgeon \\" >> $out_script 91 | echo "RGSM=${outSM} \\" >> $out_script 92 | echo "CREATE_INDEX=true \\" >> $out_script 93 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 94 | echo "" >> $out_script 95 | 96 | # Remove temp files 97 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 98 | echo "" >> $out_script 99 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/SortByCoordinate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,genome-reference:,out-script:,standalone -n 'SortByCoordinate.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --genome-reference ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --out-script ) 46 | case "$2" in 47 | "") shift 2 ;; 48 | *) out_script_name=$2 ; shift 2 ;; 49 | esac ;; 50 | 51 | --standalone ) 52 | standalone=1 ; shift ;; 53 | 54 | -- ) shift; break ;; 55 | * ) break ;; 56 | esac 57 | done 58 | 59 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 60 | 61 | logdir=${outdir}/logs 62 | mkdir -p ${logdir} 63 | 64 | if [[ ${out_script_name} ]] 65 | then 66 | out_script="${out_script_name}" 67 | else 68 | out_script="${logdir}/sort.coordinates.${timestamp}.cmd" 69 | fi 70 | 71 | 72 | if [[ $standalone ]] 73 | then 74 | echo "#!/bin/bash" > $out_script 75 | echo "" >> $out_script 76 | echo "#$ -o ${logdir}" >> $out_script 77 | echo "#$ -e ${logdir}" >> $out_script 78 | echo "#$ -S /bin/bash" >> $out_script 79 | echo '#$ -l h_vmem=8G' >> $out_script 80 | echo 'set -e' >> $out_script 81 | fi 82 | 83 | 84 | echo "" >> $out_script 85 | 86 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script 87 | echo "samtools sort -m 4G --reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script 88 | echo "-o /mnt/${outdir}/${outbam} /mnt/${inbam}" >> $out_script 89 | echo "" >> $out_script 90 | 91 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script 92 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script 93 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/SortByReadName.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 54 | 55 | logdir=${outdir}/logs 56 | mkdir -p ${logdir} 57 | 58 | if [[ ${out_script_name} ]] 59 | then 60 | out_script="${out_script_name}" 61 | else 62 | out_script="${logdir}/sort.qname.${timestamp}.cmd" 63 | fi 64 | 65 | 66 | if [[ $standalone ]] 67 | then 68 | echo "#!/bin/bash" > $out_script 69 | echo "" >> $out_script 70 | echo "#$ -o ${logdir}" >> $out_script 71 | echo "#$ -e ${logdir}" >> $out_script 72 | echo "#$ -S /bin/bash" >> $out_script 73 | echo '#$ -l h_vmem=8G' >> $out_script 74 | echo 'set -e' >> $out_script 75 | fi 76 | 77 | 78 | echo "" >> $out_script 79 | 80 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/samtools:1.7 \\" >> $out_script 81 | echo "samtools sort -n -m 4G \\" >> $out_script 82 | echo "-o /mnt/${outdir}/${outbam} \\" >> $out_script 83 | echo "/mnt/${inbam} \\" >> $out_script 84 | echo "" >> $out_script 85 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/bamsurgeon_split_BAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,genome-reference:,bam-out1:,bam-out2:,bam-in:,split-proportion:,down-sample:,seed:,out-script:,clean-bam,standalone -n 'bamsurgeon_split_BAM.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | seed=$( date +"%Y" ) 17 | proportion=0.5 18 | down_sample=1 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-in ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) inbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --bam-out1 ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) outbam1=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --bam-out2 ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) outbam2=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --genome-reference ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --split-proportion ) 53 | case "$2" in 54 | "") shift 2 ;; 55 | *) proportion=$2 ; shift 2 ;; 56 | esac ;; 57 | 58 | --down-sample ) 59 | case "$2" in 60 | "") shift 2 ;; 61 | *) down_sample=$2 ; shift 2 ;; 62 | esac ;; 63 | 64 | --seed ) 65 | case "$2" in 66 | "") shift 2 ;; 67 | *) seed=$2 ; shift 2 ;; 68 | esac ;; 69 | 70 | --out-script ) 71 | case "$2" in 72 | "") shift 2 ;; 73 | *) out_script_name=$2 ; shift 2 ;; 74 | esac ;; 75 | 76 | --clean-bam ) 77 | clean_bam=1 ; shift ;; 78 | 79 | --standalone ) 80 | standalone=1 ; shift ;; 81 | 82 | -- ) shift; break ;; 83 | * ) break ;; 84 | esac 85 | done 86 | 87 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 88 | 89 | logdir=${outdir}/logs 90 | mkdir -p ${logdir} 91 | 92 | if [[ ${out_script_name} ]] 93 | then 94 | out_script="${out_script_name}" 95 | else 96 | out_script="${logdir}/splitBams.${timestamp}.cmd" 97 | fi 98 | 99 | if [[ $standalone ]] 100 | then 101 | echo "#!/bin/bash" > $out_script 102 | echo "" >> $out_script 103 | echo "#$ -o ${logdir}" >> $out_script 104 | echo "#$ -e ${logdir}" >> $out_script 105 | echo "#$ -S /bin/bash" >> $out_script 106 | echo '#$ -l h_vmem=8G' >> $out_script 107 | echo 'set -e' >> $out_script 108 | fi 109 | 110 | 111 | echo "" >> $out_script 112 | 113 | 114 | # Then you can split 115 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 116 | echo "/usr/local/bamsurgeon/scripts/sortedBamSplit.py \\" >> $out_script 117 | echo "--bam /mnt/${inbam} \\" >> $out_script 118 | echo "--proportion ${proportion} \\" >> $out_script 119 | echo "--downsample ${down_sample} \\" >> $out_script 120 | echo "--pick1 /mnt/${outdir}/${outbam1} \\" >> $out_script 121 | echo "--pick2 /mnt/${outdir}/${outbam2} \\" >> $out_script 122 | echo "--seed ${seed}" >> $out_script 123 | echo "" >> $out_script 124 | 125 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam1}" >> $out_script 126 | echo "docker run -v /:/mnt -u $UID --rm --memory 8g lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam2}" >> $out_script 127 | echo "" >> $out_script 128 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/cleanBam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | if [[ ${out_script_name} ]] 54 | then 55 | out_script="${out_script_name}" 56 | else 57 | out_script="${logdir}/cleanBam.${timestamp}.cmd" 58 | fi 59 | 60 | if [[ $standalone ]] 61 | then 62 | echo "#!/bin/bash" > $out_script 63 | echo "" >> $out_script 64 | echo "#$ -o ${logdir}" >> $out_script 65 | echo "#$ -e ${logdir}" >> $out_script 66 | echo "#$ -S /bin/bash" >> $out_script 67 | echo '#$ -l h_vmem=4G' >> $out_script 68 | echo 'set -e' >> $out_script 69 | fi 70 | 71 | echo "" >> $out_script 72 | 73 | # To split a BAM file, first you must sort by name: 74 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 75 | echo "/usr/local/bamsurgeon/scripts/remove_reads_with_many_qnames_or_bad_CIGAR.py \\" >> $out_script 76 | echo "-bamin /mnt/${inbam} \\" >> $out_script 77 | echo "-bamout /mnt/${outdir}/${outbam}" >> $out_script 78 | echo "" >> $out_script 79 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/concatVcfFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,vcf-string:,vcf-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --vcf-out ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) outvcf=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --vcf-string ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) vcf_string=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | logdir=${outdir}/logs 54 | mkdir -p ${logdir} 55 | 56 | if [[ ${out_script_name} ]] 57 | then 58 | out_script="${out_script_name}" 59 | else 60 | out_script="${logdir}/concatVcfFiles.${timestamp}.cmd" 61 | fi 62 | 63 | if [[ $standalone ]] 64 | then 65 | echo "#!/bin/bash" > $out_script 66 | echo "" >> $out_script 67 | echo "#$ -o ${logdir}" >> $out_script 68 | echo "#$ -e ${logdir}" >> $out_script 69 | echo "#$ -S /bin/bash" >> $out_script 70 | echo '#$ -l h_vmem=2G' >> $out_script 71 | echo 'set -e' >> $out_script 72 | fi 73 | 74 | echo "" >> $out_script 75 | 76 | 77 | for file in ${vcf_string} 78 | do 79 | input_file_string="/mnt/${file} ${input_file_string}" 80 | done 81 | 82 | # Merge the BAM files 83 | echo "docker run -v /:/mnt -u $UID --memory 2g --rm lethalfang/vcftools:0.1.15 bash -c \\" >> $out_script 84 | echo "\"vcf-concat \\" >> $out_script 85 | echo "${input_file_string} \\" >> $out_script 86 | echo "> /mnt/${outdir}/${outvcf}\"" >> $out_script 87 | echo "" >> $out_script 88 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/convert_nonStandardBasesInVcfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import sys 5 | 6 | for line_i in sys.stdin: 7 | if line_i.startswith("#"): 8 | print(line_i, end="") 9 | 10 | else: 11 | item = line_i.rstrip().split("\t") 12 | item[3] = re.sub(r"[^gctanGCTAN,0-9]", "N", item[3]) 13 | item[4] = re.sub(r"[^gctanGCTAN,0-9]", "N", item[4]) 14 | line_out = "\t".join(item) 15 | 16 | print(line_out) 17 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/mergeBamFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-string:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-out ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) outbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-string ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) bam_string=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | logdir=${outdir}/logs 54 | mkdir -p ${logdir} 55 | 56 | if [[ ${out_script_name} ]] 57 | then 58 | out_script="${out_script_name}" 59 | else 60 | out_script="${logdir}/mergeBams.${timestamp}.cmd" 61 | fi 62 | 63 | if [[ $standalone ]] 64 | then 65 | echo "#!/bin/bash" > $out_script 66 | echo "" >> $out_script 67 | echo "#$ -o ${logdir}" >> $out_script 68 | echo "#$ -e ${logdir}" >> $out_script 69 | echo "#$ -S /bin/bash" >> $out_script 70 | echo '#$ -l h_vmem=8G' >> $out_script 71 | echo 'set -e' >> $out_script 72 | fi 73 | 74 | echo "" >> $out_script 75 | 76 | 77 | for file in ${bam_string} 78 | do 79 | input_file_string="I=/mnt/${file} ${input_file_string}" 80 | done 81 | 82 | # Merge the BAM files 83 | echo "docker run -v /:/mnt -u $UID --memory 8g --rm lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 84 | echo "java -Xmx8g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script 85 | echo "${input_file_string} \\" >> $out_script 86 | echo "ASSUME_SORTED=true \\" >> $out_script 87 | echo "CREATE_INDEX=true \\" >> $out_script 88 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 89 | echo "" >> $out_script 90 | 91 | # Remove temp files 92 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 93 | echo "" >> $out_script 94 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/bamSurgeon/split_BAM_by_BED.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-in:,bam-out:,selector:,out-script:,standalone -n 'split_BAM_by_BED.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | while true; do 18 | case "$1" in 19 | -o | --output-dir ) 20 | case "$2" in 21 | "") shift 2 ;; 22 | *) outdir=$2 ; shift 2 ;; 23 | esac ;; 24 | 25 | --bam-in ) 26 | case "$2" in 27 | "") shift 2 ;; 28 | *) inbam=$2 ; shift 2 ;; 29 | esac ;; 30 | 31 | --bam-out ) 32 | case "$2" in 33 | "") shift 2 ;; 34 | *) outbam=$2 ; shift 2 ;; 35 | esac ;; 36 | 37 | --selector ) 38 | case "$2" in 39 | "") shift 2 ;; 40 | *) SELECTOR=$2 ; shift 2 ;; 41 | esac ;; 42 | 43 | --out-script ) 44 | case "$2" in 45 | "") shift 2 ;; 46 | *) out_script_name=$2 ; shift 2 ;; 47 | esac ;; 48 | 49 | --standalone ) 50 | standalone=1 ; shift ;; 51 | 52 | -- ) shift; break ;; 53 | * ) break ;; 54 | esac 55 | done 56 | 57 | 58 | logdir=${outdir}/logs 59 | mkdir -p ${logdir} 60 | 61 | if [[ ${out_script_name} ]] 62 | then 63 | out_script="${out_script_name}" 64 | else 65 | out_script="${logdir}/splitByBed.${timestamp}.cmd" 66 | fi 67 | 68 | if [[ $standalone ]] 69 | then 70 | echo "#!/bin/bash" > $out_script 71 | echo "" >> $out_script 72 | echo "#$ -o ${logdir}" >> $out_script 73 | echo "#$ -e ${logdir}" >> $out_script 74 | echo "#$ -S /bin/bash" >> $out_script 75 | echo '#$ -l h_vmem=4G' >> $out_script 76 | echo 'set -e' >> $out_script 77 | fi 78 | 79 | echo "" >> $out_script 80 | 81 | 82 | echo "docker run --rm -v /:/mnt -u $UID --memory 4g lethalfang/samtools:1.7 bash -c \\" >> $out_script 83 | echo "\"samtools view /mnt/${inbam} -L /mnt/${SELECTOR} -Sbh \\" >> $out_script 84 | echo "> /mnt/${outdir}/${outbam}\"" >> $out_script 85 | 86 | echo "" >> $out_script 87 | 88 | echo "docker run --rm -v /:/mnt -u $UID --memory 4g lethalfang/samtools:1.7 \\" >> $out_script 89 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script 90 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/dream_sim.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/dream_sim.jpg -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/onkoinsight_sim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/onkoinsight_sim.png -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/bamSimulator/replicate_sim.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/bamSimulator/replicate_sim.jpg -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/container_option.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | from typing import Literal 6 | 7 | from somaticseq._version import __version__ as VERSION 8 | 9 | 10 | @dataclass 11 | class DockerImages: 12 | alientrimmer: str = "lethalfang/alientrimmer:0.4.0" 13 | bedtools: str = "lethalfang/bedtools:2.26.0" 14 | bwa: str = "lethalfang/bwa:0.7.17_samtools_1.19" 15 | jsm2: str = "lethalfang/jointsnvmix2:0.7.5" 16 | lofreq: str = "lethalfang/lofreq:2.1.3.1-1" 17 | muse: str = "marghoob/muse:1.0rc_c" 18 | mutect2: str = "broadinstitute/gatk:4.0.5.2" 19 | picard: str = "lethalfang/picard:2.22.7" 20 | sambamba: str = "lethalfang/sambamba:0.7.1" 21 | samtools: str = "lethalfang/samtools:1.19.2" 22 | scalpel: str = "lethalfang/scalpel:0.5.4" 23 | somaticseq: str = f"lethalfang/somaticseq:{VERSION}" 24 | somaticsniper: str = "lethalfang/somaticsniper:1.0.5.0-2" 25 | strelka2: str = "lethalfang/strelka:2.9.5" 26 | tabix: str = "lethalfang/tabix:1.10" 27 | trimmomatic: str = "lethalfang/trimmomatic:0.39" 28 | vardict: str = "lethalfang/vardictjava:1.7.0" 29 | varscan2: str = "djordjeklisic/sbg-varscan2:v1" 30 | 31 | 32 | @dataclass 33 | class MountedFileProperty: 34 | file: str 35 | filepath: Path 36 | filename: str 37 | directory: Path 38 | absolute_directory: Path 39 | mount_directory: str 40 | mount_path: str 41 | 42 | 43 | DOCKER_IMAGES = DockerImages() 44 | 45 | 46 | def container_params( 47 | container_image: str, 48 | tech: Literal["docker", "singularity"] = "docker", 49 | files: list[str] = [], 50 | extra_args: str = "", 51 | singularity_image_loc: str = "docker://", 52 | ) -> tuple[str, dict[str, dict[str, str]]]: 53 | 54 | file_paths = [Path(i) for i in files] 55 | file_names = [i.name for i in file_paths] 56 | file_dirs = [i.parent for i in file_paths] 57 | file_abs_dirs = [i.absolute().parent for i in file_paths] 58 | random_dirs = ["/" + uuid.uuid4().hex for _ in files] 59 | 60 | file_dictionary = {} 61 | for file_i, path_i, filename_i, dir_i, abs_dir_i, random_dir_i in zip( 62 | files, file_paths, file_names, file_dirs, file_abs_dirs, random_dirs 63 | ): 64 | file_dictionary[file_i] = { 65 | "filepath": path_i, 66 | "filename": filename_i, 67 | "dir": dir_i, 68 | "abs_dir": abs_dir_i, 69 | "mount_dir": random_dir_i, 70 | "mount_path": os.path.join(random_dir_i, filename_i), 71 | } 72 | 73 | if tech == "docker": 74 | MOUNT_STRING = "" 75 | for file_i in file_dictionary: 76 | sys_dir = file_dictionary[file_i]["abs_dir"] 77 | container_dir = file_dictionary[file_i]["mount_dir"] 78 | MOUNT_STRING = MOUNT_STRING + f" -v {sys_dir}:{container_dir}" 79 | 80 | container_string = ( 81 | f"docker run {MOUNT_STRING} -u $(id -u):$(id -g) " 82 | f"--rm {extra_args} {container_image}" 83 | ) 84 | 85 | elif tech == "singularity": 86 | MOUNT_STRING = "" 87 | for file_i in file_dictionary: 88 | sys_dir = file_dictionary[file_i]["abs_dir"] 89 | container_dir = file_dictionary[file_i]["mount_dir"] 90 | MOUNT_STRING = MOUNT_STRING + f" --bind {sys_dir}:{container_dir}" 91 | 92 | container_string = ( 93 | "singularity exec --cleanenv " 94 | f"{MOUNT_STRING} {extra_args} {singularity_image_loc}{container_image}" 95 | ) 96 | 97 | else: 98 | raise NotImplementedError("Only supports docker and singularity.") 99 | 100 | return container_string, file_dictionary 101 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/germline_variants/Canvas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,bam:,in-vcf:,sample-name:,canvas-reference:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'canvas.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | SAMPLE_NAME='Canvas' 20 | 21 | while true; do 22 | case "$1" in 23 | 24 | -o | --out-dir ) 25 | case "$2" in 26 | "") shift 2 ;; 27 | *) outdir=$2 ; shift 2 ;; 28 | esac ;; 29 | 30 | --bam ) 31 | case "$2" in 32 | "") shift 2 ;; 33 | *) bamFile=$2 ; shift 2 ;; 34 | esac ;; 35 | 36 | --in-vcf ) 37 | case "$2" in 38 | "") shift 2 ;; 39 | *) inVcf=$2 ; shift 2 ;; 40 | esac ;; 41 | 42 | --sample-name ) 43 | case "$2" in 44 | "") shift 2 ;; 45 | *) SAMPLE_NAME=$2 ; shift 2 ;; 46 | esac ;; 47 | 48 | --canvas-reference ) 49 | case "$2" in 50 | "") shift 2 ;; 51 | *) CANVAS_REFERENCE=$2 ; shift 2 ;; 52 | esac ;; 53 | 54 | --genome-reference-dir ) 55 | case "$2" in 56 | "") shift 2 ;; 57 | *) GENOMIC_REFERENCE_DIR=$2 ; shift 2 ;; 58 | esac ;; 59 | 60 | --filter-bed ) 61 | case "$2" in 62 | "") shift 2 ;; 63 | *) filterBed=$2 ; shift 2 ;; 64 | esac ;; 65 | 66 | --extra-arguments ) 67 | case "$2" in 68 | "") shift 2 ;; 69 | *) extra_arguments=$2 ; shift 2 ;; 70 | esac ;; 71 | 72 | --out-script ) 73 | case "$2" in 74 | "") shift 2 ;; 75 | *) out_script_name=$2 ; shift 2 ;; 76 | esac ;; 77 | 78 | --action ) 79 | case "$2" in 80 | "") shift 2 ;; 81 | *) action=$2 ; shift 2 ;; 82 | esac ;; 83 | 84 | --standalone ) 85 | standalone=1 ; shift ;; 86 | 87 | -- ) shift; break ;; 88 | * ) break ;; 89 | esac 90 | 91 | done 92 | 93 | logdir=${outdir}/logs 94 | mkdir -p ${logdir} 95 | 96 | if [[ ${out_script_name} ]] 97 | then 98 | out_script="${out_script_name}" 99 | else 100 | out_script="${logdir}/canvas.${timestamp}.cmd" 101 | fi 102 | 103 | 104 | if [[ $standalone ]] 105 | then 106 | echo "#!/bin/bash" > $out_script 107 | echo "" >> $out_script 108 | echo "#$ -o ${logdir}" >> $out_script 109 | echo "#$ -e ${logdir}" >> $out_script 110 | echo "#$ -S /bin/bash" >> $out_script 111 | echo '#$ -l h_vmem=12G' >> $out_script 112 | echo "#$ -pe smp ${threads}" >> $out_script 113 | echo 'set -e' >> $out_script 114 | fi 115 | 116 | echo "" >> $out_script 117 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 118 | echo "" >> $out_script 119 | 120 | 121 | # Cannot yet control number of threads to invoke 122 | # First, create Canvas-ready fasta 123 | # docker run --rm -v /:/mnt -u $UID lethalfang/canvas:1.35.1 bash -c 'export COMPlus_gcAllowVeryLargeObjects=1 && dotnet /opt/Canvas/Tools/FlagUniqueKmers/FlagUniqueKmers.dll /mnt/sc1/groups/bfx-red/data/datainsights/SEQC2_Resources/GRCh38.d1.vd1.fa /mnt/sc1/groups/bfx-red/data/datainsights/SEQC2_Resources/GRCh38.d1.vd1.Canvas-ready.fasta' 124 | 125 | # Run Canvas 126 | echo "docker run -u $UID --rm -v /:/mnt lethalfang/canvas:1.35.1 \\" >> $out_script 127 | echo "dotnet /opt/Canvas/Canvas.dll Germline-WGS \\" >> $out_script 128 | echo "--bam /mnt/${bamFile} \\" >> $out_script 129 | echo "--sample-b-allele-vcf /mnt/${inVcf} \\" >> $out_script 130 | echo "--sample-name ${SAMPLE_NAME} \\" >> $out_script 131 | echo "--reference /mnt/${CANVAS_REFERENCE} \\" >> $out_script 132 | echo "--genome-folder /mnt/${GENOMIC_REFERENCE_DIR} \\" >> $out_script 133 | echo "--filter-bed /mnt/${filterBed} \\" >> $out_script 134 | echo "--output /mnt/${out_dir} " >> $out_script 135 | 136 | echo "" >> $out_script 137 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 138 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/germline_variants/Manta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,bam:,genome-reference:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | 20 | while true; do 21 | case "$1" in 22 | 23 | -o | --out-dir ) 24 | case "$2" in 25 | "") shift 2 ;; 26 | *) outdir=$2 ; shift 2 ;; 27 | esac ;; 28 | 29 | --bam ) 30 | case "$2" in 31 | "") shift 2 ;; 32 | *) bamFile=$2 ; shift 2 ;; 33 | esac ;; 34 | 35 | --genome-reference ) 36 | case "$2" in 37 | "") shift 2 ;; 38 | *) GENOME_REFERENCE=$2 ; shift 2 ;; 39 | esac ;; 40 | 41 | --extra-arguments ) 42 | case "$2" in 43 | "") shift 2 ;; 44 | *) extra_arguments=$2 ; shift 2 ;; 45 | esac ;; 46 | 47 | --out-script ) 48 | case "$2" in 49 | "") shift 2 ;; 50 | *) out_script_name=$2 ; shift 2 ;; 51 | esac ;; 52 | 53 | --threads ) 54 | case "$2" in 55 | "") shift 2 ;; 56 | *) threads=$2 ; shift 2 ;; 57 | esac ;; 58 | 59 | --action ) 60 | case "$2" in 61 | "") shift 2 ;; 62 | *) action=$2 ; shift 2 ;; 63 | esac ;; 64 | 65 | --standalone ) 66 | standalone=1 ; shift ;; 67 | 68 | -- ) shift; break ;; 69 | * ) break ;; 70 | esac 71 | 72 | done 73 | 74 | logdir=${outdir}/logs 75 | mkdir -p ${logdir} 76 | 77 | if [[ ${out_script_name} ]] 78 | then 79 | out_script="${out_script_name}" 80 | else 81 | out_script="${logdir}/canvas.${timestamp}.cmd" 82 | fi 83 | 84 | 85 | if [[ $standalone ]] 86 | then 87 | echo "#!/bin/bash" > $out_script 88 | echo "" >> $out_script 89 | echo "#$ -o ${logdir}" >> $out_script 90 | echo "#$ -e ${logdir}" >> $out_script 91 | echo "#$ -S /bin/bash" >> $out_script 92 | echo '#$ -l h_vmem=4G' >> $out_script 93 | echo "#$ -pe smp ${threads}" >> $out_script 94 | echo 'set -e' >> $out_script 95 | fi 96 | 97 | echo "" >> $out_script 98 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 99 | echo "" >> $out_script 100 | 101 | 102 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/manta:1.4.0 \\" >> $out_script 103 | echo "/opt/manta/bin/configManta.py \\" >> $out_script 104 | echo "--bam /mnt/${bamFile} \\" >> $out_script 105 | echo "--referenceFasta /mnt/${GENOME_REFERENCE} \\" >> $out_script 106 | echo "--runDir /mnt/${outdir}" >> $out_script 107 | 108 | echo "" >> $out_script 109 | 110 | echo "docker run -v /:/mnt -u $UID --rm lethalfang/manta:1.4.0 /mnt/${outdir}/runWorkflow.py -m local -j $thread" >> $out_script 111 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/germline_variants/Nirvana.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,in-vcf:,nirvana-resources-dir:,sample:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | sampleID='Nirvana' 20 | 21 | while true; do 22 | case "$1" in 23 | 24 | -o | --out-dir ) 25 | case "$2" in 26 | "") shift 2 ;; 27 | *) outdir=$2 ; shift 2 ;; 28 | esac ;; 29 | 30 | --in-vcf ) 31 | case "$2" in 32 | "") shift 2 ;; 33 | *) inVcf=$2 ; shift 2 ;; 34 | esac ;; 35 | 36 | --nirvana-resources-dir ) 37 | case "$2" in 38 | "") shift 2 ;; 39 | *) NIRVANA_RESOURCES_DIR=$2 ; shift 2 ;; 40 | esac ;; 41 | 42 | --sample ) 43 | case "$2" in 44 | "") shift 2 ;; 45 | *) sampleID=$2 ; shift 2 ;; 46 | esac ;; 47 | 48 | --extra-arguments ) 49 | case "$2" in 50 | "") shift 2 ;; 51 | *) extra_arguments=$2 ; shift 2 ;; 52 | esac ;; 53 | 54 | --out-script ) 55 | case "$2" in 56 | "") shift 2 ;; 57 | *) out_script_name=$2 ; shift 2 ;; 58 | esac ;; 59 | 60 | --threads ) 61 | case "$2" in 62 | "") shift 2 ;; 63 | *) threads=$2 ; shift 2 ;; 64 | esac ;; 65 | 66 | --action ) 67 | case "$2" in 68 | "") shift 2 ;; 69 | *) action=$2 ; shift 2 ;; 70 | esac ;; 71 | 72 | --standalone ) 73 | standalone=1 ; shift ;; 74 | 75 | -- ) shift; break ;; 76 | * ) break ;; 77 | esac 78 | 79 | done 80 | 81 | logdir=${outdir}/logs 82 | mkdir -p ${logdir} 83 | 84 | if [[ ${out_script_name} ]] 85 | then 86 | out_script="${out_script_name}" 87 | else 88 | out_script="${logdir}/canvas.${timestamp}.cmd" 89 | fi 90 | 91 | 92 | if [[ $standalone ]] 93 | then 94 | echo "#!/bin/bash" > $out_script 95 | echo "" >> $out_script 96 | echo "#$ -o ${logdir}" >> $out_script 97 | echo "#$ -e ${logdir}" >> $out_script 98 | echo "#$ -S /bin/bash" >> $out_script 99 | echo '#$ -l h_vmem=4G' >> $out_script 100 | echo "#$ -pe smp ${threads}" >> $out_script 101 | echo 'set -e' >> $out_script 102 | fi 103 | 104 | echo "" >> $out_script 105 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 106 | echo "" >> $out_script 107 | 108 | 109 | echo "docker run --rm -u $UID -v /:/mnt lethalfang/nirvana:2.0.9 \\" >> $out_script 110 | echo "dotnet /opt/Nirvana/bin/Release/netcoreapp2.0/Nirvana.dll \\" >> $out_script 111 | echo "-c /mnt/${NIRVANA_RESOURCES_DIR}/Cache/26/GRCh38/Ensembl \\" >> $out_script 112 | echo "--sd /mnt/${NIRVANA_RESOURCES_DIR}/GRCh38 \\" >> $out_script 113 | echo "-r /mnt/${NIRVANA_RESOURCES_DIR}/References/5/Homo_sapiens.GRCh38.Nirvana.dat \\" >> $out_script 114 | echo "-i /mnt/${inVcf} \\" >> $out_script 115 | echo "-o /mnt/${outdir}/${sampleID}" >> $out_script 116 | 117 | echo "" >> $out_script 118 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 119 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/germline_variants/README.md: -------------------------------------------------------------------------------- 1 | **Requirement** 2 | 3 | - Have internet connection, and able to pull and run docker images from Docker 4 | Hub. 5 | - **Recommended**: Have cluster management system with valid "qsub" command, 6 | such as Sun Grid Engine (SGE). 7 | 8 | - Germline tasks starting with BAM file. 9 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/germline_variants/haplotypeCaller.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,out-vcf:,bam:,human-reference:,selector:,dbsnp:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'gatk_haplotypecaller.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | 20 | while true; do 21 | case "$1" in 22 | 23 | -o | --out-dir ) 24 | case "$2" in 25 | "") shift 2 ;; 26 | *) outdir=$2 ; shift 2 ;; 27 | esac ;; 28 | 29 | --out-vcf ) 30 | case "$2" in 31 | "") shift 2 ;; 32 | *) outVcfName=$2 ; shift 2 ;; 33 | esac ;; 34 | 35 | --bam ) 36 | case "$2" in 37 | "") shift 2 ;; 38 | *) bamFile=$2 ; shift 2 ;; 39 | esac ;; 40 | 41 | --human-reference ) 42 | case "$2" in 43 | "") shift 2 ;; 44 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 45 | esac ;; 46 | 47 | --selector ) 48 | case "$2" in 49 | "") shift 2 ;; 50 | *) SELECTOR=$2 ; shift 2 ;; 51 | esac ;; 52 | 53 | --dbsnp ) 54 | case "$2" in 55 | "") shift 2 ;; 56 | *) dbsnp=$2 ; shift 2 ;; 57 | esac ;; 58 | 59 | --MEM ) 60 | case "$2" in 61 | "") shift 2 ;; 62 | *) MEM=$2 ; shift 2 ;; 63 | esac ;; 64 | 65 | --threads ) 66 | case "$2" in 67 | "") shift 2 ;; 68 | *) threads=$2 ; shift 2 ;; 69 | esac ;; 70 | 71 | --extra-arguments ) 72 | case "$2" in 73 | "") shift 2 ;; 74 | *) extra_arguments=$2 ; shift 2 ;; 75 | esac ;; 76 | 77 | --out-script ) 78 | case "$2" in 79 | "") shift 2 ;; 80 | *) out_script_name=$2 ; shift 2 ;; 81 | esac ;; 82 | 83 | --action ) 84 | case "$2" in 85 | "") shift 2 ;; 86 | *) action=$2 ; shift 2 ;; 87 | esac ;; 88 | 89 | --standalone ) 90 | standalone=1 ; shift ;; 91 | 92 | -- ) shift; break ;; 93 | * ) break ;; 94 | esac 95 | 96 | done 97 | 98 | logdir=${outdir}/logs 99 | mkdir -p ${logdir} 100 | 101 | if [[ ${out_script_name} ]] 102 | then 103 | out_script="${out_script_name}" 104 | else 105 | out_script="${logdir}/HaplotypeCaller.${timestamp}.cmd" 106 | fi 107 | 108 | 109 | if [[ $standalone ]] 110 | then 111 | echo "#!/bin/bash" > $out_script 112 | echo "" >> $out_script 113 | echo "#$ -o ${logdir}" >> $out_script 114 | echo "#$ -e ${logdir}" >> $out_script 115 | echo "#$ -S /bin/bash" >> $out_script 116 | echo '#$ -l h_vmem=12G' >> $out_script 117 | echo "#$ -pe smp ${threads}" >> $out_script 118 | echo 'set -e' >> $out_script 119 | fi 120 | 121 | echo "" >> $out_script 122 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 123 | echo "" >> $out_script 124 | 125 | 126 | if [[ ${SELECTOR} ]] 127 | then 128 | selector_text="-L /mnt/${SELECTOR}" 129 | fi 130 | 131 | dbsnp_text='' 132 | if [[ ${dbsnp} ]]; then 133 | dbsnp_text="--dbsnp /mnt/${dbsnp}" 134 | fi 135 | 136 | 137 | echo "docker run --rm -v /:/mnt -u $UID broadinstitute/gatk:4.0.5.2 \\" >> $out_script 138 | echo "java -Xmx${MEM}g -jar /gatk/gatk.jar \\" >> $out_script 139 | echo "HaplotypeCaller \\" >> $out_script 140 | echo "--reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script 141 | echo "--input /mnt/${bamFile} \\" >> $out_script 142 | echo "--native-pair-hmm-threads ${threads} \\" >> $out_script 143 | echo "$selector_text \\" >> $out_script 144 | echo "$dbsnp_text \\" >> $out_script 145 | echo "${extra_arguments} \\" >> $out_script 146 | echo "--output /mnt/${outdir}/${outVcfName}" >> $out_script 147 | 148 | echo "" >> $out_script 149 | 150 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 151 | -------------------------------------------------------------------------------- /somaticseq/utilities/dockered_pipelines/somatic_mutations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/utilities/dockered_pipelines/somatic_mutations/__init__.py -------------------------------------------------------------------------------- /somaticseq/utilities/linguistic_sequence_complexity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from copy import copy 5 | from sys import float_info 6 | 7 | import somaticseq.sequencing_features as seq_features 8 | 9 | eps = float_info.epsilon 10 | 11 | 12 | def all_possible_dna_sequences(seq_length): 13 | seqs = ["G", "C", "T", "A"] 14 | 15 | for _ in range(seq_length - 1): 16 | seqs_i = copy(seqs) 17 | seqs = [] 18 | for sub_seq in seqs_i: 19 | for i in "TCGA": 20 | extended_seq = sub_seq + i 21 | seqs.append(extended_seq) 22 | 23 | return set(seqs) 24 | 25 | 26 | def max_vocabularies(seq_length): 27 | # According to: 28 | # https://doi.org/10.1093/bioinformatics/18.5.679 29 | # Assume 4 different nucleotides 30 | counts = 0 31 | k = 1 32 | while k <= seq_length: 33 | if 4**k < (seq_length - k + 1): 34 | counts = counts + 4**k 35 | else: 36 | counts = ( 37 | counts + (seq_length - k + 1 + 1) * (seq_length - k + 1 - 1 + 1) / 2 38 | ) 39 | break 40 | 41 | k += 1 42 | 43 | return counts 44 | 45 | 46 | def LC(sequence): 47 | # Calculate linguistic sequence complexity according to 48 | # https://doi.org/10.1093/bioinformatics/18.5.679 49 | # Assume 4 different nucleotides 50 | sequence = sequence.upper() 51 | 52 | if "N" not in sequence: 53 | number_of_subseqs = 0 54 | seq_length = len(sequence) 55 | max_number_of_subseqs = max_vocabularies(seq_length) 56 | 57 | for i in range(1, seq_length + 1): 58 | # max_vocab_1 = 4**i 59 | # max_vocab_2 = seq_length - i + 1 60 | set_of_seq_n = set() 61 | 62 | for n, nth_base in enumerate(sequence): 63 | if n + i <= len(sequence): 64 | sub_seq = sequence[n : n + i] 65 | set_of_seq_n.add(sub_seq) 66 | 67 | num_uniq_subseqs = len(set_of_seq_n) 68 | number_of_subseqs = number_of_subseqs + num_uniq_subseqs 69 | 70 | lc = number_of_subseqs / max_number_of_subseqs 71 | 72 | else: 73 | lc = float("nan") 74 | 75 | return lc 76 | 77 | 78 | def main() -> None: 79 | parser = argparse.ArgumentParser( 80 | description=( 81 | "Calculate linguistic sequence complexity according to " 82 | "DOI:10.1093/bioinformatics/18.5.679" 83 | ), 84 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 85 | ) 86 | 87 | parser.add_argument("-seq", "--sequence", type=str, help="GCTA sequences") 88 | parser.add_argument( 89 | "-len", 90 | "--substring-length", 91 | type=int, 92 | help=( 93 | "Default is the whole length of the sequence. " 94 | "If specified, then it will calculate sub-length up to this value." 95 | ), 96 | ) 97 | args = parser.parse_args() 98 | if args.substring_length: 99 | length = args.substring_length 100 | assert length <= len(args.sequence) 101 | 102 | else: 103 | length = len(args.sequence) 104 | # This one adds up sub-strings up to a length 105 | print(seq_features.ling_seq_complexity_with_max_vocab_length(args.sequence, length)) 106 | 107 | 108 | if __name__ == "__main__": 109 | main() 110 | -------------------------------------------------------------------------------- /somaticseq/utilities/paired_end_bam2fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | 6 | import pysam 7 | 8 | NT_PAIRS = { 9 | "G": "C", 10 | "T": "A", 11 | "C": "G", 12 | "A": "T", 13 | "N": "N", 14 | "g": "c", 15 | "t": "a", 16 | "c": "g", 17 | "a": "t", 18 | "n": "n", 19 | } 20 | 21 | 22 | def reverse_complement(seq): 23 | seq_j = "".join([NT_PAIRS[base_i] for base_i in seq[::-1]]) 24 | return seq_j 25 | 26 | 27 | def text_open_write(filename): 28 | if str(filename).endswith(".gz"): 29 | return gzip.open(filename, "wt") 30 | return open(filename, "w") 31 | 32 | 33 | def bam2fq(bam_file, fastq1, fastq2): 34 | with ( 35 | pysam.AlignmentFile(bam_file) as bam, 36 | text_open_write(fastq1) as fq1, 37 | text_open_write(fastq2) as fq2, 38 | ): 39 | reads1 = {} 40 | reads2 = {} 41 | reads = bam.fetch() 42 | for read_i in reads: 43 | if not read_i.is_secondary: 44 | seq_i = ( 45 | reverse_complement(read_i.query_sequence) 46 | if read_i.is_reverse 47 | else read_i.query_sequence 48 | ) 49 | qual_i = read_i.qual[::-1] if read_i.is_reverse else read_i.qual 50 | if read_i.is_read1: 51 | if read_i.query_name in reads2: 52 | fq1.write(f"@{read_i.query_name}/1\n") 53 | fq1.write(seq_i + "\n") 54 | fq1.write("+\n") 55 | fq1.write(qual_i + "\n") 56 | 57 | read_2 = reads2.pop(read_i.query_name) 58 | fq2.write("@{}/2\n".format(read_2["qname"])) 59 | fq2.write(read_2["seq"] + "\n") 60 | fq2.write("+\n") 61 | fq2.write(read_2["bq"] + "\n") 62 | 63 | else: 64 | reads1[read_i.query_name] = {} 65 | reads1[read_i.query_name]["qname"] = read_i.query_name 66 | reads1[read_i.query_name]["seq"] = seq_i 67 | reads1[read_i.query_name]["bq"] = qual_i 68 | 69 | elif read_i.is_read2: 70 | if read_i.query_name in reads1: 71 | read_1 = reads1.pop(read_i.query_name) 72 | fq1.write("@{}/1\n".format(read_1["qname"])) 73 | fq1.write(read_1["seq"] + "\n") 74 | fq1.write("+\n") 75 | fq1.write(read_1["bq"] + "\n") 76 | 77 | fq2.write(f"@{read_i.query_name}/2\n") 78 | fq2.write(seq_i + "\n") 79 | fq2.write("+\n") 80 | fq2.write(qual_i + "\n") 81 | else: 82 | reads2[read_i.query_name] = {} 83 | reads2[read_i.query_name]["qname"] = read_i.query_name 84 | reads2[read_i.query_name]["seq"] = seq_i 85 | reads2[read_i.query_name]["bq"] = qual_i 86 | 87 | return True 88 | 89 | 90 | def main() -> None: 91 | parser = argparse.ArgumentParser( 92 | description="Convert paired-end BAM to FASTQ1 and 2", 93 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 94 | ) 95 | parser.add_argument("-bam", "--bam", type=str, help="bam file in") 96 | parser.add_argument("-fq1", "--fastq1", type=str, help="fastq1 out") 97 | parser.add_argument("-fq2", "--fastq2", type=str, help="fastq2 out") 98 | args = parser.parse_args() 99 | bam2fq(args.bam, args.fastq1, args.fastq2) 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /somaticseq/utilities/remove_callers_from_somaticseq_tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | from math import isnan 6 | 7 | all_possible_callers = ( 8 | "if_MuTect", 9 | "if_VarScan2", 10 | "if_JointSNVMix2", 11 | "if_SomaticSniper", 12 | "if_VarDict", 13 | "MuSE_Tier", 14 | "if_LoFreq", 15 | "if_Scalpel", 16 | "if_Strelka", 17 | "if_TNscope", 18 | "if_Platypus", 19 | ) 20 | 21 | 22 | parser = argparse.ArgumentParser( 23 | description=( 24 | "In SomaticSeq TSV files, replace certain callers with nan and remove lines " 25 | "where they are only called by these. " 26 | "To mimic a TSV where only a subset of the callers were used." 27 | ), 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 29 | ) 30 | parser.add_argument("-infile", "--infile", type=str, help="input file", required=True) 31 | parser.add_argument("-outfile", "--outfile", type=str, help="input file", required=True) 32 | parser.add_argument( 33 | "-subtract", 34 | "--subtract-callers", 35 | type=str, 36 | nargs="+", 37 | help="columns to make nan", 38 | required=True, 39 | choices=all_possible_callers, 40 | ) 41 | 42 | args = parser.parse_args() 43 | 44 | for caller_i in args.subtract_callers: 45 | assert caller_i in all_possible_callers 46 | 47 | 48 | def open_textfile(file_name): 49 | if file_name.lower().endswith(".gz"): 50 | return gzip.open(file_name, "rt") 51 | else: 52 | return open(file_name) 53 | 54 | 55 | def items_to_make_nan(callers_to_subtract): 56 | out_items = [] 57 | for caller_i in callers_to_subtract: 58 | if caller_i == "if_MuTect": 59 | out_items.append("M2_NLOD") 60 | out_items.append("M2_TLOD") 61 | out_items.append("M2_STR") 62 | out_items.append("M2_ECNT") 63 | elif caller_i == "if_JointSNVMix2": 64 | out_items.append("SNVMix2_Score") 65 | elif caller_i == "if_SomaticSniper": 66 | out_items.append("Sniper_Score") 67 | elif caller_i == "if_VarDict": 68 | out_items.append("VarDict_Score") 69 | out_items.append("MSI") 70 | out_items.append("MSILEN") 71 | out_items.append("SHIFT3") 72 | elif caller_i == "if_Strelka": 73 | out_items.append("Strelka_Score") 74 | out_items.append("Strelka_QSS") 75 | out_items.append("Strelka_TQSS") 76 | 77 | return out_items 78 | 79 | 80 | with open_textfile(args.infile) as infile, open(args.outfile, "w") as outfile: 81 | line_in = infile.readline().rstrip() 82 | item_in = line_in.split("\t") 83 | out_indices = [item_in.index(i) for i in args.subtract_callers] 84 | remaining_indices = [ 85 | item_in.index(i) for i in all_possible_callers if i not in args.subtract_callers 86 | ] 87 | extra_nan_items = items_to_make_nan(args.subtract_callers) 88 | extra_nan_indices = [item_in.index(i) for i in extra_nan_items] 89 | outfile.write(line_in + "\n") 90 | line_in = infile.readline().rstrip() 91 | while line_in: 92 | item_in = line_in.split("\t") 93 | 94 | other_callers = 0 95 | for other_i in remaining_indices: 96 | classification_i = item_in[other_i] 97 | classification_i = eval(classification_i) 98 | if not isnan(classification_i): 99 | other_callers += classification_i 100 | 101 | if other_callers > 0: 102 | for out_i in out_indices + extra_nan_indices: 103 | item_in[out_i] = "nan" 104 | 105 | line_out = "\t".join(item_in) 106 | outfile.write(line_out + "\n") 107 | 108 | line_in = infile.readline().rstrip() 109 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/QC/extract_coverageDepth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam:,genome-reference:,selector:,minBaseQuality:,minMappingQuality:,extra-arguments:,out-script:,standalone, -n 'coverageDepth.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | minBaseQuality=0 18 | minMappingQuality=0 19 | 20 | 21 | while true; do 22 | case "$1" in 23 | -o | --output-dir ) 24 | case "$2" in 25 | "") shift 2 ;; 26 | *) outdir=$2 ; shift 2 ;; 27 | esac ;; 28 | 29 | --bam ) 30 | case "$2" in 31 | "") shift 2 ;; 32 | *) bamFile=$2 ; shift 2 ;; 33 | esac ;; 34 | 35 | --genome-reference ) 36 | case "$2" in 37 | "") shift 2 ;; 38 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 39 | esac ;; 40 | 41 | --minBaseQuality ) 42 | case "$2" in 43 | "") shift 2 ;; 44 | *) minBaseQuality=$2 ; shift 2 ;; 45 | esac ;; 46 | 47 | --minMappingQuality ) 48 | case "$2" in 49 | "") shift 2 ;; 50 | *) minMappingQuality=$2 ; shift 2 ;; 51 | esac ;; 52 | 53 | --selector ) 54 | case "$2" in 55 | "") shift 2 ;; 56 | *) SELECTOR=$2 ; shift 2 ;; 57 | esac ;; 58 | 59 | --extra-arguments ) 60 | case "$2" in 61 | "") shift 2 ;; 62 | *) extra_arguments=$2 ; shift 2 ;; 63 | esac ;; 64 | 65 | --out-script ) 66 | case "$2" in 67 | "") shift 2 ;; 68 | *) out_script_name=$2 ; shift 2 ;; 69 | esac ;; 70 | 71 | --standalone ) 72 | standalone=1 ; shift ;; 73 | 74 | -- ) shift; break ;; 75 | * ) break ;; 76 | esac 77 | done 78 | 79 | logdir=${outdir}/logs 80 | mkdir -p ${logdir} 81 | 82 | if [[ ${out_script_name} ]] 83 | then 84 | out_script="${out_script_name}" 85 | else 86 | out_script="${logdir}/coverageDepth.${timestamp}.cmd" 87 | fi 88 | 89 | 90 | if [[ $standalone ]] 91 | then 92 | echo "#!/bin/bash" > $out_script 93 | echo "" >> $out_script 94 | echo "#$ -o ${logdir}" >> $out_script 95 | echo "#$ -e ${logdir}" >> $out_script 96 | echo "#$ -S /bin/bash" >> $out_script 97 | echo '#$ -l h_vmem=8G' >> $out_script 98 | echo 'set -e' >> $out_script 99 | fi 100 | 101 | echo "" >> $out_script 102 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 103 | echo "" >> $out_script 104 | 105 | if [[ ${SELECTOR} ]] 106 | then 107 | selector_text="-L /mnt/${SELECTOR}" 108 | fi 109 | 110 | bamFileName=`basename ${bamFile}` 111 | 112 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk3:3.8-0 \\" >> $out_script 113 | echo "java -Xmx8g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 114 | echo "-T DepthOfCoverage \\" >> $out_script 115 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 116 | echo "-I /mnt/${bamFile} \\" >> $out_script 117 | echo "${selector_text} \\" >> $out_script 118 | echo "--minBaseQuality ${minBaseQuality} \\" >> $out_script 119 | echo "--minMappingQuality ${minMappingQuality} \\" >> $out_script 120 | echo "${extra_arguments} \\" >> $out_script 121 | echo "-o /mnt/${outdir}/${bamFileName}.depth" >> $out_script 122 | 123 | echo "" >> $out_script 124 | 125 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 126 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/README.md: -------------------------------------------------------------------------------- 1 | These scripts are deprecated. See [dockered pipeline](../dockered_pipelines/) 2 | instead with `-tech singularity` to invoke singularity scripts instead of 3 | docker. 4 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/README.md: -------------------------------------------------------------------------------- 1 | Mutation Simulation Pipeline in Singularities 2 | 3 | **Requirement** 4 | 5 | - Have internet connection and Singularity. Be able to pull docker images from 6 | Docker Hub. 7 | - **Highly recommended**: Have cluster management system with valid "qsub" 8 | command, such as Sun Grid Engine (SGE). 9 | 10 | This is ported from the 11 | [dockered pipeline](../../dockered_pipelines/bamSimulator). Commands are 12 | identical, except these scripts will run on singularities instead of docker 13 | daemon. Use the same set of commands, substitute "dockered_pipelines" in the 14 | command path for "singularities." 15 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/IndelRealign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,genome-reference:,selector:,out-tag:,extra-arguments:,out-script:,standalone, -n 'IndelRealign.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | out_tag='JointRealigned' 18 | #extra_arguments='-dt NONE --maxReadsForConsensuses 150000 --maxReadsInMemory 500000 --maxReadsForRealignment 2000000' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --tumor-bam ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) tbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --normal-bam ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) nbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --genome-reference ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --selector ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) SELECTOR=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --out-tag ) 53 | case "$2" in 54 | "") shift 2 ;; 55 | *) out_tag=$2 ; shift 2 ;; 56 | esac ;; 57 | 58 | --extra-arguments ) 59 | case "$2" in 60 | "") shift 2 ;; 61 | *) extra_arguments=$2 ; shift 2 ;; 62 | esac ;; 63 | 64 | --out-script ) 65 | case "$2" in 66 | "") shift 2 ;; 67 | *) out_script_name=$2 ; shift 2 ;; 68 | esac ;; 69 | 70 | --standalone ) 71 | standalone=1 ; shift ;; 72 | 73 | -- ) shift; break ;; 74 | * ) break ;; 75 | esac 76 | done 77 | 78 | logdir=${outdir}/logs 79 | mkdir -p ${logdir} 80 | 81 | if [[ ${out_script_name} ]] 82 | then 83 | out_script="${out_script_name}" 84 | else 85 | out_script="${logdir}/indelRealign.${timestamp}.cmd" 86 | fi 87 | 88 | if [[ $standalone ]] 89 | then 90 | echo "#!/bin/bash" > $out_script 91 | echo "" >> $out_script 92 | echo "#$ -o ${logdir}" >> $out_script 93 | echo "#$ -e ${logdir}" >> $out_script 94 | echo "#$ -S /bin/bash" >> $out_script 95 | echo '#$ -l h_vmem=10G' >> $out_script 96 | echo 'set -e' >> $out_script 97 | fi 98 | 99 | echo "" >> $out_script 100 | 101 | if [[ $SELECTOR ]] 102 | then 103 | selector_input="-L /mnt/${SELECTOR}" 104 | fi 105 | 106 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk3:3.8-0 java -Xmx9g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 107 | echo "-T RealignerTargetCreator \\" >> $out_script 108 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 109 | echo "-I /mnt/${tbam} \\" >> $out_script 110 | echo "-I /mnt/${nbam} \\" >> $out_script 111 | echo "$selector_input \\" >> $out_script 112 | echo "-o /mnt/${outdir}/T.N.intervals" >> $out_script 113 | echo "" >> $out_script 114 | 115 | echo "singularity exec --bind /:/mnt --pwd /mnt/${outdir} docker://broadinstitute/gatk3:3.8-0 \\" >> $out_script 116 | echo "java -Xmx9g -jar /usr/GenomeAnalysisTK.jar \\" >> $out_script 117 | echo "-T IndelRealigner \\" >> $out_script 118 | echo "-R /mnt/${HUMAN_REFERENCE} \\" >> $out_script 119 | echo "-I /mnt/${tbam} \\" >> $out_script 120 | echo "-I /mnt/${nbam} \\" >> $out_script 121 | echo "-targetIntervals /mnt/${outdir}/T.N.intervals \\" >> $out_script 122 | echo "${extra_arguments} \\" >> $out_script 123 | echo "-nWayOut .${out_tag}.bam" >> $out_script 124 | echo "" >> $out_script 125 | 126 | realigned_normal=${nbam%.bam}.${out_tag}.bam 127 | realigned_tumor=${tbam%.bam}.${out_tag}.bam 128 | 129 | echo "mv ${realigned_normal%.bam}.bai ${realigned_normal}.bai" >> $out_script 130 | echo "mv ${realigned_tumor%.bam}.bai ${realigned_tumor}.bai" >> $out_script 131 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/MergeTN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,tumor-bam:,normal-bam:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | outSM='TN_Merged' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-out ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) outbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --tumor-bam ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) tbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --normal-bam ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) nbam=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --out-script ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) out_script_name=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --standalone ) 53 | standalone=1 ; shift ;; 54 | 55 | -- ) shift; break ;; 56 | * ) break ;; 57 | esac 58 | done 59 | 60 | logdir=${outdir}/logs 61 | mkdir -p ${logdir} 62 | 63 | if [[ ${out_script_name} ]] 64 | then 65 | out_script="${out_script_name}" 66 | else 67 | out_script="${logdir}/mergeBams.${timestamp}.cmd" 68 | fi 69 | 70 | if [[ $standalone ]] 71 | then 72 | echo "#!/bin/bash" > $out_script 73 | echo "" >> $out_script 74 | echo "#$ -o ${logdir}" >> $out_script 75 | echo "#$ -e ${logdir}" >> $out_script 76 | echo "#$ -S /bin/bash" >> $out_script 77 | echo '#$ -l h_vmem=8G' >> $out_script 78 | echo 'set -e' >> $out_script 79 | fi 80 | 81 | echo "" >> $out_script 82 | 83 | # Merge the 2 BAM files 84 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script 86 | echo "I=/mnt/${nbam} \\" >> $out_script 87 | echo "I=/mnt/${tbam} \\" >> $out_script 88 | echo "ASSUME_SORTED=true \\" >> $out_script 89 | echo "CREATE_INDEX=true \\" >> $out_script 90 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 91 | echo "" >> $out_script 92 | 93 | # Remove temp files 94 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 95 | echo "" >> $out_script 96 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/Reheader_SM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-SM:,out-script:,standalone -n 'Reheader_SM.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | outSM='TN_Merged' 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-out ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) outbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --bam-in ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) inbam=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --out-SM ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) outSM=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --out-script ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) out_script_name=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --standalone ) 53 | standalone=1 ; shift ;; 54 | 55 | -- ) shift; break ;; 56 | * ) break ;; 57 | esac 58 | done 59 | 60 | logdir=${outdir}/logs 61 | mkdir -p ${logdir} 62 | 63 | if [[ ${out_script_name} ]] 64 | then 65 | out_script="${out_script_name}" 66 | else 67 | out_script="${logdir}/reheader.${timestamp}.cmd" 68 | fi 69 | 70 | if [[ $standalone ]] 71 | then 72 | echo "#!/bin/bash" > $out_script 73 | echo "" >> $out_script 74 | echo "#$ -o ${logdir}" >> $out_script 75 | echo "#$ -e ${logdir}" >> $out_script 76 | echo "#$ -S /bin/bash" >> $out_script 77 | echo '#$ -l h_vmem=8G' >> $out_script 78 | echo 'set -e' >> $out_script 79 | fi 80 | 81 | echo "" >> $out_script 82 | 83 | # Uniform sample and read group names in the merged file 84 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 85 | echo "java -Xmx6g -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups \\" >> $out_script 86 | echo "I=/mnt/${outdir}/${inbam} \\" >> $out_script 87 | echo "RGID=BAMSurgeon \\" >> $out_script 88 | echo "RGLB=TNMerged \\" >> $out_script 89 | echo "RGPL=illumina \\" >> $out_script 90 | echo "RGPU=BAMSurgeon \\" >> $out_script 91 | echo "RGSM=${outSM} \\" >> $out_script 92 | echo "CREATE_INDEX=true \\" >> $out_script 93 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 94 | echo "" >> $out_script 95 | 96 | # Remove temp files 97 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 98 | echo "" >> $out_script 99 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/SortByCoordinate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,genome-reference:,out-script:,standalone -n 'SortByCoordinate.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --genome-reference ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --out-script ) 46 | case "$2" in 47 | "") shift 2 ;; 48 | *) out_script_name=$2 ; shift 2 ;; 49 | esac ;; 50 | 51 | --standalone ) 52 | standalone=1 ; shift ;; 53 | 54 | -- ) shift; break ;; 55 | * ) break ;; 56 | esac 57 | done 58 | 59 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 60 | 61 | logdir=${outdir}/logs 62 | mkdir -p ${logdir} 63 | 64 | if [[ ${out_script_name} ]] 65 | then 66 | out_script="${out_script_name}" 67 | else 68 | out_script="${logdir}/sort.coordinates.${timestamp}.cmd" 69 | fi 70 | 71 | 72 | if [[ $standalone ]] 73 | then 74 | echo "#!/bin/bash" > $out_script 75 | echo "" >> $out_script 76 | echo "#$ -o ${logdir}" >> $out_script 77 | echo "#$ -e ${logdir}" >> $out_script 78 | echo "#$ -S /bin/bash" >> $out_script 79 | echo '#$ -l h_vmem=8G' >> $out_script 80 | echo 'set -e' >> $out_script 81 | fi 82 | 83 | 84 | echo "" >> $out_script 85 | 86 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script 87 | echo "samtools sort -m 4G --reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script 88 | echo "-o /mnt/${outdir}/${outbam} /mnt/${inbam}" >> $out_script 89 | echo "" >> $out_script 90 | 91 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script 92 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script 93 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/SortByReadName.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 54 | 55 | logdir=${outdir}/logs 56 | mkdir -p ${logdir} 57 | 58 | if [[ ${out_script_name} ]] 59 | then 60 | out_script="${out_script_name}" 61 | else 62 | out_script="${logdir}/sort.qname.${timestamp}.cmd" 63 | fi 64 | 65 | 66 | if [[ $standalone ]] 67 | then 68 | echo "#!/bin/bash" > $out_script 69 | echo "" >> $out_script 70 | echo "#$ -o ${logdir}" >> $out_script 71 | echo "#$ -e ${logdir}" >> $out_script 72 | echo "#$ -S /bin/bash" >> $out_script 73 | echo '#$ -l h_vmem=8G' >> $out_script 74 | echo 'set -e' >> $out_script 75 | fi 76 | 77 | 78 | echo "" >> $out_script 79 | 80 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script 81 | echo "samtools sort -n -m 4G \\" >> $out_script 82 | echo "-o /mnt/${outdir}/${outbam} \\" >> $out_script 83 | echo "/mnt/${inbam} \\" >> $out_script 84 | echo "" >> $out_script 85 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/bamsurgeon_split_BAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,genome-reference:,bam-out1:,bam-out2:,bam-in:,split-proportion:,down-sample:,seed:,out-script:,clean-bam,standalone -n 'bamsurgeon_split_BAM.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | seed=$( date +"%Y" ) 17 | proportion=0.5 18 | down_sample=1 19 | 20 | while true; do 21 | case "$1" in 22 | -o | --output-dir ) 23 | case "$2" in 24 | "") shift 2 ;; 25 | *) outdir=$2 ; shift 2 ;; 26 | esac ;; 27 | 28 | --bam-in ) 29 | case "$2" in 30 | "") shift 2 ;; 31 | *) inbam=$2 ; shift 2 ;; 32 | esac ;; 33 | 34 | --bam-out1 ) 35 | case "$2" in 36 | "") shift 2 ;; 37 | *) outbam1=$2 ; shift 2 ;; 38 | esac ;; 39 | 40 | --bam-out2 ) 41 | case "$2" in 42 | "") shift 2 ;; 43 | *) outbam2=$2 ; shift 2 ;; 44 | esac ;; 45 | 46 | --genome-reference ) 47 | case "$2" in 48 | "") shift 2 ;; 49 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 50 | esac ;; 51 | 52 | --split-proportion ) 53 | case "$2" in 54 | "") shift 2 ;; 55 | *) proportion=$2 ; shift 2 ;; 56 | esac ;; 57 | 58 | --down-sample ) 59 | case "$2" in 60 | "") shift 2 ;; 61 | *) down_sample=$2 ; shift 2 ;; 62 | esac ;; 63 | 64 | --seed ) 65 | case "$2" in 66 | "") shift 2 ;; 67 | *) seed=$2 ; shift 2 ;; 68 | esac ;; 69 | 70 | --out-script ) 71 | case "$2" in 72 | "") shift 2 ;; 73 | *) out_script_name=$2 ; shift 2 ;; 74 | esac ;; 75 | 76 | --clean-bam ) 77 | clean_bam=1 ; shift ;; 78 | 79 | --standalone ) 80 | standalone=1 ; shift ;; 81 | 82 | -- ) shift; break ;; 83 | * ) break ;; 84 | esac 85 | done 86 | 87 | hg_dict=${HUMAN_REFERENCE%\.fa*}.dict 88 | 89 | logdir=${outdir}/logs 90 | mkdir -p ${logdir} 91 | 92 | if [[ ${out_script_name} ]] 93 | then 94 | out_script="${out_script_name}" 95 | else 96 | out_script="${logdir}/splitBams.${timestamp}.cmd" 97 | fi 98 | 99 | if [[ $standalone ]] 100 | then 101 | echo "#!/bin/bash" > $out_script 102 | echo "" >> $out_script 103 | echo "#$ -o ${logdir}" >> $out_script 104 | echo "#$ -e ${logdir}" >> $out_script 105 | echo "#$ -S /bin/bash" >> $out_script 106 | echo '#$ -l h_vmem=8G' >> $out_script 107 | echo 'set -e' >> $out_script 108 | fi 109 | 110 | 111 | echo "" >> $out_script 112 | 113 | 114 | # Then you can split 115 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 116 | echo "/usr/local/bamsurgeon/scripts/sortedBamSplit.py \\" >> $out_script 117 | echo "--bam /mnt/${inbam} \\" >> $out_script 118 | echo "--proportion ${proportion} \\" >> $out_script 119 | echo "--downsample ${down_sample} \\" >> $out_script 120 | echo "--pick1 /mnt/${outdir}/${outbam1} \\" >> $out_script 121 | echo "--pick2 /mnt/${outdir}/${outbam2} \\" >> $out_script 122 | echo "--seed ${seed}" >> $out_script 123 | echo "" >> $out_script 124 | 125 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam1}" >> $out_script 126 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 samtools index /mnt/${outdir}/${outbam2}" >> $out_script 127 | echo "" >> $out_script 128 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/cleanBam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-out:,bam-in:,out-script:,standalone -n 'SortByReadName.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | seed=$( date +"%Y" ) 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-in ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) inbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-out ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) outbam=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | if [[ ${out_script_name} ]] 54 | then 55 | out_script="${out_script_name}" 56 | else 57 | out_script="${logdir}/cleanBam.${timestamp}.cmd" 58 | fi 59 | 60 | if [[ $standalone ]] 61 | then 62 | echo "#!/bin/bash" > $out_script 63 | echo "" >> $out_script 64 | echo "#$ -o ${logdir}" >> $out_script 65 | echo "#$ -e ${logdir}" >> $out_script 66 | echo "#$ -S /bin/bash" >> $out_script 67 | echo '#$ -l h_vmem=4G' >> $out_script 68 | echo 'set -e' >> $out_script 69 | fi 70 | 71 | echo "" >> $out_script 72 | 73 | # To split a BAM file, first you must sort by name: 74 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 75 | echo "/usr/local/bamsurgeon/scripts/remove_reads_with_many_qnames_or_bad_CIGAR.py \\" >> $out_script 76 | echo "-bamin /mnt/${inbam} \\" >> $out_script 77 | echo "-bamout /mnt/${outdir}/${outbam}" >> $out_script 78 | echo "" >> $out_script 79 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/concatVcfFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,vcf-string:,vcf-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --vcf-out ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) outvcf=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --vcf-string ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) vcf_string=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | logdir=${outdir}/logs 54 | mkdir -p ${logdir} 55 | 56 | if [[ ${out_script_name} ]] 57 | then 58 | out_script="${out_script_name}" 59 | else 60 | out_script="${logdir}/concatVcfFiles.${timestamp}.cmd" 61 | fi 62 | 63 | if [[ $standalone ]] 64 | then 65 | echo "#!/bin/bash" > $out_script 66 | echo "" >> $out_script 67 | echo "#$ -o ${logdir}" >> $out_script 68 | echo "#$ -e ${logdir}" >> $out_script 69 | echo "#$ -S /bin/bash" >> $out_script 70 | echo '#$ -l h_vmem=2G' >> $out_script 71 | echo 'set -e' >> $out_script 72 | fi 73 | 74 | echo "" >> $out_script 75 | 76 | 77 | for file in ${vcf_string} 78 | do 79 | input_file_string="/mnt/${file} ${input_file_string}" 80 | done 81 | 82 | # Merge the BAM files 83 | echo "singularity exec --bind /:/mnt docker://lethalfang/vcftools:0.1.15 bash -c \\" >> $out_script 84 | echo "\"vcf-concat \\" >> $out_script 85 | echo "${input_file_string} \\" >> $out_script 86 | echo "> /mnt/${outdir}/${outvcf}\"" >> $out_script 87 | echo "" >> $out_script 88 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/mergeBamFiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-string:,bam-out:,out-script:,standalone -n 'MergeTN.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | keep_intermediates=0 18 | 19 | while true; do 20 | case "$1" in 21 | -o | --output-dir ) 22 | case "$2" in 23 | "") shift 2 ;; 24 | *) outdir=$2 ; shift 2 ;; 25 | esac ;; 26 | 27 | --bam-out ) 28 | case "$2" in 29 | "") shift 2 ;; 30 | *) outbam=$2 ; shift 2 ;; 31 | esac ;; 32 | 33 | --bam-string ) 34 | case "$2" in 35 | "") shift 2 ;; 36 | *) bam_string=$2 ; shift 2 ;; 37 | esac ;; 38 | 39 | --out-script ) 40 | case "$2" in 41 | "") shift 2 ;; 42 | *) out_script_name=$2 ; shift 2 ;; 43 | esac ;; 44 | 45 | --standalone ) 46 | standalone=1 ; shift ;; 47 | 48 | -- ) shift; break ;; 49 | * ) break ;; 50 | esac 51 | done 52 | 53 | logdir=${outdir}/logs 54 | mkdir -p ${logdir} 55 | 56 | if [[ ${out_script_name} ]] 57 | then 58 | out_script="${out_script_name}" 59 | else 60 | out_script="${logdir}/mergeBams.${timestamp}.cmd" 61 | fi 62 | 63 | if [[ $standalone ]] 64 | then 65 | echo "#!/bin/bash" > $out_script 66 | echo "" >> $out_script 67 | echo "#$ -o ${logdir}" >> $out_script 68 | echo "#$ -e ${logdir}" >> $out_script 69 | echo "#$ -S /bin/bash" >> $out_script 70 | echo '#$ -l h_vmem=8G' >> $out_script 71 | echo 'set -e' >> $out_script 72 | fi 73 | 74 | echo "" >> $out_script 75 | 76 | 77 | for file in ${bam_string} 78 | do 79 | input_file_string="I=/mnt/${file} ${input_file_string}" 80 | done 81 | 82 | # Merge the BAM files 83 | echo "singularity exec --bind /:/mnt docker://lethalfang/bamsurgeon:1.1-3 \\" >> $out_script 84 | echo "java -Xmx8g -jar /usr/local/bin/picard.jar MergeSamFiles \\" >> $out_script 85 | echo "${input_file_string} \\" >> $out_script 86 | echo "ASSUME_SORTED=true \\" >> $out_script 87 | echo "CREATE_INDEX=true \\" >> $out_script 88 | echo "O=/mnt/${outdir}/${outbam}" >> $out_script 89 | echo "" >> $out_script 90 | 91 | # Remove temp files 92 | echo "mv ${outdir}/${outbam%.bam}.bai ${outdir}/${outbam}.bai" >> $out_script 93 | echo "" >> $out_script 94 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/bamSimulator/bamSurgeon/split_BAM_by_BED.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam-in:,bam-out:,selector:,out-script:,standalone -n 'split_BAM_by_BED.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | while true; do 18 | case "$1" in 19 | -o | --output-dir ) 20 | case "$2" in 21 | "") shift 2 ;; 22 | *) outdir=$2 ; shift 2 ;; 23 | esac ;; 24 | 25 | --bam-in ) 26 | case "$2" in 27 | "") shift 2 ;; 28 | *) inbam=$2 ; shift 2 ;; 29 | esac ;; 30 | 31 | --bam-out ) 32 | case "$2" in 33 | "") shift 2 ;; 34 | *) outbam=$2 ; shift 2 ;; 35 | esac ;; 36 | 37 | --selector ) 38 | case "$2" in 39 | "") shift 2 ;; 40 | *) SELECTOR=$2 ; shift 2 ;; 41 | esac ;; 42 | 43 | --out-script ) 44 | case "$2" in 45 | "") shift 2 ;; 46 | *) out_script_name=$2 ; shift 2 ;; 47 | esac ;; 48 | 49 | --standalone ) 50 | standalone=1 ; shift ;; 51 | 52 | -- ) shift; break ;; 53 | * ) break ;; 54 | esac 55 | done 56 | 57 | 58 | logdir=${outdir}/logs 59 | mkdir -p ${logdir} 60 | 61 | if [[ ${out_script_name} ]] 62 | then 63 | out_script="${out_script_name}" 64 | else 65 | out_script="${logdir}/splitByBed.${timestamp}.cmd" 66 | fi 67 | 68 | if [[ $standalone ]] 69 | then 70 | echo "#!/bin/bash" > $out_script 71 | echo "" >> $out_script 72 | echo "#$ -o ${logdir}" >> $out_script 73 | echo "#$ -e ${logdir}" >> $out_script 74 | echo "#$ -S /bin/bash" >> $out_script 75 | echo '#$ -l h_vmem=4G' >> $out_script 76 | echo 'set -e' >> $out_script 77 | fi 78 | 79 | echo "" >> $out_script 80 | 81 | 82 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 bash -c \\" >> $out_script 83 | echo "\"samtools view /mnt/${inbam} -L /mnt/${SELECTOR} -Sbh \\" >> $out_script 84 | echo "> /mnt/${outdir}/${outbam}\"" >> $out_script 85 | 86 | echo "" >> $out_script 87 | 88 | echo "singularity exec --bind /:/mnt docker://lethalfang/samtools:1.7 \\" >> $out_script 89 | echo "samtools index /mnt/${outdir}/${outbam}" >> $out_script 90 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/germline_variants/Nirvana.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,in-vcf:,nirvana-resources-dir:,sample:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'manta.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | sampleID='Nirvana' 20 | 21 | while true; do 22 | case "$1" in 23 | 24 | -o | --out-dir ) 25 | case "$2" in 26 | "") shift 2 ;; 27 | *) outdir=$2 ; shift 2 ;; 28 | esac ;; 29 | 30 | --in-vcf ) 31 | case "$2" in 32 | "") shift 2 ;; 33 | *) inVcf=$2 ; shift 2 ;; 34 | esac ;; 35 | 36 | --nirvana-resources-dir ) 37 | case "$2" in 38 | "") shift 2 ;; 39 | *) NIRVANA_RESOURCES_DIR=$2 ; shift 2 ;; 40 | esac ;; 41 | 42 | --sample ) 43 | case "$2" in 44 | "") shift 2 ;; 45 | *) sampleID=$2 ; shift 2 ;; 46 | esac ;; 47 | 48 | --extra-arguments ) 49 | case "$2" in 50 | "") shift 2 ;; 51 | *) extra_arguments=$2 ; shift 2 ;; 52 | esac ;; 53 | 54 | --out-script ) 55 | case "$2" in 56 | "") shift 2 ;; 57 | *) out_script_name=$2 ; shift 2 ;; 58 | esac ;; 59 | 60 | --threads ) 61 | case "$2" in 62 | "") shift 2 ;; 63 | *) threads=$2 ; shift 2 ;; 64 | esac ;; 65 | 66 | --action ) 67 | case "$2" in 68 | "") shift 2 ;; 69 | *) action=$2 ; shift 2 ;; 70 | esac ;; 71 | 72 | --standalone ) 73 | standalone=1 ; shift ;; 74 | 75 | -- ) shift; break ;; 76 | * ) break ;; 77 | esac 78 | 79 | done 80 | 81 | logdir=${outdir}/logs 82 | mkdir -p ${logdir} 83 | 84 | if [[ ${out_script_name} ]] 85 | then 86 | out_script="${out_script_name}" 87 | else 88 | out_script="${logdir}/canvas.${timestamp}.cmd" 89 | fi 90 | 91 | 92 | if [[ $standalone ]] 93 | then 94 | echo "#!/bin/bash" > $out_script 95 | echo "" >> $out_script 96 | echo "#$ -o ${logdir}" >> $out_script 97 | echo "#$ -e ${logdir}" >> $out_script 98 | echo "#$ -S /bin/bash" >> $out_script 99 | echo '#$ -l h_vmem=4G' >> $out_script 100 | echo "#$ -pe smp ${threads}" >> $out_script 101 | echo 'set -e' >> $out_script 102 | fi 103 | 104 | echo "" >> $out_script 105 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 106 | echo "" >> $out_script 107 | 108 | 109 | echo "singularity exec --bind /:/mnt docker://lethalfang/nirvana:2.0.9 \\" >> $out_script 110 | echo "dotnet /opt/Nirvana/bin/Release/netcoreapp2.0/Nirvana.dll \\" >> $out_script 111 | echo "-c /mnt/${NIRVANA_RESOURCES_DIR}/Cache/26/GRCh38/Ensembl \\" >> $out_script 112 | echo "--sd /mnt/${NIRVANA_RESOURCES_DIR}/GRCh38 \\" >> $out_script 113 | echo "-r /mnt/${NIRVANA_RESOURCES_DIR}/References/5/Homo_sapiens.GRCh38.Nirvana.dat \\" >> $out_script 114 | echo "-i /mnt/${inVcf} \\" >> $out_script 115 | echo "-o /mnt/${outdir}/${sampleID}" >> $out_script 116 | 117 | echo "" >> $out_script 118 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 119 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/germline_variants/bam2vcf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long output-dir:,bam:,out-vcf:,genome-reference:,dbsnp:,hapmap:,omni:,thousandG:,mills:,out-script:,action:,threads:, -n 'bam2vcf.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | 17 | tumor_bam_header='@RG\tID:myPipeline\tLB:myLibrary\tPL:illumina\tSM:TUMOR' 18 | normal_bam_header='@RG\tID:myPipeline\tLB:myLibrary\tPL:illumina\tSM:NORMAL' 19 | MEM=16 20 | threads=24 21 | action=echo 22 | 23 | while true; do 24 | case "$1" in 25 | 26 | -o | --output-dir ) 27 | case "$2" in 28 | "") shift 2 ;; 29 | *) outdir=$2 ; shift 2 ;; 30 | esac ;; 31 | 32 | --bam ) 33 | case "$2" in 34 | "") shift 2 ;; 35 | *) bam=$2 ; shift 2 ;; 36 | esac ;; 37 | 38 | --out-vcf ) 39 | case "$2" in 40 | "") shift 2 ;; 41 | *) outVcf=$2 ; shift 2 ;; 42 | esac ;; 43 | 44 | 45 | --genome-reference ) 46 | case "$2" in 47 | "") shift 2 ;; 48 | *) GENOME_REFERENCE=$2 ; shift 2 ;; 49 | esac ;; 50 | 51 | --dbsnp ) 52 | case "$2" in 53 | "") shift 2 ;; 54 | *) dbsnp=$2 ; shift 2 ;; 55 | esac ;; 56 | 57 | --hapmap ) 58 | case "$2" in 59 | "") shift 2 ;; 60 | *) hapmapFile=$2 ; shift 2 ;; 61 | esac ;; 62 | 63 | --thousandG ) 64 | case "$2" in 65 | "") shift 2 ;; 66 | *) thousandGFile=$2 ; shift 2 ;; 67 | esac ;; 68 | 69 | --omni ) 70 | case "$2" in 71 | "") shift 2 ;; 72 | *) omniFile=$2 ; shift 2 ;; 73 | esac ;; 74 | 75 | --mills ) 76 | case "$2" in 77 | "") shift 2 ;; 78 | *) millsFile=$2 ; shift 2 ;; 79 | esac ;; 80 | 81 | --threads ) 82 | case "$2" in 83 | "") shift 2 ;; 84 | *) threads=$2 ; shift 2 ;; 85 | esac ;; 86 | 87 | --MEM ) 88 | case "$2" in 89 | "") shift 2 ;; 90 | *) MEM=$2 ; shift 2 ;; 91 | esac ;; 92 | 93 | --out-script ) 94 | case "$2" in 95 | "") shift 2 ;; 96 | *) out_script_name=$2 ; shift 2 ;; 97 | esac ;; 98 | 99 | --action ) 100 | case "$2" in 101 | "") shift 2 ;; 102 | *) action=$2 ; shift 2 ;; 103 | esac ;; 104 | 105 | -- ) shift; break ;; 106 | * ) break ;; 107 | 108 | esac 109 | done 110 | 111 | 112 | 113 | logdir=${outdir}/logs 114 | mkdir -p ${logdir} 115 | 116 | 117 | if [[ ${out_script_name} ]] 118 | then 119 | out_script="${logdir}/${out_script_name}" 120 | else 121 | out_script="${logdir}/bam2vcf.${timestamp}.cmd" 122 | fi 123 | 124 | 125 | echo "#!/bin/bash" > $out_script 126 | echo "" >> $out_script 127 | 128 | echo "#$ -o ${logdir}" >> $out_script 129 | echo "#$ -e ${logdir}" >> $out_script 130 | echo "#$ -S /bin/bash" >> $out_script 131 | echo "#$ -l h_vmem=6G" >> $out_script 132 | echo "#$ -pe smp ${threads}" >> $out_script 133 | 134 | echo 'set -e' >> $out_script 135 | echo "" >> $out_script 136 | 137 | files_to_delete='' 138 | 139 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 140 | echo "" >> $out_script 141 | 142 | 143 | 144 | $MYDIR/haplotypeCaller.sh \ 145 | --out-dir ${outdir} \ 146 | --bam ${bam} \ 147 | --human-reference ${GENOME_REFERENCE} \ 148 | --dbsnp ${dbsnp} \ 149 | --out-vcf preVQSR.${outVcf} \ 150 | --threads ${threads} \ 151 | --MEM ${MEM} \ 152 | --out-script ${out_script} 153 | 154 | $MYDIR/VQSR.sh \ 155 | --out-dir ${outdir} \ 156 | --in-vcf ${outdir}/preVQSR.${outVcf} \ 157 | --human-reference ${GENOME_REFERENCE} \ 158 | --dbsnp ${dbsnp} \ 159 | --hapmap ${hapmapFile} \ 160 | --omni ${omniFile} \ 161 | --thousandG ${thousandGFile} \ 162 | --mills ${millsFile} \ 163 | --out-vcf ${outVcf} \ 164 | --out-script ${out_script} 165 | -------------------------------------------------------------------------------- /somaticseq/utilities/singularities/germline_variants/haplotypeCaller.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Use getopt instead of getopts for long options 3 | 4 | set -e 5 | 6 | OPTS=`getopt -o o: --long out-dir:,out-vcf:,bam:,human-reference:,selector:,dbsnp:,extra-arguments:,action:,MEM:,threads:,out-script:,standalone -n 'gatk_haplotypecaller.sh' -- "$@"` 7 | 8 | if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi 9 | 10 | #echo "$OPTS" 11 | eval set -- "$OPTS" 12 | 13 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 14 | 15 | timestamp=$( date +"%Y-%m-%d_%H-%M-%S_%N" ) 16 | action=echo 17 | MEM=8 18 | threads=12 19 | 20 | while true; do 21 | case "$1" in 22 | 23 | -o | --out-dir ) 24 | case "$2" in 25 | "") shift 2 ;; 26 | *) outdir=$2 ; shift 2 ;; 27 | esac ;; 28 | 29 | --out-vcf ) 30 | case "$2" in 31 | "") shift 2 ;; 32 | *) outVcfName=$2 ; shift 2 ;; 33 | esac ;; 34 | 35 | --bam ) 36 | case "$2" in 37 | "") shift 2 ;; 38 | *) bamFile=$2 ; shift 2 ;; 39 | esac ;; 40 | 41 | --human-reference ) 42 | case "$2" in 43 | "") shift 2 ;; 44 | *) HUMAN_REFERENCE=$2 ; shift 2 ;; 45 | esac ;; 46 | 47 | --selector ) 48 | case "$2" in 49 | "") shift 2 ;; 50 | *) SELECTOR=$2 ; shift 2 ;; 51 | esac ;; 52 | 53 | --dbsnp ) 54 | case "$2" in 55 | "") shift 2 ;; 56 | *) dbsnp=$2 ; shift 2 ;; 57 | esac ;; 58 | 59 | --MEM ) 60 | case "$2" in 61 | "") shift 2 ;; 62 | *) MEM=$2 ; shift 2 ;; 63 | esac ;; 64 | 65 | --threads ) 66 | case "$2" in 67 | "") shift 2 ;; 68 | *) threads=$2 ; shift 2 ;; 69 | esac ;; 70 | 71 | --extra-arguments ) 72 | case "$2" in 73 | "") shift 2 ;; 74 | *) extra_arguments=$2 ; shift 2 ;; 75 | esac ;; 76 | 77 | --out-script ) 78 | case "$2" in 79 | "") shift 2 ;; 80 | *) out_script_name=$2 ; shift 2 ;; 81 | esac ;; 82 | 83 | --action ) 84 | case "$2" in 85 | "") shift 2 ;; 86 | *) action=$2 ; shift 2 ;; 87 | esac ;; 88 | 89 | --standalone ) 90 | standalone=1 ; shift ;; 91 | 92 | -- ) shift; break ;; 93 | * ) break ;; 94 | esac 95 | 96 | done 97 | 98 | logdir=${outdir}/logs 99 | mkdir -p ${logdir} 100 | 101 | if [[ ${out_script_name} ]] 102 | then 103 | out_script="${out_script_name}" 104 | else 105 | out_script="${logdir}/HaplotypeCaller.${timestamp}.cmd" 106 | fi 107 | 108 | 109 | if [[ $standalone ]] 110 | then 111 | echo "#!/bin/bash" > $out_script 112 | echo "" >> $out_script 113 | echo "#$ -o ${logdir}" >> $out_script 114 | echo "#$ -e ${logdir}" >> $out_script 115 | echo "#$ -S /bin/bash" >> $out_script 116 | echo '#$ -l h_vmem=12G' >> $out_script 117 | echo "#$ -pe smp ${threads}" >> $out_script 118 | echo 'set -e' >> $out_script 119 | fi 120 | 121 | echo "" >> $out_script 122 | echo 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 123 | echo "" >> $out_script 124 | 125 | 126 | if [[ ${SELECTOR} ]] 127 | then 128 | selector_text="-L /mnt/${SELECTOR}" 129 | fi 130 | 131 | dbsnp_text='' 132 | if [[ ${dbsnp} ]]; then 133 | dbsnp_text="--dbsnp /mnt/${dbsnp}" 134 | fi 135 | 136 | 137 | echo "singularity exec --bind /:/mnt docker://broadinstitute/gatk:4.0.5.2 \\" >> $out_script 138 | echo "java -Xmx${MEM}g -jar /gatk/gatk.jar \\" >> $out_script 139 | echo "HaplotypeCaller \\" >> $out_script 140 | echo "--reference /mnt/${HUMAN_REFERENCE} \\" >> $out_script 141 | echo "--input /mnt/${bamFile} \\" >> $out_script 142 | echo "--native-pair-hmm-threads ${threads} \\" >> $out_script 143 | echo "$selector_text \\" >> $out_script 144 | echo "$dbsnp_text \\" >> $out_script 145 | echo "${extra_arguments} \\" >> $out_script 146 | echo "--output /mnt/${outdir}/${outVcfName}" >> $out_script 147 | 148 | echo "" >> $out_script 149 | 150 | echo 'echo -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2' >> $out_script 151 | -------------------------------------------------------------------------------- /somaticseq/utilities/snakemake/README.md: -------------------------------------------------------------------------------- 1 | # Automated Pipeline based on [Snakemake](https://snakemake.readthedocs.io/en/latest) 2 | 3 | This workflow expects SomaticSeq and the desired suite of variant callers to be 4 | installed. That is, their executables should be found via the `PATH` environment 5 | variable. 6 | 7 | For SomaticSeq, it is sufficient to include the top-level directory and the 8 | utilities directory: 9 | 10 | ``` 11 | SOMATICSEQ_HOME=/path/to/somaticseq 12 | export PATH=$SOMATICSEQ_HOME:$SOMATICSEQ_HOME/utilities:$PATH 13 | ``` 14 | 15 | Example usage: 16 | 17 | ``` 18 | snakemake \ 19 | -j \ 20 | --config \ 21 | tumor=/ABSOLUTE/PATH/TO/tumor.bam \ 22 | normal=/ABSOLUTE/PATH/TO/normal.bam \ 23 | reference=/ABSOLUTE/PATH/TO/GRCh38.fa \ 24 | dbsnp=/ABSOLUTE/PATH/TO/dbSNP.GRCh38.vcf \ 25 | gatk=/ABSOLUTE/PATH/TO/GATK.jar \ 26 | varscan=/ABSOLUTE/PATH/TO/VarScan.jar \ 27 | caller_threads=36 \ 28 | somaticseq 29 | ``` 30 | 31 | **caller_threads** is the number of threads to be used for each of the variant 32 | callers that support parallelization. 33 | 34 | The `config.yaml` file specifies default options, mostly for specifying which 35 | variant callers' results you'd like to feed into SomaticSeq. You may pass those 36 | options on the command line, as is done for `caller_threads` above, and whatever 37 | is passed on the command line will override what is specified in the 38 | configuration file. 39 | -------------------------------------------------------------------------------- /somaticseq/utilities/snakemake/config.yaml: -------------------------------------------------------------------------------- 1 | lofreq: True 2 | muse: True 3 | mutect2: True 4 | scalpel: True 5 | sniper: True 6 | strelka: True 7 | vardict: True 8 | varscan: True 9 | # The number of threads to use for each variant caller that supports 10 | # parallelization. 11 | caller_threads: 36 12 | -------------------------------------------------------------------------------- /somaticseq/utilities/variant_annotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import subprocess 7 | import tempfile 8 | import uuid 9 | 10 | import pysam 11 | 12 | COSMIC_STRING = "GENE,CDS,AA,CNT" 13 | DBSNP_STRING = ( 14 | "RSPOS,GENEINFO,dbSNPBuildID,SAO,SSR,VC,PM,MUT,KGPhase1,KGPhase3,OM,CDA,CAF,COMMON" 15 | ) 16 | 17 | 18 | def snpsift_snp(snpsift_jar, input_vcf, dbsnp_vcf, output_vcf, info_string): 19 | logger = logging.getLogger(snpsift_snp.__name__) 20 | sift_command = "java -Xmx8g -jar {} annotate -info {} {} {} > {}".format( 21 | snpsift_jar, info_string, dbsnp_vcf, input_vcf, output_vcf 22 | ) 23 | logger.info(sift_command) 24 | subprocess.check_call(sift_command, shell=True) 25 | return output_vcf 26 | 27 | 28 | def snpsift_cosmic(snpsift_jar, input_vcf, cosmic_vcf, output_vcf, info_string): 29 | logger = logging.getLogger(snpsift_cosmic.__name__) 30 | sift_command = "java -Xmx8g -jar {} annotate -info {} {} {} > {}".format( 31 | snpsift_jar, info_string, cosmic_vcf, input_vcf, output_vcf 32 | ) 33 | logger.info(sift_command) 34 | subprocess.check_call(sift_command, shell=True) 35 | return output_vcf 36 | 37 | 38 | def snpeff_annotate(snpeff_jar, input_vcf, output_vcf, db): 39 | logger = logging.getLogger(snpeff_annotate.__name__) 40 | eff_command = "java -Xmx8g -jar {} -noStats {} {} > {}".format( 41 | snpeff_jar, db, input_vcf, output_vcf 42 | ) 43 | logger.info(eff_command) 44 | subprocess.check_call(eff_command, shell=True) 45 | return output_vcf 46 | 47 | 48 | def annotate_small_variants( 49 | snpsift_jar, 50 | snpeff_jar, 51 | input_vcf, 52 | dbsnp_vcf, 53 | cosmic_vcf, 54 | output_vcf, 55 | snp_string, 56 | cosmic_string, 57 | eff_db, 58 | ): 59 | dirname = tempfile.gettempdir() 60 | dbsnp_annotated = snpsift_snp( 61 | snpsift_jar, 62 | input_vcf, 63 | dbsnp_vcf, 64 | os.path.join(dirname, uuid.uuid4().hex + ".vcf"), 65 | snp_string, 66 | ) 67 | cosmic_annotated = snpsift_cosmic( 68 | snpsift_jar, 69 | dbsnp_annotated, 70 | cosmic_vcf, 71 | os.path.join(dirname, uuid.uuid4().hex + ".vcf"), 72 | cosmic_string, 73 | ) 74 | output_vcf = snpeff_annotate(snpeff_jar, cosmic_annotated, output_vcf, eff_db) 75 | os.remove(dbsnp_annotated) 76 | os.remove(cosmic_annotated) 77 | pysam.tabix_index(output_vcf, force=True, preset="vcf") 78 | return output_vcf + ".gz" 79 | 80 | 81 | def main() -> None: 82 | FORMAT = "%(levelname)s %(asctime)-15s %(name)-20s %(message)s" 83 | logging.basicConfig(level=logging.INFO, format=FORMAT) 84 | parser = argparse.ArgumentParser( 85 | description="Annotate with snpSift and snpEff with dbSNP and COSMIC", 86 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 87 | ) 88 | parser.add_argument("-infile", "--infile", help="input vcf file") 89 | parser.add_argument("-outfile", "--outfile", help="output vcf file") 90 | parser.add_argument( 91 | "-dbsnp", "--dbsnp", help="dbsnp vcf file to feed into GATK4 HaplotypeCaller" 92 | ) 93 | parser.add_argument( 94 | "-cosmic", "--cosmic", help="cosmic vcf file to feed into GATK4 HaplotypeCaller" 95 | ) 96 | parser.add_argument("-snpsift", "--snpsift", help="SnpSift JAR") 97 | parser.add_argument("-snpeff", "--snpeff", help="snpEff JAR") 98 | parser.add_argument("-db", "--snpeff-db", help="snpEff db", default="GRCh38.86") 99 | args = parser.parse_args() 100 | annotate_small_variants( 101 | args.snpsift, 102 | args.snpeff, 103 | args.infile, 104 | args.dbsnp, 105 | args.cosmic, 106 | args.outfile, 107 | DBSNP_STRING, 108 | COSMIC_STRING, 109 | args.snpeff_db, 110 | ) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /somaticseq/utilities/vcfsorter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | # LICENSE: This file licensed under the GNU GPL v3 6 | 7 | # Retrieved from: https://code.google.com/p/vcfsorter/ linking to 8 | # https://drive.google.com/file/d/0B7jV6rjPCUApR1ZnWTMzakZfN3M/view?usp=sharing 9 | 10 | ###################################################### 11 | # vcfsorter.pl 12 | # 13 | # Copyright (C) 2011 German Gaston Leparc 14 | # 15 | # sorts VCF by reference genome 16 | # 17 | # usage: 18 | # 19 | # vcfsorter.pl genome.dict myvcf.file > mynewvcf.file 20 | # 21 | ###################################################### 22 | 23 | my $usage = < mynewvcf.file 2>STDERR 29 | EOF 30 | 31 | 32 | my $dict_file = $ARGV[0]; 33 | my $vcf_file = $ARGV[1]; 34 | 35 | die "\nERROR: missing an argument!\n\n$usage" if (@ARGV < 2); 36 | 37 | 38 | #---------------------------------------- LOAD IN FASTA DICT INTO MEMORY 39 | open(DICT,$dict_file) or die "Can't open $dict_file!\n"; 40 | my @contig_order; 41 | my $c=0; 42 | while() 43 | { 44 | if($_=~ /\@SQ/) 45 | { 46 | my ($contig) = $_ =~ /SN:(\S+)/; 47 | $contig_order[$c]=$contig; 48 | ++$c; 49 | #print $contig,"\n"; 50 | } 51 | } 52 | close(DICT); 53 | 54 | #---------------------------------------- PARSE VCF FILE & OUTPUT SORTED VCF 55 | 56 | open(VCF,$vcf_file) or die "Can't open $vcf_file!\n"; 57 | 58 | my %vcf_hash; 59 | my $header; 60 | 61 | while() 62 | { 63 | if($_=~/^#/){ $header .= $_; next; } # store header and comment fields 64 | chomp($_); 65 | 66 | my @data = split(/\t/,$_); 67 | my $contig = $data[0]; #CHROM 68 | my $start = $data[1]; #POS 69 | my $variant = $data[3]."to".$data[4]; #REF and ALT 70 | my $line = $_; 71 | 72 | #print $contig,":",$start," ",$variant,"\n"; 73 | 74 | $vcf_hash{$contig}{$start}{$variant}=$line; 75 | 76 | } 77 | close(VCF); 78 | 79 | #------------------ print out the VCF in the order of the reference genome 80 | 81 | #print standard VCF header 82 | print $header; 83 | 84 | 85 | foreach my $contig (@contig_order) # sort by contig order 86 | { 87 | #print $contig,"\n"; 88 | foreach my $start (sort {$a <=> $b} keys %{$vcf_hash{$contig}}) # sort numerically by coordinates 89 | { 90 | #print $start,"\n"; 91 | foreach my $variant (keys %{$vcf_hash{$contig}{$start}}) # if overlapping mutation, print each variant 92 | { 93 | print $vcf_hash{$contig}{$start}{$variant},"\n"; 94 | } 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/somaticseq/vcf_modifier/__init__.py -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/complex2indel.py: -------------------------------------------------------------------------------- 1 | def resolve_complex_variants_into_snvs_and_indels( 2 | refbases: str, altbases: str 3 | ) -> list[dict]: 4 | """ 5 | Split complex variants into combination of snvs and indels. 6 | """ 7 | snv_or_indel = [{"OFFSET": 0, "REF": refbases, "ALT": altbases}] 8 | 9 | if len(refbases) == 1 and len(altbases) == 1: # snv 10 | return snv_or_indel 11 | 12 | if (len(refbases) == 1 or len(altbases) == 1) and ( 13 | refbases[0] == altbases[0] 14 | ): # indel 15 | return snv_or_indel 16 | 17 | # Initialize a list to hold the new records 18 | list_of_variants: list[dict] = [] 19 | 20 | # "Left-align" the REF and ALT to assign snvs until one has to consider 21 | # deletion or insertion 22 | for i, (refbase, altbase) in enumerate(zip(refbases, altbases)): 23 | if refbase != altbase: 24 | list_of_variants.append( 25 | {"OFFSET": i, "REF": refbase, "ALT": altbase}, 26 | ) 27 | # Handle deletion 28 | if len(refbases) > len(altbases): 29 | list_of_variants.append( 30 | {"OFFSET": i, "REF": refbases[i:], "ALT": refbases[i]}, 31 | ) 32 | # Handle insertion 33 | elif len(altbases) > len(refbases): 34 | list_of_variants.append( 35 | {"OFFSET": i, "REF": refbases[i], "ALT": refbases[i] + altbases[i + 1 :]}, 36 | ) 37 | return list_of_variants 38 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/copy_TextFile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome 6 | 7 | 8 | def run(): 9 | parser = argparse.ArgumentParser( 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 11 | ) 12 | 13 | # Variant Call Type, i.e., snp or indel 14 | parser.add_argument( 15 | "-infile", "--input-file", type=str, help="Input VCF file", required=True 16 | ) 17 | parser.add_argument( 18 | "-outfile", "--output-file", type=str, help="Output VCF file", required=True 19 | ) 20 | 21 | # Parse the arguments: 22 | args = parser.parse_args() 23 | infile = args.input_file 24 | outfile = args.output_file 25 | 26 | return infile, outfile 27 | 28 | 29 | def copy(infile, outfile): 30 | with genome.open_textfile(infile) as filein, open(outfile, "w") as fileout: 31 | line_i = filein.readline() 32 | while line_i: 33 | fileout.write(line_i) 34 | line_i = filein.readline() 35 | 36 | 37 | if __name__ == "__main__": 38 | infile, outfile = run() 39 | copy(infile, outfile) 40 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/getUniqueVcfPositions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # A simple and quick way to replace GATK3 CombineVariants 4 | 5 | import argparse 6 | import gzip 7 | import re 8 | 9 | 10 | def open_textfile(file_name): 11 | # See if the input file is a .gz file: 12 | if file_name.lower().endswith(".gz"): 13 | return gzip.open(file_name, "rt") 14 | 15 | else: 16 | return open(file_name) 17 | 18 | 19 | def run(): 20 | parser = argparse.ArgumentParser( 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 22 | ) 23 | parser.add_argument( 24 | "-vcfs", 25 | "--input-vcfs", 26 | nargs="*", 27 | type=str, 28 | help="Input VCF file", 29 | required=True, 30 | default=None, 31 | ) 32 | parser.add_argument( 33 | "-out", "--output-vcf", type=str, help="Output VCF file", required=True 34 | ) 35 | 36 | args = parser.parse_args() 37 | 38 | infiles = args.input_vcfs 39 | outfile = args.output_vcf 40 | 41 | return infiles, outfile 42 | 43 | 44 | def combine(infiles, outfile): 45 | variant_positions = set() 46 | 47 | for file_i in infiles: 48 | with open_textfile(file_i) as vcf: 49 | line_i = vcf.readline().rstrip() 50 | 51 | while line_i.startswith("#"): 52 | line_i = vcf.readline().rstrip() 53 | 54 | while line_i: 55 | item = line_i.split("\t") 56 | 57 | chromosome = item[0] 58 | position = int(item[1]) 59 | refbase = item[3] 60 | altbases = re.split(r"[,/]", item[4]) 61 | 62 | for altbase_i in altbases: 63 | variant_positions.add((chromosome, position, refbase, altbase_i)) 64 | 65 | line_i = vcf.readline().rstrip() 66 | 67 | with open(outfile, "w") as vcf_out: 68 | vcf_out.write("##fileformat=VCFv4.1\n") 69 | vcf_out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") 70 | 71 | for variant_position_i in sorted(variant_positions): 72 | vcf_out.write( 73 | "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 74 | variant_position_i[0], 75 | variant_position_i[1], 76 | ".", 77 | variant_position_i[2], 78 | variant_position_i[3], 79 | ".", 80 | "PASS", 81 | ".", 82 | ) 83 | ) 84 | 85 | 86 | if __name__ == "__main__": 87 | infiles, outfile = run() 88 | combine(infiles, outfile) 89 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/leftAlign.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # flake8: noqa: E501 3 | 4 | import os 5 | 6 | 7 | def leftAlign(infile, outfile, ref, gatk3): 8 | assert infile != outfile 9 | exit_code = os.system( 10 | """java -jar {} -T LeftAlignAndTrimVariants -R {} --variant {} | egrep -v '^[0-9]+ variants|^INFO' > {}""".format( 11 | gatk3, ref, infile, outfile 12 | ) 13 | ) 14 | assert exit_code 15 | return outfile 16 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/modify_JointSNVMix2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # flake8: noqa: E501 3 | 4 | import argparse 5 | 6 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome 7 | 8 | 9 | def run(): 10 | parser = argparse.ArgumentParser( 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 12 | ) 13 | 14 | # Variant Call Type, i.e., snp or indel 15 | parser.add_argument( 16 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True 17 | ) 18 | parser.add_argument( 19 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True 20 | ) 21 | 22 | # Parse the arguments: 23 | args = parser.parse_args() 24 | infile = args.input_vcf 25 | outfile = args.output_vcf 26 | 27 | return infile, outfile 28 | 29 | 30 | def convert(infile, outfile): 31 | idx_format, idx_SM1, idx_SM2 = 8, 9, 10 32 | with genome.open_textfile(infile) as vcf, open(outfile, "w") as vcfout: 33 | line_i = vcf.readline().rstrip() 34 | 35 | # VCF header 36 | while line_i.startswith("#"): 37 | if line_i.startswith("##FORMAT= 0.8: 60 | normal_gt = "1/1" 61 | elif vaf > 0.25: 62 | normal_gt = "0/1" 63 | else: 64 | normal_gt = "0/0" 65 | 66 | item_normal[idx_ad] = "{},{}".format( 67 | item_normal[idx_rd], item_normal[idx_ad] 68 | ) 69 | item_normal.pop(idx_rd) 70 | item_normal = [normal_gt] + item_normal 71 | 72 | # TUMOR 73 | item_tumor = item[idx_SM2].split(":") 74 | tumor_ad = int(item_tumor[idx_ad]) 75 | tumor_rd = int(item_tumor[idx_rd]) 76 | 77 | try: 78 | vaf = tumor_ad / (tumor_ad + tumor_rd) 79 | except ZeroDivisionError: 80 | vaf = 0 81 | 82 | if vaf > 0.8: 83 | tumor_gt = "1/1" 84 | else: 85 | tumor_gt = "0/1" 86 | 87 | item_tumor[idx_ad] = "{},{}".format( 88 | item_tumor[idx_rd], item_tumor[idx_ad] 89 | ) 90 | item_tumor.pop(idx_rd) 91 | item_tumor = [tumor_gt] + item_tumor 92 | 93 | # Rewrite 94 | item[idx_format] = "GT:" + ":".join(format_items) 95 | item[idx_SM1] = ":".join(item_normal) 96 | item[idx_SM2] = ":".join(item_tumor) 97 | 98 | line_i = "\t".join(item) 99 | vcfout.write(line_i + "\n") 100 | line_i = vcf.readline().rstrip() 101 | 102 | 103 | if __name__ == "__main__": 104 | infile, outfile = run() 105 | convert(infile, outfile) 106 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/modify_SomaticSniper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import re 5 | 6 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome 7 | 8 | 9 | def run(): 10 | parser = argparse.ArgumentParser( 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 12 | ) 13 | # Variant Call Type, i.e., snp or indel 14 | parser.add_argument( 15 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True 16 | ) 17 | parser.add_argument( 18 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True 19 | ) 20 | # Parse the arguments: 21 | args = parser.parse_args() 22 | infile = args.input_vcf 23 | outfile = args.output_vcf 24 | 25 | return infile, outfile 26 | 27 | 28 | def convert(infile, outfile): 29 | idx_ref = 3 30 | with genome.open_textfile(infile) as vcf, open(outfile, "w") as vcfout: 31 | line_i = vcf.readline().rstrip() 32 | # VCF header 33 | while line_i.startswith("#"): 34 | vcfout.write(line_i + "\n") 35 | line_i = vcf.readline().rstrip() 36 | 37 | while line_i: 38 | # Print "SomaticSniper" into the INFO field if it is called so, 39 | # otherwise never mind. 40 | item = line_i.split("\t") 41 | # In the REF field, non-GCTA characters should be changed to N to 42 | # fit the VCF standard: 43 | item[idx_ref] = re.sub(r"[^GCTA]", "N", item[idx_ref], flags=re.I) 44 | line_i = "\t".join(item) 45 | vcfout.write(line_i + "\n") 46 | line_i = vcf.readline().rstrip() 47 | 48 | 49 | if __name__ == "__main__": 50 | infile, outfile = run() 51 | convert(infile, outfile) 52 | -------------------------------------------------------------------------------- /somaticseq/vcf_modifier/modify_Strelka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Add GT to Strelka's samples to make compatible with GATK CombineVariants, so 4 | # don't care about the content. Just 0/1 for everyone. 5 | 6 | import argparse 7 | 8 | import somaticseq.genomic_file_parsers.genomic_file_handlers as genome 9 | 10 | 11 | def run(): 12 | parser = argparse.ArgumentParser( 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 14 | ) 15 | 16 | # Variant Call Type, i.e., snp or indel 17 | parser.add_argument( 18 | "-infile", "--input-vcf", type=str, help="Input VCF file", required=True 19 | ) 20 | parser.add_argument( 21 | "-outfile", "--output-vcf", type=str, help="Output VCF file", required=True 22 | ) 23 | 24 | # Parse the arguments: 25 | args = parser.parse_args() 26 | infile = args.input_vcf 27 | outfile = args.output_vcf 28 | 29 | return infile, outfile 30 | 31 | 32 | def convert(infile, outfile): 33 | with genome.open_textfile(infile) as vcf_in, open(outfile, "w") as vcf_out: 34 | line_i = vcf_in.readline().rstrip() 35 | 36 | while line_i.startswith("##"): 37 | vcf_out.write(line_i + "\n") 38 | line_i = vcf_in.readline().rstrip() 39 | 40 | # This is the #CHROM line: 41 | headers = line_i.split("\t") 42 | num_columns = len(headers) 43 | vcf_out.write(line_i + "\n") 44 | 45 | line_i = vcf_in.readline().rstrip() 46 | while line_i: 47 | items = line_i.split("\t") 48 | 49 | items[8] = "GT:" + items[8] 50 | 51 | for i in range(9, num_columns): 52 | items[i] = "0/1:" + items[i] 53 | 54 | line_out = "\t".join(items) 55 | vcf_out.write(line_out + "\n") 56 | 57 | line_i = vcf_in.readline().rstrip() 58 | 59 | 60 | if __name__ == "__main__": 61 | infile, outfile = run() 62 | convert(infile, outfile) 63 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Final 4 | 5 | import pytest 6 | 7 | TEST_ROOT_DIR: Final = Path(__file__).resolve().parent 8 | 9 | 10 | @pytest.fixture(scope="session") 11 | def test_rootdir() -> Path: 12 | return TEST_ROOT_DIR 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def test_datadir(test_rootdir: Path) -> Path: 17 | return test_rootdir / "example" 18 | 19 | 20 | @pytest.fixture(scope="session") 21 | def tiny_tumor_bam(test_datadir: Path) -> str: 22 | return os.fspath(test_datadir / "tumor.markdup.bam") 23 | 24 | 25 | @pytest.fixture(scope="session") 26 | def tiny_normal_bam(test_datadir: Path) -> str: 27 | return os.fspath(test_datadir / "normal.markdup.bam") 28 | 29 | 30 | @pytest.fixture(scope="session") 31 | def tiny_fasta(test_datadir: Path) -> str: 32 | return os.fspath(test_datadir / "tiny.fa") 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def tiny_dbsnp_vcf(test_datadir: Path) -> str: 37 | return os.fspath(test_datadir / "tiny_dbsnp.vcf") 38 | 39 | 40 | @pytest.fixture(scope="session") 41 | def tiny_truth_vcf(test_datadir: Path) -> str: 42 | return os.fspath(test_datadir / "Varsim.somatic.truth.vcf") 43 | 44 | 45 | @pytest.fixture 46 | def tiny_paired_mutect2_vcf(test_datadir: Path) -> str: 47 | return os.fspath(test_datadir / "paired_example" / "MuTect2.vcf.gz") 48 | 49 | 50 | @pytest.fixture 51 | def tiny_paired_somaticsniper_vcf(test_datadir: Path) -> str: 52 | return os.fspath(test_datadir / "paired_example" / "SomaticSniper.vcf.gz") 53 | 54 | 55 | @pytest.fixture 56 | def tiny_paired_vardict_vcf(test_datadir: Path) -> str: 57 | return os.fspath(test_datadir / "paired_example" / "VarDict.vcf.gz") 58 | 59 | 60 | @pytest.fixture 61 | def tiny_paired_muse_vcf(test_datadir: Path) -> str: 62 | return os.fspath(test_datadir / "paired_example" / "MuSE.vcf.gz") 63 | 64 | 65 | @pytest.fixture 66 | def tiny_paired_lofreq_snv_vcf(test_datadir: Path) -> str: 67 | return os.fspath(test_datadir / "paired_example" / "LoFreq.snv.vcf.gz") 68 | 69 | 70 | @pytest.fixture 71 | def tiny_paired_lofreq_indel_vcf(test_datadir: Path) -> str: 72 | return os.fspath(test_datadir / "paired_example" / "LoFreq.indel.vcf.gz") 73 | 74 | 75 | @pytest.fixture 76 | def tiny_paired_scalpel_vcf(test_datadir: Path) -> str: 77 | return os.fspath(test_datadir / "paired_example" / "Scalpel.vcf.gz") 78 | 79 | 80 | @pytest.fixture 81 | def tiny_paired_strelka_snv_vcf(test_datadir: Path) -> str: 82 | return os.fspath(test_datadir / "paired_example" / "Strelka.snv.vcf.gz") 83 | 84 | 85 | @pytest.fixture 86 | def tiny_paired_strelka_indel_vcf(test_datadir: Path) -> str: 87 | return os.fspath(test_datadir / "paired_example" / "Strelka.indel.vcf.gz") 88 | 89 | 90 | @pytest.fixture 91 | def tiny_single_mutect2_vcf(test_datadir: Path) -> str: 92 | return os.fspath(test_datadir / "tumor_only_example" / "MuTect2.vcf.gz") 93 | 94 | 95 | @pytest.fixture 96 | def tiny_single_vardict_vcf(test_datadir: Path) -> str: 97 | return os.fspath(test_datadir / "tumor_only_example" / "VarDict.vcf.gz") 98 | 99 | 100 | @pytest.fixture 101 | def tiny_single_strelka_vcf(test_datadir: Path) -> str: 102 | return os.fspath(test_datadir / "tumor_only_example" / "Strelka.vcf.gz") 103 | 104 | 105 | @pytest.fixture(scope="session") 106 | def reference_output( 107 | test_datadir: Path, 108 | ) -> dict[str, str]: 109 | return { 110 | "paired_consensus_snv_vcf": str( 111 | test_datadir / "paired_example" / "Consensus.sSNV.vcf.gz" 112 | ), 113 | "paired_consensus_indel_vcf": str( 114 | test_datadir / "paired_example" / "Consensus.sINDEL.vcf.gz" 115 | ), 116 | "single_consensus_snv_vcf": str( 117 | test_datadir / "tumor_only_example" / "Consensus.sSNV.vcf.gz" 118 | ), 119 | "single_consensus_indel_vcf": str( 120 | test_datadir / "tumor_only_example" / "Consensus.sINDEL.vcf.gz" 121 | ), 122 | } 123 | -------------------------------------------------------------------------------- /tests/example/README.md: -------------------------------------------------------------------------------- 1 | # Examples with tiny unrealistic demo data after SomaticSeq is properly installed 2 | 3 | ## Run SomaticSeq for tumor-normal mode 4 | 5 | ``` 6 | /PATH/TO/somaticseq/tests/example/paired_somaticseq_example.sh 7 | ``` 8 | 9 | This example uses the outputs from MuTect2, VarDict, and Strelka2 as input. This 10 | is just an example. There are additional callers that are officially supported 11 | besides those three. If this command is run successfully, the directory 12 | `paired_somaticseq` will be created in your current directory. In it, SomaticSeq 13 | TSV files, VCF files, and ada classifiers will be created. Do _not_ use those 14 | classifiers for anything other than demo purposes. They are for demo purposes 15 | and completely useless. 16 | 17 | ## Run SomaticSeq for tumor-only mode 18 | 19 | Similar to above 20 | 21 | ``` 22 | cd example 23 | /PATH/TO/somaticseq/tests/example/single_somaticseq_example.sh 24 | ``` 25 | 26 | The directory will be `single_somaticseq`. 27 | 28 | ### To check the results of `paired_somaticseq_example.sh` and/or `single_somaticseq_example.sh`, run `results_check.sh`. 29 | 30 | ## Run dockerized workflow with MuTect2, VarDict, and Strelka2 in tumor-normal mode 31 | 32 | If you are able to run docker, you may test the following workflow: 33 | 34 | ``` 35 | cd example 36 | /PATH/TO/somaticseq/tests/example/invoke_dockerized_tumor_normal_callers.sh 37 | ``` 38 | 39 | Then, the following scripts will be created and executed: 40 | 41 | ``` 42 | paired_example/{1,2}/logs/mutect2.year.month.date.timestamp.cmd 43 | paired_example/{1,2}/logs/strelka.year.month.date.timestamp.cmd 44 | paired_example/{1,2}/logs/vardict.year.month.date.timestamp.cmd 45 | paired_example/{1,2}/SomaticSeq/logs/somaticSeq.year.month.date.timestamp.cmd 46 | paired_example/logs/mergeResults.year.month.date.timestamp.cmd 47 | ``` 48 | 49 | Directories 1 and 2 are created because the script invokes two parallel 50 | processes using `-nt 2`. The caller scripts (i.e., mutect2, strelka, and 51 | vardict) will be executed first by two parallel processes (`-nt 2`). Then, the 52 | somaticSeq scripts will be executed. Finally, the mergeResults script will be 53 | executed. 54 | 55 | ## dockerized workflow with tumor-only mode. 56 | 57 | Same as above, but run the 58 | `/PATH/TO/somaticseq/tests/example/invoke_dockerized_tumor_only_callers.sh`. 59 | -------------------------------------------------------------------------------- /tests/example/invoke_dockerized_tumor_normal_callers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 4 | 5 | mkdir -p paired_example 6 | 7 | somaticseq_make_somatic_scripts \ 8 | paired \ 9 | --output-directory $(pwd -P)/paired_example \ 10 | --tumor-bam ${MYDIR}/tumor.markdup.bam \ 11 | --normal-bam ${MYDIR}/normal.markdup.bam \ 12 | --genome-reference ${MYDIR}/tiny.fa \ 13 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \ 14 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \ 15 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 16 | --run-mutect2 --run-somaticsniper --run-vardict --run-muse --run-lofreq --run-scalpel --run-strelka2 \ 17 | --run-somaticseq --train-somaticseq \ 18 | --threads 2 --run-workflow 19 | -------------------------------------------------------------------------------- /tests/example/invoke_dockerized_tumor_only_callers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 4 | 5 | mkdir -p tumor_only_example 6 | 7 | somaticseq_make_somatic_scripts \ 8 | single \ 9 | --output-directory $(pwd -P)/tumor_only_example \ 10 | --bam ${MYDIR}/tumor.markdup.bam \ 11 | --genome-reference ${MYDIR}/tiny.fa \ 12 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \ 13 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \ 14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 15 | --run-mutect2 --run-vardict --run-strelka2 --run-somaticseq --train-somaticseq -nt 2 --run-workflow 16 | -------------------------------------------------------------------------------- /tests/example/normal.markdup.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/normal.markdup.bam -------------------------------------------------------------------------------- /tests/example/normal.markdup.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/normal.markdup.bam.bai -------------------------------------------------------------------------------- /tests/example/paired_example/Consensus.sINDEL.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Consensus.sINDEL.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/Consensus.sSNV.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Consensus.sSNV.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/LoFreq.indel.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/LoFreq.indel.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/LoFreq.snv.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/LoFreq.snv.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/MuSE.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/MuSE.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/MuTect2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/MuTect2.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/Scalpel.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Scalpel.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/SomaticSniper.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/SomaticSniper.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/Strelka.indel.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Strelka.indel.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/Strelka.snv.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/Strelka.snv.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_example/VarDict.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/paired_example/VarDict.vcf.gz -------------------------------------------------------------------------------- /tests/example/paired_somaticseq_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 6 | VERSION=`head -n 1 ${MYDIR}/../../somaticseq/_version.py | awk -F "=" '{print $2}' | tr -d '[[:space:]]"'` 7 | 8 | somaticseq \ 9 | --somaticseq-train \ 10 | --algorithm xgboost \ 11 | --extra-hyperparameters scale_pos_weight:0.1 seed:100 \ 12 | --output-directory paired_somaticseq/training \ 13 | --genome-reference ${MYDIR}/tiny.fa \ 14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 15 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \ 16 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \ 17 | --threads 3 \ 18 | paired \ 19 | --tumor-bam-file ${MYDIR}/tumor.markdup.bam \ 20 | --normal-bam-file ${MYDIR}/normal.markdup.bam \ 21 | --mutect2-vcf ${MYDIR}/paired_example/MuTect2.vcf.gz \ 22 | --somaticsniper-vcf ${MYDIR}/paired_example/SomaticSniper.vcf.gz \ 23 | --vardict-vcf ${MYDIR}/paired_example/VarDict.vcf.gz \ 24 | --muse-vcf ${MYDIR}/paired_example/MuSE.vcf.gz \ 25 | --lofreq-snv ${MYDIR}/paired_example/LoFreq.snv.vcf.gz \ 26 | --lofreq-indel ${MYDIR}/paired_example/LoFreq.indel.vcf.gz \ 27 | --scalpel-vcf ${MYDIR}/paired_example/Scalpel.vcf.gz \ 28 | --strelka-snv ${MYDIR}/paired_example/Strelka.snv.vcf.gz \ 29 | --strelka-indel ${MYDIR}/paired_example/Strelka.indel.vcf.gz 30 | 31 | 32 | somaticseq \ 33 | --algorithm xgboost \ 34 | --classifier-snv paired_somaticseq/training/Ensemble.sSNV.tsv.xgb.v${VERSION}.classifier \ 35 | --classifier-indel paired_somaticseq/training/Ensemble.sINDEL.tsv.xgb.v${VERSION}.classifier \ 36 | --output-directory paired_somaticseq/classification \ 37 | --genome-reference ${MYDIR}/tiny.fa \ 38 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 39 | --threads 3 \ 40 | paired \ 41 | --tumor-bam-file ${MYDIR}/tumor.markdup.bam \ 42 | --normal-bam-file ${MYDIR}/normal.markdup.bam \ 43 | --mutect2-vcf ${MYDIR}/paired_example/MuTect2.vcf.gz \ 44 | --somaticsniper-vcf ${MYDIR}/paired_example/SomaticSniper.vcf.gz \ 45 | --vardict-vcf ${MYDIR}/paired_example/VarDict.vcf.gz \ 46 | --muse-vcf ${MYDIR}/paired_example/MuSE.vcf.gz \ 47 | --lofreq-snv ${MYDIR}/paired_example/LoFreq.snv.vcf.gz \ 48 | --lofreq-indel ${MYDIR}/paired_example/LoFreq.indel.vcf.gz \ 49 | --scalpel-vcf ${MYDIR}/paired_example/Scalpel.vcf.gz \ 50 | --strelka-snv ${MYDIR}/paired_example/Strelka.snv.vcf.gz \ 51 | --strelka-indel ${MYDIR}/paired_example/Strelka.indel.vcf.gz 52 | -------------------------------------------------------------------------------- /tests/example/results_check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | mkdir -p result_check 4 | 5 | ### Split snv and indel from the ground truth file 6 | split_vcf.py -infile Varsim.somatic.truth.vcf -snv result_check/true.snv.vcf -indel result_check/true.indel.vcf 7 | 8 | 9 | ### Check the results for paired somaticseq with the ground truth 10 | if [[ -r paired_somaticseq/Consensus.sSNV.vcf && -r paired_somaticseq/Consensus.sINDEL.vcf ]] 11 | then 12 | true_snv_positives=`cat paired_somaticseq/Consensus.sSNV.vcf | egrep -wf <(cat result_check/true.snv.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l` 13 | 14 | true_indel_positives=`cat paired_somaticseq/Consensus.sINDEL.vcf | egrep -wf <(cat result_check/true.indel.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l` 15 | 16 | echo -e "For paired SomaticSeq run, out of a total of 73 true SNVs in ground truth, ${true_snv_positives} were collected by SomaticSeq. In our own testing, the number was 70. Did you get identical results? 17 | The 3 true SNVs not collected by SomaticSeq were 1:14062, 1:24700, and 1:223356. There were in the VarDict call set, but none was considered Somatic or LikelySomatic. Two of them were in the Strelka2 call set, but nont was considered a PASS. They were not in the MuTect2 call set. Hence, they were not included in the SomaticSeq output. 18 | Out of a total of 51 true indels in ground truth, ${true_indel_positives} were collected by SomaticSeq. In our own testing, the number was 51. Did you get identical results?\n" 19 | 20 | else 21 | echo 'You did not run paired_somaticseq_example.sh' 22 | 23 | fi 24 | 25 | 26 | ### Check the results for single (e.g., tumor-only) somaticseq with the ground truth 27 | if [[ -r single_somaticseq/Consensus.sSNV.vcf && -r single_somaticseq/Consensus.sINDEL.vcf ]] 28 | then 29 | true_single_snv_positives=`cat single_somaticseq/Consensus.sSNV.vcf | egrep -wf <(cat result_check/true.snv.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l` 30 | 31 | true_single_indel_positives=`cat single_somaticseq/Consensus.sINDEL.vcf | egrep -wf <(cat result_check/true.indel.vcf | egrep -v '^#' | awk -F '\t' '{print $1"\t"$2}') | wc -l` 32 | 33 | echo -e "For single-sample SomaticSeq run, out of a total of 73 true SNVs in ground truth, ${true_single_snv_positives} were collected by SomaticSeq. In our own testing, the number was 73. Did you get identical results? 34 | Out of a total of 51 true indels in ground truth, ${true_single_indel_positives} were collected by SomaticSeq. In our own testing, the number was 51. Did you get identical results?" 35 | 36 | else 37 | echo 'You did not run single_somaticseq_example.sh' 38 | 39 | fi 40 | -------------------------------------------------------------------------------- /tests/example/single_somaticseq_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | MYDIR="$( cd "$( dirname "$0" )" && pwd )" 6 | VERSION=`head -n 1 ${MYDIR}/../../somaticseq/_version.py | awk -F "=" '{print $2}' | tr -d '[[:space:]]"'` 7 | 8 | somaticseq \ 9 | --somaticseq-train \ 10 | --algorithm xgboost \ 11 | --extra-hyperparameters scale_pos_weight:0.1 seed:100 \ 12 | --output-directory single_somaticseq/training \ 13 | --genome-reference ${MYDIR}/tiny.fa \ 14 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 15 | --truth-snv ${MYDIR}/Varsim.somatic.truth.vcf \ 16 | --truth-indel ${MYDIR}/Varsim.somatic.truth.vcf \ 17 | --threads 3 \ 18 | single \ 19 | --bam-file ${MYDIR}/tumor.markdup.bam \ 20 | --mutect2-vcf ${MYDIR}/tumor_only_example/MuTect2.vcf.gz \ 21 | --vardict-vcf ${MYDIR}/tumor_only_example/VarDict.vcf.gz \ 22 | --strelka-vcf ${MYDIR}/tumor_only_example/Strelka.vcf.gz 23 | 24 | 25 | somaticseq \ 26 | --algorithm xgboost \ 27 | --classifier-snv single_somaticseq/training/Ensemble.sSNV.tsv.xgb.v${VERSION}.classifier \ 28 | --classifier-indel single_somaticseq/training/Ensemble.sINDEL.tsv.xgb.v${VERSION}.classifier \ 29 | --output-directory single_somaticseq/classification \ 30 | --genome-reference ${MYDIR}/tiny.fa \ 31 | --dbsnp-vcf ${MYDIR}/tiny_dbsnp.vcf \ 32 | --threads 3 \ 33 | single \ 34 | --bam-file ${MYDIR}/tumor.markdup.bam \ 35 | --mutect2-vcf ${MYDIR}/tumor_only_example/MuTect2.vcf.gz \ 36 | --vardict-vcf ${MYDIR}/tumor_only_example/VarDict.vcf.gz \ 37 | --strelka-vcf ${MYDIR}/tumor_only_example/Strelka.vcf.gz 38 | -------------------------------------------------------------------------------- /tests/example/tiny.dict: -------------------------------------------------------------------------------- 1 | @HD VN:1.5 2 | @SQ SN:1 LN:300000 M5:08f4e39926679c06ed248cf2f51be5d1 UR:file:/sc1/groups/bfx-red/analysis/datainsights/projects/Somatic_Benchmarks/smallDemo/tiny.fa 3 | -------------------------------------------------------------------------------- /tests/example/tiny.fa.fai: -------------------------------------------------------------------------------- 1 | 1 300000 3 300000 300001 2 | -------------------------------------------------------------------------------- /tests/example/tiny_dbsnp.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##phasing=none 3 | ##INDIVIDUAL=TRUTH 4 | ##SAMPLE= 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##ALT= 14 | ##ALT= 15 | ##ALT= 16 | ##ALT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SPIKEIN 19 | 1 14062 rs000 T A 100 PASS SOMATIC;VAF=0.333333333333;DPR=5.0 GT 0/1 20 | 1 24700 rs001 T C 100 PASS SOMATIC;VAF=0.333333333333;DPR=6.0 GT 0/1 21 | 1 223356 rs002 T C 100 PASS SOMATIC;VAF=0.4;DPR=5.0 GT 0/1 22 | -------------------------------------------------------------------------------- /tests/example/tiny_dbsnp.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tiny_dbsnp.vcf.gz -------------------------------------------------------------------------------- /tests/example/tiny_dbsnp.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tiny_dbsnp.vcf.gz.tbi -------------------------------------------------------------------------------- /tests/example/tumor.markdup.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor.markdup.bam -------------------------------------------------------------------------------- /tests/example/tumor.markdup.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor.markdup.bam.bai -------------------------------------------------------------------------------- /tests/example/tumor_only_example/Consensus.sINDEL.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Consensus.sINDEL.vcf.gz -------------------------------------------------------------------------------- /tests/example/tumor_only_example/Consensus.sSNV.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Consensus.sSNV.vcf.gz -------------------------------------------------------------------------------- /tests/example/tumor_only_example/MuTect2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/MuTect2.vcf.gz -------------------------------------------------------------------------------- /tests/example/tumor_only_example/Strelka.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/Strelka.vcf.gz -------------------------------------------------------------------------------- /tests/example/tumor_only_example/VarDict.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/somaticseq/9b1d47b091ec6875afb3912a890d3bf77eb128da/tests/example/tumor_only_example/VarDict.vcf.gz -------------------------------------------------------------------------------- /tests/unit/genomic_file_parsers/test_read_info_extractor.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | 3 | from somaticseq.genomic_file_parsers.read_info_extractor import ( 4 | AlignmentType, 5 | get_alignment_in_read, 6 | ) 7 | 8 | 9 | def test_get_alignment() -> None: 10 | """ 11 | Test the following aligned read: 12 | Coordinates: 100 200 210 220 230 235 240 245 250 13 | ^ ^ ^ ^ ^ ^ ^ ^ ^ 14 | Reference: ================================================ 15 | ||||||||||||||||||||||||||||||||||||||||||| 16 | Read: -------- -------I------I----- ----------> 17 | CIGAR: 100M 10D 10M 10 10M 5 5M 5D 5M 5S 18 | """ 19 | 20 | read_dict = { 21 | "name": "query_name", 22 | "flag": "97", # top strand read1 23 | "ref_name": "chr1", 24 | "ref_pos": "101", # 1-based coordinate 25 | "map_quality": "60", 26 | "cigar": "100M10D10M10I10M5I5M5D5M5S", 27 | "next_ref_name": "=", 28 | "next_ref_pos": "251", 29 | "length": "300", # template_length 30 | "seq": "A" * 150, 31 | "qual": "J" * 150, 32 | } 33 | header_dict = {"SQ": [{"LN": 1, "SN": contig} for contig in ["chr1", "chr2"]]} 34 | header = pysam.AlignmentHeader.from_dict(header_dict) 35 | read = pysam.AlignedSegment.from_dict(read_dict, header) 36 | # matches that are more than 3 bps from the nearest indel 37 | simple_matches = set(list(range(100, 195 + 1)) + [213, 214, 215, 223, 224, 225]) 38 | for coordinate in range(300): 39 | seq_call = get_alignment_in_read(read, coordinate) 40 | if coordinate in simple_matches: 41 | assert seq_call.call_type == AlignmentType.match 42 | assert seq_call.nearest_indel == float("inf") 43 | elif coordinate in (196, 197, 198): 44 | assert seq_call.call_type == AlignmentType.match 45 | assert seq_call.nearest_indel == 199 - coordinate 46 | elif coordinate == 199: 47 | assert seq_call.call_type == AlignmentType.deletion 48 | assert seq_call.indel_length == -10 49 | assert seq_call.nearest_indel == float("inf") 50 | elif coordinate in range(200, 210): 51 | assert seq_call.call_type == AlignmentType.unknown 52 | elif coordinate in (210, 211, 212): 53 | assert seq_call.call_type == AlignmentType.match 54 | assert seq_call.nearest_indel == coordinate - 209 55 | elif coordinate in (216, 217, 218): 56 | assert seq_call.call_type == AlignmentType.match 57 | assert seq_call.nearest_indel == 219 - coordinate 58 | elif coordinate == 219: 59 | assert seq_call.call_type == AlignmentType.insertion 60 | assert seq_call.indel_length == 10 61 | assert seq_call.nearest_indel == float("inf") 62 | elif coordinate == 229: 63 | assert seq_call.call_type == AlignmentType.insertion 64 | assert seq_call.indel_length == 5 65 | assert seq_call.nearest_indel == float("inf") 66 | elif coordinate == 234: 67 | assert seq_call.call_type == AlignmentType.deletion 68 | assert seq_call.indel_length == -5 69 | assert seq_call.nearest_indel == float("inf") 70 | elif 100 > coordinate >= 245: 71 | assert seq_call.call_type is None 72 | -------------------------------------------------------------------------------- /tests/unit/utilities/test_split_bed_into_equal_regions.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections.abc import Generator 3 | from unittest.mock import MagicMock 4 | 5 | import pytest 6 | from _pytest.tmpdir import TempPathFactory 7 | from pytest_mock import MockerFixture 8 | 9 | from somaticseq.utilities.split_bed_into_equal_regions import split 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "expected_inlines,expected_outlines", 14 | [ 15 | ( 16 | ["chr1\t0\t100\n", ""], 17 | [["chr1\t0\t34\n"], ["chr1\t34\t68\n"], ["chr1\t68\t100\n"]], 18 | ), 19 | ( 20 | ["chr1\t0\t90\n", "chr2\t0\t10\n", ""], 21 | [ 22 | ["chr1\t0\t34\n"], 23 | ["chr1\t34\t68\n"], 24 | ["chr1\t68\t90\n", "chr2\t0\t10\n"], 25 | ], 26 | ), 27 | ], 28 | ) 29 | def test_split( 30 | expected_inlines: list[str], 31 | expected_outlines: list[list[str]], 32 | mocker: MockerFixture, 33 | tmp_path_factory: TempPathFactory, 34 | ) -> None: 35 | """ 36 | Test case where input bed file is split into 3 output bed files of equal 37 | region lengths 38 | """ 39 | 40 | def _mock_reader(expected_inlines: list[str]) -> Generator: 41 | yield from expected_inlines 42 | 43 | # Mock 1 input bed file and 3 output bed files 44 | mock_reader = mocker.MagicMock() 45 | mock_writer_1 = mocker.MagicMock() 46 | mock_writer_2 = mocker.MagicMock() 47 | mock_writer_3 = mocker.MagicMock() 48 | 49 | # Mocking context manager 50 | mock_reader.__enter__.return_value = mock_reader 51 | mock_writer_1.__enter__.return_value = mock_writer_1 52 | mock_writer_2.__enter__.return_value = mock_writer_2 53 | mock_writer_3.__enter__.return_value = mock_writer_3 54 | 55 | # Define the exit methods to do nothing 56 | mock_reader.__exit__.return_value = False 57 | mock_writer_1.__exit__.return_value = False 58 | mock_writer_2.__exit__.return_value = False 59 | mock_writer_3.__exit__.return_value = False 60 | 61 | mock_reader.readline.side_effect = _mock_reader(expected_inlines) 62 | 63 | def _mock_open(file: str, mode: str = "r") -> MagicMock: 64 | if "1.x.bed" in file: 65 | return mock_writer_1 66 | elif "2.x.bed" in file: 67 | return mock_writer_2 68 | elif "3.x.bed" in file: 69 | return mock_writer_3 70 | else: 71 | return mock_reader 72 | 73 | mocker.patch("builtins.open", side_effect=_mock_open) 74 | outdir = tmp_path_factory.mktemp("split_bed") 75 | out_files = split( 76 | infile="region.bed", outfiles=os.path.join(outdir, "x.bed"), num=3 77 | ) 78 | assert out_files == [os.path.join(outdir, f"{i}.x.bed") for i in (1, 2, 3)] 79 | 80 | assert mock_writer_1.write.call_count == len(expected_outlines[0]) 81 | for line in expected_outlines[0]: 82 | mock_writer_1.write.assert_any_call(line) 83 | 84 | assert mock_writer_2.write.call_count == len(expected_outlines[1]) 85 | for line in expected_outlines[1]: 86 | mock_writer_2.write.assert_any_call(line) 87 | 88 | assert mock_writer_3.write.call_count == len(expected_outlines[2]) 89 | for line in expected_outlines[2]: 90 | mock_writer_3.write.assert_any_call(line) 91 | -------------------------------------------------------------------------------- /tests/unit/vcf_modifier/test_bed_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from _pytest.tmpdir import TempPathFactory 3 | 4 | import somaticseq.vcf_modifier.bed_util as bed_util 5 | 6 | 7 | @pytest.fixture 8 | def dummy_vcf(tmp_path_factory: TempPathFactory) -> str: 9 | temp_file_name = str(tmp_path_factory.mktemp("vcf") / "dummy.vcf") 10 | with open(temp_file_name, "w") as f: 11 | f.write("##fileformat=VCFv4.1\n") 12 | f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") 13 | for pos in range(1, 100): 14 | line = f"chr1\t{pos}\tid_{pos}\tG\tT\t.\t.\t.\n" 15 | f.write(line) 16 | return temp_file_name 17 | 18 | 19 | @pytest.fixture 20 | def inclusion_bed(tmp_path_factory: TempPathFactory) -> str: 21 | temp_file_name = str(tmp_path_factory.mktemp("bed") / "inclusion.bed") 22 | with open(temp_file_name, "w") as f: 23 | f.write("chr1\t20\t40\n") 24 | f.write("chr1\t60\t80\n") 25 | return temp_file_name 26 | 27 | 28 | @pytest.fixture 29 | def exclusion_bed(tmp_path_factory: TempPathFactory) -> str: 30 | temp_file_name = str(tmp_path_factory.mktemp("bed") / "exclusion.bed") 31 | with open(temp_file_name, "w") as f: 32 | f.write("chr1\t30\t70\n") 33 | return temp_file_name 34 | 35 | 36 | def test_bed_intersector( 37 | dummy_vcf: str, 38 | inclusion_bed: str, 39 | exclusion_bed: str, 40 | tmp_path_factory: TempPathFactory, 41 | ) -> None: 42 | outdir = tmp_path_factory.mktemp("test") 43 | out_vcf = str(outdir / "x.vcf") 44 | result = bed_util.bed_intersector(dummy_vcf, out_vcf, inclusion_bed, exclusion_bed) 45 | positions = [] 46 | with open(result) as f: 47 | line = f.readline() 48 | while line: 49 | if line.startswith("#"): 50 | line = f.readline() 51 | continue 52 | item = line.split("\t") 53 | assert item[0] == "chr1" 54 | positions.append(int(item[1])) 55 | line = f.readline() 56 | 57 | assert result == out_vcf 58 | assert positions == list(range(21, 31)) + list(range(71, 81)) 59 | -------------------------------------------------------------------------------- /tests/unit/vcf_modifier/test_split_vcf.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from _pytest.tmpdir import TempPathFactory 3 | 4 | from somaticseq.genomic_file_parsers.genomic_file_handlers import ( 5 | VCFVariantRecord, 6 | ) 7 | from somaticseq.vcf_modifier.split_vcf import split_into_snv_and_indel 8 | 9 | COMPLEX_VCF = [ 10 | ["1", "10", ".", "A", "C", "10\t.\t.\tGT\t0/0\t0/1"], # snv 11 | ["1", "10", ".", "ATGAG", "A", "10\t.\t.\tGT\t0/0\t0/1"], # deletion 12 | ["1", "11", ".", "T", "A,TC", "10\t.\t.\tGT\t0/0\t0/1"], # snv and insertion 13 | ["1", "12", ".", "GAGGTCAGGA", "AAAA", "10\t.\t.\tGT\t0/0\t0/1"], # complex 14 | ["1", "14", ".", "GGTC", "AAAAAA", "10\t.\t.\tGT\t0/0\t0/1"], # complex 15 | ] 16 | 17 | 18 | @pytest.fixture 19 | def complex_vcf(tmp_path_factory: TempPathFactory) -> str: 20 | temp_file_name = str(tmp_path_factory.mktemp("vcf") / "complex_variants.vcf") 21 | with open(temp_file_name, "w") as f: 22 | for item in COMPLEX_VCF: 23 | f.write("\t".join(item) + "\n") 24 | return temp_file_name 25 | 26 | 27 | def test_split_into_snv_and_indel( 28 | complex_vcf: str, tiny_fasta: str, tmp_path_factory: TempPathFactory 29 | ) -> None: 30 | outdir = tmp_path_factory.mktemp("test") 31 | out_snv = str(outdir / "snv.vcf") 32 | out_indel = str(outdir / "indel.vcf") 33 | split_into_snv_and_indel( 34 | infile=complex_vcf, 35 | out_snv_vcf=out_snv, 36 | out_indel_vcf=out_indel, 37 | genome_reference=tiny_fasta, 38 | ) 39 | 40 | snvs = [] 41 | with open(out_snv) as snv: 42 | for line in snv: 43 | vcf = VCFVariantRecord.from_vcf_line(line) 44 | snvs.append([vcf.chromosome, vcf.position, vcf.refbase, vcf.altbase]) 45 | assert snvs == [ 46 | ["1", 10, "A", "C"], 47 | ["1", 11, "T", "A"], 48 | ["1", 12, "G", "A"], 49 | ["1", 14, "G", "A"], 50 | ["1", 15, "G", "A"], 51 | ["1", 16, "T", "A"], 52 | ["1", 17, "C", "A"], 53 | ] 54 | 55 | indels = [] 56 | with open(out_indel) as indel: 57 | for line in indel: 58 | vcf = VCFVariantRecord.from_vcf_line(line) 59 | indels.append([vcf.chromosome, vcf.position, vcf.refbase, vcf.altbase]) 60 | assert indels == [ 61 | ["1", 10, "ATGAG", "A"], 62 | ["1", 11, "T", "TC"], 63 | ["1", 15, "GTCAGGA", "G"], 64 | ["1", 17, "C", "CAA"], 65 | ] 66 | --------------------------------------------------------------------------------