├── data ├── scheme.jpg ├── header.txt ├── CommonAdapters.fa ├── remapCOSMIC.txt └── remapNCBI.txt ├── nextflow.config ├── Dockerfile ├── README.md ├── setup.nf └── main.nf /data/scheme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elyadlezmi/RNA2CM/HEAD/data/scheme.jpg -------------------------------------------------------------------------------- /data/header.txt: -------------------------------------------------------------------------------- 1 | ##INFO= 2 | ##INFO= 3 | ##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency."> 4 | ##INFO= 5 | 6 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | 2 | params { 3 | 4 | cpu = 8 5 | readlength = '100' 6 | 7 | fastq = '' 8 | fastq2 = false 9 | prefix = 'Sample' 10 | keepInter = true 11 | filterMouse = true 12 | } 13 | 14 | 15 | profiles { 16 | 17 | standard { 18 | 19 | process { 20 | 21 | executor = 'local' 22 | container = 'elyadl/rna2cm' 23 | memory = '4GB' 24 | errorStrategy = 'retry' 25 | shell = ['/bin/bash', '-euo', 'pipefail'] 26 | } 27 | 28 | docker.enabled = true 29 | } 30 | 31 | cluster { 32 | 33 | process { 34 | 35 | executor = 'slurm' 36 | beforeScript = "module load singularity" 37 | container = 'docker://elyadl/rna2cm' 38 | memory = '4GB' 39 | time = '12h' 40 | errorStrategy = 'retry' 41 | shell = ['/bin/bash', '-euo', 'pipefail'] 42 | } 43 | 44 | singularity { 45 | 46 | enabled = true 47 | runOptions = "-B $launchDir -B $projectDir/data" 48 | cacheDir = "$launchDir" 49 | } 50 | } 51 | } 52 | 53 | cleanup = true 54 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Set the base image to debian 2 | FROM ubuntu:focal 3 | 4 | # File Author / Maintainer 5 | MAINTAINER Elyad Lezmi 6 | 7 | ENV DEBIAN_FRONTEND noninteractive 8 | 9 | RUN apt-get update && apt-get install --yes --no-install-recommends \ 10 | build-essential \ 11 | unzip \ 12 | wget \ 13 | samtools \ 14 | tabix \ 15 | r-base \ 16 | libbz2-dev \ 17 | zlib1g-dev \ 18 | liblzma-dev \ 19 | libcurl4-openssl-dev \ 20 | libssl-dev \ 21 | libxml2-dev \ 22 | rna-star \ 23 | bcftools \ 24 | python \ 25 | python3-pip \ 26 | openjdk-8-jdk 27 | 28 | RUN pip3 install pandas 29 | 30 | RUN R -q -e "install.packages('BiocManager', repos='https://cran.r-project.org')" 31 | RUN R -q -e 'BiocManager::install(c("Rsamtools", "GenomicAlignments", "BiocParallel", "futile.logger"))' 32 | RUN wget https://github.com/PeeperLab/XenofilteR/releases/download/v1.6/XenofilteR_1.6.tar.gz 33 | RUN R CMD INSTALL XenofilteR_1.6.tar.gz 34 | RUN rm XenofilteR_1.6.tar.gz 35 | 36 | RUN wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip 37 | RUN unzip Trimmomatic-0.39.zip 38 | RUN rm Trimmomatic-0.39.zip 39 | 40 | RUN wget https://github.com/broadinstitute/gatk/releases/download/4.1.3.0/gatk-4.1.3.0.zip 41 | RUN unzip gatk-4.1.3.0.zip 42 | RUN rm gatk-4.1.3.0.zip 43 | -------------------------------------------------------------------------------- /data/CommonAdapters.fa: -------------------------------------------------------------------------------- 1 | >TruSeq3_IndexedAdapter 2 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 3 | >TruSeq3_UniversalAdapter 4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 5 | >PrefixNX/1 6 | AGATGTGTATAAGAGACAG 7 | >PrefixNX/2 8 | AGATGTGTATAAGAGACAG 9 | >Trans1 10 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG 11 | >Trans1_rc 12 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 13 | >Trans2 14 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 15 | >Trans2_rc 16 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC 17 | >PrefixPE/1 18 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 19 | >PrefixPE/2 20 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 21 | >PCR_Primer1 22 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 23 | >PCR_Primer1_rc 24 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT 25 | >PCR_Primer2 26 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 27 | >PCR_Primer2_rc 28 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG 29 | >FlowCell1 30 | TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC 31 | >FlowCell2 32 | TTTTTTTTTTCAAGCAGAAGACGGCATACGA 33 | >TruSeq2_SE 34 | AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG 35 | >TruSeq2_PE_f 36 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 37 | >TruSeq2_PE_r 38 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG 39 | >PrefixPE/1 40 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 41 | >PrefixPE/2 42 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 43 | >PrefixPE/1 44 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 45 | >PrefixPE/2 46 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 47 | >PE1 48 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 49 | >PE1_rc 50 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 51 | >PE2 52 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 53 | >PE2_rc 54 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RNA2CM 2 | 3 | RNA2CM is a tool for the identification of cancer-related mutations from RNA-seq data 4 | 5 | ![](./data/scheme.jpg) 6 | ## Pre-requisites 7 | 8 | Nextflow and Docker (or Singularity for execution on SLURM-clusters) are the only pre-requisites for the RNA2CM tool. Install both if needed and make sure they are properly running on your system. If the following commands do not generate any error message you are good to go. 9 | ```bash 10 | nextflow run hello # test that nextflow is working 11 | docker run hello-world # test that docker is working if you are working on a local workstation 12 | singularity help # test that singularity is working in case your are working on a SLURM cluster 13 | ``` 14 | 15 | ## Installation 16 | 17 | 1. Download the project directory: 18 | ```bash 19 | git clone https://github.com/elyadlezmi/RNA2CM.git # clone the project using git 20 | ``` 21 | 22 | 2. Download the files CosmicMutantExportCensus.tsv.gz and CosmicCodingMuts.vcf.gz from the COSMIC website (https://cancer.sanger.ac.uk/cosmic/download, login required), then move both files into the project’s subdirectory named data (RNA2CM/data). 23 | 24 | 3. Execute the script named setup.nf which is responsible for setting up all the reference data and will complete the installation (this might take a while). 25 | ```bash 26 | nextflow run /path/to/RNA2CM/setup.nf # run the installation script 27 | ``` 28 | The setup.nf script can take three optional arguments: 29 | 30 | -profile: Choose the executor profile between a standard dockerized usage on a local workstation or usage on a SLURM cluster (requires Singularity instead of Docker) (standard/cluster, default: standard). 31 | 32 | --cpu: The number of threads for multi-threading (int, default 8). 33 | 34 | --readLength: The expected Illumina read length for optimal alignment by STAR (int, default 100). 35 | 36 | ## Usage 37 | 38 | ```bash 39 | nextflow run /path/to/RNA2CM --fastq your_sample.fastq.gz # for single-end reads 40 | nextflow run /path/to/RNA2CM --fastq your_sample_1.fastq.gz --fastq2 your_sample_2.fastq.gz # for paired-ends reads 41 | ``` 42 | Optional arguments (Note that the only required arguments are RNA-seq reads, output is generated into the working directory): 43 | 44 | -profile: (standard/cluster, default: standard). 45 | 46 | --cpu: (int, default 8). 47 | 48 | --prefix: Output files have standard names but a custom prefix can be added (str) 49 | 50 | --keepInter: Whether to keep intermediate alignment and VCF files (true/false, default: false). 51 | 52 | --filterMouse: Whether to perform mouse contamination cleanup (true/false, default true). 53 | 54 | Example for a paired-ends RNA-seq run, using 4 CPUs, keeping intermediate files: 55 | ```bash 56 | nextflow run /path/to/RNA2CM --fastq esc_1.fastq.gz --fastq2 esc_2.fastq.gz --cpu 4 --keepInter true 57 | ``` 58 | 59 | Example for a single-end RNA-seq run, skipping mouse read filtration and running on a SLURM cluster (nextflow manages batch jobs, so no need to use sbatch): 60 | ```bash 61 | nextflow run /path/to/RNA2CM --fastq SRR1234567.fastq.gz --filterMouse false -profile cluster 62 | ``` 63 | -------------------------------------------------------------------------------- /data/remapCOSMIC.txt: -------------------------------------------------------------------------------- 1 | 1 chr1 2 | 10 chr10 3 | 11 chr11 4 | 12 chr12 5 | 13 chr13 6 | 14 chr14 7 | 15 chr15 8 | 16 chr16 9 | 17 chr17 10 | 18 chr18 11 | 19 chr19 12 | 2 chr2 13 | 20 chr20 14 | 21 chr21 15 | 22 chr22 16 | 3 chr3 17 | 4 chr4 18 | 5 chr5 19 | 6 chr6 20 | 7 chr7 21 | 8 chr8 22 | 9 chr9 23 | MT chrM 24 | X chrX 25 | Y chrY 26 | GL000008.2 chr4_GL000008v2_random 27 | GL000009.2 chr14_GL000009v2_random 28 | GL000194.1 chr14_GL000194v1_random 29 | GL000195.1 chrUn_GL000195v1 30 | GL000205.2 chr17_GL000205v2_random 31 | GL000208.1 chr5_GL000208v1_random 32 | GL000213.1 chrUn_GL000213v1 33 | GL000214.1 chrUn_GL000214v1 34 | GL000216.2 chrUn_GL000216v2 35 | GL000218.1 chrUn_GL000218v1 36 | GL000219.1 chrUn_GL000219v1 37 | GL000220.1 chrUn_GL000220v1 38 | GL000221.1 chr3_GL000221v1_random 39 | GL000224.1 chrUn_GL000224v1 40 | GL000225.1 chr14_GL000225v1_random 41 | GL000226.1 chrUn_GL000226v1 42 | KI270302.1 chrUn_KI270302v1 43 | KI270303.1 chrUn_KI270303v1 44 | KI270304.1 chrUn_KI270304v1 45 | KI270305.1 chrUn_KI270305v1 46 | KI270310.1 chrUn_KI270310v1 47 | KI270311.1 chrUn_KI270311v1 48 | KI270312.1 chrUn_KI270312v1 49 | KI270315.1 chrUn_KI270315v1 50 | KI270316.1 chrUn_KI270316v1 51 | KI270317.1 chrUn_KI270317v1 52 | KI270320.1 chrUn_KI270320v1 53 | KI270322.1 chrUn_KI270322v1 54 | KI270329.1 chrUn_KI270329v1 55 | KI270330.1 chrUn_KI270330v1 56 | KI270333.1 chrUn_KI270333v1 57 | KI270334.1 chrUn_KI270334v1 58 | KI270335.1 chrUn_KI270335v1 59 | KI270336.1 chrUn_KI270336v1 60 | KI270337.1 chrUn_KI270337v1 61 | KI270338.1 chrUn_KI270338v1 62 | KI270340.1 chrUn_KI270340v1 63 | KI270362.1 chrUn_KI270362v1 64 | KI270363.1 chrUn_KI270363v1 65 | KI270364.1 chrUn_KI270364v1 66 | KI270366.1 chrUn_KI270366v1 67 | KI270371.1 chrUn_KI270371v1 68 | KI270372.1 chrUn_KI270372v1 69 | KI270373.1 chrUn_KI270373v1 70 | KI270374.1 chrUn_KI270374v1 71 | KI270375.1 chrUn_KI270375v1 72 | KI270376.1 chrUn_KI270376v1 73 | KI270378.1 chrUn_KI270378v1 74 | KI270379.1 chrUn_KI270379v1 75 | KI270381.1 chrUn_KI270381v1 76 | KI270382.1 chrUn_KI270382v1 77 | KI270383.1 chrUn_KI270383v1 78 | KI270384.1 chrUn_KI270384v1 79 | KI270385.1 chrUn_KI270385v1 80 | KI270386.1 chrUn_KI270386v1 81 | KI270387.1 chrUn_KI270387v1 82 | KI270388.1 chrUn_KI270388v1 83 | KI270389.1 chrUn_KI270389v1 84 | KI270390.1 chrUn_KI270390v1 85 | KI270391.1 chrUn_KI270391v1 86 | KI270392.1 chrUn_KI270392v1 87 | KI270393.1 chrUn_KI270393v1 88 | KI270394.1 chrUn_KI270394v1 89 | KI270395.1 chrUn_KI270395v1 90 | KI270396.1 chrUn_KI270396v1 91 | KI270411.1 chrUn_KI270411v1 92 | KI270412.1 chrUn_KI270412v1 93 | KI270414.1 chrUn_KI270414v1 94 | KI270417.1 chrUn_KI270417v1 95 | KI270418.1 chrUn_KI270418v1 96 | KI270419.1 chrUn_KI270419v1 97 | KI270420.1 chrUn_KI270420v1 98 | KI270422.1 chrUn_KI270422v1 99 | KI270423.1 chrUn_KI270423v1 100 | KI270424.1 chrUn_KI270424v1 101 | KI270425.1 chrUn_KI270425v1 102 | KI270429.1 chrUn_KI270429v1 103 | KI270435.1 chrUn_KI270435v1 104 | KI270438.1 chrUn_KI270438v1 105 | KI270442.1 chrUn_KI270442v1 106 | KI270448.1 chrUn_KI270448v1 107 | KI270465.1 chrUn_KI270465v1 108 | KI270466.1 chrUn_KI270466v1 109 | KI270467.1 chrUn_KI270467v1 110 | KI270468.1 chrUn_KI270468v1 111 | KI270507.1 chrUn_KI270507v1 112 | KI270508.1 chrUn_KI270508v1 113 | KI270509.1 chrUn_KI270509v1 114 | KI270510.1 chrUn_KI270510v1 115 | KI270511.1 chrUn_KI270511v1 116 | KI270512.1 chrUn_KI270512v1 117 | KI270515.1 chrUn_KI270515v1 118 | KI270516.1 chrUn_KI270516v1 119 | KI270517.1 chrUn_KI270517v1 120 | KI270518.1 chrUn_KI270518v1 121 | KI270519.1 chrUn_KI270519v1 122 | KI270521.1 chrUn_KI270521v1 123 | KI270522.1 chrUn_KI270522v1 124 | KI270528.1 chrUn_KI270528v1 125 | KI270529.1 chrUn_KI270529v1 126 | KI270530.1 chrUn_KI270530v1 127 | KI270538.1 chrUn_KI270538v1 128 | KI270539.1 chrUn_KI270539v1 129 | KI270544.1 chrUn_KI270544v1 130 | KI270548.1 chrUn_KI270548v1 131 | KI270579.1 chrUn_KI270579v1 132 | KI270580.1 chrUn_KI270580v1 133 | KI270581.1 chrUn_KI270581v1 134 | KI270582.1 chrUn_KI270582v1 135 | KI270583.1 chrUn_KI270583v1 136 | KI270584.1 chrUn_KI270584v1 137 | KI270587.1 chrUn_KI270587v1 138 | KI270588.1 chrUn_KI270588v1 139 | KI270589.1 chrUn_KI270589v1 140 | KI270590.1 chrUn_KI270590v1 141 | KI270591.1 chrUn_KI270591v1 142 | KI270593.1 chrUn_KI270593v1 143 | KI270706.1 chr1_KI270706v1_random 144 | KI270707.1 chr1_KI270707v1_random 145 | KI270708.1 chr1_KI270708v1_random 146 | KI270709.1 chr1_KI270709v1_random 147 | KI270710.1 chr1_KI270710v1_random 148 | KI270711.1 chr1_KI270711v1_random 149 | KI270712.1 chr1_KI270712v1_random 150 | KI270713.1 chr1_KI270713v1_random 151 | KI270714.1 chr1_KI270714v1_random 152 | KI270715.1 chr2_KI270715v1_random 153 | KI270716.1 chr2_KI270716v1_random 154 | KI270717.1 chr9_KI270717v1_random 155 | KI270718.1 chr9_KI270718v1_random 156 | KI270719.1 chr9_KI270719v1_random 157 | KI270720.1 chr9_KI270720v1_random 158 | KI270721.1 chr11_KI270721v1_random 159 | KI270722.1 chr14_KI270722v1_random 160 | KI270723.1 chr14_KI270723v1_random 161 | KI270724.1 chr14_KI270724v1_random 162 | KI270725.1 chr14_KI270725v1_random 163 | KI270726.1 chr14_KI270726v1_random 164 | KI270727.1 chr15_KI270727v1_random 165 | KI270728.1 chr16_KI270728v1_random 166 | KI270729.1 chr17_KI270729v1_random 167 | KI270730.1 chr17_KI270730v1_random 168 | KI270731.1 chr22_KI270731v1_random 169 | KI270732.1 chr22_KI270732v1_random 170 | KI270733.1 chr22_KI270733v1_random 171 | KI270734.1 chr22_KI270734v1_random 172 | KI270735.1 chr22_KI270735v1_random 173 | KI270736.1 chr22_KI270736v1_random 174 | KI270737.1 chr22_KI270737v1_random 175 | KI270738.1 chr22_KI270738v1_random 176 | KI270739.1 chr22_KI270739v1_random 177 | KI270740.1 chrY_KI270740v1_random 178 | KI270741.1 chrUn_KI270741v1 179 | KI270742.1 chrUn_KI270742v1 180 | KI270743.1 chrUn_KI270743v1 181 | KI270744.1 chrUn_KI270744v1 182 | KI270745.1 chrUn_KI270745v1 183 | KI270746.1 chrUn_KI270746v1 184 | KI270747.1 chrUn_KI270747v1 185 | KI270748.1 chrUn_KI270748v1 186 | KI270749.1 chrUn_KI270749v1 187 | KI270750.1 chrUn_KI270750v1 188 | KI270751.1 chrUn_KI270751v1 189 | KI270752.1 chrUn_KI270752v1 190 | KI270753.1 chrUn_KI270753v1 191 | KI270754.1 chrUn_KI270754v1 192 | KI270755.1 chrUn_KI270755v1 193 | KI270756.1 chrUn_KI270756v1 194 | KI270757.1 chrUn_KI270757v1 195 | -------------------------------------------------------------------------------- /data/remapNCBI.txt: -------------------------------------------------------------------------------- 1 | NC_000001.11 chr1 2 | NC_000002.12 chr2 3 | NC_000003.12 chr3 4 | NC_000004.12 chr4 5 | NC_000005.10 chr5 6 | NC_000006.12 chr6 7 | NC_000007.14 chr7 8 | NC_000008.11 chr8 9 | NC_000009.12 chr9 10 | NC_000010.11 chr10 11 | NC_000011.10 chr11 12 | NC_000012.12 chr12 13 | NC_000013.11 chr13 14 | NC_000014.9 chr14 15 | NC_000015.10 chr15 16 | NC_000016.10 chr16 17 | NC_000017.11 chr17 18 | NC_000018.10 chr18 19 | NC_000019.10 chr19 20 | NC_000020.11 chr20 21 | NC_000021.9 chr21 22 | NC_000022.11 chr22 23 | NC_000023.11 chrX 24 | NC_000024.10 chrY 25 | NT_187361.1 chr1_KI270706v1_random 26 | NT_187362.1 chr1_KI270707v1_random 27 | NT_187363.1 chr1_KI270708v1_random 28 | NT_187364.1 chr1_KI270709v1_random 29 | NT_187365.1 chr1_KI270710v1_random 30 | NT_187366.1 chr1_KI270711v1_random 31 | NT_187367.1 chr1_KI270712v1_random 32 | NT_187368.1 chr1_KI270713v1_random 33 | NT_187369.1 chr1_KI270714v1_random 34 | NT_187370.1 chr2_KI270715v1_random 35 | NT_187371.1 chr2_KI270716v1_random 36 | NT_167215.1 chr3_GL000221v1_random 37 | NT_113793.3 chr4_GL000008v2_random 38 | NT_113948.1 chr5_GL000208v1_random 39 | NT_187372.1 chr9_KI270717v1_random 40 | NT_187373.1 chr9_KI270718v1_random 41 | NT_187374.1 chr9_KI270719v1_random 42 | NT_187375.1 chr9_KI270720v1_random 43 | NT_187376.1 chr11_KI270721v1_random 44 | NT_113796.3 chr14_GL000009v2_random 45 | NT_113888.1 chr14_GL000194v1_random 46 | NT_167219.1 chr14_GL000225v1_random 47 | NT_187377.1 chr14_KI270722v1_random 48 | NT_187378.1 chr14_KI270723v1_random 49 | NT_187379.1 chr14_KI270724v1_random 50 | NT_187380.1 chr14_KI270725v1_random 51 | NT_187381.1 chr14_KI270726v1_random 52 | NT_187382.1 chr15_KI270727v1_random 53 | NT_187383.1 chr16_KI270728v1_random 54 | NT_113930.2 chr17_GL000205v2_random 55 | NT_187384.1 chr17_KI270729v1_random 56 | NT_187385.1 chr17_KI270730v1_random 57 | NT_187386.1 chr22_KI270731v1_random 58 | NT_187387.1 chr22_KI270732v1_random 59 | NT_187388.1 chr22_KI270733v1_random 60 | NT_187389.1 chr22_KI270734v1_random 61 | NT_187390.1 chr22_KI270735v1_random 62 | NT_187391.1 chr22_KI270736v1_random 63 | NT_187392.1 chr22_KI270737v1_random 64 | NT_187393.1 chr22_KI270738v1_random 65 | NT_187394.1 chr22_KI270739v1_random 66 | NT_187395.1 chrY_KI270740v1_random 67 | NT_113901.1 chrUn_GL000195v1 68 | NT_167208.1 chrUn_GL000213v1 69 | NT_167209.1 chrUn_GL000214v1 70 | NT_167211.2 chrUn_GL000216v2 71 | NT_113889.1 chrUn_GL000218v1 72 | NT_167213.1 chrUn_GL000219v1 73 | NT_167214.1 chrUn_GL000220v1 74 | NT_167218.1 chrUn_GL000224v1 75 | NT_167220.1 chrUn_GL000226v1 76 | NT_187396.1 chrUn_KI270302v1 77 | NT_187398.1 chrUn_KI270303v1 78 | NT_187397.1 chrUn_KI270304v1 79 | NT_187399.1 chrUn_KI270305v1 80 | NT_187402.1 chrUn_KI270310v1 81 | NT_187406.1 chrUn_KI270311v1 82 | NT_187405.1 chrUn_KI270312v1 83 | NT_187404.1 chrUn_KI270315v1 84 | NT_187403.1 chrUn_KI270316v1 85 | NT_187407.1 chrUn_KI270317v1 86 | NT_187401.1 chrUn_KI270320v1 87 | NT_187400.1 chrUn_KI270322v1 88 | NT_187459.1 chrUn_KI270329v1 89 | NT_187458.1 chrUn_KI270330v1 90 | NT_187461.1 chrUn_KI270333v1 91 | NT_187460.1 chrUn_KI270334v1 92 | NT_187462.1 chrUn_KI270335v1 93 | NT_187465.1 chrUn_KI270336v1 94 | NT_187466.1 chrUn_KI270337v1 95 | NT_187463.1 chrUn_KI270338v1 96 | NT_187464.1 chrUn_KI270340v1 97 | NT_187469.1 chrUn_KI270362v1 98 | NT_187467.1 chrUn_KI270363v1 99 | NT_187468.1 chrUn_KI270364v1 100 | NT_187470.1 chrUn_KI270366v1 101 | NT_187494.1 chrUn_KI270371v1 102 | NT_187491.1 chrUn_KI270372v1 103 | NT_187492.1 chrUn_KI270373v1 104 | NT_187490.1 chrUn_KI270374v1 105 | NT_187493.1 chrUn_KI270375v1 106 | NT_187489.1 chrUn_KI270376v1 107 | NT_187471.1 chrUn_KI270378v1 108 | NT_187472.1 chrUn_KI270379v1 109 | NT_187486.1 chrUn_KI270381v1 110 | NT_187488.1 chrUn_KI270382v1 111 | NT_187482.1 chrUn_KI270383v1 112 | NT_187484.1 chrUn_KI270384v1 113 | NT_187487.1 chrUn_KI270385v1 114 | NT_187480.1 chrUn_KI270386v1 115 | NT_187475.1 chrUn_KI270387v1 116 | NT_187478.1 chrUn_KI270388v1 117 | NT_187473.1 chrUn_KI270389v1 118 | NT_187474.1 chrUn_KI270390v1 119 | NT_187481.1 chrUn_KI270391v1 120 | NT_187485.1 chrUn_KI270392v1 121 | NT_187483.1 chrUn_KI270393v1 122 | NT_187479.1 chrUn_KI270394v1 123 | NT_187476.1 chrUn_KI270395v1 124 | NT_187477.1 chrUn_KI270396v1 125 | NT_187409.1 chrUn_KI270411v1 126 | NT_187408.1 chrUn_KI270412v1 127 | NT_187410.1 chrUn_KI270414v1 128 | NT_187415.1 chrUn_KI270417v1 129 | NT_187412.1 chrUn_KI270418v1 130 | NT_187411.1 chrUn_KI270419v1 131 | NT_187413.1 chrUn_KI270420v1 132 | NT_187416.1 chrUn_KI270422v1 133 | NT_187417.1 chrUn_KI270423v1 134 | NT_187414.1 chrUn_KI270424v1 135 | NT_187418.1 chrUn_KI270425v1 136 | NT_187419.1 chrUn_KI270429v1 137 | NT_187424.1 chrUn_KI270435v1 138 | NT_187425.1 chrUn_KI270438v1 139 | NT_187420.1 chrUn_KI270442v1 140 | NT_187495.1 chrUn_KI270448v1 141 | NT_187422.1 chrUn_KI270465v1 142 | NT_187421.1 chrUn_KI270466v1 143 | NT_187423.1 chrUn_KI270467v1 144 | NT_187426.1 chrUn_KI270468v1 145 | NT_187437.1 chrUn_KI270507v1 146 | NT_187430.1 chrUn_KI270508v1 147 | NT_187428.1 chrUn_KI270509v1 148 | NT_187427.1 chrUn_KI270510v1 149 | NT_187435.1 chrUn_KI270511v1 150 | NT_187432.1 chrUn_KI270512v1 151 | NT_187436.1 chrUn_KI270515v1 152 | NT_187431.1 chrUn_KI270516v1 153 | NT_187438.1 chrUn_KI270517v1 154 | NT_187429.1 chrUn_KI270518v1 155 | NT_187433.1 chrUn_KI270519v1 156 | NT_187496.1 chrUn_KI270521v1 157 | NT_187434.1 chrUn_KI270522v1 158 | NT_187440.1 chrUn_KI270528v1 159 | NT_187439.1 chrUn_KI270529v1 160 | NT_187441.1 chrUn_KI270530v1 161 | NT_187443.1 chrUn_KI270538v1 162 | NT_187442.1 chrUn_KI270539v1 163 | NT_187444.1 chrUn_KI270544v1 164 | NT_187445.1 chrUn_KI270548v1 165 | NT_187450.1 chrUn_KI270579v1 166 | NT_187448.1 chrUn_KI270580v1 167 | NT_187449.1 chrUn_KI270581v1 168 | NT_187454.1 chrUn_KI270582v1 169 | NT_187446.1 chrUn_KI270583v1 170 | NT_187453.1 chrUn_KI270584v1 171 | NT_187447.1 chrUn_KI270587v1 172 | NT_187455.1 chrUn_KI270588v1 173 | NT_187451.1 chrUn_KI270589v1 174 | NT_187452.1 chrUn_KI270590v1 175 | NT_187457.1 chrUn_KI270591v1 176 | NT_187456.1 chrUn_KI270593v1 177 | NT_187497.1 chrUn_KI270741v1 178 | NT_187513.1 chrUn_KI270742v1 179 | NT_187498.1 chrUn_KI270743v1 180 | NT_187499.1 chrUn_KI270744v1 181 | NT_187500.1 chrUn_KI270745v1 182 | NT_187501.1 chrUn_KI270746v1 183 | NT_187502.1 chrUn_KI270747v1 184 | NT_187503.1 chrUn_KI270748v1 185 | NT_187504.1 chrUn_KI270749v1 186 | NT_187505.1 chrUn_KI270750v1 187 | NT_187506.1 chrUn_KI270751v1 188 | NT_187507.1 chrUn_KI270752v1 189 | NT_187508.1 chrUn_KI270753v1 190 | NT_187509.1 chrUn_KI270754v1 191 | NT_187510.1 chrUn_KI270755v1 192 | NT_187511.1 chrUn_KI270756v1 193 | NT_187512.1 chrUn_KI270757v1 194 | NC_012920.1 chrM 195 | -------------------------------------------------------------------------------- /setup.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | // Installation script 4 | 5 | humanGTF = Channel.value('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.primary_assembly.annotation.gtf.gz') 6 | humanFasta = Channel.value('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/GRCh38.primary_assembly.genome.fa.gz') 7 | mouseGTF = Channel.value('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz') 8 | mouseFasta = Channel.value('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz') 9 | dbSNP = Channel.value('https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz') 10 | 11 | // check for the cosmic files 12 | 13 | def cencusTable = new File("$projectDir/data/CosmicMutantExportCensus.tsv.gz") 14 | def cosmicVCF = new File("$projectDir/data/CosmicCodingMuts.vcf.gz") 15 | 16 | if (!cencusTable.exists() | !cosmicVCF.exists() ) { 17 | 18 | println """ 19 | One or more of the cosmic files is missing! 20 | Please make sure both "CosmicMutantExportCensus.tsv.gz" and "CosmicCodingMuts.vcf.gz" are present in the "RNA2CM/data" directory. 21 | The files can be downloaded at https://cancer.sanger.ac.uk/cosmic/download 22 | """ 23 | 24 | } else { 25 | 26 | file("$projectDir/data/GRCh").mkdir() 27 | file("$projectDir/data/GRCm").mkdir() 28 | 29 | process downloadHumanGTF { 30 | 31 | input: 32 | val gtf from humanGTF 33 | 34 | output: 35 | file "gencode.v34.primary_assembly.annotation.gtf" into gtf4Intervals, gtf4Star 36 | 37 | """ 38 | wget $gtf 39 | gunzip gencode.v34.primary_assembly.annotation.gtf.gz 40 | """ } 41 | 42 | process downloadHumanFasta { 43 | 44 | publishDir "$projectDir/data", mode: 'copy' 45 | 46 | input: 47 | val fasta from humanFasta 48 | 49 | output: 50 | file "GRCh38.primary_assembly.genome.fa" into fasta4Indexing, fasta4Dict, fasta4Star 51 | 52 | """ 53 | wget $fasta 54 | gunzip GRCh38.primary_assembly.genome.fa.gz 55 | """ } 56 | 57 | process downloadMouseGenome { 58 | 59 | input: 60 | val fasta from mouseFasta 61 | val gtf from mouseGTF 62 | 63 | output: 64 | file "GRCm38.primary_assembly.genome.fa" into mouseFasta4Star 65 | file "gencode.vM25.primary_assembly.annotation.gtf" into mouseGTF4Star 66 | 67 | """ 68 | wget $fasta 69 | wget $gtf 70 | gunzip GRCm38.primary_assembly.genome.fa.gz gencode.vM25.primary_assembly.annotation.gtf.gz 71 | """ } 72 | 73 | process downloadDBSNP { 74 | 75 | input: 76 | val vcf from dbSNP 77 | 78 | output: 79 | file "GCF_000001405.39.gz" into dbSNP4Rename 80 | file "GCF_000001405.39.gz.tbi" into dbSNPIndex4Rename 81 | 82 | """ 83 | wget --no-check-certificate $vcf 84 | wget --no-check-certificate ${vcf}.tbi 85 | """ } 86 | 87 | process generateStarIndex { 88 | 89 | cpus params.cpu 90 | memory '40GB' 91 | 92 | input: 93 | file fasta from fasta4Star 94 | file gtf from gtf4Star 95 | path genomeDir from Channel.fromPath("$projectDir/data/GRCh", type: 'dir') 96 | 97 | output: 98 | val 'foo' into bar 99 | path "*" into out 100 | 101 | """ 102 | STAR --runThreadN $params.cpu --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles $fasta --sjdbGTFfile $gtf --sjdbOverhang $params.readlength 103 | """ } 104 | 105 | 106 | process generateStarIndexMouse { 107 | 108 | cpus params.cpu 109 | memory '40GB' 110 | 111 | input: 112 | file fasta from mouseFasta4Star 113 | file gtf from mouseGTF4Star 114 | path genomeDir from Channel.fromPath("$projectDir/data/GRCm", type: 'dir') 115 | val 'foo' from bar 116 | 117 | output: 118 | path "*" into outM 119 | """ 120 | STAR --runThreadN $params.cpu --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles $fasta --sjdbGTFfile $gtf --sjdbOverhang $params.readlength 121 | rm -rf $projectDir/data/GRCm38.primary_assembly.genome.fa $projectDir/data/gencode.vM25.primary_assembly.annotation.gtf $projectDir/data/gencode.v34.primary_assembly.annotation.gtf 122 | """ } 123 | 124 | process indexFasta { 125 | 126 | publishDir "$projectDir/data", mode: 'copy' 127 | 128 | input: 129 | file fasta from fasta4Indexing 130 | 131 | output: 132 | path '*' into fastaIndex 133 | 134 | """ 135 | samtools faidx $fasta 136 | """ } 137 | 138 | process createDictionery { 139 | 140 | publishDir "$projectDir/data", mode: 'copy' 141 | memory '8GB' 142 | 143 | input: 144 | file fasta from fasta4Dict 145 | file index from fastaIndex 146 | 147 | output: 148 | path '*' 149 | 150 | """ 151 | /gatk-4.1.3.0/gatk CreateSequenceDictionary -R $fasta 152 | """ } 153 | 154 | process createIntervals { 155 | 156 | publishDir "$projectDir/data", mode: 'copy' 157 | 158 | input: 159 | file gtf from gtf4Intervals 160 | 161 | output: 162 | file "GRCh38_exome.bed.gz" into exomeIntervals 163 | 164 | """ 165 | awk '{if(\$3=="exon") {print \$1"\\t"\$4-100"\\t"\$5+100"\\t"substr(\$16,2,length(\$16)-3)}}' $gtf | sort -k 1,1 -k2,2n | bgzip > GRCh38_exome.bed.gz 166 | """ } 167 | 168 | process indexIntervals { 169 | 170 | publishDir "$projectDir/data", mode: 'copy' 171 | 172 | input: 173 | file intervals from exomeIntervals 174 | 175 | output: 176 | path "GRCh38_exome.bed.gz.tbi" 177 | 178 | """ 179 | tabix $intervals 180 | """ } 181 | 182 | process indexCosmic { 183 | 184 | publishDir "$projectDir/data", mode: 'copy' 185 | 186 | input: 187 | file cosmic from Channel.fromPath("$projectDir/data/CosmicCodingMuts.vcf.gz") 188 | 189 | output: 190 | path "CosmicCodingMuts.vcf.gz.tbi" into CosmicIndex 191 | 192 | """ 193 | tabix $cosmic 194 | """ } 195 | 196 | process renameDBSNP { 197 | 198 | cpus params.cpu 199 | publishDir "$projectDir/data", mode: 'copy' 200 | 201 | input: 202 | file dbSNP from dbSNP4Rename 203 | file dbSNPindex from dbSNPIndex4Rename 204 | file remapNCBI from Channel.fromPath("$projectDir/data/remapNCBI.txt") 205 | 206 | output: 207 | path "dbSNPbuild154Renamed.vcf.gz" into dbsnpRenamed 208 | 209 | """ 210 | bcftools annotate --threads $params.cpu --output-type z --rename-chrs $remapNCBI --output dbSNPbuild154Renamed.vcf.gz $dbSNP 211 | """ } 212 | 213 | process renameCosmic { 214 | 215 | cpus params.cpu 216 | publishDir "$projectDir/data", mode: 'copy' 217 | 218 | input: 219 | file cosmic from Channel.fromPath("$projectDir/data/CosmicCodingMuts.vcf.gz") 220 | file index from CosmicIndex 221 | file remapCosmic from Channel.fromPath("$projectDir/data/remapCOSMIC.txt") 222 | 223 | output: 224 | path "CosmicCodingMutsRenamed.vcf.gz" into cosmicRenamed 225 | 226 | """ 227 | bcftools annotate --threads $params.cpu --output-type z --rename-chrs $remapCosmic --output CosmicCodingMutsRenamed.vcf.gz $cosmic 228 | """ } 229 | 230 | process indexRenamedVCFs { 231 | 232 | publishDir "$projectDir/data", mode: 'copy' 233 | 234 | input: 235 | file dbSNP from dbsnpRenamed 236 | file cosmic from cosmicRenamed 237 | 238 | output: 239 | path "dbSNPbuild154Renamed.vcf.gz.tbi" 240 | path "CosmicCodingMutsRenamed.vcf.gz.tbi" 241 | 242 | """ 243 | tabix $dbSNP 244 | tabix $cosmic 245 | rm -rf $projectDir/data/CosmicCodingMuts.vcf.gz $projectDir/data/CosmicCodingMuts.vcf.gz.tbi 246 | """ } 247 | } 248 | 249 | -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | // pipeline for identification of cancer related mutations 4 | 5 | prefix = params.prefix 6 | 7 | if ( params.fastq2 == false ) { 8 | 9 | process trimmomaticSE { 10 | 11 | cpus params.cpu 12 | memory '16GB' 13 | 14 | input: 15 | path fastq from Channel.fromPath("$params.fastq") 16 | path adapters from Channel.fromPath("$projectDir/data/CommonAdapters.fa") 17 | 18 | output: 19 | path "${prefix}.trimmed.fastq.gz" into trimmed, trimmed4mouse 20 | 21 | """ 22 | java -jar /Trimmomatic-0.39/trimmomatic-0.39.jar SE -threads $params.cpu $fastq ${prefix}.trimmed.fastq.gz ILLUMINACLIP:${adapters}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 23 | """ 24 | } 25 | 26 | 27 | process alignToHumanGenomeSE { 28 | 29 | cpus params.cpu 30 | memory '48GB' 31 | 32 | input: 33 | path fastq from trimmed 34 | path genomeDir from Channel.fromPath("$projectDir/data/GRCh", type: 'dir') 35 | 36 | output: 37 | path "${prefix}Aligned.sortedByCoord.out.bam" into aligned2human 38 | val 'foo' into bar 39 | 40 | """ 41 | STAR --runThreadN $params.cpu --genomeDir $genomeDir --readFilesIn $fastq --outFileNamePrefix ${prefix} --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --outSAMattributes NM --twopassMode Basic --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.1 42 | """ 43 | } 44 | 45 | 46 | process alignToMouseGenomeSE { 47 | 48 | cpus params.cpu 49 | memory '48GB' 50 | 51 | input: 52 | path fastq from trimmed4mouse 53 | path genomeDir from Channel.fromPath("$projectDir/data/GRCm", type: 'dir') 54 | val 'foo' from bar 55 | 56 | output: 57 | path "${prefix}GRCmAligned.sortedByCoord.out.bam" into aligned2mouse 58 | 59 | when: 60 | params.filterMouse == true 61 | 62 | """ 63 | STAR --runThreadN $params.cpu --genomeDir $genomeDir --readFilesIn $fastq --outFileNamePrefix ${prefix}GRCm --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --outSAMattributes NM --twopassMode Basic --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.1 64 | """ 65 | } 66 | 67 | } else { 68 | 69 | process trimmomaticPE { 70 | 71 | cpus params.cpu 72 | memory '16GB' 73 | 74 | input: 75 | path fastq1 from Channel.fromPath("$params.fastq") 76 | path fastq2 from Channel.fromPath("$params.fastq2") 77 | path adapters from Channel.fromPath("$projectDir/data/CommonAdapters.fa") 78 | 79 | output: 80 | path "${prefix}_1.trimmed.fastq.gz" into trimmed1 81 | path "${prefix}_2.trimmed.fastq.gz" into trimmed2 82 | path "${prefix}_1.trimmed.fastq.gz" into trimmed4mouse1 83 | path "${prefix}_2.trimmed.fastq.gz" into trimmed4mouse2 84 | 85 | """ 86 | java -jar /Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads $params.cpu $fastq1 $fastq2 ${prefix}_1.trimmed.fastq.gz unpaired1.fastq.gz ${prefix}_2.trimmed.fastq.gz unpaired2.fastq.gz ILLUMINACLIP:$adapters:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36 87 | """ 88 | } 89 | 90 | 91 | process alignToHumanGenomePE { 92 | 93 | cpus params.cpu 94 | memory '48GB' 95 | 96 | input: 97 | path fastq1 from trimmed1 98 | path fastq2 from trimmed2 99 | path genomeDir from Channel.fromPath("$projectDir/data/GRCh", type: 'dir') 100 | 101 | output: 102 | path "${prefix}Aligned.sortedByCoord.out.bam" into aligned2human 103 | 104 | """ 105 | STAR --runThreadN $params.cpu --genomeDir $genomeDir --readFilesIn $fastq1 $fastq2 --outFileNamePrefix ${prefix} --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --outSAMattributes NM --twopassMode Basic --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.1 106 | """ 107 | } 108 | 109 | 110 | process alignToMouseGenomePE { 111 | 112 | cpus params.cpu 113 | memory '48GB' 114 | 115 | input: 116 | path fastq1 from trimmed4mouse1 117 | path fastq2 from trimmed4mouse2 118 | path genomeDir from Channel.fromPath("$projectDir/data/GRCm", type: 'dir') 119 | 120 | output: 121 | path "${prefix}GRCmAligned.sortedByCoord.out.bam" into aligned2mouse 122 | 123 | when: 124 | params.filterMouse == true 125 | 126 | """ 127 | STAR --runThreadN $params.cpu --genomeDir $genomeDir --readFilesIn $fastq1 $fastq2 --outFileNamePrefix ${prefix}GRCm --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --outSAMattributes NM --twopassMode Basic --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.1 128 | """ 129 | } 130 | } 131 | 132 | if ( params.filterMouse == true ) { 133 | 134 | process XenofilteR { 135 | 136 | if (params.keepInter == true) { 137 | publishDir "$launchDir", mode: 'copy'} 138 | cpus params.cpu 139 | memory '16GB' 140 | 141 | input: 142 | path bam1 from aligned2human 143 | path bam2 from aligned2mouse 144 | 145 | output: 146 | file "Filtered_bams/${prefix}_Filtered.bam" into filteredBam 147 | 148 | """ 149 | #!/usr/bin/Rscript --save 150 | library("XenofilteR") 151 | bp.param <- SnowParam(workers = $params.cpu, type = "SOCK") 152 | sample.list <- matrix(c('$bam1','$bam2'),ncol=2) 153 | output.names <- c('${prefix}') 154 | XenofilteR(sample.list, destination.folder = "./", MM_threshold = 8, bp.param = bp.param, output.names) 155 | """ 156 | } 157 | 158 | 159 | } else { 160 | 161 | process skipXenofilteR { 162 | 163 | if (params.keepInter == true) { 164 | publishDir "$launchDir", mode: 'copy'} 165 | 166 | input: 167 | path bam from aligned2human 168 | 169 | output: 170 | path bam into filteredBam 171 | 172 | """ 173 | echo "--- mouse read filtering was not performed ---" 174 | """ 175 | } 176 | } 177 | 178 | process markDuplicates { 179 | 180 | memory '16GB' 181 | 182 | input: 183 | path bam from filteredBam 184 | 185 | output: 186 | path "${prefix}marked_duplicates.bam" into markedDuplicates 187 | 188 | """ 189 | /gatk-4.1.3.0/gatk MarkDuplicates --CREATE_INDEX true --I $bam --O ${prefix}marked_duplicates.bam --VALIDATION_STRINGENCY SILENT --M ${prefix}marked_dup_metrics.txt 190 | """ 191 | } 192 | 193 | 194 | process splitNcigar { 195 | 196 | memory '16GB' 197 | 198 | input: 199 | path bam from markedDuplicates 200 | path intervals from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz") 201 | path intIndex from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz.tbi") 202 | path reference_genome from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa") 203 | path index from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa.fai") 204 | path dict from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.dict") 205 | 206 | output: 207 | path "${prefix}splitN.bam" into splitN 208 | 209 | """ 210 | /gatk-4.1.3.0/gatk SplitNCigarReads -L $intervals -R $reference_genome -I $bam -O ${prefix}splitN.bam 211 | """ 212 | } 213 | 214 | 215 | process addGroups { 216 | 217 | memory '16GB' 218 | 219 | input: 220 | path bam from splitN 221 | 222 | output: 223 | path "${prefix}.grouped.bam" into grouped4BQSR 224 | path "${prefix}.grouped.bai" into grouped4BQSRindex 225 | path "${prefix}.grouped.bam" into grouped4applyBQSR 226 | path "${prefix}.grouped.bai" into grouped4applyBQSRindex 227 | """ 228 | /gatk-4.1.3.0/gatk AddOrReplaceReadGroups --CREATE_INDEX true --I $bam --O ${prefix}.grouped.bam --RGID rnasq --RGLB lb --RGPL illumina --RGPU pu --RGSM $prefix 229 | """ 230 | } 231 | 232 | 233 | process baseQualityRecalibration { 234 | 235 | memory '16GB' 236 | 237 | input: 238 | path bam from grouped4BQSR 239 | path index from grouped4BQSRindex 240 | path intervals from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz") 241 | path intIndex from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz.tbi") 242 | path reference_genome from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa") 243 | path index from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa.fai") 244 | path dict from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.dict") 245 | path dbSNP from Channel.fromPath("$projectDir/data/dbSNPbuild154Renamed.vcf.gz") 246 | path dbSNPindex from Channel.fromPath("$projectDir/data/dbSNPbuild154Renamed.vcf.gz.tbi") 247 | 248 | output: 249 | path "${prefix}.recal_data.table" into recalTable 250 | 251 | """ 252 | /gatk-4.1.3.0/gatk BaseRecalibrator -L $intervals -I $bam --use-original-qualities --disable-sequence-dictionary-validation true -R $reference_genome --known-sites $dbSNP -O ${prefix}.recal_data.table 253 | """ 254 | } 255 | 256 | 257 | process applyBQSR { 258 | 259 | memory '16GB' 260 | 261 | input: 262 | path table from recalTable 263 | path bam from grouped4applyBQSR 264 | path bai from grouped4applyBQSRindex 265 | path intervals from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz") 266 | path intIndex from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz.tbi") 267 | path reference_genome from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa") 268 | path index from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa.fai") 269 | path dict from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.dict") 270 | 271 | output: 272 | path "${prefix}.recal_output.bam" into recalibrated 273 | path "${prefix}.recal_output.bai" into recalibratedIndex 274 | 275 | """ 276 | /gatk-4.1.3.0/gatk ApplyBQSR -L $intervals -R $reference_genome -I $bam --use-original-qualities --add-output-sam-program-record --bqsr-recal-file $table -O ${prefix}.recal_output.bam 277 | """ 278 | } 279 | 280 | 281 | 282 | process callVariants { 283 | 284 | if (params.keepInter == true) { 285 | publishDir "$launchDir", mode: 'copy'} 286 | memory '16GB' 287 | 288 | input: 289 | path bam from recalibrated 290 | path bai from recalibratedIndex 291 | path intervals from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz") 292 | path intIndex from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz.tbi") 293 | path reference_genome from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa") 294 | path index from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa.fai") 295 | path dict from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.dict") 296 | 297 | output: 298 | file "${prefix}.output.vcf.gz" into variants 299 | file "${prefix}.output.vcf.gz.tbi" into variantsIndex 300 | 301 | """ 302 | /gatk-4.1.3.0/gatk HaplotypeCaller -L $intervals -R $reference_genome -I $bam -O ${prefix}.output.vcf.gz --dont-use-soft-clipped-bases --pcr-indel-model AGGRESSIVE 303 | """ 304 | } 305 | 306 | 307 | process hardFilter { 308 | 309 | memory '16GB' 310 | 311 | input: 312 | path vcf from variants 313 | path vcfIndex from variantsIndex 314 | path reference_genome from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa") 315 | path index from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.fa.fai") 316 | path dict from Channel.fromPath("$projectDir/data/GRCh38.primary_assembly.genome.dict") 317 | 318 | output: 319 | path "${prefix}.hardfilter.vcf.gz" into hardfilterd 320 | 321 | 322 | """ 323 | /gatk-4.1.3.0/gatk VariantFiltration --R $reference_genome --V $vcf --window 35 --cluster 3 --filter-name "FS" --filter "FS > 30.0" --filter-name "QD" --filter "QD < 2.0" -O ${prefix}.hardfilter.vcf.gz 324 | """ 325 | } 326 | 327 | 328 | process filterVariants { 329 | 330 | cpus params.cpu 331 | 332 | input: 333 | path hardfilterd 334 | 335 | output: 336 | path "${prefix}filtered.vcf.gz" into filterd 337 | 338 | """ 339 | bcftools view --threads $params.cpu -i 'FILTER="PASS" && FORMAT/DP >= 10 && FORMAT/AD[:1] >= 5' --output-type z --output-file ${prefix}filtered.vcf.gz ${prefix}.hardfilter.vcf.gz 340 | """ } 341 | 342 | process IndexfilteredVariants { 343 | 344 | input: 345 | path filterd 346 | 347 | output: 348 | path "${prefix}filtered.vcf.gz" into filterd4annotation 349 | path "${prefix}filtered.vcf.gz.tbi" into filteredIndex 350 | 351 | """ 352 | tabix "${prefix}filtered.vcf.gz" 353 | """ } 354 | 355 | process geneAnnotation { 356 | 357 | cpus params.cpu 358 | 359 | input: 360 | path filterd4annotation 361 | path intervals from Channel.fromPath("$projectDir/data/GRCh38_exome.bed.gz") 362 | path header from Channel.fromPath("$projectDir/data/header.txt") 363 | path filteredIndex 364 | 365 | output: 366 | path "${prefix}named.vcf.gz" into gene 367 | path "${prefix}named.vcf.gz.tbi" into geneIndex 368 | 369 | """ 370 | bcftools annotate --threads $params.cpu -a $intervals -h $header -c CHROM,FROM,TO,Gene --output-type z --output ${prefix}named.vcf.gz ${prefix}filtered.vcf.gz 371 | tabix ${prefix}named.vcf.gz 372 | """ } 373 | 374 | process snpAnnotation { 375 | 376 | cpus params.cpu 377 | 378 | input: 379 | path gene 380 | path geneIndex 381 | path dbSNP from Channel.fromPath("$projectDir/data/dbSNPbuild154Renamed.vcf.gz") 382 | path dbSNPindex from Channel.fromPath("$projectDir/data/dbSNPbuild154Renamed.vcf.gz.tbi") 383 | 384 | output: 385 | path "${prefix}dbSNP.vcf.gz" into snp 386 | path "${prefix}dbSNP.vcf.gz.tbi" into snpIndex 387 | 388 | """ 389 | bcftools annotate --threads $params.cpu -a $dbSNP -c INFO/RS,INFO/COMMON --output-type z --output ${prefix}dbSNP.vcf.gz ${prefix}named.vcf.gz 390 | tabix ${prefix}dbSNP.vcf.gz 391 | """ } 392 | 393 | process cosmicAnnotation { 394 | 395 | cpus params.cpu 396 | 397 | input: 398 | path snp 399 | path snpIndex 400 | path cosmic_vcf from Channel.fromPath("$projectDir/data/CosmicCodingMutsRenamed.vcf.gz") 401 | path cosmicIndex from Channel.fromPath("$projectDir/data/CosmicCodingMutsRenamed.vcf.gz.tbi") 402 | 403 | output: 404 | path "${prefix}.annotated.vcf.gz" into cosmic 405 | path "${prefix}.annotated.vcf.gz.tbi" into cosmicIndex 406 | 407 | """ 408 | bcftools annotate --threads $params.cpu -a $cosmic_vcf -c ID,INFO/CNT --output-type z --output ${prefix}.annotated.vcf.gz ${prefix}dbSNP.vcf.gz 409 | tabix ${prefix}.annotated.vcf.gz 410 | """ 411 | } 412 | 413 | process variantTable { 414 | 415 | if (params.keepInter == true) { 416 | publishDir "$launchDir", mode: 'copy'} 417 | 418 | input: 419 | path cosmic 420 | path cosmicIndex 421 | 422 | output: 423 | path "${prefix}varTable.tsv" into table 424 | 425 | """ 426 | bcftools query -H -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/Gene\t%ID\t%INFO/CNT\t%INFO/RS\t%INFO/COMMON\t[%AD]\n' ${prefix}.annotated.vcf.gz > ${prefix}varTable.tsv 427 | """ 428 | } 429 | 430 | process findCancerMutations { 431 | 432 | publishDir "$launchDir", mode: 'copy' 433 | 434 | input: 435 | path varTable from table 436 | path cosmic from Channel.fromPath("$projectDir/data/CosmicMutantExportCensus.tsv.gz") 437 | 438 | output: 439 | path "${prefix}_cancer_mutations.csv" into results 440 | 441 | """ 442 | #!/usr/bin/python3 443 | 444 | import pandas as pd 445 | 446 | df = pd.read_csv('$varTable', sep='\t') 447 | 448 | df = df[df['[6]ID'] != '.'] 449 | df['[7]CNT'] = df['[7]CNT'].astype("int32") 450 | df = df[df['[7]CNT'] >= 20] 451 | 452 | df = df[df['[9]COMMON'] != '1'] 453 | 454 | mutations = pd.read_csv('$cosmic', sep='\t', compression='gzip', encoding='latin1') 455 | mutations = mutations[['Tier','GENOMIC_MUTATION_ID', 'Mutation AA', 'Mutation Description', 'FATHMM prediction','FATHMM score']] 456 | 457 | df = df.merge(mutations, left_on='[6]ID', right_on='GENOMIC_MUTATION_ID') 458 | 459 | df = df[df['Tier'] == 1] 460 | df = df[df['Mutation Description'] != 'Substitution - coding silent'] 461 | df = df[df['FATHMM prediction'] == 'PATHOGENIC'] 462 | 463 | df.drop_duplicates(subset='[6]ID', inplace=True) 464 | df = df[['[5]Gene', '# [1]CHROM', '[2]POS', '[3]REF', '[4]ALT', '[7]CNT', '[8]RS', 'GENOMIC_MUTATION_ID', 'Mutation AA', 'Mutation Description', 'FATHMM score','[10]$prefix:AD']] 465 | df.rename(columns = {'[5]Gene': 'Gene', '# [1]CHROM': 'CHROM', '[2]POS': 'POS', '[3]REF':'REF','[4]ALT': 'ALT', '[7]CNT': 'COSMIC_CNT','[8]RS': 'RS(dbSNP)', 'Mutation AA': 'Mutation_AA', 'Mutation Description': 'Mutation_Description', '[10]$prefix:AD': 'AD'}, inplace=True) 466 | 467 | df.to_csv('${prefix}_cancer_mutations.csv', index=False) 468 | """ 469 | } 470 | --------------------------------------------------------------------------------