├── slamdunk ├── __init__.py ├── dunks │ ├── __init__.py │ ├── dump.py │ ├── snps.py │ ├── deduplicator.py │ ├── mapper.py │ └── filter.py ├── plot │ ├── __init__.py │ ├── checkLibraries.R │ ├── PCAPlotter.R │ ├── conversion_per_read_position.R │ ├── compute_context_TC_rates.R │ ├── compute_sample_comparison_statistics.R │ ├── compute_overall_rates.R │ ├── compute_conversion_rate_mle.R │ ├── splash_eval_count_files.R │ ├── compute_halflifes.R │ ├── SNPeval.R │ ├── globalRatePlotter.R │ ├── eval_halflifes_error_plot.R │ ├── eval_conversion_rate_plots.R │ ├── eval_halflife_per_gene_plots.R │ └── merge_rate_files.R ├── slamseq │ └── __init__.py ├── test │ ├── __init__.py │ ├── data │ │ ├── actb.bed │ │ ├── reads_slamdunk_mapped_filtered_tcount.tsv │ │ └── reads.fq │ ├── test_sample.py │ └── test_sample.sh ├── utils │ ├── __init__.py │ ├── BedReader.py │ ├── SNPtools.py │ └── misc.py ├── contrib │ └── RNASeqReadSimulator │ │ ├── src │ │ ├── getSegs.pyc │ │ ├── splitfasta.py │ │ ├── addvariation2splicingbed.py │ │ ├── getSegs.py │ │ ├── genexplvprofile.py │ │ ├── getseqfrombed.py │ │ └── gensimreads.py │ │ ├── demo │ │ ├── input │ │ │ ├── samplereaderror.txt │ │ │ ├── sampleposbias.txt │ │ │ └── sample.bed │ │ ├── gensingleendreads.sh │ │ ├── genstrandedreads.sh │ │ └── genpairedendreads.sh │ │ └── README └── version.py ├── MANIFEST.in ├── .settings └── .gitignore ├── requirements.txt ├── .gitignore ├── environment.yml ├── hooks └── build ├── bin ├── splash ├── alleyoop ├── slamdunk └── _preamble.py ├── .travis.yml ├── Dockerfile ├── README.md └── setup.py /slamdunk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md -------------------------------------------------------------------------------- /slamdunk/dunks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slamdunk/plot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slamdunk/slamseq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slamdunk/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slamdunk/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.settings/.gitignore: -------------------------------------------------------------------------------- 1 | /org.eclipse.core.resources.prefs 2 | -------------------------------------------------------------------------------- /slamdunk/test/data/actb.bed: -------------------------------------------------------------------------------- 1 | chr5 120498 122492 Actb 0 + 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.9.4 2 | pybedtools>=0.6.4 3 | intervaltree>=2.1.0 4 | pandas>=0.13.1 5 | biopython>=1.63 6 | pysam>=0.8.3 7 | Cython>=0.20.1 -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/t-neumann/slamdunk/HEAD/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc -------------------------------------------------------------------------------- /slamdunk/test/test_sample.py: -------------------------------------------------------------------------------- 1 | # content of test_sample.py 2 | def func(x): 3 | return x + 1 4 | 5 | #def test_run(): 6 | 7 | 8 | def test_answer(): 9 | assert func(4) == 5 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | /.pydevproject 3 | /sync.sh 4 | *.pyc 5 | .DS_Store 6 | /doc/build/ 7 | /bin/NextGenMap/ 8 | /bin/ngm 9 | /slamdunk/plot/Rslamdunk 10 | .Rhistory 11 | .cache 12 | *-enc.2.ngm 13 | *-ht-13-2.3.ngm 14 | *.fai 15 | 16 | -------------------------------------------------------------------------------- /slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start End Name Length Strand ConversionRate ReadsCPM Tcontent CoverageOnTs ConversionsOnTs ReadCount TcReadCount multimapCount ConversionRateLower ConversionRateUpper 2 | chr5 120498 122492 Actb 1994 + 0.022222222222222223 666666.6666666666 445 90 2 8 4 0 -1.0 -1.0 3 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: slamdunk 2 | dependencies: 3 | - nextgenmap=0.5.5 4 | - samtools=1.10 5 | - varscan=2.4.4 6 | - r-tidyverse=1.3.0 7 | - r-matrixstats=0.55.0 8 | - r-gridextra=2.3 9 | - r-getopt=1.20.3 10 | - joblib=0.14.0 11 | - pandas=0.25.3 12 | - cython=0.29.14 13 | - biopython=1.74 14 | - pybedtools=0.8.0 15 | - intervaltree=3.0.2 16 | -------------------------------------------------------------------------------- /hooks/build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | if [ "$DOCKER_TAG" = "latest" ]; then 4 | echo "Building :latest, without VERSION_ARG" 5 | TAG=`curl -s https://api.github.com/repos/t-neumann/slamdunk/releases | grep tag_name | head -n 1 | cut -d '"' -f 4` 6 | echo $TAG 7 | docker build --build-arg VERSION_ARG="$TAG" -t ${IMAGE_NAME} . 8 | else 9 | echo "Building :$DOCKER_TAG, with VERSION_ARG=\"--vers $DOCKER_TAG\"" 10 | docker build --build-arg VERSION_ARG="$DOCKER_TAG" -t ${IMAGE_NAME} . 11 | fi -------------------------------------------------------------------------------- /slamdunk/test/test_sample.sh: -------------------------------------------------------------------------------- 1 | 2 | slamdunk all -r slamdunk/test/data/ref.fa -b slamdunk/test/data/actb.bed -o slamdunk/test/data/output -rl 100 -mbq 27 -5 0 slamdunk/test/data/reads.fq 3 | 4 | grep -v "^#" slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount.tsv > slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv 5 | 6 | diff slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv 7 | 8 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/splitfasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Split fasta files including paired-end reads into two separate files 4 | """ 5 | from __future__ import print_function; 6 | import sys; 7 | import re; 8 | 9 | outfile1=""; 10 | outfile2=""; 11 | 12 | for i in range(len(sys.argv)): 13 | if sys.argv[i]=="-o": 14 | outfile1=sys.argv[i+1]+"_1.fa"; 15 | outfile2=sys.argv[i+1]+"_2.fa"; 16 | 17 | if outfile1=="": 18 | sys.exit(-1); 19 | 20 | ofid1=open(outfile1,"w"); 21 | ofid2=open(outfile2,"w"); 22 | 23 | isleft=True; 24 | for lines in sys.stdin: 25 | if lines[0]=='>': 26 | if lines.strip()[-1]=='1': 27 | isleft=True; 28 | else: 29 | isleft=False; 30 | lines=re.sub("/[12]","",lines); 31 | if isleft: 32 | print(lines,file=ofid1,end=''); 33 | else: 34 | print(lines,file=ofid2,end=''); 35 | 36 | 37 | 38 | 39 | 40 | ofid1.close(); 41 | ofid2.close(); 42 | -------------------------------------------------------------------------------- /bin/splash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | import sys 21 | 22 | try: 23 | import _preamble 24 | except ImportError: 25 | sys.exc_clear() 26 | 27 | from slamdunk import splash 28 | splash.run() -------------------------------------------------------------------------------- /bin/alleyoop: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | import sys 21 | 22 | try: 23 | import _preamble 24 | except ImportError: 25 | sys.exc_clear() 26 | 27 | from slamdunk import alleyoop 28 | alleyoop.run() -------------------------------------------------------------------------------- /bin/slamdunk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | import sys 21 | 22 | try: 23 | import _preamble 24 | except ImportError: 25 | sys.exc_clear() 26 | 27 | from slamdunk import slamdunk 28 | slamdunk.run() 29 | -------------------------------------------------------------------------------- /slamdunk/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 2 | # 3 | # This file is part of Slamdunk. 4 | # 5 | # Slamdunk is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as 7 | # published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # Slamdunk is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | 18 | # Overall slamDunk version 19 | __version__ = "0.4.3" 20 | # File format version of BAM files from slamdunk filter 21 | __bam_version__ = "3" 22 | # File format version of count files from slamdunk count 23 | __count_version__ = "3" 24 | # Required NextGenMap version 25 | __ngm_version__ = "0.5.5" 26 | -------------------------------------------------------------------------------- /bin/_preamble.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 2 | # 3 | # This file is part of Slamdunk. 4 | # 5 | # Slamdunk is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as 7 | # published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # Slamdunk is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | 18 | import sys, os 19 | 20 | path = os.path.abspath(sys.argv[0]) 21 | while os.path.dirname(path) != path: 22 | if os.path.exists(os.path.join(path, 'slamdunk', '__init__.py')): 23 | 24 | #sys.path.insert(0, os.path.join(path, 'slamdunk')) 25 | sys.path.insert(0, path) 26 | break 27 | path = os.path.dirname(path) 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | # We don't actually use the Travis Python, but this keeps it organized. 4 | - "2.7" 5 | - "3.5" 6 | - "3.6" 7 | - "3.7" 8 | 9 | before_install: 10 | 11 | # Here we just install Miniconda, which you shouldn't have to change. 12 | - if [ "$TRAVIS_OS_NAME" == "osx" ]; then 13 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; 14 | else 15 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 16 | fi 17 | - chmod +x miniconda.sh 18 | - ./miniconda.sh -b -p $HOME/miniconda 19 | - export PATH=/home/travis/miniconda/bin:$PATH 20 | - conda update --yes conda 21 | - conda config --add channels defaults 22 | - conda config --add channels bioconda 23 | - conda config --add channels conda-forge 24 | 25 | install: 26 | 27 | # We just set up a conda environment with the right Python version. This 28 | # should not need changing. 29 | 30 | - conda env create -f environment.yml 31 | - source activate slamdunk 32 | - pip install pytest 33 | - pip install . 34 | 35 | # command to run tests 36 | script: 37 | - slamdunk -h 38 | - alleyoop -h 39 | - splash -h 40 | - slamdunk/test/test_sample.sh 41 | - pytest 42 | -------------------------------------------------------------------------------- /slamdunk/plot/checkLibraries.R: -------------------------------------------------------------------------------- 1 | # Helper function to check whether Rslamdunk libraries are available 2 | # Install if libraries are not available 3 | 4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 5 | # 6 | # This file is part of Slamdunk. 7 | # 8 | # Slamdunk is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU Affero General Public License as 10 | # published by the Free Software Foundation, either version 3 of the 11 | # License, or (at your option) any later version. 12 | # 13 | # Slamdunk is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU Affero General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU Affero General Public License 19 | # along with this program. If not, see . 20 | 21 | checkLib <- function(libLoc) { 22 | 23 | list.of.packages <- c("getopt","ggplot2","gridExtra","RColorBrewer","lattice","matrixStats","assertthat","lazyeval","tibble") 24 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages(lib.loc = libLoc)[,"Package"])] 25 | 26 | if(length(new.packages)) install.packages(new.packages, repos="http://cran.wu.ac.at/", lib = libLoc, dependencies = TRUE) 27 | } -------------------------------------------------------------------------------- /slamdunk/test/data/reads.fq: -------------------------------------------------------------------------------- 1 | @Read1_1_0 2 | CCGTTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @Read2_0_0 6 | TTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @Read3_1_0 10 | CTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @Read4_1_1 14 | TCTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @Read5_2_2 18 | CCGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @Read6_1_0 22 | TTGTGTAAGGTAAGGCGTGCACTTTTATTGGTCTCA 23 | + 24 | FFFFFFFFFFFFFFF. 17 | 18 | FROM continuumio/miniconda3:4.7.12 19 | 20 | MAINTAINER Tobias Neumann 21 | 22 | ARG VERSION_ARG 23 | 24 | COPY environment.yml /tmp/environment.yml 25 | 26 | RUN apt-get update \ 27 | && apt-get install -y procps \ 28 | && apt-get clean -y \ 29 | && rm -rf /var/lib/apt/lists/* \ 30 | && conda config --add channels defaults \ 31 | && conda config --add channels bioconda \ 32 | && conda config --add channels conda-forge \ 33 | && conda env create --name slamdunk -f /tmp/environment.yml \ 34 | && /opt/conda/envs/slamdunk/bin/pip install git+https://github.com/t-neumann/slamdunk.git@${VERSION_ARG} \ 35 | && rm -rf /opt/conda/pkgs/* 36 | 37 | ENV PATH /opt/conda/envs/slamdunk/bin:$PATH 38 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/demo/gensingleendreads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #------------------------------------------- 4 | # Parameters 5 | # Required parameters: 6 | # Transcript annotation (BED file) 7 | BED=input/sample.bed 8 | 9 | # output FASTA prefix 10 | FASTAFILE=output/single.fa 11 | 12 | # reference chromosome 13 | REFERENCE=input/reference.fa 14 | 15 | # Optional parameters 16 | # Read length 17 | READLEN=75 18 | 19 | # Number of reads generated 20 | NREAD=100000 21 | 22 | # positional bias file 23 | POSBIAS=input/sampleposbias.txt 24 | 25 | # Read error position profile 26 | READERR=input/samplereaderror.txt 27 | 28 | # Intermediate files 29 | # File for random expression level assignment 30 | RANDEXPLV=output/explvprofile.txt 31 | 32 | 33 | #----------------------------------------------- 34 | # Add paths if users don't install the script 35 | export PATH=../src/:$PATH 36 | # Commands to randomly assign weights to each transcript 37 | 38 | if [ ! -d "output" ]; then 39 | mkdir output 40 | fi 41 | 42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV" 43 | 44 | echo "Commands to randomly assign weights to each transcript:" 45 | echo $CMD0 46 | 47 | genexplvprofile.py $BED > $RANDEXPLV 48 | 49 | # Commands to simulate reads (output to STDOUT in BED format) 50 | # If you want single-end reads, don't use the "-p" option. 51 | CMD1="gensimreads.py -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN $BED " 52 | 53 | echo "Commands to generate simulated paired-reads in BED format:" 54 | echo $CMD1 55 | 56 | 57 | # Commands to convert BED file to fasta file 58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE" 59 | 60 | echo "Commands to generate FASTA file from the last command:" 61 | echo $CMD2 62 | echo "Output FASTA prefix: $FASTAFILE" 63 | 64 | 65 | 66 | # Execute two commands simultaneously 67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute: 68 | $CMD1 | $CMD2 > $FASTAFILE 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/demo/genstrandedreads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #------------------------------------------- 4 | # Parameters 5 | # Required parameters: 6 | # Transcript annotation (BED file) 7 | BED=input/sample.bed 8 | 9 | # output FASTA prefix 10 | FASTAFILE=output/single-stranded.fa 11 | 12 | # reference chromosome 13 | REFERENCE=input/reference.fa 14 | 15 | # Optional parameters 16 | # Read length 17 | READLEN=75 18 | 19 | # Number of reads generated 20 | NREAD=100000 21 | 22 | # positional bias file 23 | POSBIAS=input/sampleposbias.txt 24 | 25 | # Read error position profile 26 | READERR=input/samplereaderror.txt 27 | 28 | # Intermediate files 29 | # File for random expression level assignment 30 | RANDEXPLV=output/explvprofile.txt 31 | 32 | 33 | #----------------------------------------------- 34 | # Add paths if users don't install the script 35 | export PATH=../src/:$PATH 36 | # Commands to randomly assign weights to each transcript 37 | 38 | if [ ! -d "output" ]; then 39 | mkdir output 40 | fi 41 | 42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV" 43 | 44 | echo "Commands to randomly assign weights to each transcript:" 45 | echo $CMD0 46 | 47 | genexplvprofile.py $BED > $RANDEXPLV 48 | 49 | # Commands to simulate reads (output to STDOUT in BED format) 50 | # If you want single-end reads, don't use the "-p" option. 51 | CMD1="gensimreads.py --stranded -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN $BED " 52 | 53 | echo "Commands to generate simulated paired-reads in BED format:" 54 | echo $CMD1 55 | 56 | 57 | # Commands to convert BED file to fasta file 58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE" 59 | 60 | echo "Commands to generate FASTA file from the last command:" 61 | echo $CMD2 62 | echo "Output FASTA prefix: $FASTAFILE" 63 | 64 | 65 | 66 | # Execute two commands simultaneously 67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute: 68 | $CMD1 | $CMD2 > $FASTAFILE 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### Streamlining SLAM-Seq analysis with ultra-high sensitivity. 4 | 5 | [![GitHub release](https://img.shields.io/github/release/t-neumann/slamdunk.svg)](https://github.com/t-neumann/slamdunk/releases/latest) 6 | [![Travis CI](https://img.shields.io/travis/t-neumann/slamdunk.svg)](https://travis-ci.org/t-neumann/slamdunk) 7 | 8 | [![Docker Pulls](https://img.shields.io/docker/pulls/tobneu/slamdunk.svg)](https://hub.docker.com/r/tobneu/slamdunk) 9 | [![Docker Automated build](https://img.shields.io/docker/automated/tobneu/slamdunk.svg)](https://hub.docker.com/r/tobneu/slamdunk/builds/) 10 | 11 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/slamdunk/README.html) 12 | [![Anaconda build](https://anaconda.org/bioconda/slamdunk/badges/version.svg 13 | )](https://anaconda.org/bioconda/slamdunk) 14 | [![Anaconda downloads](https://anaconda.org/bioconda/slamdunk/badges/downloads.svg 15 | )](https://anaconda.org/bioconda/slamdunk) 16 | 17 | [![PyPI release](https://img.shields.io/pypi/v/slamdunk.svg)](https://pypi.python.org/pypi/slamdunk) 18 | ![Github Stars](https://img.shields.io/github/stars/t-neumann/slamdunk.svg?style=social&label=Star) 19 | 20 | ----- 21 | 22 | ### Slamdunk documentation 23 | 24 | http://t-neumann.github.io/slamdunk 25 | 26 | ### nf-core slamseq workflow 27 | 28 | [![nfcore/slamseq](https://github.com/nf-core/slamseq/raw/master/docs/images/nf-core-slamseq_logo.png)](https://nf-co.re/slamseq) 29 | 30 | ### Please cite 31 | 32 | Neumann, T., Herzog, V. A., Muhar, M., Haeseler, von, A., Zuber, J., Ameres, S. L., & Rescheneder, P. (2019). [Quantification of experimentally induced nucleotide conversions in high-throughput sequencing datasets](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2849-7). BMC Bioinformatics, 20(1), 258. http://doi.org/10.1186/s12859-019-2849-7 33 | 34 | -------------------------------------------------------------------------------- /slamdunk/dunks/dump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | from __future__ import print_function 21 | 22 | from slamdunk.utils.misc import checkStep # @UnresolvedImport 23 | from slamdunk.slamseq.SlamSeqFile import SlamSeqBamFile, SlamSeqWriter # @UnresolvedImport 24 | from slamdunk.utils import SNPtools # @UnresolvedImport 25 | 26 | def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): 27 | 28 | if(not checkStep([bam, referenceFile], [outputCSV], force)): 29 | print("Skipped computing T->C per reads position for file " + bam, file=log) 30 | else: 31 | 32 | snps = SNPtools.SNPDictionary(snpsFile) 33 | snps.read() 34 | 35 | outputFile = SlamSeqWriter(outputCSV) 36 | 37 | #Go through one chr after the other 38 | testFile = SlamSeqBamFile(bam, referenceFile, snps) 39 | 40 | chromosomes = testFile.getChromosomes() 41 | 42 | for chromosome in chromosomes: 43 | readIterator = testFile.readsInChromosome(chromosome) 44 | for read in readIterator: 45 | outputFile.write(read) 46 | 47 | 48 | outputFile.close() -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/demo/genpairedendreads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #------------------------------------------- 4 | # Parameters 5 | # Required parameters: 6 | # Transcript annotation (BED file) 7 | BED=input/sample.bed 8 | 9 | # output FASTA prefix 10 | FASTAFILE=output/paired 11 | 12 | # reference chromosome 13 | REFERENCE=input/reference.fa 14 | 15 | # Optional parameters 16 | # Read length 17 | READLEN=75 18 | 19 | # Number of reads generated 20 | NREAD=100000 21 | 22 | # The mean and std of spans for paired-end reads. 23 | PAIREDEND="200,20" 24 | 25 | # positional bias file 26 | POSBIAS=input/sampleposbias.txt 27 | 28 | # Read error position profile 29 | READERR=input/samplereaderror.txt 30 | 31 | # Intermediate files 32 | # File for random expression level assignment 33 | RANDEXPLV=output/explvprofile.txt 34 | 35 | 36 | #----------------------------------------------- 37 | 38 | # Add paths if users don't install the script 39 | export PATH=../src/:$PATH 40 | 41 | # Commands to randomly assign weights to each transcript 42 | 43 | if [ ! -d "output" ]; then 44 | mkdir output 45 | fi 46 | 47 | CMD0=" genexplvprofile.py $BED > $RANDEXPLV" 48 | 49 | echo "Commands to randomly assign weights to each transcript:" 50 | echo $CMD0 51 | 52 | 53 | genexplvprofile.py $BED > $RANDEXPLV 54 | 55 | # Commands to simulate reads (output to STDOUT in BED format) 56 | # If you want single-end reads, don't use the "-p" option. 57 | CMD1="gensimreads.py -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN -p $PAIREDEND $BED " 58 | 59 | echo "Commands to generate simulated paired-reads in BED format:" 60 | echo $CMD1 61 | 62 | 63 | # Commands to convert BED file to fasta file 64 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE" 65 | 66 | echo "Commands to generate FASTA file from the last command:" 67 | echo $CMD2 68 | echo "Output FASTA prefix: $FASTAFILE" 69 | 70 | 71 | 72 | # Execute two commands simultaneously 73 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute: 74 | # $CMD1 | $CMD2 > $FASTAFILE 75 | $CMD1 | $CMD2 | splitfasta.py -o $FASTAFILE 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/addvariation2splicingbed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script is used to add splicing variations from STDIN .BED file. 4 | 5 | Usage: addvariation2splicingbed.py {OPTIONS} 6 | 7 | OPTIONS 8 | 9 | ATTENTION: 10 | 11 | HISTORY 12 | 01/09/2012 13 | 14 | """ 15 | from __future__ import print_function; 16 | import sys; 17 | import subprocess; 18 | import pydoc; 19 | import os; 20 | import random; 21 | import bisect; 22 | import math; 23 | from getSegs import *; 24 | 25 | import pdb; 26 | 27 | 28 | errorrate=0.2; 29 | onbedfile="-"; 30 | 31 | 32 | for i in range(len(sys.argv)): 33 | if sys.argv[i]=='-h': 34 | print(pydoc.render_doc(sys.modules[__name__])); 35 | sys.exit(); 36 | if i. 19 | 20 | from __future__ import print_function 21 | import subprocess 22 | import csv 23 | from slamdunk.utils.misc import checkStep, getBinary # @UnresolvedImport 24 | 25 | def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False): 26 | if(checkStep([inputBAM, referenceFile], [outputSNP], force)): 27 | fileSNP = open(outputSNP, 'w') 28 | 29 | mpileupCmd = "samtools mpileup -B -A -f " + referenceFile + " " + inputBAM 30 | if(verbose): 31 | print(mpileupCmd, file=log) 32 | if(not printOnly): 33 | mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log) 34 | 35 | varscanCmd = "varscan mpileup2snp --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1" 36 | if(verbose): 37 | print(varscanCmd, file=log) 38 | if(not printOnly): 39 | varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log) 40 | varscan.wait() 41 | 42 | fileSNP.close() 43 | else: 44 | print("Skipping SNP calling", file=log) 45 | 46 | def countSNPsInFile(inputFile): 47 | snpCount = 0 48 | tcSnpCount = 0 49 | with open(inputFile, "r") as snpFile: 50 | snpReader = csv.reader(snpFile, delimiter='\t') 51 | for row in snpReader: 52 | if((row[2].upper() == "T" and row[3].upper() == "C") or (row[2].upper() == "A" and row[3].upper() == "G")): 53 | tcSnpCount = tcSnpCount + 1 54 | snpCount = snpCount + 1 55 | return snpCount, tcSnpCount 56 | -------------------------------------------------------------------------------- /slamdunk/utils/BedReader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 2 | # 3 | # This file is part of Slamdunk. 4 | # 5 | # Slamdunk is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as 7 | # published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # Slamdunk is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | 18 | from intervaltree import IntervalTree 19 | 20 | def bedToIntervallTree(bed): 21 | utrs = {} 22 | 23 | for utr in BedIterator(bed): 24 | 25 | if (not utr.chromosome in utrs) : 26 | utrs[utr.chromosome] = IntervalTree() 27 | 28 | utrs[utr.chromosome][utr.start:(utr.stop + 1)] = utr.name 29 | 30 | return utrs 31 | 32 | 33 | class BedEntry: 34 | 35 | def __init__(self): 36 | self.chromosome = "" 37 | self.start = 0 38 | self.stop = 0 39 | self.name = "" 40 | self.score = "." 41 | self.strand = "." 42 | 43 | def __repr__(self): 44 | return (self.chromosome + "\t" + str(self.start) + "\t" + str(self.stop) + "\t" + self.name) 45 | 46 | def getLength(self): 47 | return self.stop - self.start 48 | 49 | def hasStrand(self): 50 | return self.strand == "+" or self.strand == "-" 51 | 52 | def hasNonEmptyName(self): 53 | return self.name != "" 54 | 55 | class BedIterator: 56 | 57 | def __init__(self, filename): 58 | self._bedFile = open(filename, "r") 59 | 60 | def __iter__(self): 61 | return self 62 | 63 | def _toBED(self, line): 64 | cols = line.rstrip().split("\t") 65 | bedEntry = BedEntry() 66 | bedEntry.chromosome = cols[0] 67 | bedEntry.start = int(cols[1]) 68 | bedEntry.stop = int(cols[2]) 69 | bedEntry.name = cols[3] 70 | 71 | if (len(cols) > 4) : 72 | bedEntry.score = cols[4] 73 | # Add strand info if available 74 | if (len(cols) > 5) : 75 | bedEntry.strand = cols[5] 76 | 77 | return bedEntry 78 | 79 | def __next__(self): 80 | try: 81 | return self._toBED(self._bedFile.__next__()) 82 | except StopIteration: 83 | self._bedFile.close() 84 | raise StopIteration 85 | -------------------------------------------------------------------------------- /slamdunk/utils/SNPtools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 2 | # 3 | # This file is part of Slamdunk. 4 | # 5 | # Slamdunk is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as 7 | # published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # Slamdunk is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | 18 | import os 19 | 20 | from pybedtools import BedTool 21 | 22 | 23 | class SNPDictionary(object): 24 | 25 | def __init__(self, vcfFile): 26 | self._vcfFile = vcfFile 27 | self._tcSNPs = {} 28 | self._agSNPs = {} 29 | 30 | def _addSNP(self, snp): 31 | 32 | if(snp[3].upper() == "T" and snp[4].upper() == "C"): 33 | key = snp[0] + snp[1] 34 | self._tcSNPs[key] = True 35 | 36 | if(snp[3].upper() == "A" and snp[4].upper() == "G"): 37 | key = snp[0] + snp[1] 38 | self._agSNPs[key] = True 39 | 40 | def read(self): 41 | if (self._vcfFile != None): 42 | if(os.path.exists(self._vcfFile)): 43 | vcfReader = BedTool(self._vcfFile) 44 | 45 | if(vcfReader.file_type != "vcf"): 46 | print("Wrong file type. Empty or not a vcf file.") 47 | 48 | for snp in vcfReader: 49 | self._addSNP(snp) 50 | else: 51 | print("Warning: SNP file " + self._vcfFile + " not found.") 52 | 53 | def isAGSnp(self, chromosome, position): 54 | key = chromosome + str(int(position) + 1) 55 | return key in self._agSNPs 56 | 57 | 58 | def isTCSnp(self, chromosome, position): 59 | key = chromosome + str(int(position) + 1) 60 | return key in self._tcSNPs 61 | 62 | def getAGSNPsInUTR(self, chromosome, start, stop, snpType): 63 | count = 0 64 | for i in range(start, stop): 65 | if(self.isAGSnp(chromosome, i)): 66 | count += 1 67 | return count 68 | 69 | def getTCSNPsInUTR(self, chromosome, start, stop, snpType): 70 | count = 0 71 | for i in range(start, stop): 72 | if(self.isTCSnp(chromosome, i)): 73 | count += 1 74 | return count 75 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/demo/input/sample.bed: -------------------------------------------------------------------------------- 1 | chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347, 2 | chr1 11873 14409 uc010nxq.1 0 + 12189 13639 0 3 354,127,1007, 0,721,1529, 3 | chr1 11873 14409 uc010nxr.1 0 + 11873 11873 0 3 354,52,1189, 0,772,1347, 4 | chr1 14362 16765 uc009vis.2 0 - 14362 14362 0 4 467,69,147,159, 0,607,1433,2244, 5 | chr1 14362 19759 uc001aae.3 0 - 14362 14362 0 10 467,69,152,159,198,136,137,147,99,847, 0,607,1433,2244,2495,2870,3243,3552,3905,4550, 6 | chr1 14362 19759 uc009vit.2 0 - 14362 14362 0 9 467,69,152,159,198,510,147,99,847, 0,607,1433,2244,2495,2870,3552,3905,4550, 7 | chr1 14362 19759 uc009viu.2 0 - 14362 14362 0 10 467,69,152,159,198,510,147,102,54,847, 0,607,1433,2244,2495,2870,3552,3905,4138,4550, 8 | chr1 14362 24901 uc001aab.3 0 - 14362 14362 0 10 467,69,152,159,202,136,137,147,112,164, 0,607,1433,2244,2491,2870,3243,3552,3905,10375, 9 | chr1 14362 29370 uc001aac.3 0 - 14362 14362 0 11 467,69,152,159,198,110,137,147,102,154,50, 0,607,1433,2244,2495,2896,3243,3552,3905,10375,14958, 10 | chr1 14362 29370 uc001aah.3 0 - 14362 14362 0 11 467,69,152,159,198,136,137,147,99,154,50, 0,607,1433,2244,2495,2870,3243,3552,3905,10375,14958, 11 | chr1 14362 29370 uc009viq.2 0 - 14362 14362 0 7 467,152,159,198,456,154,50, 0,1433,2244,2495,3243,10375,14958, 12 | chr1 14362 29370 uc009vir.2 0 - 14362 14362 0 10 467,69,152,159,198,510,147,99,154,50, 0,607,1433,2244,2495,2870,3552,3905,10375,14958, 13 | chr1 14406 29370 uc009viv.2 0 - 14406 14406 0 7 2359,198,136,137,147,154,50, 0,2451,2826,3199,3508,10331,14914, 14 | chr1 14406 29370 uc009viw.2 0 - 14406 14406 0 7 2359,198,510,147,99,154,50, 0,2451,2826,3508,3861,10331,14914, 15 | chr1 15602 29370 uc009vix.2 0 - 15602 15602 0 7 345,159,198,136,147,154,50, 0,1004,1255,1630,2312,9135,13718, 16 | chr1 15795 18061 uc009vjd.2 0 - 15795 15795 0 5 152,159,198,136,456, 0,811,1062,1437,1810, 17 | chr1 16606 29370 uc009viy.2 0 - 16606 16606 0 9 159,198,136,137,147,95,58,154,50, 0,251,626,999,1308,1661,1890,8131,12714, 18 | chr1 16606 29370 uc009viz.2 0 - 16606 16606 0 8 159,202,136,137,147,112,154,50, 0,247,626,999,1308,1661,8131,12714, 19 | chr1 16857 17751 uc009vjc.1 0 - 16857 16857 0 2 198,519, 0,375, 20 | chr1 16857 19759 uc001aai.1 0 - 16857 16857 0 6 198,136,137,147,112,847, 0,375,748,1057,1410,2055, 21 | chr1 16857 29370 uc010nxs.1 0 - 16857 16857 0 8 198,136,137,147,99,227,154,50, 0,375,748,1057,1410,2055,7880,12463, 22 | chr1 16857 29961 uc009vjb.1 0 - 16857 16857 0 7 198,136,137,147,112,154,138, 0,375,748,1057,1410,7880,12966, 23 | chr1 17232 29370 uc009vje.2 0 - 17232 17232 0 4 510,147,99,50, 0,682,1035,12088, 24 | chr1 17605 29370 uc009vjf.2 0 - 17605 17605 0 7 137,147,95,58,227,154,50, 0,309,662,891,1307,7132,11715, 25 | chr1 34611 36081 uc001aak.2 0 - 34611 34611 0 3 563,205,361, 0,665,1109, 26 | chr1 69090 70008 uc001aal.1 0 + 69090 70008 0 1 918, 0, 27 | chr1 137838 139228 uc001aam.3 0 - 137838 137838 0 1 1390, 0, 28 | -------------------------------------------------------------------------------- /slamdunk/plot/PCAPlotter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Plot PCA based on readcounts in UTRs 4 | 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'fileTab', "f", 2,"character","tsv table of rate files", 27 | 'outputPDF', "O", 2,"character","output pdf file name", 28 | 'outputPCA', "P", 2,"character","output PCA transformations file name" 29 | ),ncol = 5,byrow=T) 30 | 31 | opt = getopt(spec) 32 | 33 | if ( !is.null(opt$help) || length(opt)==1 ) { 34 | #get the script name 35 | cmd = commandArgs(FALSE) 36 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 37 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 38 | #print a friendly message and exit with a non-zero error code 39 | cat(getopt(spec,command = self,usage=T)) 40 | q(status=1); 41 | } 42 | 43 | 44 | if ( is.null(opt$fileTab) ) stop("arg fileTab must be specified") 45 | if ( is.null(opt$outputPDF) ) { opt$outputFile = "out.pdf" } 46 | 47 | library(ggplot2) 48 | 49 | samples = read.table(opt$fileTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "") 50 | 51 | if (nrow(samples) <= 1) { 52 | cat('# slamdunk PCA\n', file=opt$outputPCA) 53 | cat(paste(samples$sample,0,"0\n",sep="\t"),append=TRUE,file=opt$outputPCA) 54 | #signal success and exit. 55 | q(status=0) 56 | } 57 | 58 | countsList = list() 59 | 60 | for (i in 1:nrow(samples)) { 61 | curTab = read.delim(samples$file[i],stringsAsFactors=FALSE, comment.char="#") 62 | 63 | countsList[[samples$sample[i]]] = curTab$TcReadCount 64 | 65 | } 66 | 67 | countMatrix = do.call(cbind, countsList) 68 | 69 | variances = apply(countMatrix, 1, var) 70 | 71 | sel = order(variances, decreasing=TRUE)[seq_len(min(500, length(variances)))] 72 | 73 | pca = prcomp(t(countMatrix[sel,])) 74 | 75 | PoV = pca$sdev ^ 2 / sum(pca$sdev ^ 2) 76 | 77 | plotTab = data.frame(sample = row.names(pca$x), PC1 = pca$x[,1], PC2 = pca$x[,2]) 78 | 79 | pdf(opt$outputPDF) 80 | 81 | ggplot(plotTab, aes(x=PC1, y=PC2, color = sample)) + geom_point(size = 3) + 82 | xlab(paste("PC1 (", round(PoV[1],digits=2), " % variance)",sep="")) + 83 | ylab(paste("PC2 (", round(PoV[2],digits=2), " % variance)",sep="")) + 84 | theme(legend.position="bottom", legend.title=element_blank()) + ggtitle("Slamdunk PCA") 85 | 86 | dev.off() 87 | 88 | cat('# slamdunk PCA\n', file=opt$outputPCA) 89 | write.table(plotTab,file=opt$outputPCA,append=TRUE,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE) 90 | 91 | #signal success and exit. 92 | q(status=0) 93 | -------------------------------------------------------------------------------- /slamdunk/plot/conversion_per_read_position.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | library(getopt) 21 | 22 | spec = matrix(c( 23 | 'help' , 'h', 0, "logical","print the usage of the command", 24 | 'utr' , 'u', 0, "logical","utr plotting", 25 | 'inputFile', "i", 2,"character","tsv table of mutations per position", 26 | 'outputFile', "o", 2,"character","output pdf file name" 27 | ),ncol = 5,byrow=T) 28 | 29 | opt = getopt(spec) 30 | 31 | if ( !is.null(opt$help) || length(opt)==1 ) { 32 | #get the script name 33 | cmd = commandArgs(FALSE) 34 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 35 | cat(basename(self),": Create mismatches per read/UTR position plots.\n\n") 36 | #print a friendly message and exit with a non-zero error code 37 | cat(getopt(spec,command = self,usage=T)) 38 | q(status=1); 39 | } 40 | 41 | positionLabel = "Position on read" 42 | mutationLabel = "% of reads with mutation" 43 | 44 | if( !is.null(opt$utr)) { 45 | positionLabel = "Position at 3' UTR end (200 bp upstream)" 46 | mutationLabel = "% of UTRs with mutation" 47 | } 48 | 49 | if ( is.null(opt$inputFile) ) stop("arg input must be specified") 50 | if ( is.null(opt$outputFile) ) { opt$outputFile = paste(opt$inputFile, ".pdf", sep="") } 51 | 52 | 53 | mut = read.table(opt$inputFile, comment.char = "#") 54 | 55 | if (is.null(mut$V6)) { 56 | mut$V6 = mut$V5 57 | } 58 | 59 | #mut = read.table("test_mut_bowtie.csv") 60 | 61 | #totalFwd = mut[1,1] 62 | #totalRev = mut[1,2] 63 | #tcFwd = mut[1,3] 64 | #tcRev = mut[1,4] 65 | 66 | #mut = mut[-1,] 67 | 68 | counts = rbind(c(mut$V1)/c(mut$V5) * 100, c(mut$V2)/c(mut$V6) * 100) 69 | countsTC = rbind(c(mut$V3)/c(mut$V5) * 100, c(mut$V4)/c(mut$V6) * 100) 70 | 71 | ################################################################## 72 | # Workaround for 0 counts (need to work out what's going on there 73 | 74 | counts[is.nan(counts)] = 0 75 | countsTC[is.nan(countsTC)] = 0 76 | 77 | ################################################################## 78 | pdf(opt$outputFile, width=10, height=10) 79 | par(mfrow=c(2,1)) 80 | 81 | # Scale to next 10 82 | barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,max(10,ceiling(counts / 10) * 10)), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse")) 83 | #barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,10), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse")) 84 | # Scale to next 1 85 | barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,max(1,ceiling(countsTC))), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse")) 86 | #barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,1), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse")) 87 | 88 | dev.off() 89 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/getSegs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import bisect; 4 | 5 | def getSegs(pos,seglen,direction,exonstart,exonlen): 6 | """ 7 | This function returns the corresponding segstart and seg length (as in the list) 8 | parameters: 9 | pos: the position to be queried 10 | seglen: the length of the segment 11 | direction: 1 (forward), -1 (backward) 12 | exonstart,exonlen: the exon positions 13 | Return value: (segstart,seglength,status) 14 | status is used to indicate the possible errors 15 | 0: normal 16 | 1: the position is not in the exon range 17 | 2: the segment exceeds the boundary 18 | """ 19 | segstart=[]; 20 | seglength=[]; 21 | # find the corresponding exon which includes pos 22 | posexonid=-1; 23 | status=0; 24 | for ne in range(len(exonstart)): 25 | if pos in range(exonstart[ne],exonstart[ne]+exonlen[ne]): 26 | posexonid=ne; 27 | break; 28 | if posexonid==-1: 29 | status=1; 30 | return (segstart,seglength,status); 31 | if direction==1: 32 | while seglen>0: 33 | lentoadd=min(seglen,exonlen[posexonid]+exonstart[posexonid]-pos); 34 | segstart+=[pos]; 35 | seglength+=[lentoadd]; 36 | posexonid=posexonid+1; 37 | seglen=seglen-lentoadd; 38 | if posexonid>=len(exonstart): 39 | if seglen>0: 40 | status=2; 41 | return (segstart,seglength,status); 42 | pos=exonstart[posexonid]; 43 | if direction==-1: 44 | while seglen>0: 45 | lentoadd=min(seglen,pos-exonstart[posexonid]+1); 46 | segstart.insert(0,pos-lentoadd+1); # insert in the front 47 | seglength.insert(0,lentoadd); 48 | posexonid=posexonid-1; 49 | seglen=seglen-lentoadd; 50 | if posexonid<0: 51 | if seglen>0: 52 | status=2; 53 | return (segstart,seglength,status); 54 | pos=exonstart[posexonid]+exonlen[posexonid]-1; 55 | return (segstart,seglength,status); 56 | 57 | def tpos2pos(tpos,cumlen,exonstart): 58 | """ 59 | Convertion from coordinates in a transcript to coordinates in a reference. 60 | Need to provide exon start position and the cumulate exon length as input. 61 | """ 62 | selseg=bisect.bisect_right(cumlen,tpos); 63 | # if the position is exceeding the boundary, set as the last position of the boundary 64 | if selseg>=len(cumlen): 65 | selseg=len(cumlen)-1; 66 | tpos=cumlen[-1]-1; 67 | if selseg>0: 68 | pos=exonstart[selseg]+(tpos-cumlen[selseg-1]); 69 | else: 70 | pos=exonstart[selseg]+tpos; 71 | return pos; 72 | 73 | def writeBedline(fid,lineid,chromosome,direction,startrange,lenrange): 74 | """ 75 | Write one line in .bed file. 76 | Need to provide information of chromosome, id, direction, segment starts and segment lengths 77 | """ 78 | # skip if startrange is malformed 79 | if not startrange: 80 | return None 81 | bedrange=(startrange[0],startrange[-1]+lenrange[-1]); 82 | startrange=[i-startrange[0] for i in startrange]; 83 | # directions 84 | if direction==1: 85 | direction='+'; 86 | elif direction==-1: 87 | direction='-'; 88 | #write line 89 | fid.write(chromosome + '\t' # 0th, chromosome 90 | + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 1-2th, start and and 91 | + lineid + '\t' # 3th, id 92 | + '0\t'+ direction +'\t' # 4th, 5th, 0 and direction 93 | + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 6-7th, same as 1-2 94 | + '0\t'+str(len(startrange))+'\t' # 8th, 0; 9th, number of segments 95 | + ''.join([str(i)+',' for i in lenrange]) + '\t' # 10th, length 96 | + ''.join([str(i)+',' for i in startrange]) +'\t' # 11th, start position 97 | +'\n'); 98 | 99 | 100 | -------------------------------------------------------------------------------- /slamdunk/dunks/deduplicator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | from __future__ import print_function 21 | import pysam 22 | 23 | from slamdunk.utils.misc import checkStep, pysamIndex # @UnresolvedImport 24 | 25 | def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False): 26 | 27 | if(printOnly or checkStep([inputBAM], [outputBAM], force)): 28 | 29 | samfile = pysam.AlignmentFile(inputBAM, "rb") 30 | outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile) 31 | 32 | processedReads = 0 33 | retainedReads = 0 34 | 35 | prevChr = "" 36 | prevStart = "" 37 | 38 | duplicateBuffer = {} 39 | 40 | for read in samfile: 41 | 42 | flag = read.cigarstring 43 | chr = read.reference_id 44 | start = read.reference_start 45 | seq = read.query_sequence 46 | if (read.has_tag("TC")) : 47 | tcflag = read.get_tag("TC") 48 | else : 49 | tcflag = 0 50 | 51 | if (tcflag >= tcMutations) : 52 | 53 | if (chr != prevChr or start != prevStart) : 54 | 55 | if (prevChr != "") : 56 | for curSeq in duplicateBuffer : 57 | for curFlag in duplicateBuffer[curSeq]: 58 | for readEntry in duplicateBuffer[curSeq][curFlag]: 59 | if not readEntry.is_duplicate: 60 | retainedReads += 1 61 | outfile.write(readEntry) 62 | duplicateBuffer.clear() 63 | 64 | if not seq in duplicateBuffer: 65 | duplicateBuffer[seq] = {} 66 | if not flag in duplicateBuffer[seq]: 67 | duplicateBuffer[seq][flag] = list() 68 | if len(duplicateBuffer[seq][flag]) > 0 : 69 | read.is_duplicate = True 70 | duplicateBuffer[seq][flag].append(read) 71 | 72 | prevChr = chr 73 | prevStart = start 74 | 75 | processedReads += 1 76 | 77 | for seq in duplicateBuffer: 78 | for flag in duplicateBuffer[seq] : 79 | for readEntry in duplicateBuffer[seq][flag]: 80 | if not readEntry.is_duplicate: 81 | retainedReads += 1 82 | outfile.write(readEntry) 83 | duplicateBuffer.clear() 84 | 85 | outfile.close() 86 | 87 | print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "") 88 | print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="") 89 | print(" compression rate)", file=log) 90 | 91 | pysamIndex(outputBAM) 92 | 93 | else: 94 | print("Skipped deduplication for " + inputBAM, file=log) -------------------------------------------------------------------------------- /slamdunk/plot/compute_context_TC_rates.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Script to plot TC context rates of reads 4 | 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'rateTab', "f", 2,"character","tsv table of rate files", 27 | 'outputFile', "O", 2,"character","output pdf file name" 28 | ),ncol = 5,byrow=T) 29 | 30 | opt = getopt(spec) 31 | 32 | if ( !is.null(opt$help) || length(opt)==1 ) { 33 | #get the script name 34 | cmd = commandArgs(FALSE) 35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 37 | #print a friendly message and exit with a non-zero error code 38 | cat(getopt(spec,command = self,usage=T)) 39 | q(status=1) 40 | } 41 | 42 | 43 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified") 44 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" } 45 | 46 | library(ggplot2) 47 | library(gridExtra) 48 | 49 | rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"),comment.char = "") 50 | 51 | pdf(opt$outputFile) 52 | plotList = list() 53 | 54 | for (i in 1:nrow(rates)) { 55 | curTab = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE) 56 | 57 | subFront = curTab[1:2,] 58 | subBack = curTab[4:5,] 59 | names(subBack) = curTab[3,] 60 | 61 | #subFront = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1) 62 | #subBack = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1,skip=2) 63 | 64 | printTabFront = data.frame(contexts=rep(names(subFront),each=2),strand = factor(rep(c("+","-"),ncol(subFront)),levels=c("+","-")), 65 | rate_percent = as.numeric(unlist(subFront))) 66 | printTabBack = data.frame(contexts=rep(names(subBack),each=2),strand = factor(rep(c("+","-"),ncol(subBack)),levels=c("+","-")), 67 | rate_percent = as.numeric(unlist(subBack))) 68 | 69 | printTabFront$rate_percent = printTabFront$rate_percent / sum(printTabFront$rate_percent) 70 | printTabBack$rate_percent = printTabBack$rate_percent / sum(printTabBack$rate_percent) 71 | 72 | # Ignore N contexts for now 73 | printTabFront = printTabFront[-grep("NT",printTabFront$contexts),] 74 | printTabBack = printTabBack[-grep("TN",printTabBack$contexts),] 75 | 76 | curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabFront) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) + 77 | theme(text = element_text(size=6),axis.text.x = element_text(size=6), plot.title = element_text(size=10)) 78 | plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("5' T->C context") 79 | curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabBack) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) + 80 | theme(text = element_text(size=6),axis.text.x = element_text(size=6),plot.title = element_text(size=10)) 81 | plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("3' T->C context") 82 | } 83 | 84 | do.call(grid.arrange, plotList) 85 | 86 | dev.off() 87 | 88 | #signal success and exit. 89 | q(status=0) 90 | -------------------------------------------------------------------------------- /slamdunk/plot/compute_sample_comparison_statistics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Script to plot pairwise correlations and PCA 4 | # 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | # Helper 23 | 24 | my_panel_cor <- function(x, y, digits=2, prefix="", cex.cor, ...) 25 | { 26 | usr <- par("usr"); on.exit(par(usr)) 27 | par(usr = c(0, 1, 0, 1)) 28 | 29 | 30 | toUse = which(is.finite(x) & is.finite(y) & (x|y>0)) 31 | r <- abs(cor(x[toUse], y[toUse])) 32 | 33 | 34 | txt <- format(c(r, 0.123456789), digits=digits)[1] 35 | txt <- paste(prefix, txt, sep="") 36 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) 37 | text(0.5, 0.5, txt, cex = cex.cor * r) 38 | } 39 | 40 | my_panel_smooth <- function(x, y,lcol="red") 41 | 42 | { 43 | smoothScatter(x,y,add=T) 44 | abline(0,1,col=lcol) 45 | } 46 | 47 | library(getopt) 48 | 49 | spec = matrix(c( 50 | 'help' , 'h', 0, "logical","print the usage of the command", 51 | 'sampleTab', "i", 2,"character","csv table of sample counts", 52 | 'outputPrefix', "o", 2,"character","output file name prefix" 53 | ),ncol = 5,byrow=T) 54 | 55 | opt = getopt(spec) 56 | 57 | if ( !is.null(opt$help) || length(opt)==1 ) { 58 | #get the script name 59 | cmd = commandArgs(FALSE) 60 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 61 | cat(basename(self),": Compute sample comparison statistics from sample counts.\n\n") 62 | #print a friendly message and exit with a non-zero error code 63 | cat(getopt(spec,command = self,usage=T)) 64 | q(status=1); 65 | } 66 | 67 | 68 | if ( is.null(opt$sampleTab) ) stop("arg sampleTab must be specified") 69 | if ( is.null(opt$outputPrefix) ) { opt$outputPrefix = "sampleCorrelation" } 70 | 71 | rates = read.table(opt$sampleTab,header=TRUE,sep=";", comment.char = "") 72 | 73 | if (ncol(rates) < 6) { 74 | print("No need for calculating pairwise statistics for single sample") 75 | quit(status=0) 76 | } 77 | 78 | library(RColorBrewer, lib.loc = libLoc) 79 | library(lattice, lib.loc = libLoc) 80 | library(matrixStats, lib.loc = libLoc) 81 | 82 | values = data.matrix(rates[,c(5:ncol(rates))]) 83 | 84 | ################################################## 85 | # PCA 86 | ################################################## 87 | 88 | rowVariances = rowVars(data.matrix(values)) 89 | 90 | select = order(rowVariances, decreasing = TRUE)[seq_len(min(500, length(rowVariances)))] 91 | 92 | pca = prcomp(t(values[select, ])) 93 | 94 | if (ncol(values) == 2) { 95 | col = brewer.pal(3, "Paired")[1:2] 96 | } else if (ncol(values) > 12) { 97 | getPalette = colorRampPalette(brewer.pal(9, "Set1")) 98 | col = getPalette(ncol(values)) 99 | } else { 100 | col = brewer.pal(ncol(values), "Paired") 101 | } 102 | 103 | # Get amount of explained variance (see summary(pca)) 104 | varianceProportion = pca$sdev ^ 2 / sum(pca$sdev ^ 2) 105 | 106 | pdf(paste(opt$outputPrefix,"_PCA.pdf",sep="")) 107 | 108 | if (ncol(values) > 12) { 109 | 110 | xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x), 111 | pch = 20, cex = 2, aspect = "iso", col = col, xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""), 112 | ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""), 113 | ) 114 | 115 | } else { 116 | 117 | xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x), 118 | pch = 20, cex = 2, aspect = "iso", col = col, main = draw.key(key = list(rect = list(col = col), 119 | text = list(colnames(values)), rep = FALSE)), xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""), 120 | ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""), 121 | ) 122 | 123 | } 124 | 125 | dev.off() 126 | 127 | ################################################## 128 | # Pairwise correlations 129 | ################################################## 130 | 131 | if (ncol(values) <= 12) { 132 | 133 | pdf(paste(opt$outputPrefix,"_pairwiseCorrelation.pdf",sep="")) 134 | 135 | pairs(values,upper.panel=my_panel_smooth,lower.panel=my_panel_cor) 136 | 137 | dev.off() 138 | } 139 | 140 | #signal success and exit. 141 | q(status=0) 142 | -------------------------------------------------------------------------------- /slamdunk/plot/compute_overall_rates.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Script to overlap public database file 4 | # 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | library(ggplot2) 24 | library(gridExtra) 25 | 26 | spec = matrix(c( 27 | 'help' , 'h', 0, "logical","print the usage of the command", 28 | 'rateTab', "f", 2,"character","tsv table of rate files", 29 | 'name', "n", 2,"character","Sample name", 30 | 'outputFile', "O", 2,"character","output pdf file name" 31 | ),ncol = 5,byrow=T) 32 | 33 | opt = getopt(spec) 34 | 35 | if ( !is.null(opt$help) || length(opt)<1 ) { 36 | #get the script name 37 | cmd = commandArgs(FALSE) 38 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 39 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 40 | #print a friendly message and exit with a non-zero error code 41 | cat(getopt(spec,command = self,usage=T)) 42 | q(status=1); 43 | } 44 | 45 | 46 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified") 47 | if ( is.null(opt$name) ) { opt$outputFile = "Sample 1" } 48 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" } 49 | 50 | #rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "") 51 | 52 | fileName = opt$rateTab 53 | # file = "/project/libby/slamdunk-analysis/sra_example/rates/ERR1692138_slamdunk_mapped_filtered_overallrates.csv" 54 | # file = "/project/libby/slamdunk-analysis/mareike/rates/AML_41-1_48h_Mll212207_37484.fastq_slamdunk_mapped_filtered_overallrates.csv" 55 | sampleName = opt$name 56 | # sampleName = "Sample 1" 57 | 58 | pdf(opt$outputFile) 59 | 60 | plotList = list() 61 | 62 | #for (i in 1:nrow(rates)) { 63 | curTab = read.table(fileName,stringsAsFactors=FALSE) 64 | 65 | curTab[, c("A", "C", "G", "T")] <- curTab[, c("A", "C", "G", "T")]/rowSums(curTab[, c("A", "C", "G", "T")]) * 100 66 | curTab[, c("a", "c", "g", "t")] <- curTab[, c("a", "c", "g", "t")]/rowSums(curTab[, c("a", "c", "g", "t")]) * 100 67 | 68 | printTab = data.frame(rates=c(rep("AT",2),rep("AC",2),rep("AG",2), 69 | rep("TA",2),rep("TC",2),rep("TG",2), 70 | rep("CA",2),rep("CT",2),rep("CG",2), 71 | rep("GA",2),rep("GT",2),rep("GC",2)), strand = rep(c("+","-"),12), 72 | rate_percent = c(curTab["A","T"],curTab["A","t"],curTab["A","C"],curTab["A","c"],curTab["A","G"],curTab["A","g"], 73 | curTab["T","A"],curTab["T","a"],curTab["T","C"],curTab["T","c"],curTab["T","G"],curTab["T","g"], 74 | curTab["C","A"],curTab["C","a"],curTab["C","T"],curTab["C","t"],curTab["C","G"],curTab["C","g"], 75 | curTab["G","A"],curTab["G","a"],curTab["G","T"],curTab["G","t"],curTab["G","C"],curTab["G","c"]) 76 | ) 77 | 78 | 79 | #fwdATot = max(1, sum(curTab["A",c("A", "C", "G", "T", "N")])) 80 | #fwdCTot = max(1, sum(curTab["C",c("A", "C", "G", "T", "N")])) 81 | #fwdGTot = max(1, sum(curTab["G",c("A", "C", "G", "T", "N")])) 82 | #fwdTTot = max(1, sum(curTab["T",c("A", "C", "G", "T", "N")])) 83 | 84 | #revATot = max(1, sum(curTab["A",c("a", "c", "g", "t", "n")])) 85 | #revCTot = max(1, sum(curTab["C",c("a", "c", "g", "t", "n")])) 86 | #revGTot = max(1, sum(curTab["G",c("a", "c", "g", "t", "n")])) 87 | #revTTot = max(1, sum(curTab["T",c("a", "c", "g", "t", "n")])) 88 | 89 | #total = c(rep(c(fwdATot, revATot), 3), rep(c(fwdTTot, revTTot), 3), rep(c(fwdCTot, revCTot), 3), rep(c(fwdGTot, revGTot), 3) ) 90 | 91 | #printTab$rate_percent = printTab$rate_percent / total * 100 92 | 93 | maxRatePercent = max(10, max(printTab$rate_percent) * 1.1) 94 | 95 | printTab$y = -0.3 96 | printTab[printTab$strand == "-", ]$y = printTab[printTab$strand == "-", ]$rate_percent + printTab[printTab$strand == "+", ]$rate_percent 97 | 98 | curPlot = qplot(x=rates, y=rate_percent, fill=strand,data=printTab) + ylim(-0.5,maxRatePercent) + geom_bar(stat="identity") + geom_text(aes(y = printTab$y, label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = -0.50) + ylab("Rate percent %") + xlab(sampleName) + 99 | theme(text = element_text(size=12),axis.text.x = element_text(size=12)) 100 | #curPlot + xlim(0,35) 101 | plotList[[length(plotList)+1]] <- curPlot #+ ylim(0.0,maxRatePercent) 102 | #} 103 | 104 | do.call(grid.arrange, plotList) 105 | 106 | dev.off() 107 | 108 | #signal success and exit. 109 | q(status=0) 110 | -------------------------------------------------------------------------------- /slamdunk/dunks/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | from __future__ import print_function 21 | import os, re 22 | 23 | from slamdunk.utils.misc import files_exist, checkStep, run, pysamIndex, removeFile, getBinary, replaceExtension, shellerr # @UnresolvedImport 24 | from slamdunk.version import __ngm_version__ # @UnresolvedImport 25 | 26 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 27 | 28 | def sort(inputSAM, outputBAM, log, threads=1, keepSam=True, dry=False, verbose=True): 29 | 30 | if(files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"])): 31 | runSam2bam(inputSAM, outputBAM, log, False, False, not keepSam, threads=threads, dry=dry, verbose=verbose) 32 | else: 33 | print("Skipped sorting for " + inputSAM, file=log) 34 | 35 | def checkNextGenMapVersion(): 36 | ngmHelp = shellerr("ngm", raiseError = False) 37 | matchObj = re.match( r'.*([0-9]+\.[0-9]+\.[0-9]+).*', str(ngmHelp), re.M|re.I) 38 | if matchObj: 39 | version = matchObj.group(1) 40 | if version != __ngm_version__: 41 | raise RuntimeError('NextGenMap version expected: ' + __ngm_version__ + " but found " + version + ". Please reinstall slamdunk package.") 42 | else: 43 | raise RuntimeError('Could not get NextGenMap version. Please reinstall slamdunk package.') 44 | 45 | def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False): 46 | if(delinFile and files_exist(outFile) and not files_exist(inFile)): 47 | print("Skipping sam2bam for " + outFile, file=log) 48 | else: 49 | if(onlyUnique and filterMQ == 0): 50 | filterMQ = 1; 51 | 52 | success = True 53 | cmd = ["samtools view", "-@", str(threads), "-Sb", "-o", outFile, inFile] 54 | if filterMQ > 0: 55 | cmd+=["-q", str(filterMQ)] 56 | if onlyProperPaired: 57 | cmd+=["-f", "2"] 58 | if not L is None: 59 | cmd+=["-L", L] 60 | run(" ".join(cmd), log, verbose=verbose, dry=dry) 61 | 62 | if(sort): 63 | tmp = outFile + "_tmp" 64 | if(not dry): 65 | os.rename(outFile, tmp) 66 | run(" ".join(["samtools sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry) 67 | if(success): 68 | removeFile(tmp) 69 | if(success and delinFile): 70 | if(not dry): 71 | removeFile(inFile) 72 | 73 | if(index): 74 | pysamIndex(outFile) 75 | 76 | 77 | def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False): 78 | 79 | if(quantseqMapping is True) : 80 | parameter = "--no-progress" 81 | 82 | if(trim5p > 0): 83 | parameter = parameter + " -5 " + str(trim5p) 84 | 85 | if(maxPolyA > -1): 86 | parameter = parameter + " --max-polya " + str(maxPolyA) 87 | 88 | if(endtoendMapping is True): 89 | parameter = parameter + " -e " 90 | else: 91 | parameter = parameter + " -l " 92 | 93 | if(sampleId != None): 94 | parameter = parameter + " --rg-id " + str(sampleId) 95 | if(sampleName != ""): 96 | parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime) 97 | 98 | if(topn > 1): 99 | parameter = parameter + " -n " + str(topn) + " --strata " 100 | 101 | if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)): 102 | if outputSAM.endswith(".sam"): 103 | # Output SAM 104 | run("ngm -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) 105 | else: 106 | # Output BAM directly 107 | run("ngm -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) 108 | else: 109 | print("Skipped mapping for " + inputBAM, file=log) 110 | -------------------------------------------------------------------------------- /slamdunk/plot/compute_conversion_rate_mle.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | library(getopt) 21 | library(bbmle) 22 | 23 | spec = matrix(c( 24 | 'help' , 'h', 0, "logical","print the usage of the command", 25 | 'file', "f", 2,"character","", 26 | 'rate', "r", 2,"character", "", 27 | 'output', "o", 2,"character","Output tsv" 28 | ),ncol = 5,byrow=T) 29 | 30 | opt = getopt(spec) 31 | 32 | if ( !is.null(opt$help) || length(opt)==3 ) { 33 | #get the script name 34 | cmd = commandArgs(FALSE) 35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 37 | #print a friendly message and exit with a non-zero error code 38 | cat(getopt(spec,command = self,usage=T)) 39 | q(status=1); 40 | } 41 | 42 | if ( is.null(opt$file) ) stop("arg file must be specified") 43 | if ( is.null(opt$output) ) stop("arg output must be specified") 44 | if ( is.null(opt$rate) ) stop("arg rate must be specified") 45 | 46 | # a: percentage of converted transcripts 47 | # b: conversion rate 48 | LL <- function(a, b) { 49 | R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0) 50 | -sum(log(R)) 51 | } 52 | 53 | # Estimate a with fixed b 54 | LL2 <- function(as) { 55 | b = estb 56 | rs = c() 57 | for(a in as) { 58 | R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0) 59 | rs = c(rs, -sum(log(R))) 60 | } 61 | rs 62 | } 63 | 64 | readMeatInfo <- function(fileName) { 65 | #fileName = filesSlamDunk[1] 66 | sampleInfo = read.table(fileName, nrows = 1, comment.char = "") 67 | version = paste(lapply(sampleInfo[1,1:3], as.character), collapse = '\t') 68 | sampleID = as.character(sampleInfo[1, ]$V7) 69 | sampleName = as.character(sampleInfo[1, ]$V6) 70 | sampleType = as.character(sampleInfo[1, ]$V8) 71 | sampleTime = as.numeric(sampleInfo[1, ]$V9) 72 | sampleInfo = read.table(fileName, nrows = 1, skip = 1, comment.char = "") 73 | annotationMD5 = as.character(sampleInfo[1, ]$V3) 74 | annotationName = as.character(sampleInfo[1, ]$V2) 75 | c(sampleID, sampleName, sampleType, sampleTime, annotationName, annotationMD5, version) 76 | } 77 | 78 | 79 | 80 | file = opt$file 81 | #file = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation/halflife_cov60_rl38/slamdunk/count/sample_3_pulse_30min_rep1_reads_slamdunk_mapped_filtered_tcount_perread.tsv" 82 | #files = as.character(ordered(strsplit(file, ",")[[1]])) 83 | 84 | output = opt$output 85 | #output = "/project/ngs/philipp/slamdunk-analysis/veronika/ngm-20161027/count-examples/34504_An312_wt-2n_mRNA-slamseq-autoquant_1h-R2.fq_slamdunk_mapped_filtered_tcount_mle.tsv" 86 | 87 | estb = as.numeric(opt$rate) 88 | #estb = 0.024 89 | 90 | meta = readMeatInfo(file) 91 | id = meta[1] 92 | type = meta[3] 93 | time = meta[4] 94 | 95 | data = read.table(file, header = T, stringsAsFactors = F) 96 | #head(data) 97 | 98 | for(i in 1:nrow(data)) { 99 | #i = 1 100 | N = as.numeric(strsplit(data[i,]$ReadCount, ",")[[1]]) 101 | k = as.numeric(strsplit(data[i,]$TcReadCount, ",")[[1]]) 102 | sample = data.frame(n = N, k = k) 103 | fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99)) 104 | confinv = confint(fit) 105 | 106 | #result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]])) 107 | data[i,]$ConversionRate = fit@coef[[1]] 108 | data[i,]$ReadCount = length(N) 109 | data[i,]$TcReadCount = sum(k > 0) 110 | data[i,]$ConversionRateLower = confinv[[1]] 111 | data[i,]$ConversionRateUpper = confinv[[2]] 112 | } 113 | 114 | if(sum(is.na(data$ConversionRateLower) > 0)) { 115 | data[is.na(data$ConversionRateLower), ]$ConversionRateLower = 0 116 | } 117 | if(sum(is.na(data$ConversionRateUpper) > 0)) { 118 | data[is.na(data$ConversionRateUpper), ]$ConversionRateUpper = 1 119 | } 120 | 121 | #all = data.frame() 122 | #for(file in files) { 123 | # part = read.table(file, header = T) 124 | # all = rbind(all, part) 125 | #} 126 | #head(all) 127 | # Estimate b from all data 128 | #sample = all 129 | #fit = mle2(minuslogl = LL, start = list(a = 0.1, b = 0.01), method = "L-BFGS-B", lower = c(a = 0.000001, b = 0.000001), upper = c(a = 0.99, b = 0.1)) 130 | 131 | #result = c() 132 | #names = as.character(unique(all$utr)) 133 | 134 | #for(name in names) { 135 | # #name = names[2] 136 | # sample = all[all$utr == name, ] 137 | # 138 | # fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99)) 139 | # confinv = confint(fit) 140 | ## result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]])) 141 | # 142 | #} 143 | # 144 | 145 | # Read header 146 | header = readLines(file, 2) 147 | con <- file(output, open="wt") 148 | # Print header 149 | writeLines(header[1], con) 150 | writeLines(header[2], con) 151 | # Print data 152 | write.table(data, con, sep = "\t", quote = F, row.names = F, col.names = T) 153 | close(con) 154 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/genexplvprofile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File). 4 | 5 | USAGE 6 | 7 | genexplvprofile.py {OPTIONS} 8 | 9 | OPTIONS 10 | 11 | -h/--help\tPrint this message 12 | 13 | -e/--lognormal\tmu,sigma Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4 14 | --geometric\tmu Use geometric distribution with parameter mu instead of lognormal distribution to assign expression levels. 15 | 16 | -f/--statonly\tPrint the statistics only; do not assign expression levels. 17 | 18 | NOTE 19 | 20 | 1. To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED). 21 | 22 | 2. The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]). 23 | 24 | HISTORY 25 | 26 | 07/24/2012 27 | Enable geometric distribution for expression level assignments. Require numpy package. 28 | 29 | 02/16/2012 30 | Run on Python 2.7 31 | 32 | 02/08/2012 33 | Initialization. 34 | ''' 35 | 36 | from __future__ import print_function 37 | import sys; 38 | import pydoc; 39 | import os; 40 | import re; 41 | import fileinput; 42 | import random; 43 | import numpy; 44 | 45 | def parsebed(lines): 46 | # Parse one line in count data 47 | fd=lines.strip().split('\t'); 48 | if len(fd)!=12: 49 | return ['',-1,-1,0]; 50 | if fd[10].endswith(','): 51 | fd[10]=fd[10][:-1]; 52 | if fd[11].endswith(','): 53 | fd[11]=fd[11][:-1]; 54 | seglen=[int(x) for x in fd[10].split(',')]; 55 | segstart=[int(x) for x in fd[11].split(',')]; 56 | #jstart=int(fd[1])+seglen[0]+1; 57 | #jend=int(fd[1])+segstart[1]+1; 58 | jstart=int(fd[1])+1; # start is 0-base; increase 1 to convert to 1-base 59 | jend=int(fd[2]); 60 | # jscore=int(fd[4]); 61 | #seg1=[jstart+segstart[i] for i in range(len(segstart))]; 62 | #seg2=[jstart+segstart[i]+seglen[i]-1 for i in range(len(segstart))]; 63 | # [seg1,seg2] are now 1-base inclusive 64 | return [fd[0],jstart,jend,fd[3],sum(seglen),fd[5],fd[9]]; 65 | 66 | argvi=1; 67 | mindist=50; 68 | minscore=2; 69 | mu=-4; 70 | sigma=4; 71 | assignexplv=True; 72 | 73 | 74 | allfile=[]; 75 | 76 | distype="lognormal"; 77 | 78 | while argvi <(len(sys.argv)): 79 | if sys.argv[argvi]=="-h" or sys.argv[argvi]=="--help" : 80 | print(pydoc.render_doc(sys.modules[__name__]),file=sys.stderr); 81 | sys.exit(); 82 | elif sys.argv[argvi]=="-f" or sys.argv[argvi]=="--statonly": 83 | assignexplv=False; 84 | elif sys.argv[argvi]=="-e" or sys.argv[argvi]=="--lognormal" : 85 | distype="lognormal"; 86 | ms=sys.argv[argvi+1].split(","); 87 | argvi=argvi+1; 88 | if len(ms)!=2: 89 | print('Error: incorrect parameter for -e.',file=sys.stderr); 90 | sys.exit(); 91 | try: 92 | mu=float(ms[0]); 93 | sigma=float(ms[1]); 94 | except ValueError: 95 | print('Error: incorrect parameter for -e.',file=sys.stderr); 96 | sys.exit(); 97 | print('Mean and variance for lognormal distribution: '+str(mu)+','+str(sigma),file=sys.stderr); 98 | elif sys.argv[argvi]=="--geometric": 99 | distype="geometric"; 100 | try: 101 | mu=float(sys.argv[argvi+1]); 102 | if mu<0 or mu>1: 103 | print('Error: the parameter for geometric distribution must be between 0 and 1.',file=sys.stderr); 104 | sys.exit(); 105 | except ValueError: 106 | print('Error: incorrect parameter for -e.',file=sys.stderr); 107 | sys.exit(); 108 | print('Mean for geometric distribution: '+str(mu),file=sys.stderr); 109 | argvi=argvi+1; 110 | else: 111 | allfile.append(sys.argv[argvi]); 112 | argvi=argvi+1; 113 | 114 | 115 | allid={}; 116 | 117 | prevchr=""; 118 | prevrange=[0,0]; 119 | rangeid=0; 120 | 121 | nline=0; 122 | 123 | currentgene=[]; 124 | groupid=0; 125 | 126 | print('#ID\tLength\tDir\tExons\tPosition\tGroupID\tNIsoformInGroup',end=''); 127 | if assignexplv==True: 128 | print('\tExplv'); 129 | else: 130 | print(); 131 | 132 | for lines in fileinput.input(allfile): 133 | nline=nline+1; 134 | pf=parsebed(lines); 135 | chrname=pf[0];jstart=pf[1];jend=pf[2];id=pf[3]; 136 | if len(chrname)==0 and jstart<0: 137 | continue; 138 | length=pf[4];direction=pf[5];nexon=pf[6]; 139 | if chrname!=prevchr or jstart-prevrange[1]>0: 140 | if len(prevchr)!=0: 141 | groupid=groupid+1; 142 | for item in currentgene: 143 | print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end=''); 144 | if assignexplv==True: 145 | if distype=="geometric": 146 | weight=numpy.random.geometric(mu)*item[1]; 147 | else: 148 | weight=random.lognormvariate(mu,sigma)*item[1]; 149 | print("\t"+str(weight)); 150 | else: 151 | print(); 152 | prevrange[0]=jstart; 153 | prevrange[1]=jend; 154 | prevchr=chrname; 155 | rangeid=rangeid+1; 156 | currentgene=[]; 157 | elif jstartprevrange[1]: 161 | prevrange[1]=jend; 162 | currentgene.append((id+"\t"+str(length)+"\t"+direction+"\t"+str(nexon)+"\t"+chrname+":"+str(jstart)+"-"+str(jend),length)); 163 | 164 | 165 | if len(prevchr)!=0: 166 | groupid=groupid+1; 167 | for item in currentgene: 168 | print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end=''); 169 | if assignexplv==True: 170 | if distype=="geometric": 171 | weight=numpy.random.geometric(mu)*item[1]; 172 | else: 173 | weight=random.lognormvariate(mu,sigma)*item[1]; 174 | print("\t"+str(weight)); 175 | else: 176 | print(); 177 | 178 | -------------------------------------------------------------------------------- /slamdunk/plot/splash_eval_count_files.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Script to evaluate Slamdunk count results 3 | # 4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 5 | # 6 | # This file is part of Slamdunk. 7 | # 8 | # Slamdunk is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU Affero General Public License as 10 | # published by the Free Software Foundation, either version 3 of the 11 | # License, or (at your option) any later version. 12 | # 13 | # Slamdunk is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU Affero General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU Affero General Public License 19 | # along with this program. If not, see . 20 | 21 | library(getopt) 22 | 23 | spec = matrix(c( 24 | 'help' , 'h', 0, "logical","print the usage of the command", 25 | 'simulated', "s", 2,"character","Summarized count file", 26 | 'slamdunk', "d", 2,"character","Summarized count file", 27 | 'output', "o", 2,"character","Output pdf" 28 | ),ncol = 5,byrow=T) 29 | 30 | opt = getopt(spec) 31 | 32 | if ( !is.null(opt$help) || length(opt)==1 ) { 33 | #get the script name 34 | cmd = commandArgs(FALSE) 35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 37 | #print a friendly message and exit with a non-zero error code 38 | cat(getopt(spec,command = self,usage=T)) 39 | q(status=1); 40 | } 41 | 42 | 43 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified") 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified") 45 | if ( is.null(opt$output) ) stop("arg output must be specified") 46 | 47 | 48 | rsme <- function(model, measure) { 49 | sqrt( mean( (model-measure)^2 , na.rm = TRUE ) ) 50 | } 51 | 52 | #folder = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_6/" 53 | #version = "slamdunk/" 54 | #tcRatePerPosition = 0.024 55 | #readLength = 50 - 12 56 | #sampleNumber = 21 57 | #cfactor = 1 - dbinom(0, round(readLength / 4), tcRatePerPosition) 58 | 59 | simulatedFileRates = opt$simulated 60 | #simulatedFileRates = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/utrsummary_all_samples_rates_reads.tsv" 61 | 62 | slamDunkFile = opt$slamdunk 63 | #slamDunkFile = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/slamdunk/count/tcounts_all_samples_rates.tsv" 64 | 65 | outputFile = opt$output 66 | outputFileCSV = paste0(outputFile, ".tsv") 67 | 68 | simulatedRates = read.table(simulatedFileRates, header=T, sep="\t", stringsAsFactors = F) 69 | slamDunkRates = read.table(slamDunkFile, header=T, sep="\t", stringsAsFactors = F) 70 | 71 | # Should not be neccessary, but for large datasets some entries are lost. 72 | # Keep all that is found in both 73 | inBoth = intersect(simulatedRates$Name, slamDunkRates$Name) 74 | simulatedRates = simulatedRates[simulatedRates$Name %in% inBoth,] 75 | slamDunkRates = slamDunkRates[slamDunkRates$Name %in% inBoth,] 76 | 77 | fixedColumns = 11 78 | sampleNumber = ncol(simulatedRates) - fixedColumns + 1 79 | 80 | sampleNames = colnames(simulatedRates)[fixedColumns:(fixedColumns + sampleNumber - 1)] 81 | simulatedSamples = simulatedRates[, fixedColumns:(fixedColumns + sampleNumber - 1)] 82 | slamDunkSamples = slamDunkRates[, fixedColumns:(fixedColumns + sampleNumber - 1)] 83 | 84 | pdf(outputFile) 85 | par(mfrow=c(2,1)) 86 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-1,1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2) 87 | abline(h=0, lty=2, col="grey") 88 | 89 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples + 0.001)), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2) 90 | abline(h=0, lty=2, col="grey") 91 | 92 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-0.1,0.1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2) 93 | abline(h=0, lty=2, col="grey") 94 | 95 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples + 0.001)), ylim=c(-1,1), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2) 96 | abline(h=0, lty=2, col="grey") 97 | 98 | merged = data.frame() 99 | #rsmeTab = data.frame(File=character(), Rate=character(), RSME=character(), stringsAsFactors=F) 100 | rsmeTab = matrix("", ncol=3, nrow=0) 101 | for(currentSample in 0:(sampleNumber - 1)) { 102 | #currentSample = 0 103 | current = cbind(slamDunkRates[, c(1:fixedColumns - 1, fixedColumns + currentSample)], simulatedRates[, fixedColumns + currentSample]) 104 | colnames(current) = c(colnames(slamDunkRates[, c(1:fixedColumns - 1)]), "Simulate", "Slamdunk") 105 | merged = rbind(merged, current) 106 | 107 | rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(substring(sampleNames[currentSample + 1], 2)), as.character(rsme(current$Simulate, current$Slamdunk)))) 108 | } 109 | 110 | par(mfrow=c(1,1)) 111 | perr = round(rsme(merged$Simulate, merged$Slamdunk), digits = 4) 112 | pcorr = round(cor(merged$Simulate, merged$Slamdunk), digits = 4) 113 | plot(merged$Slamdunk, merged$Simulate, xlim=c(0,1), ylim=c(0,1), pch=4, xlab="Simulated", ylab="Slamdunk", main=paste("Cor: ", pcorr, ", RMSE: ", perr)) 114 | abline(a = 0, b = 1, col="grey", lty=2) 115 | 116 | plot(merged$avgTcontent, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4) 117 | 118 | plot(merged$avgReadsCPM, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4) 119 | 120 | plot(merged$avgMultimapper, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4) 121 | 122 | dev.off() 123 | 124 | rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(-1), as.character(rsme(merged$Simulate, merged$Slamdunk)))) 125 | 126 | write.table(rsmeTab, outputFileCSV, sep = "\t", quote = F, row.names = F, col.names = T) 127 | -------------------------------------------------------------------------------- /slamdunk/plot/compute_halflifes.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Script to compute half-lifes from SlamSeq data 4 | 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder, Bhat Pooja 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results", 27 | 'timepoints', "t", 2,"character","Comma seperated list of time points", 28 | 'output', "o", 2,"character","Output tsv" 29 | ),ncol = 5,byrow=T) 30 | 31 | opt = getopt(spec) 32 | 33 | if ( !is.null(opt$help) || length(opt)==3 ) { 34 | #get the script name 35 | cmd = commandArgs(FALSE) 36 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 37 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 38 | #print a friendly message and exit with a non-zero error code 39 | cat(getopt(spec,command = self,usage=T)) 40 | q(status=1); 41 | } 42 | 43 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified") 44 | if ( is.null(opt$output) ) stop("arg output must be specified") 45 | if ( is.null(opt$timepoints) ) stop("arg timepoints must be specified") 46 | 47 | slamDunkFiles = opt$slamdunk 48 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv" 49 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]])) 50 | outputFile = opt$output 51 | timesParameter = opt$timepoints 52 | times = as.numeric(strsplit(timesParameter, ",")[[1]]) 53 | times = times / 60 54 | 55 | 56 | mergeRates <- function(times, files, perRead) { 57 | mergedRates = data.frame() 58 | for(i in 1:length(times)) { 59 | time = times[i] 60 | print(time) 61 | simDataFile = files[i] 62 | simulation = read.table(simDataFile) 63 | colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount") 64 | if(nrow(mergedRates) == 0) { 65 | mergedRates = simulation[, c("chr", "start", "stop", "name", "strand")] 66 | mergedRates$avgReadsCPM = simulation$readsCPM 67 | mergedRates$avgMultimapper = simulation$multiMapCount 68 | if(perRead == TRUE) { 69 | mergedRates$conversionRate = simulation$convertedReads / simulation$readCount 70 | } else { 71 | mergedRates$conversionRate = simulation$conversionRate 72 | } 73 | } else { 74 | mergedRates$avgReadsCPM = mergedRates$avgReadsCPM + simulation$readsCPM 75 | mergedRates$avgMultimapper = mergedRates$avgMultimapper + simulation$multiMapCount 76 | if(perRead == TRUE) { 77 | mergedRates = cbind(mergedRates, simulation$convertedReads / simulation$readCount) 78 | } else { 79 | mergedRates = cbind(mergedRates, simulation$conversionRate) 80 | } 81 | } 82 | } 83 | colnames(mergedRates) = c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", times) 84 | mergedRates$readsCPM = mergedRates$readsCPM / length(times) 85 | mergedRates$multiMapCount = mergedRates$multiMapCount / length(times) 86 | mergedRates 87 | } 88 | 89 | computeHalfLife <- function(rates, timepoints) { 90 | # Infere half life from data 91 | a_start<-max(rates) #param a is the y value when x=0 92 | k_start = log(2, base = exp(1))/5 93 | 94 | halfLifePred = NA 95 | C = NA 96 | k = NA 97 | 98 | tryCatch( { 99 | fit = nls(rates ~ a*(1-exp(-k*(timepoints))), start=list(a=a_start,k=k_start)) 100 | halfLifePred = log(2, base = exp(1))/coef(fit)[2] * 60 101 | C = coef(fit)[1] 102 | k = coef(fit)[2] 103 | }, error=function(e){}) 104 | summary(fit) 105 | 106 | RSS.p <- sum(residuals(fit)^2) 107 | TSS <- sum((rates - mean(rates))^2) 108 | rsquared = 1 - (RSS.p/TSS) 109 | 110 | c(halfLifePred, C, k, rsquared) 111 | } 112 | 113 | perRead = F 114 | slamDunkMergedRates = mergeRates(times, filesSlamDunk, perRead) 115 | 116 | halfLifeTable = data.frame() 117 | 118 | for(utr in 1:nrow(slamDunkMergedRates)) { 119 | #utr = 8 120 | slamDunkMergedRates[utr,] 121 | pulseSlamDunk = data.frame(y = as.numeric(t(slamDunkMergedRates[utr, 8:(7 + length(times))])[,1]), x = times) 122 | 123 | result = computeHalfLife(pulseSlamDunk$y, pulseSlamDunk$x) 124 | #rates = pulseSlamDunk$y 125 | #timepoints = pulseSlamDunk$x 126 | halfLifeTable = rbind(halfLifeTable, cbind(slamDunkMergedRates[utr, c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount")], result[1])) 127 | } 128 | 129 | colnames(halfLifeTable) = c("#chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", "score") 130 | 131 | write.table(halfLifeTable, outputFile, sep = "\t", quote = F, row.names = F, col.names = T) 132 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/README: -------------------------------------------------------------------------------- 1 | RNASeqReadSimulator 2 | ================== 3 | Author: Wei Li (li.david.wei AT gmail.com) 4 | 5 | Introduction 6 | ------------ 7 | RNASeqReadSimulator is a set of scripts generating simulated RNA-Seq reads. RNASeqReadSimulator provides users a simple tool to generate RNA-Seq reads for research purposes, and a framework to allow experienced users to expand functions. RNASeqReadSimulator offers the following features: 8 | 9 | 1. It allows users to randomly assign expression levels of transcripts and generate simulated single-end or paired-end RNA-Seq reads. 10 | 11 | 2. It is able to generate RNA-Seq reads that have a specified positional bias profile. 12 | 13 | 3. It is able to simulate random read errors from sequencing platforms. 14 | 15 | 4. The simulator consists of a few simple Python scripts. All scripts are command line driven, allowing users to invoke and design more functions. 16 | 17 | Requirements 18 | ------------ 19 | RNASeqReadSimulator runs on python 2.7 with biopython package installed. 20 | 21 | Installation 22 | ------------ 23 | After download, it is suggested that the path of the scripts (src) be added to the system path. For example, if the scripts are located at /home/me/rnaseqsimulator, then add the following command to your .bashrc profile: 24 | 25 | export PATH="$PATH:/home/me/rnaseqsimulator/src" 26 | 27 | Demo 28 | ---- 29 | The demo folder includes a few scripts and sample input files to generate RNA-Seq reads from a simple example. Two bash scripts, gensingleendreads.sh and genpairedendreads.sh, are examples to generate single-end and paired-end reads. 30 | 31 | 32 | 33 | Usage 34 | ----- 35 | 36 | RNASeqReadSimulator includes the following essential scripts: 37 | 38 | genexplvprofile.py is used to assign a random expression level of transcripts; 39 | 40 | gensimreads.py simulates RNA-Seq reads in BED format; 41 | 42 | getseqfrombed.py converts reads from BED format to FASTA format; 43 | 44 | Other optional scripts and files include: 45 | 46 | splitfasta.py splits paired-end reads in FASTA file into to separate files; 47 | 48 | addvariation2splicingbed.py is a supplementary script to generate variations in splicing RNA-Seq reads. 49 | 50 | 51 | 52 | genexplvprofile.py 53 | ------------------ 54 | 55 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File). 56 | 57 | USAGE genexplvprofile.py {OPTIONS} 58 | 59 | OPTIONS 60 | 61 | -h/--help Print this message 62 | 63 | -e/--lognormal Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4 64 | 65 | -f/--statonly Print the statistics only; do not assign expression levels. 66 | 67 | NOTE: 68 | 69 | To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED). 70 | 71 | The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]). 72 | 73 | 74 | gensimreads.py 75 | ------------- 76 | This script generates simulated RNA-Seq reads (in .bed format) from known gene annotations. 77 | 78 | Usage: gensimreads.py {OPTIONS} 79 | 80 | BED-File: The gene annotation file (in BED format). Use '-' for STDIN input. 81 | 82 | OPTIONS 83 | 84 | -e/--expression Specify the weight of each transcript. Each line in the file should have at least (NFIELD+1) fields, with field 0 the annotation id, and field NFIELD the weight of this annotation. NFIELD is given by -f/--field option. If this file is not provided, uniform weight is applied. See the output of genexplvprofile.py as an example. 85 | 86 | -n/--nreads Specify the number of reads to be generated. Default 100000. 87 | 88 | -b/--posbias Specify the positional bias file. The file should include at least 100 lines, each contains only one integer number, showing the preference of the positional bias at this position. If no positional bias file is specified, use uniform distribution bias. 89 | 90 | -l/--readlen Specify the read length. Default 32. 91 | 92 | -o/--output Specify the output file. The default is STDOUT 93 | 94 | -f/--field The field of each line as weight input. Default is 7 (beginning from field 0). 95 | 96 | -p/--pairend Generate paired-end reads with specified insert length mean and standard derivation. The default is 200,20. 97 | 98 | --stranded Generate stranded RNA-Seq reads. 99 | 100 | NOTE 101 | 102 | The bed file is required to sort according to the chromosome name and position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED). 103 | 104 | No problem to handle reads spanning multiple exons. 105 | 106 | 107 | getseqfrombed.py 108 | ---------------- 109 | This script is used to extract sequences from bed file. 110 | 111 | USAGE getseqfrombed.py {OPTIONS} 112 | 113 | OPTIONS 114 | 115 | -b/--seqerror Specify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed. 116 | 117 | -r/--errorrate Specify the overall error rate, a real positive number. The number of errors of each read will follow a Poisson distribution with its mean value specified by --errorrate. Default 0 (no errors). 118 | 119 | -l/--readlen Specify the read length. Default is 75. 120 | 121 | -f/--fill Fill at the end of each read by the sequence seq, if the read is shorter than the read length. Default is 'A' (to simulate poly-A tails in RNA-Seq reads). 122 | 123 | NOTE 124 | 125 | 1. The input .bed file is best to sort according to chromosome names. Use - to input from STDIN. 126 | 127 | 2. Biopython and numpy package are required. 128 | 129 | 3. I assume that all sequences are in the same length. The length information is given by the -l parameter. If the sequence length is greater than the read length, nucleotides outside the read length will not be simulated for error. 130 | -------------------------------------------------------------------------------- /slamdunk/plot/SNPeval.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Script to look at SNP distributions along UTRs ranked by # T>C SNPs 4 | # 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'inputFile', "i", 2,"character","tsv table of snp vs tc count files", 27 | 'coverageCutoff', "c", 2,"numeric","coverage cutoff for calling variants", 28 | 'variantFraction', "v", 2,"numeric","variant fraction cutoff for calling variants", 29 | 'outputFile', "o", 2,"character","output pdf file name" 30 | ),ncol = 5,byrow=T) 31 | 32 | opt = getopt(spec) 33 | 34 | if ( !is.null(opt$help) || length(opt)==1 ) { 35 | #get the script name 36 | cmd = commandArgs(FALSE) 37 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 38 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 39 | #print a friendly message and exit with a non-zero error code 40 | cat(getopt(spec,command = self,usage=T)) 41 | q(status=1); 42 | } 43 | 44 | 45 | if ( is.null(opt$inputFile) ) stop("arg rateTab must be specified") 46 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" } 47 | if ( is.null(opt$coverageCutoff) ) { opt$coverageCutoff = 0 } 48 | if ( is.null(opt$variantFraction) ) { opt$variantFraction = 0 } 49 | 50 | tricubeMovingAverage <-function (x, span = 0.5, full.length = TRUE) { 51 | 52 | n <- length(x) 53 | width <- span * n 54 | hwidth <- as.integer(width%/%2L) 55 | if (hwidth <= 0L) 56 | return(x) 57 | width <- 2L * hwidth + 1L 58 | u <- seq(from = -1, to = 1, length = width) * width/(width + 59 | 1) 60 | tricube.weights <- (1 - abs(u)^3)^3 61 | tricube.weights <- tricube.weights/sum(tricube.weights) 62 | if (!full.length) 63 | return(as.vector(filter(x, tricube.weights), mode = "numeric")[(hwidth + 1):(n - hwidth)]) 64 | z <- numeric(hwidth) 65 | x <- as.vector(filter(c(z, x, z), tricube.weights), mode = "numeric")[(hwidth + 1):(n + hwidth)] 66 | cw <- cumsum(tricube.weights) 67 | x[1:hwidth] <- x[1:hwidth]/cw[(width - hwidth):(width - 1)] 68 | x[(n - hwidth + 1):n] <- x[(n - hwidth + 1):n]/cw[(width - 1):(width - hwidth)] 69 | x 70 | } 71 | 72 | rescale <- function(x, new, old = range(x)) { 73 | new[1] + (x - old[1])/(old[2] - old[1]) * 74 | (new[2] - new[1]) 75 | } 76 | 77 | GSEAplot <- function(counts, snps, ...) { 78 | 79 | num <- length(counts) 80 | 81 | sel = rep.int(FALSE, num) 82 | 83 | sel[snps] <- TRUE 84 | 85 | countOrder <- order(counts, na.last = TRUE, decreasing = TRUE) 86 | counts <- counts[countOrder] 87 | sel <- sel[countOrder] 88 | 89 | snpLoc <- which(sel) 90 | 91 | col.bars <- "black" 92 | 93 | ylim <- c(-1, 1.5) 94 | 95 | plot(1:num, xlim = c(0, num), ylim = c(0, 2.1), type = "n", 96 | axes = FALSE, ylab = "", ...) 97 | 98 | lwd <- 50/length(snpLoc) 99 | lwd <- min(1.9, lwd) 100 | lwd <- max(0.2, lwd) 101 | 102 | barlim <- ylim[2] - c(1.5, 0.5) 103 | rect.yb <- 0 104 | rect.yt <- 0.5 105 | rect(0.5, 0, num + 0.5, 0.5, col = "pink", border = NA) 106 | 107 | if (length(snpLoc) > 0) { 108 | 109 | segments(snpLoc, barlim[1], snpLoc, barlim[2]/2, lwd = lwd, col = "black") 110 | segments(snpLoc, barlim[2]/2, snpLoc, barlim[2]/2 * 2, lwd = lwd, col = "black") 111 | 112 | } 113 | 114 | axis(side = 2, at = 0.5, padj = 3.8, cex.axis = 0.85, 115 | labels = "High # T>C reads", tick = FALSE) 116 | axis(side = 4, at = 0.5, padj = -3.8, cex.axis = 0.85, 117 | labels = "Low # T>C reads", tick = FALSE) 118 | prob <- (10:0)/10 119 | axis(at = seq(1, num, len = 11), side = 1, cex.axis = 0.7, 120 | las = 2, labels = format(quantile(counts, p = prob), 121 | digits = 1)) 122 | 123 | ave.enrich1 <- length(snpLoc)/num 124 | worm1 <- tricubeMovingAverage(sel, span = 0.45)/ave.enrich1 125 | 126 | r.worm1 <- c(0, max(worm1)) 127 | worm1.scale <- rescale(worm1, new = c(1.1 , 2.1 ), old = r.worm1) 128 | 129 | lines(x = 1:num, y = worm1.scale, col = "black", lwd = 2) 130 | abline(h = rescale(1, new = c(1.1 , 2.1), old = r.worm1), lty = 2) 131 | axis(side = 2, at = c(1.1 , 2.1 ), cex.axis = 0.8, labels = c(0, format(max(worm1), digits = 2))) 132 | axis(side = 2, labels = "Enrichment", at = 1.6 , padj = -0.6, tick = FALSE, cex.axis = 0.8) 133 | } 134 | 135 | pdf(opt$outputFile) 136 | 137 | minCounts = round(opt$coverageCutoff * opt$variantFraction) 138 | 139 | table = read.delim(opt$inputFile,header=FALSE,col.names = c("name","count","unmasked","masked","snp")) 140 | 141 | table = table[table$unmasked >= minCounts,] 142 | 143 | table = table[table$count >= quantile(table$count, 0.75),] 144 | 145 | par(mfrow=c(2,1)) 146 | 147 | if (length(table(table$snp)) > 1) { 148 | 149 | blindTest = wilcox.test(table$unmasked ~ table$snp == "1", alternative = "less") 150 | maskedTest = wilcox.test(table$masked ~ table$snp == "1", alternative = "less") 151 | 152 | blindPvalue = blindTest$p.value 153 | 154 | if (blindPvalue < 0.01) { 155 | blindPvalue = "< 0.01" 156 | } else { 157 | blindPvalue = paste("= ",round(blindPvalue,digits=2),sep="") 158 | } 159 | 160 | maskedPvalue = maskedTest$p.value 161 | 162 | if (maskedPvalue < 0.01) { 163 | maskedPvalue = "< 0.01" 164 | } else { 165 | maskedPvalue = paste("= ",round(maskedPvalue,digits=2),sep="") 166 | } 167 | 168 | GSEAplot(table$unmasked, which(table$snp == 1), main="Blind", xlab = paste("Mann-Whitney-U: p-value ",blindPvalue,sep="")) 169 | GSEAplot(table$masked, which(table$snp == 1), main="SNP-masked", xlab = paste("Mann-Whitney-U: p-value ",maskedPvalue,sep="")) 170 | 171 | } else { 172 | GSEAplot(table$unmasked, which(table$snp == 1), main="Blind", xlab = paste("Mann-Whitney-U: p-value NA",sep="")) 173 | GSEAplot(table$masked, which(table$snp == 1), main="SNP-masked", xlab = paste("Mann-Whitney-U: p-value NA",sep="")) 174 | } 175 | 176 | # wilcox.test(testTab$masked ~ testTab$snp == "1") 177 | # wilcox.test(testTab$unmasked ~ testTab$snp == "1") 178 | # 179 | # ks.test(testTab$masked, testTab$unmasked) 180 | # ks.test(testTab$unmasked[testTab$snp == "0"],testTab$unmasked[testTab$snp == "1"]) 181 | 182 | dev.off() 183 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 2 | # 3 | # This file is part of Slamdunk. 4 | # 5 | # Slamdunk is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as 7 | # published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # Slamdunk is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | 18 | from __future__ import print_function 19 | import os, sys, re 20 | 21 | try: 22 | from setuptools import setup, find_packages 23 | from setuptools.command.install import install as _install 24 | from codecs import open 25 | from os import path 26 | except ImportError: 27 | from distutils.core import setup 28 | from distutils.command.install import install as _install 29 | 30 | here = path.abspath(path.dirname(__file__)) 31 | name = "slamdunk" 32 | 33 | #Get the long description from the README file 34 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 35 | long_description = f.read() 36 | 37 | # now we have a `__version__` variable 38 | exec(open(path.join(here, name, 'version.py')).read()) 39 | 40 | # Copy bin recursively 41 | def package_files(directory): 42 | paths = [] 43 | for (path, directories, filenames) in os.walk(directory): 44 | for filename in filenames: 45 | paths.append(os.path.join("..", "..", path, filename)) 46 | return paths 47 | 48 | bin_files = package_files(name + os.sep + 'contrib') 49 | plot_files = package_files(name + os.sep + 'plot') 50 | 51 | def _runExternalBuilds(dir): 52 | 53 | import subprocess 54 | 55 | print("Building RNASeqReadSimulator.") 56 | syscall = "(cd " + os.path.join(dir, name, "contrib") + " ; ./build-rnaseqreadsimulator.sh)" 57 | subprocess.call([syscall], shell=True) 58 | 59 | class install(_install): 60 | 61 | def initialize_options(self): 62 | _install.initialize_options(self) 63 | 64 | def finalize_options(self): 65 | _install.finalize_options(self) 66 | 67 | def run(self): 68 | _install.run(self) 69 | self.execute(_runExternalBuilds, (self.install_lib) ,msg="Installing external dependencies") 70 | 71 | setup( 72 | name = name, 73 | 74 | # Versions should comply with PEP440. For a discussion on single-sourcing 75 | # the version across setup.py and the project code, see 76 | # https://packaging.python.org/en/latest/single_source_version.html 77 | version=__version__, 78 | 79 | description='SLAMdunk suite for analyzing SLAM-seq data', 80 | long_description=long_description, 81 | 82 | # The project's main homepage. 83 | url='http://t-neumann.github.io/slamdunk', 84 | 85 | # Author details 86 | author='Tobias Neumann, Philipp Rescheneder', 87 | author_email='tobias.neumann.at@gmail.com, philipp.rescheneder@univie.ac.at', 88 | 89 | # Choose your license 90 | license='GNU Affero General Public License v3 or later (AGPLv3+)', 91 | 92 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 93 | classifiers=[ 94 | # How mature is this project? Common values are 95 | # 3 - Alpha 96 | # 4 - Beta 97 | # 5 - Production/Stable 98 | 'Development Status :: 4 - Beta', 99 | 100 | # Indicate who your project is intended for 101 | 'Intended Audience :: Science/Research', 102 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 103 | 104 | # Pick your license as you wish (should match "license" above) 105 | 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)', 106 | 107 | # Specify the Python versions you support here. In particular, ensure 108 | # that you indicate whether you support Python 2, Python 3 or both. 109 | 'Programming Language :: Python :: 2', 110 | 'Programming Language :: Python :: 2.6', 111 | 'Programming Language :: Python :: 2.7', 112 | 'Programming Language :: Python :: 3', 113 | 'Programming Language :: Python :: 3.3', 114 | 'Programming Language :: Python :: 3.4', 115 | 'Programming Language :: Python :: 3.5', 116 | 'Programming Language :: C++', 117 | 'Programming Language :: Java', 118 | ], 119 | 120 | # What does your project relate to? 121 | keywords='Next-Generation-Sequencing NGS QuantSeq SLAMSeq', 122 | 123 | # You can just specify the packages manually here if your project is 124 | # simple. Or you can use find_packages(). 125 | packages=find_packages(exclude=['doc', 'tests']), 126 | 127 | # Alternatively, if you want to distribute just a my_module.py, uncomment 128 | # this: 129 | #py_modules=["slamdunk.main", "slamdunk.toolbox","slamdunk.simulate"], 130 | 131 | # List run-time dependencies here. These will be installed by pip when 132 | # your project is installed. For an analysis of "install_requires" vs pip's 133 | # requirements files see: 134 | # https://packaging.python.org/en/latest/requirements.html 135 | install_requires=['joblib>=0.9.4','pybedtools>=0.6.4','intervaltree>=2.1.0','pandas>=0.13.1','biopython>=1.63','pysam>=0.8.3', 'Cython>=0.20.1'], 136 | 137 | # List additional groups of dependencies here (e.g. development 138 | # dependencies). You can install these using the following syntax, 139 | # for example: 140 | # $ pip install -e .[dev,test] 141 | # extras_require={ 142 | # 'dev': ['check-manifest'], 143 | # 'test': ['coverage'], 144 | # }, 145 | 146 | # If there are data files included in your packages that need to be 147 | # installed, specify them here. If using Python 2.6 or less, then these 148 | # have to be included in MANIFEST.in as well. 149 | package_data={ 150 | 'slamdunk.contrib': bin_files, 151 | 'slamdunk.plot': plot_files, 152 | }, 153 | 154 | # Although 'package_data' is the preferred approach, in some case you may 155 | # need to place data files outside of your packages. See: 156 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 157 | # In this case, 'data_file' will be installed into '/my_data' 158 | #data_files=[('bin', extra_files)], 159 | 160 | # To provide executable scripts, use entry points in preference to the 161 | # "scripts" keyword. Entry points provide cross-platform support and allow 162 | # pip to create the appropriate form of executable for the target platform. 163 | entry_points={ 164 | 'console_scripts': [ 165 | 'slamdunk=slamdunk.slamdunk:run', 166 | 'alleyoop=slamdunk.alleyoop:run', 167 | 'splash=slamdunk.splash:run', 168 | ], 169 | }, 170 | ) 171 | -------------------------------------------------------------------------------- /slamdunk/plot/globalRatePlotter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Plot overall conversion rates per UTR 4 | 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'rateTab', "f", 2,"character","tsv table of rate files", 27 | 'outputFile', "O", 2,"character","output pdf file name" 28 | ),ncol = 5,byrow=T) 29 | 30 | opt = getopt(spec) 31 | 32 | if ( !is.null(opt$help) || length(opt)==1 ) { 33 | #get the script name 34 | cmd = commandArgs(FALSE) 35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 37 | #print a friendly message and exit with a non-zero error code 38 | cat(getopt(spec,command = self,usage=T)) 39 | q(status=1); 40 | } 41 | 42 | 43 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified") 44 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" } 45 | 46 | library(ggplot2) 47 | library(gridExtra) 48 | 49 | rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "") 50 | 51 | pdf(opt$outputFile) 52 | 53 | plotList = list() 54 | 55 | for (i in 1:nrow(rates)) { 56 | curTab = read.delim(rates$file[i],stringsAsFactors=FALSE,comment.char='#') 57 | 58 | plusTab = curTab[curTab$Strand == "+",] 59 | minusTab = curTab[curTab$Strand == "-",] 60 | 61 | # "Name" "Chr" "Start" "End" "Strand" "ReadCount" 62 | # "A_A" "A_C" "A_G" "A_T" "A_N" "C_A" 63 | # "C_C" "C_G" "C_T" "C_N" "G_A" "G_C" 64 | # "G_G" "G_T" "G_N" "T_A" "T_C" "T_G" 65 | # "T_T" "T_N" "N_A" "N_C" "N_G" "N_T" 66 | # "N_N" 67 | 68 | names(minusTab) = c("Name", "Chr", "Start", "End", "Strand", "ReadCount", 69 | "T_T", "T_G", "T_C", "T_A", "NNN", "G_T", 70 | "G_G", "G_C", "G_A", "NNN", "C_T", "C_G", 71 | "C_C", "C_A", "NNN", "A_T", "A_G", "A_C", 72 | "A_A", "NNN", "NNN", "NNN", "NNN", "NNN", 73 | "NNN") 74 | 75 | plusTab = plusTab[,c(1,grep("N",names(plusTab),invert=TRUE))] 76 | minusTab = minusTab[,grep("NNN",names(minusTab),invert=TRUE)] 77 | 78 | plusTab = plusTab[,c(-1,-2,-3,-4,-5,-6)] 79 | plusTab = plusTab[rowSums(plusTab) > 0,] 80 | 81 | plusTab$Asum = plusTab$A_A + plusTab$A_C + plusTab$A_G + plusTab$A_T 82 | plusTab$Csum = plusTab$C_A + plusTab$C_C + plusTab$C_G + plusTab$C_T 83 | plusTab$Gsum = plusTab$G_A + plusTab$G_C + plusTab$G_G + plusTab$G_T 84 | plusTab$Tsum = plusTab$T_A + plusTab$T_C + plusTab$T_G + plusTab$T_T 85 | 86 | plusTab$A_A = plusTab$A_A / plusTab$Asum 87 | plusTab$A_C = plusTab$A_C / plusTab$Asum 88 | plusTab$A_G = plusTab$A_G / plusTab$Asum 89 | plusTab$A_T = plusTab$A_T / plusTab$Asum 90 | 91 | plusTab$C_A = plusTab$C_A / plusTab$Csum 92 | plusTab$C_C = plusTab$C_C / plusTab$Csum 93 | plusTab$C_G = plusTab$C_G / plusTab$Csum 94 | plusTab$C_T = plusTab$C_T / plusTab$Csum 95 | 96 | plusTab$G_A = plusTab$G_A / plusTab$Gsum 97 | plusTab$G_C = plusTab$G_C / plusTab$Gsum 98 | plusTab$G_G = plusTab$G_G / plusTab$Gsum 99 | plusTab$G_T = plusTab$G_T / plusTab$Gsum 100 | 101 | plusTab$T_A = plusTab$T_A / plusTab$Tsum 102 | plusTab$T_C = plusTab$T_C / plusTab$Tsum 103 | plusTab$T_G = plusTab$T_G / plusTab$Tsum 104 | plusTab$T_T = plusTab$T_T / plusTab$Tsum 105 | 106 | plusTab = plusTab[,grep("sum",names(plusTab),invert=TRUE)] 107 | 108 | plusTab = plusTab * 100 109 | 110 | minusTab = minusTab[,c(-1,-2,-3,-4,-5,-6)] 111 | minusTab = minusTab[rowSums(minusTab) > 0,] 112 | 113 | minusTab$Asum = minusTab$A_A + minusTab$A_C + minusTab$A_G + minusTab$A_T 114 | minusTab$Csum = minusTab$C_A + minusTab$C_C + minusTab$C_G + minusTab$C_T 115 | minusTab$Gsum = minusTab$G_A + minusTab$G_C + minusTab$G_G + minusTab$G_T 116 | minusTab$Tsum = minusTab$T_A + minusTab$T_C + minusTab$T_G + minusTab$T_T 117 | 118 | minusTab$A_A = minusTab$A_A / minusTab$Asum 119 | minusTab$A_C = minusTab$A_C / minusTab$Asum 120 | minusTab$A_G = minusTab$A_G / minusTab$Asum 121 | minusTab$A_T = minusTab$A_T / minusTab$Asum 122 | 123 | minusTab$C_A = minusTab$C_A / minusTab$Csum 124 | minusTab$C_C = minusTab$C_C / minusTab$Csum 125 | minusTab$C_G = minusTab$C_G / minusTab$Csum 126 | minusTab$C_T = minusTab$C_T / minusTab$Csum 127 | 128 | minusTab$G_A = minusTab$G_A / minusTab$Gsum 129 | minusTab$G_C = minusTab$G_C / minusTab$Gsum 130 | minusTab$G_G = minusTab$G_G / minusTab$Gsum 131 | minusTab$G_T = minusTab$G_T / minusTab$Gsum 132 | 133 | minusTab$T_A = minusTab$T_A / minusTab$Tsum 134 | minusTab$T_C = minusTab$T_C / minusTab$Tsum 135 | minusTab$T_G = minusTab$T_G / minusTab$Tsum 136 | minusTab$T_T = minusTab$T_T / minusTab$Tsum 137 | 138 | minusTab = minusTab[,grep("sum",names(minusTab),invert=TRUE)] 139 | 140 | minusTab = minusTab * 100 141 | 142 | plotTab = rbind(plusTab, minusTab) 143 | 144 | plotTab = plotTab[,c("A_C","A_G","A_T","C_A","C_G","C_T","G_A","G_C","G_T","T_A","T_C","T_G")] 145 | quantiles = lapply(plotTab, function(x) { 146 | return(quantile(x, na.rm=TRUE, p=0.75) + 1.5 * IQR(x, na.rm=TRUE)) 147 | }) 148 | 149 | ymax = ceiling(max(unlist(quantiles))) 150 | 151 | plotTab = rbind( 152 | data.frame(class = "A_C", values = plotTab$A_C), 153 | data.frame(class = "A_G", values = plotTab$A_G), 154 | data.frame(class = "A_T", values = plotTab$A_T), 155 | data.frame(class = "C_A", values = plotTab$C_A), 156 | data.frame(class = "C_G", values = plotTab$C_G), 157 | data.frame(class = "C_T", values = plotTab$C_T), 158 | data.frame(class = "G_A", values = plotTab$G_A), 159 | data.frame(class = "G_C", values = plotTab$G_C), 160 | data.frame(class = "G_T", values = plotTab$G_T), 161 | data.frame(class = "T_A", values = plotTab$T_A), 162 | data.frame(class = "T_C", values = plotTab$T_C), 163 | data.frame(class = "T_G", values = plotTab$T_G) 164 | ) 165 | 166 | plotTab$highlight = "no" 167 | plotTab$highlight[plotTab$class == "T_C"] = "yes" 168 | plotTab$class = sub("_", ">", plotTab$class) 169 | plotTab$group = "A" 170 | plotTab$group[plotTab$class %in% c("C>A","C>G","C>T")] = "C" 171 | plotTab$group[plotTab$class %in% c("G>A","G>C","G>T")] = "G" 172 | plotTab$group[plotTab$class %in% c("T>A","T>C","T>G")] = "T" 173 | 174 | plotTab = plotTab[!is.na(plotTab$values),] 175 | 176 | curPlot = ggplot(plotTab, aes(x=class,y=values,fill=highlight,col=highlight)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.shape = NA,lwd=0.8,fatten=2) + facet_grid(~group, scales="free", space="free") + xlab("") + ylab("Mutation rate per UTR base [%]") + 177 | scale_fill_manual(values=c("white","white")) + scale_color_manual(values=c("black", "red")) + theme(axis.ticks.x = element_blank(), legend.position = "none") + coord_cartesian(ylim=c(0, ymax)) 178 | 179 | plotList[[length(plotList)+1]] <- curPlot + ggtitle(rates$sample[i]) 180 | 181 | anovaTest = aov(values ~ class, data = plotTab) 182 | print(paste("Sample: ",rates$sample[i],sep="")) 183 | print(TukeyHSD(x = anovaTest, 'class', conf.level = 0.95)$class) 184 | 185 | } 186 | 187 | do.call(grid.arrange, plotList) 188 | 189 | dev.off() 190 | 191 | #signal success and exit. 192 | q(status=0) 193 | -------------------------------------------------------------------------------- /slamdunk/plot/eval_halflifes_error_plot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | library(getopt) 21 | 22 | spec = matrix(c( 23 | 'help' , 'h', 0, "logical","print the usage of the command", 24 | 'simulated', "s", 2,"character","Half-lifes inferred from SlamDunk results", 25 | 'predicted', "p", 2,"character","Simulated Half-Lifes", 26 | 'truth', "t", 2,"character","True Half-Lifes", 27 | 'output', "o", 2,"character","Output pdf", 28 | 'missing', "m", 2,"character","List of utrs with missing half-life" 29 | ),ncol = 5,byrow=T) 30 | 31 | opt = getopt(spec) 32 | 33 | if ( !is.null(opt$help) || length(opt)==5 ) { 34 | #get the script name 35 | cmd = commandArgs(FALSE) 36 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 37 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 38 | #print a friendly message and exit with a non-zero error code 39 | cat(getopt(spec,command = self,usage=T)) 40 | q(status=1); 41 | } 42 | 43 | 44 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified") 45 | if ( is.null(opt$predicted) ) stop("arg slamdunk must be specified") 46 | if ( is.null(opt$truth) ) stop("arg truth must be specified") 47 | if ( is.null(opt$output) ) stop("arg output must be specified") 48 | if ( is.null(opt$missing) ) stop("arg missing must be specified") 49 | 50 | truthFile = opt$truth 51 | #truthFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/finalAnnotation_test_cut_chrM_correct_100_original_utrs.bed" 52 | #truthFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/finalAnnotation_test_cut_chrM_correct_original_utrs.bed" 53 | simHLFile = opt$simulated 54 | #simHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_1_0min_utrsummary_halflifes.tsv" 55 | #simHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/finalAnnotation_test_1_0min_utrsummary_halflifes.tsv" 56 | predHLFile = opt$predicted 57 | #predHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/halflifes/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount_halflifes.tsv" 58 | #predHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/slamdunk/halflifes/finalAnnotation_test_1_0min_reads_slamdunk_mapped_filtered_tcount_halflifes.tsv" 59 | output = opt$output 60 | missing = opt$missing 61 | 62 | trueHL = read.csv(truthFile, sep="\t", header = F) 63 | colnames(trueHL) = c("chr", "start", "stop", "name", "halflife", "strand") 64 | simHL = read.csv(simHLFile, sep="\t", header = T) 65 | predHL = read.csv(predHLFile, sep="\t") 66 | 67 | predHL$simulated_hl = simHL$score 68 | predHL$true_hl = trueHL$halflife 69 | predHL$multiperc = predHL$multiMapCount / predHL$readsCPM 70 | head(simHL) 71 | 72 | tmp = predHL[is.na(predHL$score) | is.na(predHL$simulated_hl), ] 73 | predHL = predHL[!is.na(predHL$score) & !is.na(predHL$simulated_hl), ] 74 | 75 | predHL$log2DiffSim = log2((predHL$score + 0.1) / (predHL$simulated_hl + 0.1)) 76 | predHL$log2DiffTrue = log2((predHL$score + 0.1) / (predHL$true_hl + 0.1)) 77 | rmseSim = sqrt(sum((predHL$simulated_hl - predHL$score) ^ 2) / nrow(predHL)) 78 | rmseTrue = sqrt(sum((predHL$true_hl - predHL$score) ^ 2) / nrow(predHL)) 79 | avgHL = mean(predHL$true_hl) 80 | 81 | predHLUniq = predHL[predHL$multiMapCount == 0,] 82 | predHLMulti = predHL[predHL$multiMapCount > 0,] 83 | 84 | #ggplot(predHLUniq,aes(x=readsCPM,y=log2Diff)) + stat_binhex() 85 | head(predHLUniq) 86 | 87 | pdf(output, height = 6, width = 9) 88 | plot(0, main=paste0("Unique UTRs (", nrow(predHLUniq), ")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$readsCPM), max(predHLUniq$readsCPM)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n") 89 | points(predHLUniq$readsCPM, predHLUniq$log2DiffSim, pch=1, col="#00000033") 90 | abline(h=0, lty=2, col="grey") 91 | plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$readsCPM), max(predHLUniq$readsCPM)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n") 92 | points(predHLMulti$readsCPM, predHLMulti$log2DiffSim, pch=1, col="#00000033") 93 | abline(h=0, lty=2, col="grey") 94 | 95 | plot(0, main=paste0("Unique UTRs (", nrow(predHLUniq), ")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$true_hl), max(predHLUniq$true_hl)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n") 96 | points(predHLUniq$true_hl, predHLUniq$log2DiffSim, pch=1, col="#00000033") 97 | abline(h=0, lty=2, col="grey") 98 | plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$true_hl), max(predHLUniq$true_hl)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n") 99 | points(predHLMulti$true_hl, predHLMulti$log2DiffSim, pch=1, col="#00000033") 100 | abline(h=0, lty=2, col="grey") 101 | 102 | 103 | #plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$multiperc), max(predHLUniq$multiperc)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n") 104 | #points(predHLMulti$multiperc, predHLMulti$log2DiffSim, pch=1, col="#00000033") 105 | #abline(h=0, lty=2, col="grey") 106 | 107 | 108 | #lim = max(predHLUniq$simulated_hl) * 1.25 109 | lim = 1440 110 | corr = cor(predHLUniq$simulated_hl, predHLUniq$score) 111 | plot(predHLUniq$simulated_hl ~ predHLUniq$score, main=paste0("Simulated vs. SlamDunk (unique)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (simulated)", xlab="Half-Life (slamDunk)", col="#00000033") 112 | abline(a = 0, b = 1, col="grey", lty=2) 113 | 114 | corr = cor(predHLMulti$simulated_hl, predHLMulti$score) 115 | plot(predHLMulti$simulated_hl ~ predHLMulti$score, main=paste0("Simulated vs. SlamDunk (multimapper)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (simulated)", xlab="Half-Life (slamDunk)", col="#00000033") 116 | abline(a = 0, b = 1, col="grey", lty=2) 117 | 118 | 119 | corr = cor(predHLUniq$true_hl, predHLUniq$score) 120 | plot(predHLUniq$true_hl ~ predHLUniq$score, main=paste0("True vs. SlamDunk (unique)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (true)", xlab="Half-Life (slamDunk)", col="#00000033") 121 | abline(a = 0, b = 1, col="grey", lty=2) 122 | 123 | corr = cor(predHLMulti$true_hl, predHLMulti$score) 124 | plot(predHLMulti$true_hl ~ predHLMulti$score, main=paste0("True vs. SlamDunk (multimapper)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (true)", xlab="Half-Life (slamDunk)", col="#00000033") 125 | abline(a = 0, b = 1, col="grey", lty=2) 126 | 127 | 128 | dev.off() 129 | 130 | #ggplot(predHLUniq,aes(x=simulated_hl,y=log2Diff)) + geom_point(alpha = 0.3) 131 | #p = p + stat_binhex(bins=100) 132 | #p = p 133 | #ggplot(predHLMulti,aes(x=simulated_hl,y=log2Diff)) + stat_binhex(bins=100) 134 | 135 | 136 | 137 | write.table(tmp, missing, quote = F, row.names = F, col.names = T) 138 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/getseqfrombed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script is used to extract sequences from bed file. 4 | 5 | USAGE 6 | getseqfrombed.py {OPTIONS} <.bed file|-> 7 | 8 | OPTIONS 9 | 10 | -b/--seqerror [error file]\tSpecify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed. 11 | 12 | -r/--errorrate [error rate]\tSpecify the overall error rate, a real positive number. The number of errors of each read will follow a Poisson distribution with its mean value specified by --errorrate. Default 0 (no errors). 13 | 14 | -l/--readlen [read length]\tSpecify the read length. Default 75. 15 | 16 | -f/--fill [seq]\tFill at the end of each read by the sequence seq, if the read is shorter than the read length. Default A (to simulate poly-A tails in RNA-Seq reads). 17 | 18 | NOTE 19 | 20 | 1. The input .bed file is best to sort according to chromosome names. Use - to input from STDIN. 21 | 2. Biopython and numpy package are required. 22 | 23 | 3. When applying models, we assume that all sequences are in the same length. The length information is given by the -l parameter. If the sequence length is greater than read length, nucleotides outside the read length will not be simulated for error. 24 | 25 | HISTORY 26 | 27 | 28 | 14/12/2019: 29 | Tobias Neumann: Make python3 compatible. 30 | Fix a bug with error profiles in the minus strand. 31 | 02/01/2013: 32 | Fix a bug with no read errors generated. 33 | Fix a bug with error profiles in the minus strand. 34 | 08/25/2011: 35 | Rename makebedseq.py to getseqfrombed.py. 36 | Print results to stdout. 37 | """ 38 | 39 | import sys; 40 | import pydoc; 41 | import os; 42 | import random; 43 | import bisect; 44 | import math; 45 | import numpy; 46 | from Bio import SeqIO; 47 | from Bio.SeqRecord import SeqRecord; 48 | 49 | # import argparse; 50 | # parser=argparse.ArgumentParser('Extract sequences from bed file'); 51 | # parser.add_argument('-b','--seqerror',help='Specify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed.'); 52 | # parser.add_argument('-r','--errorrate',type=float,default=0.0,help='Specify the overall error rate, a number between 0 and 1. Default 0 (no errors).'); 53 | # parser.add_argument('-l','--readlen',type=int,default=75,help='Specify the read length. Default 75.'); 54 | # parser.add_argument('-f','--fill',default='A',help='Fill at the end of each read by the sequence seq, if the read is shorter than the read length. Default A (to simulate poly-A tails in RNA-Seq reads).'); 55 | 56 | if len(sys.argv)<2: 57 | print>>sys.stderr, (pydoc.render_doc(sys.modules[__name__])); 58 | sys.exit(); 59 | 60 | # analyzing parameters 61 | posweight=[]; 62 | errrate=0.00; 63 | readlength=75; 64 | forcelength=False; 65 | filledseq='A'; 66 | 67 | for i in range(len(sys.argv)): 68 | if i100: 76 | break; 77 | tbweight=float(lines.strip()); 78 | posweight.append(tbweight); 79 | if len(posweight)!=100: 80 | print('Error: the bias file should include at least 100 lines.',file=sys.stderr); 81 | sys.exit(); 82 | if sys.argv[i]=='-r' or sys.argv[i]=='--errorrate': 83 | errrate=float(sys.argv[i+1]); 84 | if errrate<0: # or errrate>1: 85 | print('Error: the error rate should be between 0-1.',file=sys.stderr); 86 | sys.exit(); 87 | print('Error rate: '+str(errrate),file=sys.stderr); 88 | if sys.argv[i]=='-l' or sys.argv[i]=='--readlen': 89 | readlength=int(sys.argv[i+1]); 90 | print('Read length:'+str(readlength),file=sys.stderr); 91 | if sys.argv[i]=='-f' or sys.argv[i]=='--fill': 92 | forcelength=True; 93 | filledseq=sys.argv[i+1]; 94 | print('Force same read length with filled :'+(filledseq),file=sys.stderr); 95 | 96 | 97 | 98 | # construct weight probability for read length, if possible 99 | rlenweight=[]; 100 | if len(posweight)!=0: 101 | kweight=0; 102 | for i in range(readlength): 103 | nfrac=i*100.0/readlength; 104 | lower=int(math.floor(nfrac)); 105 | higher=int(math.ceil(nfrac)); 106 | if higher==lower: higher=lower+1; 107 | #print('higher:'+str(higher)+',lower:'+str(lower)); 108 | if higher<100: 109 | val=posweight[lower]*(nfrac-lower)+posweight[higher]*(higher-nfrac); 110 | else: 111 | val=posweight[99]; 112 | kweight+=val; 113 | rlenweight.append(kweight); 114 | 115 | bedfile=sys.argv[-2]; 116 | reffile=sys.argv[-1]; 117 | #ofastafile=sys.argv[-1]; 118 | 119 | # build reference 120 | seqref=SeqIO.index(reffile,'fasta'); 121 | refkeys = list(seqref.keys()) 122 | 123 | # read bed file, and ready for writing 124 | if bedfile!="-": 125 | fid=open(bedfile); 126 | else: 127 | fid=sys.stdin; 128 | #ofid=open(ofastafile,'w'); 129 | ofid=sys.stdout 130 | 131 | nlines=0; 132 | 133 | prevchr=''; 134 | previndex=''; 135 | 136 | for lines in fid: 137 | # update line counter 138 | nlines=nlines+1; 139 | if nlines %10000==1: 140 | print('Processing '+str(nlines)+' lines...',file=sys.stderr); 141 | # parse lines 142 | bedfield=lines.strip().split('\t'); 143 | if len(bedfield)!=12: 144 | print('Error: incorrect number of fields at line %d (should be 12, observed %d)' % (nlines, len(bedfield)) ,file=sys.stderr); 145 | continue; 146 | # clustering 147 | fieldrange=[int(bedfield[1]),int(bedfield[2])]; 148 | # parse all exons 149 | exonlen=[int(x) for x in bedfield[10][:-1].split(',')]; 150 | exonstart=[int(x)+fieldrange[0] for x in bedfield[11][:-1].split(',')]; 151 | if not bedfield[0] in refkeys: 152 | print('Warning: '+bedfield[0]+ ' not in the reference. Ignore...' ,file=sys.stderr); 153 | continue; 154 | if bedfield[0]!=prevchr: 155 | print('Switching to %s ...' % bedfield[0],file=sys.stderr); 156 | prevchr=bedfield[0]; 157 | previndex=seqref[bedfield[0]]; 158 | # extract sequences 159 | thisseq=SeqRecord(''); 160 | for i in range(len(exonlen)): 161 | thisseq+=previndex[exonstart[i]:(exonstart[i]+exonlen[i])]; 162 | if forcelength: 163 | if sum(exonlen)0: 170 | newseq=thisseq.seq; 171 | for n in range(nmut): 172 | if len(posweight)==0: 173 | # uniform distrib 174 | modifyposition=random.choice(range(len(newseq))); 175 | else: 176 | rchosen=random.random()*kweight; 177 | modifyposition=bisect.bisect_right(posweight,rchosen); 178 | # mutate the position 179 | if len(newseq)>modifyposition: 180 | topos=random.choice('ATGC'); 181 | while topos==newseq[modifyposition]: 182 | topos=random.choice('ATGC'); 183 | print ('MUTATION at position '+str(modifyposition)+','+newseq[modifyposition]+'->'+topos,file=sys.stderr); 184 | # print >>sys.stderr,('SEQ:'+newseq); 185 | newseq=newseq[:modifyposition]+topos+newseq[(modifyposition+1):]; 186 | # print >>sys.stderr,('SEQ:'+newseq); 187 | #print>>sys.stderr,('NMUTATION:'+str(nmut)); 188 | #print>>sys.stderr, (str(thisseq.seq)); 189 | #print>>sys.stderr,(newseq); 190 | thisseq.seq=newseq; 191 | # reverse-complement the sequence if it is on the negative strand 192 | if bedfield[5]=='-': 193 | #print >>sys.stderr,('SEQ:'+thisseq.seq); 194 | thisseq.seq=thisseq.seq.reverse_complement(); 195 | #print >>sys.stderr,('RVCSEQ:'+thisseq.seq); 196 | # write to record 197 | try: 198 | SeqIO.write(thisseq,ofid,'fasta'); 199 | except ValueError: 200 | print('Skip at line '+str(nlines)+', sequence object:',file=sys.stderr); 201 | print(thisseq,file=sys.stderr); 202 | 203 | 204 | 205 | # ofid.close(); 206 | if bedfile!="-": 207 | fid.close(); 208 | -------------------------------------------------------------------------------- /slamdunk/plot/eval_conversion_rate_plots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | library(getopt) 21 | 22 | spec = matrix(c( 23 | 'help' , 'h', 0, "logical","print the usage of the command", 24 | 'simulated', "s", 2,"character","Comma seperated list of simulated files", 25 | 'slamdunk', "f", 2,"character","Comma seperated lost of SlamDunk results", 26 | 'output', "o", 2,"character","Output pdf", 27 | 'conversionrate', "c", 2,"character","Simulated conversion rate" 28 | ),ncol = 5,byrow=T) 29 | 30 | opt = getopt(spec) 31 | 32 | if ( !is.null(opt$help) || length(opt)==3 ) { 33 | #get the script name 34 | cmd = commandArgs(FALSE) 35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 37 | #print a friendly message and exit with a non-zero error code 38 | cat(getopt(spec,command = self,usage=T)) 39 | q(status=1); 40 | } 41 | 42 | 43 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified") 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified") 45 | if ( is.null(opt$output) ) stop("arg output must be specified") 46 | if ( is.null(opt$conversionrate) ) { opt$conversionrate = 0.03 } 47 | 48 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_8_1440min_utrsummary.csv" 49 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv" 50 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/conversion_rate_per_gene_eval_plots.pdf" 51 | #timesParameter = "0,15,30,60,180,360,720,1440" 52 | #conversionRate = 0.03 53 | 54 | simulatedFiles = opt$simulated 55 | #simulatedFiles = "simulation_1/pooja_UTR_annotation_examples_sample_1_0min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_2_15min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_3_30min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_4_60min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_5_180min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_6_360min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_7_720min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_8_1440min_utrsummary.csv" 56 | slamDunkFiles = opt$slamdunk 57 | #slamDunkFiles = "simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv" 58 | filesSimulated = as.character(ordered(strsplit(simulatedFiles, ",")[[1]])) 59 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]])) 60 | outputFile = opt$output 61 | conversionRate = opt$conversionrate 62 | 63 | pdf(outputFile, width = 15, height = 6) 64 | for(timepoint in 1:length(filesSimulated)) { 65 | #timepoint = 5 66 | simDataFile = filesSimulated[timepoint] 67 | slamDunkFile = filesSlamDunk[timepoint] 68 | name = basename(simDataFile) 69 | 70 | simulation = read.table(simDataFile) 71 | colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount") 72 | #simulation$coverage = (simulation$simulatedReads / (simulation$stop - simulation$start) * 50) 73 | simulation$convertedReadsRate = simulation$convertedReads / simulation$readCount 74 | 75 | slamdunk = read.table(slamDunkFile) 76 | colnames(slamdunk) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount") 77 | slamdunk$readsCPM_sim = simulation$readsCPM 78 | slamdunk$log2diff = log2((simulation$conversionRate + 0.0000001) / (slamdunk$conversionRate + 0.0000001)) 79 | slamdunk$diff = (simulation$conversionRate - slamdunk$conversionRate) 80 | #slamdunk$convertedReadsRate = slamdunk$convertedReads / slamdunk$readCount 81 | #slamdunk$diffconvertedReadsRate = (simulation$convertedReads - slamdunk$convertedReads) 82 | #slamdunk$diffconvertedReadsRatediff = ((simulation$convertedReads / simulation$readCount) - (slamdunk$convertedReads / slamdunk$readCount)) 83 | #plot(simulation$V11 ~ slamdunk$V6, xlim=c(0, 0.01), ylim=c(0, 0.01)) 84 | 85 | par(mfrow=c(1,2)) 86 | #yLim = max(abs(slamdunk$diffconvertedReadsRate)) 87 | yLim = as.numeric(conversionRate) 88 | #boxplot(slamdunk$log2diff) 89 | slamDunkUniq = slamdunk[slamdunk$multiMapCount <= 0, ] 90 | slamDunkMulti = slamdunk[slamdunk$multiMapCount > 0, ] 91 | plot(slamDunkUniq$readsCPM_sim, slamDunkUniq$diff, main=name, pch=4, ylim=c(-yLim, yLim), ylab="conversion (sim) - conversion (slamdunk)", xlab="read counts per million") 92 | points(slamDunkMulti$readsCPM_sim, slamDunkMulti$diff, pch=4, col="red") 93 | abline(h=0, lty=2, col="grey") 94 | 95 | yLim = 4 96 | plot(slamDunkUniq$readsCPM_sim, slamDunkUniq$log2diff, main=name, pch=4, ylim=c(-yLim, yLim), ylab="log2(conversion (sim) / conversion (slamdunk))", xlab="read counts per million") 97 | points(slamDunkMulti$readsCPM_sim, slamDunkMulti$log2diff, pch=4, col="red") 98 | abline(h=0, lty=2, col="grey") 99 | } 100 | dev.off() 101 | -------------------------------------------------------------------------------- /slamdunk/contrib/RNASeqReadSimulator/src/gensimreads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script generates simulated RNA-Seq reads (in .bed format) from known gene annotations. 4 | 5 | USAGE 6 | 7 | gensimreads.py {OPTIONS} 8 | 9 | PARAMETER 10 | 11 | BED-File\tThe gene annotation file (in BED format). Use '-' for STDIN input 12 | 13 | OPTIONS 14 | 15 | -e/--expression [expression level file] \tSpecify the weight of each transcript. Each line in the file should have at least (NFIELD+1) fields, with field 0 the annotation id, and field NFIELD the weight of this annoation. If this file is not provided, uniform weight is applied. 16 | 17 | -n/--nreads readcnt \tSpecify the number of reads to be generated. Default 100000. 18 | 19 | -b/--posbias [positional bias file] \tSpecify the positional bias file. The file should include at least 100 lines, each contains only one integer number, showing the preference of the positional bias at this position. If no positional bias file is specified, use uniform distribution bias. 20 | 21 | -l/--readlen [read length] \tSpecify the read length. Default 32. 22 | 23 | -o/--output [output .bed file] \tSpecify the output file. Default STDOUT 24 | 25 | -f/--field [NFIELD] \tThe field of each line as weight input. Default 7 (beginning from field 0) to compatible to genexplvprofile.py. 26 | 27 | -p/--pairend [PELENMEAN,PELENSTD]\t Generate paired-end reads with specified insert length mean and standard derivation. The default is 200,20. 28 | 29 | --stranded \tThe reads are strand specific. 30 | 31 | NOTE 32 | 33 | 1. The bed file is required to sort according to the chromosome name and position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED). 34 | 35 | 2. No problem to handle reads spanning multiple exons. 36 | 37 | HISTORY 38 | 39 | 04/30/2012 40 | Support generating stranded RNA-Seq reads 41 | 42 | 02/16/2012 43 | Now runs on python 2.7 44 | 45 | 02/08/2012 46 | Change default value of NFIELD from 4 to 7 to be compatible with default genexplvprofile values. 47 | 48 | 01/29/2012 49 | Add paired-end support. 50 | 51 | 01/09/2012 52 | Add -f option. 53 | 54 | AUTHOR 55 | Wei Li (li.david.wei AT gmail.com) 56 | """ 57 | 58 | from __future__ import print_function 59 | import sys; 60 | import subprocess; 61 | import pydoc; 62 | import os; 63 | import random; 64 | import bisect; 65 | import math; 66 | from getSegs import *; 67 | 68 | import pdb; 69 | 70 | # read length 71 | readlen=32; 72 | # number of reads to sample 73 | readcnt=100000; 74 | 75 | nfield=7; 76 | 77 | if len(sys.argv)<2: 78 | print(pydoc.render_doc(sys.modules[__name__])); 79 | sys.exit(); 80 | 81 | allids={}; 82 | allidl=[]; 83 | allexp=[]; 84 | 85 | posweight=[]; 86 | 87 | #onbedfile=sys.argv[-1]+'.reads.bed'; 88 | onbedfile="-"; 89 | 90 | genpereads=False; 91 | pemean=200; 92 | pestd=20; 93 | 94 | stranded=False; 95 | 96 | for i in range(len(sys.argv)): 97 | if i100: 123 | break; 124 | tbweight=float(lines.strip()); 125 | posweight.append(tbweight); 126 | if len(posweight)!=100: 127 | print('Error: the bias file should include at least 100 lines.',file=sys.stderr); 128 | sys.exit(); 129 | if sys.argv[i]=='-n' or sys.argv[i]=='--nreads': 130 | readcnt=int(sys.argv[i+1]); 131 | print('Read count:',readcnt,file=sys.stderr); 132 | if sys.argv[i]=='-l' or sys.argv[i]=='--readlen': 133 | readlen=int(sys.argv[i+1]); 134 | print('Read length:',readlen,file=sys.stderr); 135 | if sys.argv[i]=='-o' or sys.argv[i]=='--output': 136 | onbedfile=sys.argv[i+1]; 137 | print('Output bed file:',onbedfile,file=sys.stderr); 138 | if sys.argv[i]=='-f' or sys.argv[i]=='--field': 139 | nfield=int(sys.argv[i+1]); 140 | print('Field:',nfield,file=sys.stderr); 141 | if sys.argv[i]=='-p' or sys.argv[i]=='--pairend': 142 | genpereads=True; 143 | pef=sys.argv[i+1].split(','); 144 | pemean=int(pef[0]); 145 | pestd=int(pef[1]); 146 | print('Generate paired-end reads with mean and std '+str(pemean)+','+str(pestd),file=sys.stderr); 147 | if sys.argv[i]=='-h' or sys.argv[i]=='--help': 148 | print(pydoc.render_doc(sys.modules[__name__])); 149 | sys.exit(); 150 | if sys.argv[i]=='--stranded': 151 | stranded=True; 152 | 153 | 154 | 155 | bedfile=sys.argv[-1]; 156 | 157 | # if no annotation file is specified, use uniform distri. 158 | print('Assigning weights...',file=sys.stderr); 159 | if len(allexp)==0: 160 | totalweight=0; 161 | for lines in open(bedfile): 162 | bedfield=lines.strip().split(); 163 | allids[bedfield[3]]=0; 164 | totalweight+=1; 165 | allexp.append(totalweight); 166 | allidl.append(bedfield[3]); 167 | 168 | # sampling process 169 | print('Sampling...',file=sys.stderr); 170 | for j in range(readcnt): 171 | k=random.random()*totalweight; 172 | sel=bisect.bisect_right(allexp,k); 173 | allids[allidl[sel]]=allids[allidl[sel]]+1; 174 | 175 | # if no bias file specified, use uniform distrib 176 | 177 | print('Total assigned reads:',sum(allids.values()),file=sys.stderr); 178 | 179 | 180 | #debug info: 181 | #for k in allidl: 182 | # print (k, allids[k]); 183 | 184 | #sys.exit(); 185 | 186 | if onbedfile!="-": 187 | onfid=open(onbedfile,'w'); 188 | else: 189 | onfid=sys.stdout; 190 | 191 | 192 | nlines=0; 193 | 194 | totalgenreads=0; 195 | # read bed file 196 | for lines in open(bedfile): 197 | # update line counter 198 | nlines=nlines+1; 199 | if nlines %10000==1: 200 | print('Processing '+str(nlines)+' lines...',file=sys.stderr); 201 | # parse lines 202 | bedfield=lines.strip().split(); 203 | if len(bedfield)!=12: 204 | print('Error: incorrect number of fields (should be 12)',file=sys.stderr); 205 | continue; 206 | if bedfield[5]=='+': 207 | direction=1; 208 | elif bedfield[5]=='-': 209 | direction=-1; 210 | else: 211 | print('Error: incorrect field in field[5] %s:' %bedfield[5],file=sys.stderr); 212 | if bedfield[3] not in allids: 213 | # the current id not found, continue 214 | continue; 215 | nreads=allids[bedfield[3]]; 216 | if nreads<1: 217 | continue; 218 | # parse all segments 219 | fieldrange=(int(bedfield[1]),int(bedfield[2])); 220 | if bedfield[10][-1]==',': 221 | bedfield[10]=bedfield[10][:-1]; 222 | if bedfield[11][-1]==',': 223 | bedfield[11]=bedfield[11][:-1]; 224 | exonlen=[int(x) for x in bedfield[10].split(',')]; 225 | exonstart=[int(x)+fieldrange[0] for x in bedfield[11].split(',')]; 226 | # old code: for each possible position in the transcript, build its segments 227 | # for ne in range(len(exonlen)): 228 | # for pos in range(exonstart[ne],exonstart[ne]+exonlen[ne]): 229 | # create a position 230 | totallen=sum(exonlen); 231 | # here, we randomly choose one position 232 | if genpereads==False: 233 | selrange=totallen-readlen+1; 234 | else: 235 | selrange=totallen-pemean+2*pestd; 236 | if selrange<1: 237 | if genpereads==False: 238 | print('Ignore annoatation',bedfield[3],'of length',totallen,'Reads:',allids[bedfield[3]],file=sys.stderr); 239 | else: 240 | print('Ignore annoatation',bedfield[3],'of length',totallen,'since its shorter than paired-end mean insert length. Reads:',allids[bedfield[3]],file=sys.stderr); 241 | continue; 242 | totalgenreads+=nreads; 243 | cumlen=[];cumlen.extend(exonlen); 244 | for i in range(1,len(cumlen)): 245 | cumlen[i]=cumlen[i]+cumlen[i-1]; 246 | # for nun-uniform distribution, construct a new array for selection 247 | thistbweight=[]; 248 | if len(posweight)!=0: 249 | kweight=0; 250 | for i in range(selrange): 251 | nfrac=i*100.0/selrange; # a value between 0-100 252 | nlower=int(math.floor(nfrac)); # 0-100 253 | nhigher=int(math.ceil(nfrac)); # 0-100 254 | if nhigher==nlower: nhigher=nlower+1; 255 | if nhigher<100: 256 | val=posweight[nlower]*(nfrac-nlower)+posweight[nhigher]*(nhigher-nfrac); 257 | else: 258 | val=posweight[99]; 259 | kweight+=val; 260 | thistbweight.append(kweight); 261 | for t in range(nreads): 262 | if len(posweight)==0: 263 | tpos=random.choice(range(selrange)); 264 | else: 265 | rd=random.random()*kweight; 266 | bsl=bisect.bisect_right(thistbweight,rd); 267 | # for reverse transcripts: flip the position 268 | if direction==-1: 269 | bsl=selrange-1-bsl; 270 | tpos=bsl; 271 | pos=tpos2pos(tpos,cumlen,exonstart); 272 | if genpereads==True: 273 | tpos2=tpos+int(random.normalvariate(pemean-readlen+1,pestd)); 274 | pos2=tpos2pos(tpos2,cumlen,exonstart); 275 | # get the segments 276 | if True: 277 | (startrange,lenrange,status)=getSegs(pos,readlen,1,exonstart,exonlen); 278 | if status!=0: 279 | print('Status:',status,', pos:', pos,'out of',len(cumlen),file=sys.stderr); 280 | #pdb.set_trace(); 281 | continue; 282 | # generate another pair 283 | if genpereads==True: 284 | (startrange2,lenrange2,status2)=getSegs(pos2,readlen,1,exonstart,exonlen); 285 | if status==1: 286 | print('Status:',status,', pos:', pos,'out of',len(cumlen),file=sys.stderr); 287 | if genpereads==False: 288 | lineid="%s_e_%d_%s_%d" % (bedfield[3],t,bedfield[0],pos); 289 | else: 290 | lineid="%s_e_%d_%s_%d/1" % (bedfield[3],t,bedfield[0],pos); 291 | lineid2="%s_e_%d_%s_%d/2" % (bedfield[3],t,bedfield[0],pos); 292 | # random direction 293 | if stranded==False or direction==0: 294 | thisdir=random.choice([1,-1]); 295 | else: 296 | thisdir=direction; 297 | writeBedline(onfid,lineid,bedfield[0],thisdir,startrange,lenrange); 298 | if genpereads==True: 299 | writeBedline(onfid,lineid2,bedfield[0],thisdir*(-1),startrange2,lenrange2); 300 | else: 301 | print(bedfield[0],file=sys.stdout); 302 | 303 | #print('Pospool:'); 304 | #for k in sorted(pospool.keys()): 305 | # print(str(k)+":"+str(pospool[k]),end=","); 306 | #print(); 307 | 308 | 309 | print('Total '+str(nlines)+' lines...',file=sys.stderr); 310 | print('Total '+str(totalgenreads)+' reads...',file=sys.stderr); 311 | if onbedfile!="-": 312 | onfid.close(); 313 | 314 | -------------------------------------------------------------------------------- /slamdunk/utils/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | # Date located in: - 21 | from __future__ import print_function 22 | import sys, os 23 | import pysam 24 | import subprocess 25 | import collections 26 | import csv 27 | import ast 28 | import hashlib 29 | 30 | ReadStat = collections.namedtuple('ReadStat' , 'SequencedReads MappedReads DedupReads FilteredReads SNPs AnnotationName AnnotationMD5') 31 | SampleInfo = collections.namedtuple('SampleInfo' , 'ID Name Type Time') 32 | 33 | class SlamSeqInfo: 34 | 35 | ID_SequencedRead = "sequenced" 36 | ID_MappedReads = "mapped" 37 | ID_FilteredReads = "filtered" 38 | ID_DedupReads = "dedup" 39 | ID_MQFilteredReads = "mqfiltered" 40 | ID_IdFilteredReads = "idfiltered" 41 | ID_NmFilteredReads = "nmfiltered" 42 | ID_MultimapperReads = "multimapper" 43 | ID_SNPs = "snps" 44 | ID_AnnotationName = "annotation" 45 | ID_AnnotationMD5 = "annotationmd5" 46 | 47 | def getFromReadStat(self, name, stats): 48 | if(name in stats): 49 | return stats[name] 50 | else: 51 | return "NA" 52 | 53 | def __init__(self, bam = None): 54 | if bam is None: 55 | self.SequencedReads = 0 56 | self.MappedReads = 0 57 | self.DedupReads = 0 58 | self.FilteredReads = 0 59 | self.MQFilteredReads = 0 60 | self.IdFilteredReads = 0 61 | self.NmFilteredReads = 0 62 | self.MultimapperReads = 0 63 | self.SNPs = 0 64 | self.AnnotationName = "NA" 65 | self.AnnotationMD5 = "NA" 66 | else: 67 | DS = ast.literal_eval(getReadGroup(bam)['DS']) 68 | 69 | self.SequencedReads = self.getFromReadStat(self.ID_SequencedRead, DS) 70 | self.MappedReads = self.getFromReadStat(self.ID_MappedReads, DS) 71 | self.DedupReads = self.getFromReadStat(self.ID_DedupReads, DS) 72 | self.FilteredReads = self.getFromReadStat(self.ID_FilteredReads, DS) 73 | self.MQFilteredReads = self.getFromReadStat(self.ID_MQFilteredReads, DS) 74 | self.IdFilteredReads = self.getFromReadStat(self.ID_IdFilteredReads, DS) 75 | self.NmFilteredReads = self.getFromReadStat(self.ID_NmFilteredReads, DS) 76 | self.MultimapperReads = self.getFromReadStat(self.ID_MultimapperReads, DS) 77 | self.SNPs = self.getFromReadStat(self.ID_SNPs, DS) 78 | self.AnnotationName = self.getFromReadStat(self.ID_AnnotationName, DS) 79 | self.AnnotationMD5 = self.getFromReadStat(self.ID_AnnotationMD5, DS) 80 | 81 | def __repr__(self): 82 | return "{" + "'" + self.ID_SequencedRead + "':" + str(self.SequencedReads) + "," + "'" + self.ID_MappedReads + "':" + str(self.MappedReads) + "," + "'" + self.ID_FilteredReads + "':" + str(self.FilteredReads) + "," + "'" + self.ID_MQFilteredReads + "':" + str(self.MQFilteredReads) + "," + "'" + self.ID_IdFilteredReads + "':" + str(self.IdFilteredReads) + "," + "'" + self.ID_NmFilteredReads + "':" + str(self.NmFilteredReads) + "," + "'" + self.ID_MultimapperReads + "':" + str(self.MultimapperReads) + "," + "'" + self.ID_DedupReads + "':" + str(self.DedupReads) + "," + "'" + self.ID_SNPs + "':" + str(self.SNPs) + "," + "'" + self.ID_AnnotationName + "':'" + str(self.AnnotationName) + "'," + "'" + self.ID_AnnotationMD5 + "':'" + str(self.AnnotationMD5) + "'}" 83 | 84 | def md5(fname): 85 | hash_md5 = hashlib.md5() 86 | with open(fname, "rb") as f: 87 | for chunk in iter(lambda: f.read(4096), b""): 88 | hash_md5.update(chunk) 89 | return hash_md5.hexdigest() 90 | 91 | def estimateMaxReadLength(bam): 92 | 93 | readfile = pysam.AlignmentFile(bam, "rb") 94 | 95 | minLength = sys.maxsize 96 | maxLength = 0 97 | 98 | for read in readfile.head(n = 1000) : 99 | minLength = min(minLength, read.query_length + read.get_tag("XA")) 100 | maxLength = max(maxLength, read.query_length + read.get_tag("XA")) 101 | 102 | range = maxLength - minLength 103 | 104 | if (range <= 10) : 105 | return(maxLength + 10) 106 | else: 107 | return(-1) 108 | 109 | #Replaces the file extension of inFile to with and adds a suffix 110 | #Example replaceExtension("reads.fq", ".sam", suffix="_namg") => reads_ngm.sam 111 | def replaceExtension(inFile, newExtension, suffix=""): 112 | return os.path.splitext(inFile)[0] + suffix + newExtension 113 | 114 | #Removes right-most extension from file name 115 | def removeExtension(inFile): 116 | name = os.path.splitext(inFile)[0] 117 | ext = os.path.splitext(inFile)[1] 118 | if(ext == ".gz"): 119 | name = os.path.splitext(name)[0] 120 | return name 121 | 122 | def getchar(): 123 | print("Waiting for input", file=sys.stderr) 124 | sys.stdin.readline() 125 | 126 | def files_exist(files): 127 | if (type(files) is list) : 128 | for f in files: 129 | if not os.path.exists(f): 130 | return False 131 | else: 132 | if not os.path.exists(files): 133 | return False 134 | return True 135 | 136 | # remove a (list of) file(s) (if it/they exists) 137 | def removeFile(files): 138 | if (type(files) is list) : 139 | for f in files: 140 | if os.path.exists(f): 141 | os.remove(f) 142 | else: 143 | if os.path.exists(files): 144 | os.remove(files) 145 | 146 | 147 | def checkStep(inFiles, outFiles, force=False): 148 | if not files_exist(inFiles): 149 | raise RuntimeError("One or more input files don't exist: " + str(inFiles)) 150 | inFileDate = os.path.getmtime(inFiles[0]) 151 | for x in inFiles[1:]: 152 | inFileDate = max(inFileDate, os.path.getmtime(x)) 153 | 154 | if len(outFiles) > 0 and files_exist(outFiles): 155 | outFileDate = os.path.getmtime(outFiles[0]) 156 | for x in outFiles[1:]: 157 | outFileDate = min(outFileDate, os.path.getmtime(x)) 158 | if outFileDate > inFileDate: 159 | if(force == True): 160 | return True 161 | else: 162 | return False 163 | 164 | return True 165 | 166 | def getBinary(name): 167 | 168 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 169 | 170 | return os.path.join(projectPath, "contrib", name) 171 | 172 | def getRNASeqReadSimulator(name): 173 | 174 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 175 | 176 | return os.path.join(projectPath, "contrib", "RNASeqReadSimulator", "src", name) 177 | 178 | def getPlotter(name): 179 | 180 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 181 | 182 | return os.path.join(projectPath, "plot", name + ".R") 183 | 184 | def run(cmd, log=sys.stderr, verbose=False, dry=False): 185 | if(verbose or dry): 186 | print(cmd, file=log) 187 | 188 | if(not dry): 189 | #ret = os.system(cmd) 190 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) 191 | lines_iterator = iter(p.stdout.readline, b"") 192 | for line in lines_iterator: 193 | print(line, end="", file=log) # yield line 194 | p.wait(); 195 | if(p.returncode != 0): 196 | raise RuntimeError("Error while executing command: \"" + cmd + "\"") 197 | 198 | def callR(cmd, log=sys.stderr, verbose=False, dry=False): 199 | 200 | if(verbose or dry): 201 | print(cmd, file=log) 202 | 203 | if(not dry): 204 | 205 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) 206 | lines_iterator = iter(p.stdout.readline, b"") 207 | for line in lines_iterator: 208 | print(line, end="", file=log) # yield line 209 | p.wait(); 210 | if(p.returncode != 0): 211 | raise RuntimeError("Error while executing command: \"" + cmd + "\"") 212 | 213 | def pysamIndex(outputBam): 214 | pysam.index(outputBam) # @UndefinedVariable 215 | 216 | def countReads(bam): 217 | bamFile = pysam.AlignmentFile(bam) 218 | mapped = 0 219 | unmapped = 0 220 | for read in bamFile.fetch(until_eof=True): 221 | if(not read.is_secondary and not read.is_supplementary): 222 | if(read.is_unmapped): 223 | unmapped += 1 224 | else: 225 | mapped += 1 226 | bamFile.close() 227 | return mapped, unmapped 228 | 229 | def getReadGroup(bam): 230 | bamFile = pysam.AlignmentFile(bam) 231 | header = bamFile.header 232 | bamFile.close() 233 | if('RG' in header and len(header['RG']) > 0): 234 | return header['RG'][0] 235 | else: 236 | raise RuntimeError("Could not get mapped/unmapped/filtered read counts from BAM file. RG is missing. Please rerun slamdunk filter.") 237 | 238 | def getSampleInfo(bam): 239 | sampleInfo = getReadGroup(bam) 240 | sampleInfos = sampleInfo['SM'].split(":") 241 | return SampleInfo(ID = sampleInfo['ID'], Name = sampleInfos[0], Type = sampleInfos[1], Time = sampleInfos[2]) 242 | 243 | def readSampleNames(sampleNames, bams): 244 | samples = None 245 | 246 | if(sampleNames != None and files_exist(sampleNames)): 247 | samples = {} 248 | with open(sampleNames, "r") as sampleFile: 249 | samplesReader = csv.reader(sampleFile, delimiter='\t') 250 | for row in samplesReader: 251 | samples[removeExtension(row[0])] = row[1] 252 | 253 | return samples 254 | 255 | def getSampleName(fileName, samples): 256 | if samples == None: 257 | return removeExtension(fileName) 258 | else: 259 | for key in samples: 260 | if(key in fileName): 261 | return samples[key] 262 | 263 | return 264 | 265 | def matchFile(sample, files): 266 | fileName = None 267 | for item in files: 268 | if(sample in item): 269 | if(fileName == None): 270 | fileName = item 271 | else: 272 | raise RuntimeError("Found more than one matching file in list.") 273 | 274 | return fileName 275 | 276 | def complement(seq): 277 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'} 278 | bases = list(seq) 279 | bases = [complement[base] for base in bases] 280 | return ''.join(bases) 281 | 282 | def shell(cmd): 283 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) 284 | p.wait() 285 | if(p.returncode != 0): 286 | raise RuntimeError("Error while executing command: " + cmd) 287 | else: 288 | return p.communicate()[0] 289 | 290 | def shellerr(cmd, raiseError=True): 291 | p = subprocess.Popen(cmd, stderr=subprocess.PIPE, shell=True) 292 | p.wait() 293 | if(p.returncode != 0 and raiseError == True): 294 | raise RuntimeError("Error while executing command: " + cmd) 295 | else: 296 | return p.communicate()[1] 297 | -------------------------------------------------------------------------------- /slamdunk/plot/eval_halflife_per_gene_plots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | library(getopt) 21 | 22 | spec = matrix(c( 23 | 'help' , 'h', 0, "logical","print the usage of the command", 24 | 'simulated', "s", 2,"character","Comma seperated list of simulated files", 25 | 'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results", 26 | 'timepoints', "t", 2,"character","Comma seperated list of time points", 27 | 'bed', "b", 2,"character","BED file containing half lifes", 28 | 'output', "o", 2,"character","Output pdf", 29 | 'conversionrate', "c", 2,"character","Simulated conversion rate" 30 | ),ncol = 5,byrow=T) 31 | 32 | opt = getopt(spec) 33 | 34 | if ( !is.null(opt$help) || length(opt)==4 ) { 35 | #get the script name 36 | cmd = commandArgs(FALSE) 37 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 38 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 39 | #print a friendly message and exit with a non-zero error code 40 | cat(getopt(spec,command = self,usage=T)) 41 | q(status=1); 42 | } 43 | 44 | 45 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified") 46 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified") 47 | if ( is.null(opt$output) ) stop("arg output must be specified") 48 | if ( is.null(opt$timepoints) ) stop("arg timepoints must be specified") 49 | if ( is.null(opt$bed) ) stop("arg bed specified") 50 | if ( is.null(opt$conversionrate) ) { opt$conversionrate = 0.03 } 51 | 52 | simulatedFiles = opt$simulated 53 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_8_1440min_utrsummary.csv" 54 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_8_1440min_utrsummary.csv" 55 | 56 | slamDunkFiles = opt$slamdunk 57 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv" 58 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv" 59 | 60 | filesSimulated = as.character(ordered(strsplit(simulatedFiles, ",")[[1]])) 61 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]])) 62 | outputFile = opt$output 63 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/halflife_per_gene_eval_plots.pdf" 64 | #timesParameter = "0,15,30,60,180,360,720,1440" 65 | timesParameter = opt$timepoints 66 | times = as.numeric(strsplit(timesParameter, ",")[[1]]) 67 | times = times / 60 68 | #bedFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_original_utrs.bed" 69 | #bedFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/finalAnnotation_test_cut_chrM_correct_100_original_utrs.bed" 70 | bedFile = opt$bed 71 | #conversionRate = 0.03 72 | conversionRate = as.numeric(opt$conversionrate) 73 | 74 | computeHalfLife <- function(rates, timepoints) { 75 | # Infere half life from data 76 | a_start<-max(rates) #param a is the y value when x=0 77 | k_start = log(2, base = exp(1))/5 78 | 79 | halfLifePred = NA 80 | C = NA 81 | k = NA 82 | 83 | tryCatch( { 84 | fit = nls(rates ~ a*(1-exp(-k*(timepoints))), start=list(a=a_start,k=k_start)) 85 | halfLifePred = log(2, base = exp(1))/coef(fit)[2] * 60 86 | C = coef(fit)[1] 87 | k = coef(fit)[2] 88 | }, error=function(e){}) 89 | 90 | c(halfLifePred, C, k) 91 | } 92 | 93 | mergeRates <- function(times, files, perRead) { 94 | mergedRates = data.frame() 95 | for(i in 1:length(times)) { 96 | time = times[i] 97 | #print(time) 98 | simDataFile = files[i] 99 | simulation = read.table(simDataFile) 100 | colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount") 101 | if(nrow(mergedRates) == 0) { 102 | mergedRates = simulation[, c("chr", "start", "stop", "name", "strand")] 103 | if(perRead == TRUE) { 104 | mergedRates$conversionRate = simulation$convertedReads / simulation$readCount 105 | } else { 106 | mergedRates$conversionRate = simulation$conversionRate 107 | } 108 | } else { 109 | if(perRead == TRUE) { 110 | mergedRates = cbind(mergedRates, simulation$convertedReads / simulation$readCount) 111 | } else { 112 | mergedRates = cbind(mergedRates, simulation$conversionRate) 113 | } 114 | } 115 | } 116 | colnames(mergedRates) = c("chr", "start", "stop", "name", "strand", times) 117 | mergedRates 118 | } 119 | 120 | bed = read.table(bedFile) 121 | colnames(bed) = c("char", "start", "stop", "name", "score", "strand") 122 | 123 | 124 | 125 | perRead = F 126 | slamDunkMergedRates = mergeRates(times, filesSlamDunk, perRead) 127 | simMergedRates = mergeRates(times, filesSimulated, perRead) 128 | 129 | pageNumber = 1 130 | #outputFile = "~/test.pdf" 131 | #pdf(outputFile, height=6, width=9) 132 | pdf(paste0(outputFile, "_page_", pageNumber, ".pdf"), height=6, width=9) 133 | for(utr in 1:nrow(slamDunkMergedRates)) { 134 | #utr = 1 135 | pulseSlamDunk = data.frame(y = as.numeric(t(slamDunkMergedRates[utr, 6:(5 + length(times))])[,1]), x = times) 136 | pulseSimulated = data.frame(y = as.numeric(t(simMergedRates[utr, 6:(5 + length(times))])[,1]), x = times) 137 | #yLim = max(max(pulseSlamDunk$y), max(pulseSimulated$y)) 138 | yLim = conversionRate * 1.25 139 | yLab = "conversion rate" 140 | if(perRead) { 141 | yLab = "% of T->C reads" 142 | } 143 | 144 | # Infere half life from data 145 | halfLifeResultSlamDunk = computeHalfLife(pulseSlamDunk$y, pulseSlamDunk$x) 146 | halfLifePred = halfLifeResultSlamDunk[1] 147 | halfLifeResultSimulated = computeHalfLife(pulseSimulated$y, pulseSimulated$x) 148 | halfLifeSim = halfLifeResultSimulated[1] 149 | halfLifeTruth = bed[utr, ]$score 150 | 151 | 152 | plot(0, type="n", main=paste0(slamDunkMergedRates[utr, ]$name, "\n half life: ", round(halfLifeTruth, digits = 0), " (truth), ", round(halfLifeSim, digits = 0), " (sim), ", round(halfLifePred, digits = 0)," (slamDunk)"), xlab="Time (hours)", ylab=yLab, ylim=c(0, yLim), xlim=c(times[1], times[length(times)]), pch=4) 153 | lines(pulseSimulated$x, pulseSimulated$y, type="p", col="green", lty=1, pch=4) 154 | lines(pulseSlamDunk$x, pulseSlamDunk$y, type="p", col="blue", lty=1, pch=4) 155 | legend("bottomright", c("rates (slamDunk)", "rates (simulated)", "slamDunk", "simulated", "truth"), col=c("blue", "green", "blue", "green", "grey"), lty=c(1, 1, 2, 2, 2), bty="n") 156 | 157 | 158 | t = 0:max(times) 159 | # Print truth 160 | lambda = log(2) / (halfLifeTruth / 60) 161 | lines((1 - exp(-lambda*t)) * conversionRate ~ t, type="l", lty=2, col="grey") 162 | # Print simulated 163 | lines((1 - exp(-halfLifeResultSimulated[3]*t)) * halfLifeResultSimulated[2] ~ t, type="l", lty=2, col="green") 164 | # Print slamDunk 165 | lines((1 - exp(-halfLifeResultSlamDunk[3]*t)) * halfLifeResultSlamDunk[2] ~ t, type="l", lty=2, col="blue") 166 | 167 | if(utr %% 100 == 0) { 168 | dev.off() 169 | pageNumber = pageNumber + 1 170 | pdf(paste0(outputFile, "_page_", pageNumber, ".pdf"), height=6, width=9) 171 | print(paste0(outputFile, "_page_", pageNumber, ".pdf")) 172 | } 173 | 174 | } 175 | dev.off() 176 | -------------------------------------------------------------------------------- /slamdunk/plot/merge_rate_files.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Script to merge SlamDunk count files 4 | # 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 6 | # 7 | # This file is part of Slamdunk. 8 | # 9 | # Slamdunk is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Affero General Public License as 11 | # published by the Free Software Foundation, either version 3 of the 12 | # License, or (at your option) any later version. 13 | # 14 | # Slamdunk is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Affero General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Affero General Public License 20 | # along with this program. If not, see . 21 | 22 | library(getopt) 23 | 24 | spec = matrix(c( 25 | 'help' , 'h', 0, "logical","print the usage of the command", 26 | 'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results", 27 | 'output', "o", 2,"character","Output tsv", 28 | 'column', "c", 2,"character","Column or Expression used to summarize files", 29 | 'columnname', "n", 2,"character","Index of meta data field to use as column name" 30 | ),ncol = 5,byrow=T) 31 | 32 | opt = getopt(spec) 33 | 34 | if ( !is.null(opt$help) || length(opt)==2 ) { 35 | #get the script name 36 | cmd = commandArgs(FALSE) 37 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2] 38 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n") 39 | #print a friendly message and exit with a non-zero error code 40 | cat(getopt(spec,command = self,usage=T)) 41 | q(status=1); 42 | } 43 | 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified") 45 | if ( is.null(opt$output) ) stop("arg output must be specified") 46 | if ( is.null(opt$column) ) { opt$column = "TcReadCount / ReadCount" } 47 | if ( is.null(opt$column) ) { opt$columnname = 2 } 48 | 49 | slamDunkFiles = opt$slamdunk 50 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.tsv" 51 | #slamDunkFiles = "ngm-20161027/count/34330_An312_wt-2n_mRNA-slamseq-autoquant_0h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34331_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34332_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34333_An312_wt-2n_mRNA-slamseq-autoquant_1h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34334_An312_wt-2n_mRNA-slamseq-autoquant_3h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34335_An312_wt-2n_mRNA-slamseq-autoquant_6h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34336_An312_wt-2n_mRNA-slamseq-autoquant_12h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34343_An312_wt-2n_mRNA-slamseq-autoquant_0h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34344_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34347_An312_wt-2n_mRNA-slamseq-autoquant_3h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34348_An312_wt-2n_mRNA-slamseq-autoquant_6h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34349_An312_wt-2n_mRNA-slamseq-autoquant_12h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34354_An312_wt-2n_mRNA-slamseq-autoquant_24h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34356_An312_wt-2n_mRNA-slamseq-autoquant_0h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34357_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34358_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34359_An312_wt-2n_mRNA-slamseq-autoquant_1h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34361_An312_wt-2n_mRNA-slamseq-autoquant_6h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34362_An312_wt-2n_mRNA-slamseq-autoquant_12h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34367_An312_wt-2n_mRNA-slamseq-autoquant_24h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34503_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34504_An312_wt-2n_mRNA-slamseq-autoquant_1h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34505_An312_wt-2n_mRNA-slamseq-autoquant_3h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34506_An312_wt-2n_mRNA-slamseq-autoquant_24h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34507_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34508_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34509_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34510_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34511_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34512_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34513_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34514_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34515_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34516_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34517_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34518_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34519_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34520_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34521_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34522_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34523_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34524_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34525_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34526_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34527_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34528_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34529_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34530_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R3.fq_slamdunk_mapped_filtered_tcount.tsv" 52 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]])) 53 | outputFile = opt$output 54 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/halflife_per_gene_eval_plots.tsv" 55 | evalExpression = opt$column 56 | #evalExpression = "TcReadCount / ReadCount" 57 | columnName = as.integer(opt$columnname) 58 | #columnName = 2 59 | 60 | readMeatInfo <- function(fileName) { 61 | #fileName = filesSlamDunk[1] 62 | sampleInfo = read.table(fileName, nrows = 1, comment.char = "") 63 | version = paste(lapply(sampleInfo[1,1:3], as.character), collapse = '\t') 64 | sampleID = as.character(sampleInfo[1, ]$V7) 65 | sampleName = as.character(sampleInfo[1, ]$V6) 66 | sampleType = as.character(sampleInfo[1, ]$V8) 67 | sampleTime = as.numeric(sampleInfo[1, ]$V9) 68 | sampleInfo = read.table(fileName, nrows = 1, skip = 1, comment.char = "") 69 | annotationMD5 = as.character(sampleInfo[1, ]$V3) 70 | annotationName = as.character(sampleInfo[1, ]$V2) 71 | c(sampleID, sampleName, sampleType, sampleTime, annotationName, annotationMD5, version) 72 | } 73 | 74 | sampleNumber = length(filesSlamDunk) 75 | mergedRates = data.frame() 76 | 77 | annotationName = "" 78 | annotationMD5 = "" 79 | version = "" 80 | IDs = c() 81 | 82 | # Merge rates from all samples 83 | for(i in 1:length(filesSlamDunk)) { 84 | #i = 1 85 | file = filesSlamDunk[i] 86 | meta = readMeatInfo(file) 87 | sampleName = meta[columnName] 88 | 89 | if(i == 1) { 90 | version = meta[7] 91 | annotationName = meta[5] 92 | annotationMD5 = meta[6] 93 | } else { 94 | if(annotationMD5 != meta[6]) { 95 | 96 | } 97 | } 98 | 99 | IDs = c(IDs, as.numeric(meta[1])) 100 | data = read.table(file, header = T) 101 | if(i == 1) { 102 | mergedRates = data[, c(1:6)] 103 | mergedRates$avgReadsCPM = data$ReadsCPM 104 | mergedRates$avgMultimapper = data$multimapCount 105 | mergedRates$avgTcontent = data$Tcontent 106 | mergedRates$avgCoverageOnTs = data$CoverageOnTs 107 | } else { 108 | mergedRates$avgReadsCPM = mergedRates$avgReadsCPM + data$ReadsCPM 109 | mergedRates$avgMultimapper = mergedRates$avgMultimapper + data$multimapCount 110 | mergedRates$avgTcontent = mergedRates$avgTcontent + data$Tcontent 111 | mergedRates$avgCoverageOnTs = mergedRates$avgCoverageOnTs + data$CoverageOnTs 112 | } 113 | #if(perRead == T) { 114 | attach(data) 115 | #mergedRates[,sampleName] = data$TcReadCount / data$ReadCount 116 | mergedRates[,sampleName] = eval(parse(text=evalExpression)) 117 | detach(data) 118 | mergedRates[data$ReadCount == 0,sampleName] = 0 119 | #} else { 120 | # mergedRates[,sampleName] = data$ConversionRate 121 | #} 122 | } 123 | # compute average CPM and multimapper per UTR 124 | mergedRates$avgReadsCPM = mergedRates$avgReadsCPM / sampleNumber 125 | mergedRates$avgMultimapper = mergedRates$avgMultimapper / sampleNumber 126 | mergedRates$avgTcontent = mergedRates$avgTcontent / sampleNumber 127 | mergedRates$avgCoverageOnTs = mergedRates$avgCoverageOnTs / sampleNumber 128 | 129 | #head(mergedRates) 130 | # Sort columns by sample name 131 | colNumber = length(colnames(mergedRates)) 132 | firstSampleColumn = (colNumber - sampleNumber + 1) 133 | sampleNames = colnames(mergedRates)[firstSampleColumn:colNumber] 134 | sampleColumnOrder = order(IDs) 135 | mergedRates = mergedRates[, c(1:(firstSampleColumn - 1), (sampleColumnOrder + firstSampleColumn - 1))] 136 | 137 | #head(mergedRates) 138 | 139 | # Write to output file 140 | con <- file(outputFile, open="wt") 141 | writeLines(version, con) 142 | writeLines(paste0("#Annotation:\t", annotationName, "\t", annotationMD5), con) 143 | writeLines(paste0("#Expression:\t", evalExpression), con) 144 | write.table(mergedRates, con, sep = "\t", quote = F, row.names = F, col.names = T) 145 | close(con) 146 | -------------------------------------------------------------------------------- /slamdunk/dunks/filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder. 4 | # 5 | # This file is part of Slamdunk. 6 | # 7 | # Slamdunk is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # Slamdunk is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | 20 | # Date located in: - 21 | from __future__ import print_function 22 | import pysam, random, os 23 | 24 | from slamdunk.version import __version__, __bam_version__ # @UnresolvedImport 25 | 26 | from slamdunk.utils.BedReader import bedToIntervallTree # @UnresolvedImport 27 | from slamdunk.utils.misc import checkStep, run, removeFile, getBinary, pysamIndex, SlamSeqInfo, md5 # @UnresolvedImport 28 | 29 | # def Filter_old(inputBAM, outputBAM, log, MQ=2, printOnly=False, verbose=True, force=True): 30 | # if(printOnly or checkStep([inputBAM], [outputBAM], force)): 31 | # run(" ".join([ getBinary("samtools"), "view -q", str(MQ), "-b", inputBAM, ">", outputBAM]), log, verbose=verbose, dry=printOnly) 32 | # else: 33 | # print("Skipped filtering for " + inputBAM, file=log) 34 | # 35 | # runIndexBam(outputBAM, log, verbose=verbose, dry=printOnly) 36 | # runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) 37 | 38 | def bamSort(outputBAM, log, newHeader, verbose): 39 | 40 | tmp = outputBAM + "_tmp" 41 | if(newHeader != None): 42 | pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb") 43 | pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader) 44 | for read in pyOutputBAM: 45 | pyTmp.write(read) 46 | pyOutputBAM.close() 47 | pyTmp.close() 48 | else: 49 | os.rename(outputBAM, tmp) 50 | 51 | #run(" ".join(["samtools", "sort", "-@", str(threads) , tmp, replaceExtension(outFile, "")]), log, verbose=verbose, dry=dry) 52 | run(" ".join(["samtools sort", "-o", outputBAM, tmp]), log, verbose=verbose, dry=False) 53 | #pysam.sort(tmp, outputBAM) # @UndefinedVariable 54 | removeFile(tmp) 55 | 56 | def dumpBufferToBam (buffer, multimapList, outbam, infile): 57 | # Randomly write hit from read 58 | #read = random.choice(buffer.values()).pop() 59 | read = list(buffer.values()).pop().pop() 60 | 61 | # printer = read.query_name + "\t" + infile.getrname(read.reference_id) + "\t" + str(read.reference_start) + "\t" + str(read.reference_end) + "\tPRINT\tTrue" 62 | read.set_tag("RD", multimapList.rstrip(" "), "Z") 63 | read.is_secondary = False 64 | read.is_supplementary = False 65 | outbam.write(read) 66 | 67 | # return printer 68 | # for key in buffer.keys(): 69 | # for read in buffer[key]: 70 | # outbam.write(read) 71 | 72 | def multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log): 73 | 74 | mappedReads = 0 75 | unmappedReads = 0 76 | filteredReads = 0 77 | 78 | mqFiltered = 0 79 | idFiltered = 0 80 | nmFiltered = 0 81 | 82 | utrIntervallTreeDict = bedToIntervallTree(bed) 83 | 84 | # debugLog = os.path.join("multimapdebug.log") 85 | # 86 | # fo = open(debugLog, "w") 87 | 88 | # Buffers for multimappers 89 | multimapBuffer = {} 90 | prevRead = "" 91 | # If read maps to another than previously recorded UTR -> do not dump reads to file 92 | dumpBuffer = True 93 | # This string tracks all multiple alignments 94 | multimapList = "" 95 | # logList = [] 96 | 97 | for read in infile: 98 | if(not read.is_secondary and not read.is_supplementary): 99 | if(read.is_unmapped): 100 | unmappedReads += 1 101 | else: 102 | mappedReads += 1 103 | 104 | # First pass general filters 105 | if(read.is_unmapped): 106 | continue 107 | if(float(read.get_tag("XI")) < minIdentity): 108 | idFiltered += 1 109 | continue 110 | if(NM > -1 and int(read.get_tag("NM")) > NM): 111 | nmFiltered += 1 112 | continue 113 | if (read.mapping_quality == 0) : 114 | # Previous read was also multimapper 115 | if (read.query_name != prevRead and prevRead != "") : 116 | 117 | #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : 118 | if (dumpBuffer and len(multimapBuffer) > 0) : 119 | dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) 120 | filteredReads += 1 121 | 122 | # ret = dumpBufferToBam(multimapBuffer, outfile, infile) 123 | # print(ret,file = fo) 124 | #multimapBuffer = {} 125 | #multimapBuffer["nonUTR"] = [] 126 | 127 | # for entry in logList: 128 | # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) 129 | # logList = [] 130 | 131 | dumpBuffer = True 132 | multimapList = "" 133 | multimapBuffer = {} 134 | 135 | # Query Intervall tree for given chromosome for UTs 136 | chr = infile.getrname(read.reference_id) 137 | start = read.reference_start 138 | end = read.reference_end 139 | 140 | if (chr in utrIntervallTreeDict) : 141 | query = utrIntervallTreeDict[chr][start:end] 142 | else : 143 | query = set() 144 | 145 | if len(query) > 0: 146 | # First UTR hit is recorded without checks 147 | if (len(multimapBuffer) == 0) : 148 | for result in query : 149 | if (not result.data in multimapBuffer) : 150 | multimapBuffer[result.data] = [] 151 | multimapBuffer[result.data].append(read) 152 | # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR 153 | else : 154 | for result in query : 155 | if (not result.data in multimapBuffer) : 156 | multimapBuffer[result.data] = [] 157 | multimapBuffer[result.data].append(read) 158 | dumpBuffer = False 159 | else : 160 | multimapBuffer[result.data].append(read) 161 | 162 | # else : 163 | # # If no overlap -> nonUTR 164 | # multimapBuffer["nonUTR"].append(read) 165 | # for result in query : 166 | # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data) 167 | # else : 168 | # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF") 169 | 170 | multimapList = multimapList + chr + ":" + str(start) + "-" + str(end) + " " 171 | 172 | prevRead = read.query_name 173 | else : 174 | # Dump any multimappers before a unique mapper 175 | #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) : 176 | if (len(multimapBuffer) > 0) : 177 | if (dumpBuffer) : 178 | dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) 179 | filteredReads += 1 180 | # ret = dumpBufferToBam(multimapBuffer, outfile, infile) 181 | # print(ret,file = fo) 182 | multimapBuffer = {} 183 | # for entry in logList: 184 | # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) 185 | # logList = [] 186 | #multimapBuffer["nonUTR"] = [] 187 | dumpBuffer = True 188 | multimapList = "" 189 | 190 | # Record all unique mappers 191 | prevRead = read.query_name 192 | outfile.write(read) 193 | filteredReads += 1 194 | 195 | # Dump last portion if it was multimapper 196 | #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : 197 | if (dumpBuffer and len(multimapBuffer) > 0) : 198 | dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) 199 | filteredReads += 1 200 | 201 | multimapper = mappedReads - filteredReads - idFiltered - nmFiltered 202 | 203 | print("Criterion\tFiltered reads",file=log) 204 | print("MQ < 0\t0",file=log) 205 | print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) 206 | print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) 207 | print("MM\t" + str(multimapper),file=log) 208 | 209 | # fo.close() 210 | return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper 211 | 212 | 213 | def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): 214 | if(printOnly or checkStep([inputBAM], [outputBAM], force)): 215 | 216 | mappedReads = 0 217 | unmappedReads = 0 218 | filteredReads = 0 219 | 220 | mqFiltered = 0 221 | idFiltered = 0 222 | nmFiltered = 0 223 | multimapper = 0 224 | 225 | infile = pysam.AlignmentFile(inputBAM, "rb") 226 | outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) 227 | 228 | # Default filtering without bed 229 | if (bed == None) : 230 | 231 | print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log) 232 | 233 | for read in infile: 234 | 235 | if(not read.is_secondary and not read.is_supplementary): 236 | if(read.is_unmapped): 237 | unmappedReads += 1 238 | else: 239 | mappedReads += 1 240 | 241 | if(read.is_unmapped): 242 | continue 243 | if(read.mapping_quality < MQ): 244 | mqFiltered += 1 245 | continue 246 | if(float(read.get_tag("XI")) < minIdentity): 247 | idFiltered += 1 248 | continue 249 | if(NM > -1 and int(read.get_tag("NM")) > NM): 250 | nmFiltered += 1 251 | continue 252 | 253 | if(not read.is_secondary and not read.is_supplementary): 254 | filteredReads += 1 255 | 256 | outfile.write(read) 257 | 258 | print("Criterion\tFiltered reads",file=log) 259 | print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log) 260 | print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) 261 | print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) 262 | print("MM\t0",file=log) 263 | else : 264 | # Multimap retention strategy filtering when bed is supplied 265 | 266 | random.seed(1) 267 | 268 | print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log) 269 | 270 | mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) 271 | #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) 272 | 273 | # Add number of sequenced and number of mapped reads to the read group description 274 | # Used for creating summary file 275 | inFileBamHeader = outfile.header 276 | if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): 277 | slamseqInfo = SlamSeqInfo() 278 | slamseqInfo.SequencedReads = mappedReads + unmappedReads 279 | slamseqInfo.MappedReads = mappedReads 280 | slamseqInfo.FilteredReads = filteredReads 281 | slamseqInfo.MQFilteredReads = mqFiltered 282 | slamseqInfo.IdFilteredReads = idFiltered 283 | slamseqInfo.NmFilteredReads = nmFiltered 284 | slamseqInfo.MultimapperReads = multimapper 285 | 286 | if (bed != None) : 287 | slamseqInfo.AnnotationName = os.path.basename(bed) 288 | slamseqInfo.AnnotationMD5 = md5(bed) 289 | else : 290 | slamseqInfo.AnnotationName = "" 291 | slamseqInfo.AnnotationMD5 = "" 292 | 293 | if not isinstance(inFileBamHeader, dict): 294 | inFileBamHeader = inFileBamHeader.to_dict() 295 | inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) 296 | #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" 297 | 298 | slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } 299 | if('PG' in inFileBamHeader): 300 | inFileBamHeader['PG'].append(slamDunkPG) 301 | else: 302 | inFileBamHeader['PG'] = [ slamDunkPG ] 303 | 304 | infile.close() 305 | outfile.close() 306 | 307 | # Sort afterwards 308 | bamSort(outputBAM, log, inFileBamHeader, verbose) 309 | 310 | pysamIndex(outputBAM) 311 | #pysamFlagstat(outputBAM) 312 | #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) 313 | 314 | else: 315 | print("Skipped filtering for " + inputBAM, file=log) 316 | --------------------------------------------------------------------------------