├── slamdunk
├── __init__.py
├── dunks
│ ├── __init__.py
│ ├── dump.py
│ ├── snps.py
│ ├── deduplicator.py
│ ├── mapper.py
│ └── filter.py
├── plot
│ ├── __init__.py
│ ├── checkLibraries.R
│ ├── PCAPlotter.R
│ ├── conversion_per_read_position.R
│ ├── compute_context_TC_rates.R
│ ├── compute_sample_comparison_statistics.R
│ ├── compute_overall_rates.R
│ ├── compute_conversion_rate_mle.R
│ ├── splash_eval_count_files.R
│ ├── compute_halflifes.R
│ ├── SNPeval.R
│ ├── globalRatePlotter.R
│ ├── eval_halflifes_error_plot.R
│ ├── eval_conversion_rate_plots.R
│ ├── eval_halflife_per_gene_plots.R
│ └── merge_rate_files.R
├── slamseq
│ └── __init__.py
├── test
│ ├── __init__.py
│ ├── data
│ │ ├── actb.bed
│ │ ├── reads_slamdunk_mapped_filtered_tcount.tsv
│ │ └── reads.fq
│ ├── test_sample.py
│ └── test_sample.sh
├── utils
│ ├── __init__.py
│ ├── BedReader.py
│ ├── SNPtools.py
│ └── misc.py
├── contrib
│ └── RNASeqReadSimulator
│ │ ├── src
│ │ ├── getSegs.pyc
│ │ ├── splitfasta.py
│ │ ├── addvariation2splicingbed.py
│ │ ├── getSegs.py
│ │ ├── genexplvprofile.py
│ │ ├── getseqfrombed.py
│ │ └── gensimreads.py
│ │ ├── demo
│ │ ├── input
│ │ │ ├── samplereaderror.txt
│ │ │ ├── sampleposbias.txt
│ │ │ └── sample.bed
│ │ ├── gensingleendreads.sh
│ │ ├── genstrandedreads.sh
│ │ └── genpairedendreads.sh
│ │ └── README
└── version.py
├── MANIFEST.in
├── .settings
└── .gitignore
├── requirements.txt
├── .gitignore
├── environment.yml
├── hooks
└── build
├── bin
├── splash
├── alleyoop
├── slamdunk
└── _preamble.py
├── .travis.yml
├── Dockerfile
├── README.md
└── setup.py
/slamdunk/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
--------------------------------------------------------------------------------
/slamdunk/dunks/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/slamdunk/plot/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/slamdunk/slamseq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/slamdunk/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/slamdunk/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.settings/.gitignore:
--------------------------------------------------------------------------------
1 | /org.eclipse.core.resources.prefs
2 |
--------------------------------------------------------------------------------
/slamdunk/test/data/actb.bed:
--------------------------------------------------------------------------------
1 | chr5 120498 122492 Actb 0 +
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.9.4
2 | pybedtools>=0.6.4
3 | intervaltree>=2.1.0
4 | pandas>=0.13.1
5 | biopython>=1.63
6 | pysam>=0.8.3
7 | Cython>=0.20.1
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/t-neumann/slamdunk/HEAD/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc
--------------------------------------------------------------------------------
/slamdunk/test/test_sample.py:
--------------------------------------------------------------------------------
1 | # content of test_sample.py
2 | def func(x):
3 | return x + 1
4 |
5 | #def test_run():
6 |
7 |
8 | def test_answer():
9 | assert func(4) == 5
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | /.pydevproject
3 | /sync.sh
4 | *.pyc
5 | .DS_Store
6 | /doc/build/
7 | /bin/NextGenMap/
8 | /bin/ngm
9 | /slamdunk/plot/Rslamdunk
10 | .Rhistory
11 | .cache
12 | *-enc.2.ngm
13 | *-ht-13-2.3.ngm
14 | *.fai
15 |
16 |
--------------------------------------------------------------------------------
/slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv:
--------------------------------------------------------------------------------
1 | Chromosome Start End Name Length Strand ConversionRate ReadsCPM Tcontent CoverageOnTs ConversionsOnTs ReadCount TcReadCount multimapCount ConversionRateLower ConversionRateUpper
2 | chr5 120498 122492 Actb 1994 + 0.022222222222222223 666666.6666666666 445 90 2 8 4 0 -1.0 -1.0
3 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: slamdunk
2 | dependencies:
3 | - nextgenmap=0.5.5
4 | - samtools=1.10
5 | - varscan=2.4.4
6 | - r-tidyverse=1.3.0
7 | - r-matrixstats=0.55.0
8 | - r-gridextra=2.3
9 | - r-getopt=1.20.3
10 | - joblib=0.14.0
11 | - pandas=0.25.3
12 | - cython=0.29.14
13 | - biopython=1.74
14 | - pybedtools=0.8.0
15 | - intervaltree=3.0.2
16 |
--------------------------------------------------------------------------------
/hooks/build:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | if [ "$DOCKER_TAG" = "latest" ]; then
4 | echo "Building :latest, without VERSION_ARG"
5 | TAG=`curl -s https://api.github.com/repos/t-neumann/slamdunk/releases | grep tag_name | head -n 1 | cut -d '"' -f 4`
6 | echo $TAG
7 | docker build --build-arg VERSION_ARG="$TAG" -t ${IMAGE_NAME} .
8 | else
9 | echo "Building :$DOCKER_TAG, with VERSION_ARG=\"--vers $DOCKER_TAG\""
10 | docker build --build-arg VERSION_ARG="$DOCKER_TAG" -t ${IMAGE_NAME} .
11 | fi
--------------------------------------------------------------------------------
/slamdunk/test/test_sample.sh:
--------------------------------------------------------------------------------
1 |
2 | slamdunk all -r slamdunk/test/data/ref.fa -b slamdunk/test/data/actb.bed -o slamdunk/test/data/output -rl 100 -mbq 27 -5 0 slamdunk/test/data/reads.fq
3 |
4 | grep -v "^#" slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount.tsv > slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv
5 |
6 | diff slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv
7 |
8 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/splitfasta.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Split fasta files including paired-end reads into two separate files
4 | """
5 | from __future__ import print_function;
6 | import sys;
7 | import re;
8 |
9 | outfile1="";
10 | outfile2="";
11 |
12 | for i in range(len(sys.argv)):
13 | if sys.argv[i]=="-o":
14 | outfile1=sys.argv[i+1]+"_1.fa";
15 | outfile2=sys.argv[i+1]+"_2.fa";
16 |
17 | if outfile1=="":
18 | sys.exit(-1);
19 |
20 | ofid1=open(outfile1,"w");
21 | ofid2=open(outfile2,"w");
22 |
23 | isleft=True;
24 | for lines in sys.stdin:
25 | if lines[0]=='>':
26 | if lines.strip()[-1]=='1':
27 | isleft=True;
28 | else:
29 | isleft=False;
30 | lines=re.sub("/[12]","",lines);
31 | if isleft:
32 | print(lines,file=ofid1,end='');
33 | else:
34 | print(lines,file=ofid2,end='');
35 |
36 |
37 |
38 |
39 |
40 | ofid1.close();
41 | ofid2.close();
42 |
--------------------------------------------------------------------------------
/bin/splash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | import sys
21 |
22 | try:
23 | import _preamble
24 | except ImportError:
25 | sys.exc_clear()
26 |
27 | from slamdunk import splash
28 | splash.run()
--------------------------------------------------------------------------------
/bin/alleyoop:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | import sys
21 |
22 | try:
23 | import _preamble
24 | except ImportError:
25 | sys.exc_clear()
26 |
27 | from slamdunk import alleyoop
28 | alleyoop.run()
--------------------------------------------------------------------------------
/bin/slamdunk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | import sys
21 |
22 | try:
23 | import _preamble
24 | except ImportError:
25 | sys.exc_clear()
26 |
27 | from slamdunk import slamdunk
28 | slamdunk.run()
29 |
--------------------------------------------------------------------------------
/slamdunk/version.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
2 | #
3 | # This file is part of Slamdunk.
4 | #
5 | # Slamdunk is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Affero General Public License as
7 | # published by the Free Software Foundation, either version 3 of the
8 | # License, or (at your option) any later version.
9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 |
18 | # Overall slamDunk version
19 | __version__ = "0.4.3"
20 | # File format version of BAM files from slamdunk filter
21 | __bam_version__ = "3"
22 | # File format version of count files from slamdunk count
23 | __count_version__ = "3"
24 | # Required NextGenMap version
25 | __ngm_version__ = "0.5.5"
26 |
--------------------------------------------------------------------------------
/bin/_preamble.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
2 | #
3 | # This file is part of Slamdunk.
4 | #
5 | # Slamdunk is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Affero General Public License as
7 | # published by the Free Software Foundation, either version 3 of the
8 | # License, or (at your option) any later version.
9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 |
18 | import sys, os
19 |
20 | path = os.path.abspath(sys.argv[0])
21 | while os.path.dirname(path) != path:
22 | if os.path.exists(os.path.join(path, 'slamdunk', '__init__.py')):
23 |
24 | #sys.path.insert(0, os.path.join(path, 'slamdunk'))
25 | sys.path.insert(0, path)
26 | break
27 | path = os.path.dirname(path)
28 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | # We don't actually use the Travis Python, but this keeps it organized.
4 | - "2.7"
5 | - "3.5"
6 | - "3.6"
7 | - "3.7"
8 |
9 | before_install:
10 |
11 | # Here we just install Miniconda, which you shouldn't have to change.
12 | - if [ "$TRAVIS_OS_NAME" == "osx" ]; then
13 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh;
14 | else
15 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
16 | fi
17 | - chmod +x miniconda.sh
18 | - ./miniconda.sh -b -p $HOME/miniconda
19 | - export PATH=/home/travis/miniconda/bin:$PATH
20 | - conda update --yes conda
21 | - conda config --add channels defaults
22 | - conda config --add channels bioconda
23 | - conda config --add channels conda-forge
24 |
25 | install:
26 |
27 | # We just set up a conda environment with the right Python version. This
28 | # should not need changing.
29 |
30 | - conda env create -f environment.yml
31 | - source activate slamdunk
32 | - pip install pytest
33 | - pip install .
34 |
35 | # command to run tests
36 | script:
37 | - slamdunk -h
38 | - alleyoop -h
39 | - splash -h
40 | - slamdunk/test/test_sample.sh
41 | - pytest
42 |
--------------------------------------------------------------------------------
/slamdunk/plot/checkLibraries.R:
--------------------------------------------------------------------------------
1 | # Helper function to check whether Rslamdunk libraries are available
2 | # Install if libraries are not available
3 |
4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
5 | #
6 | # This file is part of Slamdunk.
7 | #
8 | # Slamdunk is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Affero General Public License as
10 | # published by the Free Software Foundation, either version 3 of the
11 | # License, or (at your option) any later version.
12 | #
13 | # Slamdunk is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Affero General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Affero General Public License
19 | # along with this program. If not, see .
20 |
21 | checkLib <- function(libLoc) {
22 |
23 | list.of.packages <- c("getopt","ggplot2","gridExtra","RColorBrewer","lattice","matrixStats","assertthat","lazyeval","tibble")
24 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages(lib.loc = libLoc)[,"Package"])]
25 |
26 | if(length(new.packages)) install.packages(new.packages, repos="http://cran.wu.ac.at/", lib = libLoc, dependencies = TRUE)
27 | }
--------------------------------------------------------------------------------
/slamdunk/test/data/reads.fq:
--------------------------------------------------------------------------------
1 | @Read1_1_0
2 | CCGTTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
3 | +
4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
5 | @Read2_0_0
6 | TTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
7 | +
8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
9 | @Read3_1_0
10 | CTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
11 | +
12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
13 | @Read4_1_1
14 | TCTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
15 | +
16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
17 | @Read5_2_2
18 | CCGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
19 | +
20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
21 | @Read6_1_0
22 | TTGTGTAAGGTAAGGCGTGCACTTTTATTGGTCTCA
23 | +
24 | FFFFFFFFFFFFFFF.
17 |
18 | FROM continuumio/miniconda3:4.7.12
19 |
20 | MAINTAINER Tobias Neumann
21 |
22 | ARG VERSION_ARG
23 |
24 | COPY environment.yml /tmp/environment.yml
25 |
26 | RUN apt-get update \
27 | && apt-get install -y procps \
28 | && apt-get clean -y \
29 | && rm -rf /var/lib/apt/lists/* \
30 | && conda config --add channels defaults \
31 | && conda config --add channels bioconda \
32 | && conda config --add channels conda-forge \
33 | && conda env create --name slamdunk -f /tmp/environment.yml \
34 | && /opt/conda/envs/slamdunk/bin/pip install git+https://github.com/t-neumann/slamdunk.git@${VERSION_ARG} \
35 | && rm -rf /opt/conda/pkgs/*
36 |
37 | ENV PATH /opt/conda/envs/slamdunk/bin:$PATH
38 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/gensingleendreads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #-------------------------------------------
4 | # Parameters
5 | # Required parameters:
6 | # Transcript annotation (BED file)
7 | BED=input/sample.bed
8 |
9 | # output FASTA prefix
10 | FASTAFILE=output/single.fa
11 |
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 |
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 |
19 | # Number of reads generated
20 | NREAD=100000
21 |
22 | # positional bias file
23 | POSBIAS=input/sampleposbias.txt
24 |
25 | # Read error position profile
26 | READERR=input/samplereaderror.txt
27 |
28 | # Intermediate files
29 | # File for random expression level assignment
30 | RANDEXPLV=output/explvprofile.txt
31 |
32 |
33 | #-----------------------------------------------
34 | # Add paths if users don't install the script
35 | export PATH=../src/:$PATH
36 | # Commands to randomly assign weights to each transcript
37 |
38 | if [ ! -d "output" ]; then
39 | mkdir output
40 | fi
41 |
42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV"
43 |
44 | echo "Commands to randomly assign weights to each transcript:"
45 | echo $CMD0
46 |
47 | genexplvprofile.py $BED > $RANDEXPLV
48 |
49 | # Commands to simulate reads (output to STDOUT in BED format)
50 | # If you want single-end reads, don't use the "-p" option.
51 | CMD1="gensimreads.py -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN $BED "
52 |
53 | echo "Commands to generate simulated paired-reads in BED format:"
54 | echo $CMD1
55 |
56 |
57 | # Commands to convert BED file to fasta file
58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE"
59 |
60 | echo "Commands to generate FASTA file from the last command:"
61 | echo $CMD2
62 | echo "Output FASTA prefix: $FASTAFILE"
63 |
64 |
65 |
66 | # Execute two commands simultaneously
67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
68 | $CMD1 | $CMD2 > $FASTAFILE
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/genstrandedreads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #-------------------------------------------
4 | # Parameters
5 | # Required parameters:
6 | # Transcript annotation (BED file)
7 | BED=input/sample.bed
8 |
9 | # output FASTA prefix
10 | FASTAFILE=output/single-stranded.fa
11 |
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 |
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 |
19 | # Number of reads generated
20 | NREAD=100000
21 |
22 | # positional bias file
23 | POSBIAS=input/sampleposbias.txt
24 |
25 | # Read error position profile
26 | READERR=input/samplereaderror.txt
27 |
28 | # Intermediate files
29 | # File for random expression level assignment
30 | RANDEXPLV=output/explvprofile.txt
31 |
32 |
33 | #-----------------------------------------------
34 | # Add paths if users don't install the script
35 | export PATH=../src/:$PATH
36 | # Commands to randomly assign weights to each transcript
37 |
38 | if [ ! -d "output" ]; then
39 | mkdir output
40 | fi
41 |
42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV"
43 |
44 | echo "Commands to randomly assign weights to each transcript:"
45 | echo $CMD0
46 |
47 | genexplvprofile.py $BED > $RANDEXPLV
48 |
49 | # Commands to simulate reads (output to STDOUT in BED format)
50 | # If you want single-end reads, don't use the "-p" option.
51 | CMD1="gensimreads.py --stranded -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN $BED "
52 |
53 | echo "Commands to generate simulated paired-reads in BED format:"
54 | echo $CMD1
55 |
56 |
57 | # Commands to convert BED file to fasta file
58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE"
59 |
60 | echo "Commands to generate FASTA file from the last command:"
61 | echo $CMD2
62 | echo "Output FASTA prefix: $FASTAFILE"
63 |
64 |
65 |
66 | # Execute two commands simultaneously
67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
68 | $CMD1 | $CMD2 > $FASTAFILE
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ### Streamlining SLAM-Seq analysis with ultra-high sensitivity.
4 |
5 | [](https://github.com/t-neumann/slamdunk/releases/latest)
6 | [](https://travis-ci.org/t-neumann/slamdunk)
7 |
8 | [](https://hub.docker.com/r/tobneu/slamdunk)
9 | [](https://hub.docker.com/r/tobneu/slamdunk/builds/)
10 |
11 | [](http://bioconda.github.io/recipes/slamdunk/README.html)
12 | [](https://anaconda.org/bioconda/slamdunk)
14 | [](https://anaconda.org/bioconda/slamdunk)
16 |
17 | [](https://pypi.python.org/pypi/slamdunk)
18 | 
19 |
20 | -----
21 |
22 | ### Slamdunk documentation
23 |
24 | http://t-neumann.github.io/slamdunk
25 |
26 | ### nf-core slamseq workflow
27 |
28 | [](https://nf-co.re/slamseq)
29 |
30 | ### Please cite
31 |
32 | Neumann, T., Herzog, V. A., Muhar, M., Haeseler, von, A., Zuber, J., Ameres, S. L., & Rescheneder, P. (2019). [Quantification of experimentally induced nucleotide conversions in high-throughput sequencing datasets](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2849-7). BMC Bioinformatics, 20(1), 258. http://doi.org/10.1186/s12859-019-2849-7
33 |
34 |
--------------------------------------------------------------------------------
/slamdunk/dunks/dump.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | from __future__ import print_function
21 |
22 | from slamdunk.utils.misc import checkStep # @UnresolvedImport
23 | from slamdunk.slamseq.SlamSeqFile import SlamSeqBamFile, SlamSeqWriter # @UnresolvedImport
24 | from slamdunk.utils import SNPtools # @UnresolvedImport
25 |
26 | def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False):
27 |
28 | if(not checkStep([bam, referenceFile], [outputCSV], force)):
29 | print("Skipped computing T->C per reads position for file " + bam, file=log)
30 | else:
31 |
32 | snps = SNPtools.SNPDictionary(snpsFile)
33 | snps.read()
34 |
35 | outputFile = SlamSeqWriter(outputCSV)
36 |
37 | #Go through one chr after the other
38 | testFile = SlamSeqBamFile(bam, referenceFile, snps)
39 |
40 | chromosomes = testFile.getChromosomes()
41 |
42 | for chromosome in chromosomes:
43 | readIterator = testFile.readsInChromosome(chromosome)
44 | for read in readIterator:
45 | outputFile.write(read)
46 |
47 |
48 | outputFile.close()
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/genpairedendreads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #-------------------------------------------
4 | # Parameters
5 | # Required parameters:
6 | # Transcript annotation (BED file)
7 | BED=input/sample.bed
8 |
9 | # output FASTA prefix
10 | FASTAFILE=output/paired
11 |
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 |
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 |
19 | # Number of reads generated
20 | NREAD=100000
21 |
22 | # The mean and std of spans for paired-end reads.
23 | PAIREDEND="200,20"
24 |
25 | # positional bias file
26 | POSBIAS=input/sampleposbias.txt
27 |
28 | # Read error position profile
29 | READERR=input/samplereaderror.txt
30 |
31 | # Intermediate files
32 | # File for random expression level assignment
33 | RANDEXPLV=output/explvprofile.txt
34 |
35 |
36 | #-----------------------------------------------
37 |
38 | # Add paths if users don't install the script
39 | export PATH=../src/:$PATH
40 |
41 | # Commands to randomly assign weights to each transcript
42 |
43 | if [ ! -d "output" ]; then
44 | mkdir output
45 | fi
46 |
47 | CMD0=" genexplvprofile.py $BED > $RANDEXPLV"
48 |
49 | echo "Commands to randomly assign weights to each transcript:"
50 | echo $CMD0
51 |
52 |
53 | genexplvprofile.py $BED > $RANDEXPLV
54 |
55 | # Commands to simulate reads (output to STDOUT in BED format)
56 | # If you want single-end reads, don't use the "-p" option.
57 | CMD1="gensimreads.py -e $RANDEXPLV -n $NREAD -b $POSBIAS -l $READLEN -p $PAIREDEND $BED "
58 |
59 | echo "Commands to generate simulated paired-reads in BED format:"
60 | echo $CMD1
61 |
62 |
63 | # Commands to convert BED file to fasta file
64 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN - $REFERENCE"
65 |
66 | echo "Commands to generate FASTA file from the last command:"
67 | echo $CMD2
68 | echo "Output FASTA prefix: $FASTAFILE"
69 |
70 |
71 |
72 | # Execute two commands simultaneously
73 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
74 | # $CMD1 | $CMD2 > $FASTAFILE
75 | $CMD1 | $CMD2 | splitfasta.py -o $FASTAFILE
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/addvariation2splicingbed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | This script is used to add splicing variations from STDIN .BED file.
4 |
5 | Usage: addvariation2splicingbed.py {OPTIONS}
6 |
7 | OPTIONS
8 |
9 | ATTENTION:
10 |
11 | HISTORY
12 | 01/09/2012
13 |
14 | """
15 | from __future__ import print_function;
16 | import sys;
17 | import subprocess;
18 | import pydoc;
19 | import os;
20 | import random;
21 | import bisect;
22 | import math;
23 | from getSegs import *;
24 |
25 | import pdb;
26 |
27 |
28 | errorrate=0.2;
29 | onbedfile="-";
30 |
31 |
32 | for i in range(len(sys.argv)):
33 | if sys.argv[i]=='-h':
34 | print(pydoc.render_doc(sys.modules[__name__]));
35 | sys.exit();
36 | if i.
19 |
20 | from __future__ import print_function
21 | import subprocess
22 | import csv
23 | from slamdunk.utils.misc import checkStep, getBinary # @UnresolvedImport
24 |
25 | def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False):
26 | if(checkStep([inputBAM, referenceFile], [outputSNP], force)):
27 | fileSNP = open(outputSNP, 'w')
28 |
29 | mpileupCmd = "samtools mpileup -B -A -f " + referenceFile + " " + inputBAM
30 | if(verbose):
31 | print(mpileupCmd, file=log)
32 | if(not printOnly):
33 | mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log)
34 |
35 | varscanCmd = "varscan mpileup2snp --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1"
36 | if(verbose):
37 | print(varscanCmd, file=log)
38 | if(not printOnly):
39 | varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log)
40 | varscan.wait()
41 |
42 | fileSNP.close()
43 | else:
44 | print("Skipping SNP calling", file=log)
45 |
46 | def countSNPsInFile(inputFile):
47 | snpCount = 0
48 | tcSnpCount = 0
49 | with open(inputFile, "r") as snpFile:
50 | snpReader = csv.reader(snpFile, delimiter='\t')
51 | for row in snpReader:
52 | if((row[2].upper() == "T" and row[3].upper() == "C") or (row[2].upper() == "A" and row[3].upper() == "G")):
53 | tcSnpCount = tcSnpCount + 1
54 | snpCount = snpCount + 1
55 | return snpCount, tcSnpCount
56 |
--------------------------------------------------------------------------------
/slamdunk/utils/BedReader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
2 | #
3 | # This file is part of Slamdunk.
4 | #
5 | # Slamdunk is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Affero General Public License as
7 | # published by the Free Software Foundation, either version 3 of the
8 | # License, or (at your option) any later version.
9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 |
18 | from intervaltree import IntervalTree
19 |
20 | def bedToIntervallTree(bed):
21 | utrs = {}
22 |
23 | for utr in BedIterator(bed):
24 |
25 | if (not utr.chromosome in utrs) :
26 | utrs[utr.chromosome] = IntervalTree()
27 |
28 | utrs[utr.chromosome][utr.start:(utr.stop + 1)] = utr.name
29 |
30 | return utrs
31 |
32 |
33 | class BedEntry:
34 |
35 | def __init__(self):
36 | self.chromosome = ""
37 | self.start = 0
38 | self.stop = 0
39 | self.name = ""
40 | self.score = "."
41 | self.strand = "."
42 |
43 | def __repr__(self):
44 | return (self.chromosome + "\t" + str(self.start) + "\t" + str(self.stop) + "\t" + self.name)
45 |
46 | def getLength(self):
47 | return self.stop - self.start
48 |
49 | def hasStrand(self):
50 | return self.strand == "+" or self.strand == "-"
51 |
52 | def hasNonEmptyName(self):
53 | return self.name != ""
54 |
55 | class BedIterator:
56 |
57 | def __init__(self, filename):
58 | self._bedFile = open(filename, "r")
59 |
60 | def __iter__(self):
61 | return self
62 |
63 | def _toBED(self, line):
64 | cols = line.rstrip().split("\t")
65 | bedEntry = BedEntry()
66 | bedEntry.chromosome = cols[0]
67 | bedEntry.start = int(cols[1])
68 | bedEntry.stop = int(cols[2])
69 | bedEntry.name = cols[3]
70 |
71 | if (len(cols) > 4) :
72 | bedEntry.score = cols[4]
73 | # Add strand info if available
74 | if (len(cols) > 5) :
75 | bedEntry.strand = cols[5]
76 |
77 | return bedEntry
78 |
79 | def __next__(self):
80 | try:
81 | return self._toBED(self._bedFile.__next__())
82 | except StopIteration:
83 | self._bedFile.close()
84 | raise StopIteration
85 |
--------------------------------------------------------------------------------
/slamdunk/utils/SNPtools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
2 | #
3 | # This file is part of Slamdunk.
4 | #
5 | # Slamdunk is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU Affero General Public License as
7 | # published by the Free Software Foundation, either version 3 of the
8 | # License, or (at your option) any later version.
9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program. If not, see .
17 |
18 | import os
19 |
20 | from pybedtools import BedTool
21 |
22 |
23 | class SNPDictionary(object):
24 |
25 | def __init__(self, vcfFile):
26 | self._vcfFile = vcfFile
27 | self._tcSNPs = {}
28 | self._agSNPs = {}
29 |
30 | def _addSNP(self, snp):
31 |
32 | if(snp[3].upper() == "T" and snp[4].upper() == "C"):
33 | key = snp[0] + snp[1]
34 | self._tcSNPs[key] = True
35 |
36 | if(snp[3].upper() == "A" and snp[4].upper() == "G"):
37 | key = snp[0] + snp[1]
38 | self._agSNPs[key] = True
39 |
40 | def read(self):
41 | if (self._vcfFile != None):
42 | if(os.path.exists(self._vcfFile)):
43 | vcfReader = BedTool(self._vcfFile)
44 |
45 | if(vcfReader.file_type != "vcf"):
46 | print("Wrong file type. Empty or not a vcf file.")
47 |
48 | for snp in vcfReader:
49 | self._addSNP(snp)
50 | else:
51 | print("Warning: SNP file " + self._vcfFile + " not found.")
52 |
53 | def isAGSnp(self, chromosome, position):
54 | key = chromosome + str(int(position) + 1)
55 | return key in self._agSNPs
56 |
57 |
58 | def isTCSnp(self, chromosome, position):
59 | key = chromosome + str(int(position) + 1)
60 | return key in self._tcSNPs
61 |
62 | def getAGSNPsInUTR(self, chromosome, start, stop, snpType):
63 | count = 0
64 | for i in range(start, stop):
65 | if(self.isAGSnp(chromosome, i)):
66 | count += 1
67 | return count
68 |
69 | def getTCSNPsInUTR(self, chromosome, start, stop, snpType):
70 | count = 0
71 | for i in range(start, stop):
72 | if(self.isTCSnp(chromosome, i)):
73 | count += 1
74 | return count
75 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/input/sample.bed:
--------------------------------------------------------------------------------
1 | chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347,
2 | chr1 11873 14409 uc010nxq.1 0 + 12189 13639 0 3 354,127,1007, 0,721,1529,
3 | chr1 11873 14409 uc010nxr.1 0 + 11873 11873 0 3 354,52,1189, 0,772,1347,
4 | chr1 14362 16765 uc009vis.2 0 - 14362 14362 0 4 467,69,147,159, 0,607,1433,2244,
5 | chr1 14362 19759 uc001aae.3 0 - 14362 14362 0 10 467,69,152,159,198,136,137,147,99,847, 0,607,1433,2244,2495,2870,3243,3552,3905,4550,
6 | chr1 14362 19759 uc009vit.2 0 - 14362 14362 0 9 467,69,152,159,198,510,147,99,847, 0,607,1433,2244,2495,2870,3552,3905,4550,
7 | chr1 14362 19759 uc009viu.2 0 - 14362 14362 0 10 467,69,152,159,198,510,147,102,54,847, 0,607,1433,2244,2495,2870,3552,3905,4138,4550,
8 | chr1 14362 24901 uc001aab.3 0 - 14362 14362 0 10 467,69,152,159,202,136,137,147,112,164, 0,607,1433,2244,2491,2870,3243,3552,3905,10375,
9 | chr1 14362 29370 uc001aac.3 0 - 14362 14362 0 11 467,69,152,159,198,110,137,147,102,154,50, 0,607,1433,2244,2495,2896,3243,3552,3905,10375,14958,
10 | chr1 14362 29370 uc001aah.3 0 - 14362 14362 0 11 467,69,152,159,198,136,137,147,99,154,50, 0,607,1433,2244,2495,2870,3243,3552,3905,10375,14958,
11 | chr1 14362 29370 uc009viq.2 0 - 14362 14362 0 7 467,152,159,198,456,154,50, 0,1433,2244,2495,3243,10375,14958,
12 | chr1 14362 29370 uc009vir.2 0 - 14362 14362 0 10 467,69,152,159,198,510,147,99,154,50, 0,607,1433,2244,2495,2870,3552,3905,10375,14958,
13 | chr1 14406 29370 uc009viv.2 0 - 14406 14406 0 7 2359,198,136,137,147,154,50, 0,2451,2826,3199,3508,10331,14914,
14 | chr1 14406 29370 uc009viw.2 0 - 14406 14406 0 7 2359,198,510,147,99,154,50, 0,2451,2826,3508,3861,10331,14914,
15 | chr1 15602 29370 uc009vix.2 0 - 15602 15602 0 7 345,159,198,136,147,154,50, 0,1004,1255,1630,2312,9135,13718,
16 | chr1 15795 18061 uc009vjd.2 0 - 15795 15795 0 5 152,159,198,136,456, 0,811,1062,1437,1810,
17 | chr1 16606 29370 uc009viy.2 0 - 16606 16606 0 9 159,198,136,137,147,95,58,154,50, 0,251,626,999,1308,1661,1890,8131,12714,
18 | chr1 16606 29370 uc009viz.2 0 - 16606 16606 0 8 159,202,136,137,147,112,154,50, 0,247,626,999,1308,1661,8131,12714,
19 | chr1 16857 17751 uc009vjc.1 0 - 16857 16857 0 2 198,519, 0,375,
20 | chr1 16857 19759 uc001aai.1 0 - 16857 16857 0 6 198,136,137,147,112,847, 0,375,748,1057,1410,2055,
21 | chr1 16857 29370 uc010nxs.1 0 - 16857 16857 0 8 198,136,137,147,99,227,154,50, 0,375,748,1057,1410,2055,7880,12463,
22 | chr1 16857 29961 uc009vjb.1 0 - 16857 16857 0 7 198,136,137,147,112,154,138, 0,375,748,1057,1410,7880,12966,
23 | chr1 17232 29370 uc009vje.2 0 - 17232 17232 0 4 510,147,99,50, 0,682,1035,12088,
24 | chr1 17605 29370 uc009vjf.2 0 - 17605 17605 0 7 137,147,95,58,227,154,50, 0,309,662,891,1307,7132,11715,
25 | chr1 34611 36081 uc001aak.2 0 - 34611 34611 0 3 563,205,361, 0,665,1109,
26 | chr1 69090 70008 uc001aal.1 0 + 69090 70008 0 1 918, 0,
27 | chr1 137838 139228 uc001aam.3 0 - 137838 137838 0 1 1390, 0,
28 |
--------------------------------------------------------------------------------
/slamdunk/plot/PCAPlotter.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Plot PCA based on readcounts in UTRs
4 |
5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
6 | #
7 | # This file is part of Slamdunk.
8 | #
9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program. If not, see .
21 |
22 | library(getopt)
23 |
24 | spec = matrix(c(
25 | 'help' , 'h', 0, "logical","print the usage of the command",
26 | 'fileTab', "f", 2,"character","tsv table of rate files",
27 | 'outputPDF', "O", 2,"character","output pdf file name",
28 | 'outputPCA', "P", 2,"character","output PCA transformations file name"
29 | ),ncol = 5,byrow=T)
30 |
31 | opt = getopt(spec)
32 |
33 | if ( !is.null(opt$help) || length(opt)==1 ) {
34 | #get the script name
35 | cmd = commandArgs(FALSE)
36 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
37 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
38 | #print a friendly message and exit with a non-zero error code
39 | cat(getopt(spec,command = self,usage=T))
40 | q(status=1);
41 | }
42 |
43 |
44 | if ( is.null(opt$fileTab) ) stop("arg fileTab must be specified")
45 | if ( is.null(opt$outputPDF) ) { opt$outputFile = "out.pdf" }
46 |
47 | library(ggplot2)
48 |
49 | samples = read.table(opt$fileTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "")
50 |
51 | if (nrow(samples) <= 1) {
52 | cat('# slamdunk PCA\n', file=opt$outputPCA)
53 | cat(paste(samples$sample,0,"0\n",sep="\t"),append=TRUE,file=opt$outputPCA)
54 | #signal success and exit.
55 | q(status=0)
56 | }
57 |
58 | countsList = list()
59 |
60 | for (i in 1:nrow(samples)) {
61 | curTab = read.delim(samples$file[i],stringsAsFactors=FALSE, comment.char="#")
62 |
63 | countsList[[samples$sample[i]]] = curTab$TcReadCount
64 |
65 | }
66 |
67 | countMatrix = do.call(cbind, countsList)
68 |
69 | variances = apply(countMatrix, 1, var)
70 |
71 | sel = order(variances, decreasing=TRUE)[seq_len(min(500, length(variances)))]
72 |
73 | pca = prcomp(t(countMatrix[sel,]))
74 |
75 | PoV = pca$sdev ^ 2 / sum(pca$sdev ^ 2)
76 |
77 | plotTab = data.frame(sample = row.names(pca$x), PC1 = pca$x[,1], PC2 = pca$x[,2])
78 |
79 | pdf(opt$outputPDF)
80 |
81 | ggplot(plotTab, aes(x=PC1, y=PC2, color = sample)) + geom_point(size = 3) +
82 | xlab(paste("PC1 (", round(PoV[1],digits=2), " % variance)",sep="")) +
83 | ylab(paste("PC2 (", round(PoV[2],digits=2), " % variance)",sep="")) +
84 | theme(legend.position="bottom", legend.title=element_blank()) + ggtitle("Slamdunk PCA")
85 |
86 | dev.off()
87 |
88 | cat('# slamdunk PCA\n', file=opt$outputPCA)
89 | write.table(plotTab,file=opt$outputPCA,append=TRUE,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
90 |
91 | #signal success and exit.
92 | q(status=0)
93 |
--------------------------------------------------------------------------------
/slamdunk/plot/conversion_per_read_position.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | library(getopt)
21 |
22 | spec = matrix(c(
23 | 'help' , 'h', 0, "logical","print the usage of the command",
24 | 'utr' , 'u', 0, "logical","utr plotting",
25 | 'inputFile', "i", 2,"character","tsv table of mutations per position",
26 | 'outputFile', "o", 2,"character","output pdf file name"
27 | ),ncol = 5,byrow=T)
28 |
29 | opt = getopt(spec)
30 |
31 | if ( !is.null(opt$help) || length(opt)==1 ) {
32 | #get the script name
33 | cmd = commandArgs(FALSE)
34 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
35 | cat(basename(self),": Create mismatches per read/UTR position plots.\n\n")
36 | #print a friendly message and exit with a non-zero error code
37 | cat(getopt(spec,command = self,usage=T))
38 | q(status=1);
39 | }
40 |
41 | positionLabel = "Position on read"
42 | mutationLabel = "% of reads with mutation"
43 |
44 | if( !is.null(opt$utr)) {
45 | positionLabel = "Position at 3' UTR end (200 bp upstream)"
46 | mutationLabel = "% of UTRs with mutation"
47 | }
48 |
49 | if ( is.null(opt$inputFile) ) stop("arg input must be specified")
50 | if ( is.null(opt$outputFile) ) { opt$outputFile = paste(opt$inputFile, ".pdf", sep="") }
51 |
52 |
53 | mut = read.table(opt$inputFile, comment.char = "#")
54 |
55 | if (is.null(mut$V6)) {
56 | mut$V6 = mut$V5
57 | }
58 |
59 | #mut = read.table("test_mut_bowtie.csv")
60 |
61 | #totalFwd = mut[1,1]
62 | #totalRev = mut[1,2]
63 | #tcFwd = mut[1,3]
64 | #tcRev = mut[1,4]
65 |
66 | #mut = mut[-1,]
67 |
68 | counts = rbind(c(mut$V1)/c(mut$V5) * 100, c(mut$V2)/c(mut$V6) * 100)
69 | countsTC = rbind(c(mut$V3)/c(mut$V5) * 100, c(mut$V4)/c(mut$V6) * 100)
70 |
71 | ##################################################################
72 | # Workaround for 0 counts (need to work out what's going on there
73 |
74 | counts[is.nan(counts)] = 0
75 | countsTC[is.nan(countsTC)] = 0
76 |
77 | ##################################################################
78 | pdf(opt$outputFile, width=10, height=10)
79 | par(mfrow=c(2,1))
80 |
81 | # Scale to next 10
82 | barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,max(10,ceiling(counts / 10) * 10)), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
83 | #barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,10), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
84 | # Scale to next 1
85 | barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,max(1,ceiling(countsTC))), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
86 | #barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,1), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
87 |
88 | dev.off()
89 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import bisect;
4 |
5 | def getSegs(pos,seglen,direction,exonstart,exonlen):
6 | """
7 | This function returns the corresponding segstart and seg length (as in the list)
8 | parameters:
9 | pos: the position to be queried
10 | seglen: the length of the segment
11 | direction: 1 (forward), -1 (backward)
12 | exonstart,exonlen: the exon positions
13 | Return value: (segstart,seglength,status)
14 | status is used to indicate the possible errors
15 | 0: normal
16 | 1: the position is not in the exon range
17 | 2: the segment exceeds the boundary
18 | """
19 | segstart=[];
20 | seglength=[];
21 | # find the corresponding exon which includes pos
22 | posexonid=-1;
23 | status=0;
24 | for ne in range(len(exonstart)):
25 | if pos in range(exonstart[ne],exonstart[ne]+exonlen[ne]):
26 | posexonid=ne;
27 | break;
28 | if posexonid==-1:
29 | status=1;
30 | return (segstart,seglength,status);
31 | if direction==1:
32 | while seglen>0:
33 | lentoadd=min(seglen,exonlen[posexonid]+exonstart[posexonid]-pos);
34 | segstart+=[pos];
35 | seglength+=[lentoadd];
36 | posexonid=posexonid+1;
37 | seglen=seglen-lentoadd;
38 | if posexonid>=len(exonstart):
39 | if seglen>0:
40 | status=2;
41 | return (segstart,seglength,status);
42 | pos=exonstart[posexonid];
43 | if direction==-1:
44 | while seglen>0:
45 | lentoadd=min(seglen,pos-exonstart[posexonid]+1);
46 | segstart.insert(0,pos-lentoadd+1); # insert in the front
47 | seglength.insert(0,lentoadd);
48 | posexonid=posexonid-1;
49 | seglen=seglen-lentoadd;
50 | if posexonid<0:
51 | if seglen>0:
52 | status=2;
53 | return (segstart,seglength,status);
54 | pos=exonstart[posexonid]+exonlen[posexonid]-1;
55 | return (segstart,seglength,status);
56 |
57 | def tpos2pos(tpos,cumlen,exonstart):
58 | """
59 | Convertion from coordinates in a transcript to coordinates in a reference.
60 | Need to provide exon start position and the cumulate exon length as input.
61 | """
62 | selseg=bisect.bisect_right(cumlen,tpos);
63 | # if the position is exceeding the boundary, set as the last position of the boundary
64 | if selseg>=len(cumlen):
65 | selseg=len(cumlen)-1;
66 | tpos=cumlen[-1]-1;
67 | if selseg>0:
68 | pos=exonstart[selseg]+(tpos-cumlen[selseg-1]);
69 | else:
70 | pos=exonstart[selseg]+tpos;
71 | return pos;
72 |
73 | def writeBedline(fid,lineid,chromosome,direction,startrange,lenrange):
74 | """
75 | Write one line in .bed file.
76 | Need to provide information of chromosome, id, direction, segment starts and segment lengths
77 | """
78 | # skip if startrange is malformed
79 | if not startrange:
80 | return None
81 | bedrange=(startrange[0],startrange[-1]+lenrange[-1]);
82 | startrange=[i-startrange[0] for i in startrange];
83 | # directions
84 | if direction==1:
85 | direction='+';
86 | elif direction==-1:
87 | direction='-';
88 | #write line
89 | fid.write(chromosome + '\t' # 0th, chromosome
90 | + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 1-2th, start and and
91 | + lineid + '\t' # 3th, id
92 | + '0\t'+ direction +'\t' # 4th, 5th, 0 and direction
93 | + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 6-7th, same as 1-2
94 | + '0\t'+str(len(startrange))+'\t' # 8th, 0; 9th, number of segments
95 | + ''.join([str(i)+',' for i in lenrange]) + '\t' # 10th, length
96 | + ''.join([str(i)+',' for i in startrange]) +'\t' # 11th, start position
97 | +'\n');
98 |
99 |
100 |
--------------------------------------------------------------------------------
/slamdunk/dunks/deduplicator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | from __future__ import print_function
21 | import pysam
22 |
23 | from slamdunk.utils.misc import checkStep, pysamIndex # @UnresolvedImport
24 |
25 | def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False):
26 |
27 | if(printOnly or checkStep([inputBAM], [outputBAM], force)):
28 |
29 | samfile = pysam.AlignmentFile(inputBAM, "rb")
30 | outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)
31 |
32 | processedReads = 0
33 | retainedReads = 0
34 |
35 | prevChr = ""
36 | prevStart = ""
37 |
38 | duplicateBuffer = {}
39 |
40 | for read in samfile:
41 |
42 | flag = read.cigarstring
43 | chr = read.reference_id
44 | start = read.reference_start
45 | seq = read.query_sequence
46 | if (read.has_tag("TC")) :
47 | tcflag = read.get_tag("TC")
48 | else :
49 | tcflag = 0
50 |
51 | if (tcflag >= tcMutations) :
52 |
53 | if (chr != prevChr or start != prevStart) :
54 |
55 | if (prevChr != "") :
56 | for curSeq in duplicateBuffer :
57 | for curFlag in duplicateBuffer[curSeq]:
58 | for readEntry in duplicateBuffer[curSeq][curFlag]:
59 | if not readEntry.is_duplicate:
60 | retainedReads += 1
61 | outfile.write(readEntry)
62 | duplicateBuffer.clear()
63 |
64 | if not seq in duplicateBuffer:
65 | duplicateBuffer[seq] = {}
66 | if not flag in duplicateBuffer[seq]:
67 | duplicateBuffer[seq][flag] = list()
68 | if len(duplicateBuffer[seq][flag]) > 0 :
69 | read.is_duplicate = True
70 | duplicateBuffer[seq][flag].append(read)
71 |
72 | prevChr = chr
73 | prevStart = start
74 |
75 | processedReads += 1
76 |
77 | for seq in duplicateBuffer:
78 | for flag in duplicateBuffer[seq] :
79 | for readEntry in duplicateBuffer[seq][flag]:
80 | if not readEntry.is_duplicate:
81 | retainedReads += 1
82 | outfile.write(readEntry)
83 | duplicateBuffer.clear()
84 |
85 | outfile.close()
86 |
87 | print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "")
88 | print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="")
89 | print(" compression rate)", file=log)
90 |
91 | pysamIndex(outputBAM)
92 |
93 | else:
94 | print("Skipped deduplication for " + inputBAM, file=log)
--------------------------------------------------------------------------------
/slamdunk/plot/compute_context_TC_rates.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Script to plot TC context rates of reads
4 |
5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
6 | #
7 | # This file is part of Slamdunk.
8 | #
9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program. If not, see .
21 |
22 | library(getopt)
23 |
24 | spec = matrix(c(
25 | 'help' , 'h', 0, "logical","print the usage of the command",
26 | 'rateTab', "f", 2,"character","tsv table of rate files",
27 | 'outputFile', "O", 2,"character","output pdf file name"
28 | ),ncol = 5,byrow=T)
29 |
30 | opt = getopt(spec)
31 |
32 | if ( !is.null(opt$help) || length(opt)==1 ) {
33 | #get the script name
34 | cmd = commandArgs(FALSE)
35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
37 | #print a friendly message and exit with a non-zero error code
38 | cat(getopt(spec,command = self,usage=T))
39 | q(status=1)
40 | }
41 |
42 |
43 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified")
44 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
45 |
46 | library(ggplot2)
47 | library(gridExtra)
48 |
49 | rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"),comment.char = "")
50 |
51 | pdf(opt$outputFile)
52 | plotList = list()
53 |
54 | for (i in 1:nrow(rates)) {
55 | curTab = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE)
56 |
57 | subFront = curTab[1:2,]
58 | subBack = curTab[4:5,]
59 | names(subBack) = curTab[3,]
60 |
61 | #subFront = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1)
62 | #subBack = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1,skip=2)
63 |
64 | printTabFront = data.frame(contexts=rep(names(subFront),each=2),strand = factor(rep(c("+","-"),ncol(subFront)),levels=c("+","-")),
65 | rate_percent = as.numeric(unlist(subFront)))
66 | printTabBack = data.frame(contexts=rep(names(subBack),each=2),strand = factor(rep(c("+","-"),ncol(subBack)),levels=c("+","-")),
67 | rate_percent = as.numeric(unlist(subBack)))
68 |
69 | printTabFront$rate_percent = printTabFront$rate_percent / sum(printTabFront$rate_percent)
70 | printTabBack$rate_percent = printTabBack$rate_percent / sum(printTabBack$rate_percent)
71 |
72 | # Ignore N contexts for now
73 | printTabFront = printTabFront[-grep("NT",printTabFront$contexts),]
74 | printTabBack = printTabBack[-grep("TN",printTabBack$contexts),]
75 |
76 | curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabFront) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) +
77 | theme(text = element_text(size=6),axis.text.x = element_text(size=6), plot.title = element_text(size=10))
78 | plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("5' T->C context")
79 | curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabBack) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) +
80 | theme(text = element_text(size=6),axis.text.x = element_text(size=6),plot.title = element_text(size=10))
81 | plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("3' T->C context")
82 | }
83 |
84 | do.call(grid.arrange, plotList)
85 |
86 | dev.off()
87 |
88 | #signal success and exit.
89 | q(status=0)
90 |
--------------------------------------------------------------------------------
/slamdunk/plot/compute_sample_comparison_statistics.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Script to plot pairwise correlations and PCA
4 | #
5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
6 | #
7 | # This file is part of Slamdunk.
8 | #
9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program. If not, see .
21 |
22 | # Helper
23 |
24 | my_panel_cor <- function(x, y, digits=2, prefix="", cex.cor, ...)
25 | {
26 | usr <- par("usr"); on.exit(par(usr))
27 | par(usr = c(0, 1, 0, 1))
28 |
29 |
30 | toUse = which(is.finite(x) & is.finite(y) & (x|y>0))
31 | r <- abs(cor(x[toUse], y[toUse]))
32 |
33 |
34 | txt <- format(c(r, 0.123456789), digits=digits)[1]
35 | txt <- paste(prefix, txt, sep="")
36 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
37 | text(0.5, 0.5, txt, cex = cex.cor * r)
38 | }
39 |
40 | my_panel_smooth <- function(x, y,lcol="red")
41 |
42 | {
43 | smoothScatter(x,y,add=T)
44 | abline(0,1,col=lcol)
45 | }
46 |
47 | library(getopt)
48 |
49 | spec = matrix(c(
50 | 'help' , 'h', 0, "logical","print the usage of the command",
51 | 'sampleTab', "i", 2,"character","csv table of sample counts",
52 | 'outputPrefix', "o", 2,"character","output file name prefix"
53 | ),ncol = 5,byrow=T)
54 |
55 | opt = getopt(spec)
56 |
57 | if ( !is.null(opt$help) || length(opt)==1 ) {
58 | #get the script name
59 | cmd = commandArgs(FALSE)
60 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
61 | cat(basename(self),": Compute sample comparison statistics from sample counts.\n\n")
62 | #print a friendly message and exit with a non-zero error code
63 | cat(getopt(spec,command = self,usage=T))
64 | q(status=1);
65 | }
66 |
67 |
68 | if ( is.null(opt$sampleTab) ) stop("arg sampleTab must be specified")
69 | if ( is.null(opt$outputPrefix) ) { opt$outputPrefix = "sampleCorrelation" }
70 |
71 | rates = read.table(opt$sampleTab,header=TRUE,sep=";", comment.char = "")
72 |
73 | if (ncol(rates) < 6) {
74 | print("No need for calculating pairwise statistics for single sample")
75 | quit(status=0)
76 | }
77 |
78 | library(RColorBrewer, lib.loc = libLoc)
79 | library(lattice, lib.loc = libLoc)
80 | library(matrixStats, lib.loc = libLoc)
81 |
82 | values = data.matrix(rates[,c(5:ncol(rates))])
83 |
84 | ##################################################
85 | # PCA
86 | ##################################################
87 |
88 | rowVariances = rowVars(data.matrix(values))
89 |
90 | select = order(rowVariances, decreasing = TRUE)[seq_len(min(500, length(rowVariances)))]
91 |
92 | pca = prcomp(t(values[select, ]))
93 |
94 | if (ncol(values) == 2) {
95 | col = brewer.pal(3, "Paired")[1:2]
96 | } else if (ncol(values) > 12) {
97 | getPalette = colorRampPalette(brewer.pal(9, "Set1"))
98 | col = getPalette(ncol(values))
99 | } else {
100 | col = brewer.pal(ncol(values), "Paired")
101 | }
102 |
103 | # Get amount of explained variance (see summary(pca))
104 | varianceProportion = pca$sdev ^ 2 / sum(pca$sdev ^ 2)
105 |
106 | pdf(paste(opt$outputPrefix,"_PCA.pdf",sep=""))
107 |
108 | if (ncol(values) > 12) {
109 |
110 | xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x),
111 | pch = 20, cex = 2, aspect = "iso", col = col, xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""),
112 | ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""),
113 | )
114 |
115 | } else {
116 |
117 | xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x),
118 | pch = 20, cex = 2, aspect = "iso", col = col, main = draw.key(key = list(rect = list(col = col),
119 | text = list(colnames(values)), rep = FALSE)), xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""),
120 | ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""),
121 | )
122 |
123 | }
124 |
125 | dev.off()
126 |
127 | ##################################################
128 | # Pairwise correlations
129 | ##################################################
130 |
131 | if (ncol(values) <= 12) {
132 |
133 | pdf(paste(opt$outputPrefix,"_pairwiseCorrelation.pdf",sep=""))
134 |
135 | pairs(values,upper.panel=my_panel_smooth,lower.panel=my_panel_cor)
136 |
137 | dev.off()
138 | }
139 |
140 | #signal success and exit.
141 | q(status=0)
142 |
--------------------------------------------------------------------------------
/slamdunk/plot/compute_overall_rates.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Script to overlap public database file
4 | #
5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
6 | #
7 | # This file is part of Slamdunk.
8 | #
9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program. If not, see .
21 |
22 | library(getopt)
23 | library(ggplot2)
24 | library(gridExtra)
25 |
26 | spec = matrix(c(
27 | 'help' , 'h', 0, "logical","print the usage of the command",
28 | 'rateTab', "f", 2,"character","tsv table of rate files",
29 | 'name', "n", 2,"character","Sample name",
30 | 'outputFile', "O", 2,"character","output pdf file name"
31 | ),ncol = 5,byrow=T)
32 |
33 | opt = getopt(spec)
34 |
35 | if ( !is.null(opt$help) || length(opt)<1 ) {
36 | #get the script name
37 | cmd = commandArgs(FALSE)
38 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
39 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
40 | #print a friendly message and exit with a non-zero error code
41 | cat(getopt(spec,command = self,usage=T))
42 | q(status=1);
43 | }
44 |
45 |
46 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified")
47 | if ( is.null(opt$name) ) { opt$outputFile = "Sample 1" }
48 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
49 |
50 | #rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "")
51 |
52 | fileName = opt$rateTab
53 | # file = "/project/libby/slamdunk-analysis/sra_example/rates/ERR1692138_slamdunk_mapped_filtered_overallrates.csv"
54 | # file = "/project/libby/slamdunk-analysis/mareike/rates/AML_41-1_48h_Mll212207_37484.fastq_slamdunk_mapped_filtered_overallrates.csv"
55 | sampleName = opt$name
56 | # sampleName = "Sample 1"
57 |
58 | pdf(opt$outputFile)
59 |
60 | plotList = list()
61 |
62 | #for (i in 1:nrow(rates)) {
63 | curTab = read.table(fileName,stringsAsFactors=FALSE)
64 |
65 | curTab[, c("A", "C", "G", "T")] <- curTab[, c("A", "C", "G", "T")]/rowSums(curTab[, c("A", "C", "G", "T")]) * 100
66 | curTab[, c("a", "c", "g", "t")] <- curTab[, c("a", "c", "g", "t")]/rowSums(curTab[, c("a", "c", "g", "t")]) * 100
67 |
68 | printTab = data.frame(rates=c(rep("AT",2),rep("AC",2),rep("AG",2),
69 | rep("TA",2),rep("TC",2),rep("TG",2),
70 | rep("CA",2),rep("CT",2),rep("CG",2),
71 | rep("GA",2),rep("GT",2),rep("GC",2)), strand = rep(c("+","-"),12),
72 | rate_percent = c(curTab["A","T"],curTab["A","t"],curTab["A","C"],curTab["A","c"],curTab["A","G"],curTab["A","g"],
73 | curTab["T","A"],curTab["T","a"],curTab["T","C"],curTab["T","c"],curTab["T","G"],curTab["T","g"],
74 | curTab["C","A"],curTab["C","a"],curTab["C","T"],curTab["C","t"],curTab["C","G"],curTab["C","g"],
75 | curTab["G","A"],curTab["G","a"],curTab["G","T"],curTab["G","t"],curTab["G","C"],curTab["G","c"])
76 | )
77 |
78 |
79 | #fwdATot = max(1, sum(curTab["A",c("A", "C", "G", "T", "N")]))
80 | #fwdCTot = max(1, sum(curTab["C",c("A", "C", "G", "T", "N")]))
81 | #fwdGTot = max(1, sum(curTab["G",c("A", "C", "G", "T", "N")]))
82 | #fwdTTot = max(1, sum(curTab["T",c("A", "C", "G", "T", "N")]))
83 |
84 | #revATot = max(1, sum(curTab["A",c("a", "c", "g", "t", "n")]))
85 | #revCTot = max(1, sum(curTab["C",c("a", "c", "g", "t", "n")]))
86 | #revGTot = max(1, sum(curTab["G",c("a", "c", "g", "t", "n")]))
87 | #revTTot = max(1, sum(curTab["T",c("a", "c", "g", "t", "n")]))
88 |
89 | #total = c(rep(c(fwdATot, revATot), 3), rep(c(fwdTTot, revTTot), 3), rep(c(fwdCTot, revCTot), 3), rep(c(fwdGTot, revGTot), 3) )
90 |
91 | #printTab$rate_percent = printTab$rate_percent / total * 100
92 |
93 | maxRatePercent = max(10, max(printTab$rate_percent) * 1.1)
94 |
95 | printTab$y = -0.3
96 | printTab[printTab$strand == "-", ]$y = printTab[printTab$strand == "-", ]$rate_percent + printTab[printTab$strand == "+", ]$rate_percent
97 |
98 | curPlot = qplot(x=rates, y=rate_percent, fill=strand,data=printTab) + ylim(-0.5,maxRatePercent) + geom_bar(stat="identity") + geom_text(aes(y = printTab$y, label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = -0.50) + ylab("Rate percent %") + xlab(sampleName) +
99 | theme(text = element_text(size=12),axis.text.x = element_text(size=12))
100 | #curPlot + xlim(0,35)
101 | plotList[[length(plotList)+1]] <- curPlot #+ ylim(0.0,maxRatePercent)
102 | #}
103 |
104 | do.call(grid.arrange, plotList)
105 |
106 | dev.off()
107 |
108 | #signal success and exit.
109 | q(status=0)
110 |
--------------------------------------------------------------------------------
/slamdunk/dunks/mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | from __future__ import print_function
21 | import os, re
22 |
23 | from slamdunk.utils.misc import files_exist, checkStep, run, pysamIndex, removeFile, getBinary, replaceExtension, shellerr # @UnresolvedImport
24 | from slamdunk.version import __ngm_version__ # @UnresolvedImport
25 |
26 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27 |
28 | def sort(inputSAM, outputBAM, log, threads=1, keepSam=True, dry=False, verbose=True):
29 |
30 | if(files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"])):
31 | runSam2bam(inputSAM, outputBAM, log, False, False, not keepSam, threads=threads, dry=dry, verbose=verbose)
32 | else:
33 | print("Skipped sorting for " + inputSAM, file=log)
34 |
35 | def checkNextGenMapVersion():
36 | ngmHelp = shellerr("ngm", raiseError = False)
37 | matchObj = re.match( r'.*([0-9]+\.[0-9]+\.[0-9]+).*', str(ngmHelp), re.M|re.I)
38 | if matchObj:
39 | version = matchObj.group(1)
40 | if version != __ngm_version__:
41 | raise RuntimeError('NextGenMap version expected: ' + __ngm_version__ + " but found " + version + ". Please reinstall slamdunk package.")
42 | else:
43 | raise RuntimeError('Could not get NextGenMap version. Please reinstall slamdunk package.')
44 |
45 | def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False):
46 | if(delinFile and files_exist(outFile) and not files_exist(inFile)):
47 | print("Skipping sam2bam for " + outFile, file=log)
48 | else:
49 | if(onlyUnique and filterMQ == 0):
50 | filterMQ = 1;
51 |
52 | success = True
53 | cmd = ["samtools view", "-@", str(threads), "-Sb", "-o", outFile, inFile]
54 | if filterMQ > 0:
55 | cmd+=["-q", str(filterMQ)]
56 | if onlyProperPaired:
57 | cmd+=["-f", "2"]
58 | if not L is None:
59 | cmd+=["-L", L]
60 | run(" ".join(cmd), log, verbose=verbose, dry=dry)
61 |
62 | if(sort):
63 | tmp = outFile + "_tmp"
64 | if(not dry):
65 | os.rename(outFile, tmp)
66 | run(" ".join(["samtools sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry)
67 | if(success):
68 | removeFile(tmp)
69 | if(success and delinFile):
70 | if(not dry):
71 | removeFile(inFile)
72 |
73 | if(index):
74 | pysamIndex(outFile)
75 |
76 |
77 | def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False):
78 |
79 | if(quantseqMapping is True) :
80 | parameter = "--no-progress"
81 |
82 | if(trim5p > 0):
83 | parameter = parameter + " -5 " + str(trim5p)
84 |
85 | if(maxPolyA > -1):
86 | parameter = parameter + " --max-polya " + str(maxPolyA)
87 |
88 | if(endtoendMapping is True):
89 | parameter = parameter + " -e "
90 | else:
91 | parameter = parameter + " -l "
92 |
93 | if(sampleId != None):
94 | parameter = parameter + " --rg-id " + str(sampleId)
95 | if(sampleName != ""):
96 | parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime)
97 |
98 | if(topn > 1):
99 | parameter = parameter + " -n " + str(topn) + " --strata "
100 |
101 | if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)):
102 | if outputSAM.endswith(".sam"):
103 | # Output SAM
104 | run("ngm -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
105 | else:
106 | # Output BAM directly
107 | run("ngm -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
108 | else:
109 | print("Skipped mapping for " + inputBAM, file=log)
110 |
--------------------------------------------------------------------------------
/slamdunk/plot/compute_conversion_rate_mle.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
4 | #
5 | # This file is part of Slamdunk.
6 | #
7 | # Slamdunk is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as
9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program. If not, see .
19 |
20 | library(getopt)
21 | library(bbmle)
22 |
23 | spec = matrix(c(
24 | 'help' , 'h', 0, "logical","print the usage of the command",
25 | 'file', "f", 2,"character","",
26 | 'rate', "r", 2,"character", "",
27 | 'output', "o", 2,"character","Output tsv"
28 | ),ncol = 5,byrow=T)
29 |
30 | opt = getopt(spec)
31 |
32 | if ( !is.null(opt$help) || length(opt)==3 ) {
33 | #get the script name
34 | cmd = commandArgs(FALSE)
35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
37 | #print a friendly message and exit with a non-zero error code
38 | cat(getopt(spec,command = self,usage=T))
39 | q(status=1);
40 | }
41 |
42 | if ( is.null(opt$file) ) stop("arg file must be specified")
43 | if ( is.null(opt$output) ) stop("arg output must be specified")
44 | if ( is.null(opt$rate) ) stop("arg rate must be specified")
45 |
46 | # a: percentage of converted transcripts
47 | # b: conversion rate
48 | LL <- function(a, b) {
49 | R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0)
50 | -sum(log(R))
51 | }
52 |
53 | # Estimate a with fixed b
54 | LL2 <- function(as) {
55 | b = estb
56 | rs = c()
57 | for(a in as) {
58 | R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0)
59 | rs = c(rs, -sum(log(R)))
60 | }
61 | rs
62 | }
63 |
64 | readMeatInfo <- function(fileName) {
65 | #fileName = filesSlamDunk[1]
66 | sampleInfo = read.table(fileName, nrows = 1, comment.char = "")
67 | version = paste(lapply(sampleInfo[1,1:3], as.character), collapse = '\t')
68 | sampleID = as.character(sampleInfo[1, ]$V7)
69 | sampleName = as.character(sampleInfo[1, ]$V6)
70 | sampleType = as.character(sampleInfo[1, ]$V8)
71 | sampleTime = as.numeric(sampleInfo[1, ]$V9)
72 | sampleInfo = read.table(fileName, nrows = 1, skip = 1, comment.char = "")
73 | annotationMD5 = as.character(sampleInfo[1, ]$V3)
74 | annotationName = as.character(sampleInfo[1, ]$V2)
75 | c(sampleID, sampleName, sampleType, sampleTime, annotationName, annotationMD5, version)
76 | }
77 |
78 |
79 |
80 | file = opt$file
81 | #file = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation/halflife_cov60_rl38/slamdunk/count/sample_3_pulse_30min_rep1_reads_slamdunk_mapped_filtered_tcount_perread.tsv"
82 | #files = as.character(ordered(strsplit(file, ",")[[1]]))
83 |
84 | output = opt$output
85 | #output = "/project/ngs/philipp/slamdunk-analysis/veronika/ngm-20161027/count-examples/34504_An312_wt-2n_mRNA-slamseq-autoquant_1h-R2.fq_slamdunk_mapped_filtered_tcount_mle.tsv"
86 |
87 | estb = as.numeric(opt$rate)
88 | #estb = 0.024
89 |
90 | meta = readMeatInfo(file)
91 | id = meta[1]
92 | type = meta[3]
93 | time = meta[4]
94 |
95 | data = read.table(file, header = T, stringsAsFactors = F)
96 | #head(data)
97 |
98 | for(i in 1:nrow(data)) {
99 | #i = 1
100 | N = as.numeric(strsplit(data[i,]$ReadCount, ",")[[1]])
101 | k = as.numeric(strsplit(data[i,]$TcReadCount, ",")[[1]])
102 | sample = data.frame(n = N, k = k)
103 | fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99))
104 | confinv = confint(fit)
105 |
106 | #result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]]))
107 | data[i,]$ConversionRate = fit@coef[[1]]
108 | data[i,]$ReadCount = length(N)
109 | data[i,]$TcReadCount = sum(k > 0)
110 | data[i,]$ConversionRateLower = confinv[[1]]
111 | data[i,]$ConversionRateUpper = confinv[[2]]
112 | }
113 |
114 | if(sum(is.na(data$ConversionRateLower) > 0)) {
115 | data[is.na(data$ConversionRateLower), ]$ConversionRateLower = 0
116 | }
117 | if(sum(is.na(data$ConversionRateUpper) > 0)) {
118 | data[is.na(data$ConversionRateUpper), ]$ConversionRateUpper = 1
119 | }
120 |
121 | #all = data.frame()
122 | #for(file in files) {
123 | # part = read.table(file, header = T)
124 | # all = rbind(all, part)
125 | #}
126 | #head(all)
127 | # Estimate b from all data
128 | #sample = all
129 | #fit = mle2(minuslogl = LL, start = list(a = 0.1, b = 0.01), method = "L-BFGS-B", lower = c(a = 0.000001, b = 0.000001), upper = c(a = 0.99, b = 0.1))
130 |
131 | #result = c()
132 | #names = as.character(unique(all$utr))
133 |
134 | #for(name in names) {
135 | # #name = names[2]
136 | # sample = all[all$utr == name, ]
137 | #
138 | # fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99))
139 | # confinv = confint(fit)
140 | ## result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]]))
141 | #
142 | #}
143 | #
144 |
145 | # Read header
146 | header = readLines(file, 2)
147 | con <- file(output, open="wt")
148 | # Print header
149 | writeLines(header[1], con)
150 | writeLines(header[2], con)
151 | # Print data
152 | write.table(data, con, sep = "\t", quote = F, row.names = F, col.names = T)
153 | close(con)
154 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/genexplvprofile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File).
4 |
5 | USAGE
6 |
7 | genexplvprofile.py {OPTIONS}
8 |
9 | OPTIONS
10 |
11 | -h/--help\tPrint this message
12 |
13 | -e/--lognormal\tmu,sigma Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4
14 | --geometric\tmu Use geometric distribution with parameter mu instead of lognormal distribution to assign expression levels.
15 |
16 | -f/--statonly\tPrint the statistics only; do not assign expression levels.
17 |
18 | NOTE
19 |
20 | 1. To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).
21 |
22 | 2. The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]).
23 |
24 | HISTORY
25 |
26 | 07/24/2012
27 | Enable geometric distribution for expression level assignments. Require numpy package.
28 |
29 | 02/16/2012
30 | Run on Python 2.7
31 |
32 | 02/08/2012
33 | Initialization.
34 | '''
35 |
36 | from __future__ import print_function
37 | import sys;
38 | import pydoc;
39 | import os;
40 | import re;
41 | import fileinput;
42 | import random;
43 | import numpy;
44 |
45 | def parsebed(lines):
46 | # Parse one line in count data
47 | fd=lines.strip().split('\t');
48 | if len(fd)!=12:
49 | return ['',-1,-1,0];
50 | if fd[10].endswith(','):
51 | fd[10]=fd[10][:-1];
52 | if fd[11].endswith(','):
53 | fd[11]=fd[11][:-1];
54 | seglen=[int(x) for x in fd[10].split(',')];
55 | segstart=[int(x) for x in fd[11].split(',')];
56 | #jstart=int(fd[1])+seglen[0]+1;
57 | #jend=int(fd[1])+segstart[1]+1;
58 | jstart=int(fd[1])+1; # start is 0-base; increase 1 to convert to 1-base
59 | jend=int(fd[2]);
60 | # jscore=int(fd[4]);
61 | #seg1=[jstart+segstart[i] for i in range(len(segstart))];
62 | #seg2=[jstart+segstart[i]+seglen[i]-1 for i in range(len(segstart))];
63 | # [seg1,seg2] are now 1-base inclusive
64 | return [fd[0],jstart,jend,fd[3],sum(seglen),fd[5],fd[9]];
65 |
66 | argvi=1;
67 | mindist=50;
68 | minscore=2;
69 | mu=-4;
70 | sigma=4;
71 | assignexplv=True;
72 |
73 |
74 | allfile=[];
75 |
76 | distype="lognormal";
77 |
78 | while argvi <(len(sys.argv)):
79 | if sys.argv[argvi]=="-h" or sys.argv[argvi]=="--help" :
80 | print(pydoc.render_doc(sys.modules[__name__]),file=sys.stderr);
81 | sys.exit();
82 | elif sys.argv[argvi]=="-f" or sys.argv[argvi]=="--statonly":
83 | assignexplv=False;
84 | elif sys.argv[argvi]=="-e" or sys.argv[argvi]=="--lognormal" :
85 | distype="lognormal";
86 | ms=sys.argv[argvi+1].split(",");
87 | argvi=argvi+1;
88 | if len(ms)!=2:
89 | print('Error: incorrect parameter for -e.',file=sys.stderr);
90 | sys.exit();
91 | try:
92 | mu=float(ms[0]);
93 | sigma=float(ms[1]);
94 | except ValueError:
95 | print('Error: incorrect parameter for -e.',file=sys.stderr);
96 | sys.exit();
97 | print('Mean and variance for lognormal distribution: '+str(mu)+','+str(sigma),file=sys.stderr);
98 | elif sys.argv[argvi]=="--geometric":
99 | distype="geometric";
100 | try:
101 | mu=float(sys.argv[argvi+1]);
102 | if mu<0 or mu>1:
103 | print('Error: the parameter for geometric distribution must be between 0 and 1.',file=sys.stderr);
104 | sys.exit();
105 | except ValueError:
106 | print('Error: incorrect parameter for -e.',file=sys.stderr);
107 | sys.exit();
108 | print('Mean for geometric distribution: '+str(mu),file=sys.stderr);
109 | argvi=argvi+1;
110 | else:
111 | allfile.append(sys.argv[argvi]);
112 | argvi=argvi+1;
113 |
114 |
115 | allid={};
116 |
117 | prevchr="";
118 | prevrange=[0,0];
119 | rangeid=0;
120 |
121 | nline=0;
122 |
123 | currentgene=[];
124 | groupid=0;
125 |
126 | print('#ID\tLength\tDir\tExons\tPosition\tGroupID\tNIsoformInGroup',end='');
127 | if assignexplv==True:
128 | print('\tExplv');
129 | else:
130 | print();
131 |
132 | for lines in fileinput.input(allfile):
133 | nline=nline+1;
134 | pf=parsebed(lines);
135 | chrname=pf[0];jstart=pf[1];jend=pf[2];id=pf[3];
136 | if len(chrname)==0 and jstart<0:
137 | continue;
138 | length=pf[4];direction=pf[5];nexon=pf[6];
139 | if chrname!=prevchr or jstart-prevrange[1]>0:
140 | if len(prevchr)!=0:
141 | groupid=groupid+1;
142 | for item in currentgene:
143 | print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end='');
144 | if assignexplv==True:
145 | if distype=="geometric":
146 | weight=numpy.random.geometric(mu)*item[1];
147 | else:
148 | weight=random.lognormvariate(mu,sigma)*item[1];
149 | print("\t"+str(weight));
150 | else:
151 | print();
152 | prevrange[0]=jstart;
153 | prevrange[1]=jend;
154 | prevchr=chrname;
155 | rangeid=rangeid+1;
156 | currentgene=[];
157 | elif jstartprevrange[1]:
161 | prevrange[1]=jend;
162 | currentgene.append((id+"\t"+str(length)+"\t"+direction+"\t"+str(nexon)+"\t"+chrname+":"+str(jstart)+"-"+str(jend),length));
163 |
164 |
165 | if len(prevchr)!=0:
166 | groupid=groupid+1;
167 | for item in currentgene:
168 | print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end='');
169 | if assignexplv==True:
170 | if distype=="geometric":
171 | weight=numpy.random.geometric(mu)*item[1];
172 | else:
173 | weight=random.lognormvariate(mu,sigma)*item[1];
174 | print("\t"+str(weight));
175 | else:
176 | print();
177 |
178 |
--------------------------------------------------------------------------------
/slamdunk/plot/splash_eval_count_files.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | # Script to evaluate Slamdunk count results
3 | #
4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
5 | #
6 | # This file is part of Slamdunk.
7 | #
8 | # Slamdunk is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Affero General Public License as
10 | # published by the Free Software Foundation, either version 3 of the
11 | # License, or (at your option) any later version.
12 | #
13 | # Slamdunk is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Affero General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Affero General Public License
19 | # along with this program. If not, see .
20 |
21 | library(getopt)
22 |
23 | spec = matrix(c(
24 | 'help' , 'h', 0, "logical","print the usage of the command",
25 | 'simulated', "s", 2,"character","Summarized count file",
26 | 'slamdunk', "d", 2,"character","Summarized count file",
27 | 'output', "o", 2,"character","Output pdf"
28 | ),ncol = 5,byrow=T)
29 |
30 | opt = getopt(spec)
31 |
32 | if ( !is.null(opt$help) || length(opt)==1 ) {
33 | #get the script name
34 | cmd = commandArgs(FALSE)
35 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
36 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
37 | #print a friendly message and exit with a non-zero error code
38 | cat(getopt(spec,command = self,usage=T))
39 | q(status=1);
40 | }
41 |
42 |
43 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified")
44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
45 | if ( is.null(opt$output) ) stop("arg output must be specified")
46 |
47 |
48 | rsme <- function(model, measure) {
49 | sqrt( mean( (model-measure)^2 , na.rm = TRUE ) )
50 | }
51 |
52 | #folder = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_6/"
53 | #version = "slamdunk/"
54 | #tcRatePerPosition = 0.024
55 | #readLength = 50 - 12
56 | #sampleNumber = 21
57 | #cfactor = 1 - dbinom(0, round(readLength / 4), tcRatePerPosition)
58 |
59 | simulatedFileRates = opt$simulated
60 | #simulatedFileRates = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/utrsummary_all_samples_rates_reads.tsv"
61 |
62 | slamDunkFile = opt$slamdunk
63 | #slamDunkFile = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/slamdunk/count/tcounts_all_samples_rates.tsv"
64 |
65 | outputFile = opt$output
66 | outputFileCSV = paste0(outputFile, ".tsv")
67 |
68 | simulatedRates = read.table(simulatedFileRates, header=T, sep="\t", stringsAsFactors = F)
69 | slamDunkRates = read.table(slamDunkFile, header=T, sep="\t", stringsAsFactors = F)
70 |
71 | # Should not be neccessary, but for large datasets some entries are lost.
72 | # Keep all that is found in both
73 | inBoth = intersect(simulatedRates$Name, slamDunkRates$Name)
74 | simulatedRates = simulatedRates[simulatedRates$Name %in% inBoth,]
75 | slamDunkRates = slamDunkRates[slamDunkRates$Name %in% inBoth,]
76 |
77 | fixedColumns = 11
78 | sampleNumber = ncol(simulatedRates) - fixedColumns + 1
79 |
80 | sampleNames = colnames(simulatedRates)[fixedColumns:(fixedColumns + sampleNumber - 1)]
81 | simulatedSamples = simulatedRates[, fixedColumns:(fixedColumns + sampleNumber - 1)]
82 | slamDunkSamples = slamDunkRates[, fixedColumns:(fixedColumns + sampleNumber - 1)]
83 |
84 | pdf(outputFile)
85 | par(mfrow=c(2,1))
86 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-1,1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2)
87 | abline(h=0, lty=2, col="grey")
88 |
89 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples + 0.001)), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2)
90 | abline(h=0, lty=2, col="grey")
91 |
92 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-0.1,0.1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2)
93 | abline(h=0, lty=2, col="grey")
94 |
95 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples + 0.001)), ylim=c(-1,1), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2)
96 | abline(h=0, lty=2, col="grey")
97 |
98 | merged = data.frame()
99 | #rsmeTab = data.frame(File=character(), Rate=character(), RSME=character(), stringsAsFactors=F)
100 | rsmeTab = matrix("", ncol=3, nrow=0)
101 | for(currentSample in 0:(sampleNumber - 1)) {
102 | #currentSample = 0
103 | current = cbind(slamDunkRates[, c(1:fixedColumns - 1, fixedColumns + currentSample)], simulatedRates[, fixedColumns + currentSample])
104 | colnames(current) = c(colnames(slamDunkRates[, c(1:fixedColumns - 1)]), "Simulate", "Slamdunk")
105 | merged = rbind(merged, current)
106 |
107 | rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(substring(sampleNames[currentSample + 1], 2)), as.character(rsme(current$Simulate, current$Slamdunk))))
108 | }
109 |
110 | par(mfrow=c(1,1))
111 | perr = round(rsme(merged$Simulate, merged$Slamdunk), digits = 4)
112 | pcorr = round(cor(merged$Simulate, merged$Slamdunk), digits = 4)
113 | plot(merged$Slamdunk, merged$Simulate, xlim=c(0,1), ylim=c(0,1), pch=4, xlab="Simulated", ylab="Slamdunk", main=paste("Cor: ", pcorr, ", RMSE: ", perr))
114 | abline(a = 0, b = 1, col="grey", lty=2)
115 |
116 | plot(merged$avgTcontent, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
117 |
118 | plot(merged$avgReadsCPM, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
119 |
120 | plot(merged$avgMultimapper, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
121 |
122 | dev.off()
123 |
124 | rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(-1), as.character(rsme(merged$Simulate, merged$Slamdunk))))
125 |
126 | write.table(rsmeTab, outputFileCSV, sep = "\t", quote = F, row.names = F, col.names = T)
127 |
--------------------------------------------------------------------------------
/slamdunk/plot/compute_halflifes.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | # Script to compute half-lifes from SlamSeq data
4 |
5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder, Bhat Pooja
6 | #
7 | # This file is part of Slamdunk.
8 | #
9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program. If not, see .
21 |
22 | library(getopt)
23 |
24 | spec = matrix(c(
25 | 'help' , 'h', 0, "logical","print the usage of the command",
26 | 'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results",
27 | 'timepoints', "t", 2,"character","Comma seperated list of time points",
28 | 'output', "o", 2,"character","Output tsv"
29 | ),ncol = 5,byrow=T)
30 |
31 | opt = getopt(spec)
32 |
33 | if ( !is.null(opt$help) || length(opt)==3 ) {
34 | #get the script name
35 | cmd = commandArgs(FALSE)
36 | self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
37 | cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
38 | #print a friendly message and exit with a non-zero error code
39 | cat(getopt(spec,command = self,usage=T))
40 | q(status=1);
41 | }
42 |
43 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
44 | if ( is.null(opt$output) ) stop("arg output must be specified")
45 | if ( is.null(opt$timepoints) ) stop("arg timepoints must be specified")
46 |
47 | slamDunkFiles = opt$slamdunk
48 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
49 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]]))
50 | outputFile = opt$output
51 | timesParameter = opt$timepoints
52 | times = as.numeric(strsplit(timesParameter, ",")[[1]])
53 | times = times / 60
54 |
55 |
56 | mergeRates <- function(times, files, perRead) {
57 | mergedRates = data.frame()
58 | for(i in 1:length(times)) {
59 | time = times[i]
60 | print(time)
61 | simDataFile = files[i]
62 | simulation = read.table(simDataFile)
63 | colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount")
64 | if(nrow(mergedRates) == 0) {
65 | mergedRates = simulation[, c("chr", "start", "stop", "name", "strand")]
66 | mergedRates$avgReadsCPM = simulation$readsCPM
67 | mergedRates$avgMultimapper = simulation$multiMapCount
68 | if(perRead == TRUE) {
69 | mergedRates$conversionRate = simulation$convertedReads / simulation$readCount
70 | } else {
71 | mergedRates$conversionRate = simulation$conversionRate
72 | }
73 | } else {
74 | mergedRates$avgReadsCPM = mergedRates$avgReadsCPM + simulation$readsCPM
75 | mergedRates$avgMultimapper = mergedRates$avgMultimapper + simulation$multiMapCount
76 | if(perRead == TRUE) {
77 | mergedRates = cbind(mergedRates, simulation$convertedReads / simulation$readCount)
78 | } else {
79 | mergedRates = cbind(mergedRates, simulation$conversionRate)
80 | }
81 | }
82 | }
83 | colnames(mergedRates) = c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", times)
84 | mergedRates$readsCPM = mergedRates$readsCPM / length(times)
85 | mergedRates$multiMapCount = mergedRates$multiMapCount / length(times)
86 | mergedRates
87 | }
88 |
89 | computeHalfLife <- function(rates, timepoints) {
90 | # Infere half life from data
91 | a_start<-max(rates) #param a is the y value when x=0
92 | k_start = log(2, base = exp(1))/5
93 |
94 | halfLifePred = NA
95 | C = NA
96 | k = NA
97 |
98 | tryCatch( {
99 | fit = nls(rates ~ a*(1-exp(-k*(timepoints))), start=list(a=a_start,k=k_start))
100 | halfLifePred = log(2, base = exp(1))/coef(fit)[2] * 60
101 | C = coef(fit)[1]
102 | k = coef(fit)[2]
103 | }, error=function(e){})
104 | summary(fit)
105 |
106 | RSS.p <- sum(residuals(fit)^2)
107 | TSS <- sum((rates - mean(rates))^2)
108 | rsquared = 1 - (RSS.p/TSS)
109 |
110 | c(halfLifePred, C, k, rsquared)
111 | }
112 |
113 | perRead = F
114 | slamDunkMergedRates = mergeRates(times, filesSlamDunk, perRead)
115 |
116 | halfLifeTable = data.frame()
117 |
118 | for(utr in 1:nrow(slamDunkMergedRates)) {
119 | #utr = 8
120 | slamDunkMergedRates[utr,]
121 | pulseSlamDunk = data.frame(y = as.numeric(t(slamDunkMergedRates[utr, 8:(7 + length(times))])[,1]), x = times)
122 |
123 | result = computeHalfLife(pulseSlamDunk$y, pulseSlamDunk$x)
124 | #rates = pulseSlamDunk$y
125 | #timepoints = pulseSlamDunk$x
126 | halfLifeTable = rbind(halfLifeTable, cbind(slamDunkMergedRates[utr, c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount")], result[1]))
127 | }
128 |
129 | colnames(halfLifeTable) = c("#chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", "score")
130 |
131 | write.table(halfLifeTable, outputFile, sep = "\t", quote = F, row.names = F, col.names = T)
132 |
--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/README:
--------------------------------------------------------------------------------
1 | RNASeqReadSimulator
2 | ==================
3 | Author: Wei Li (li.david.wei AT gmail.com)
4 |
5 | Introduction
6 | ------------
7 | RNASeqReadSimulator is a set of scripts generating simulated RNA-Seq reads. RNASeqReadSimulator provides users a simple tool to generate RNA-Seq reads for research purposes, and a framework to allow experienced users to expand functions. RNASeqReadSimulator offers the following features:
8 |
9 | 1. It allows users to randomly assign expression levels of transcripts and generate simulated single-end or paired-end RNA-Seq reads.
10 |
11 | 2. It is able to generate RNA-Seq reads that have a specified positional bias profile.
12 |
13 | 3. It is able to simulate random read errors from sequencing platforms.
14 |
15 | 4. The simulator consists of a few simple Python scripts. All scripts are command line driven, allowing users to invoke and design more functions.
16 |
17 | Requirements
18 | ------------
19 | RNASeqReadSimulator runs on python 2.7 with biopython package installed.
20 |
21 | Installation
22 | ------------
23 | After download, it is suggested that the path of the scripts (src) be added to the system path. For example, if the scripts are located at /home/me/rnaseqsimulator, then add the following command to your .bashrc profile:
24 |
25 | export PATH="$PATH:/home/me/rnaseqsimulator/src"
26 |
27 | Demo
28 | ----
29 | The demo folder includes a few scripts and sample input files to generate RNA-Seq reads from a simple example. Two bash scripts, gensingleendreads.sh and genpairedendreads.sh, are examples to generate single-end and paired-end reads.
30 |
31 |
32 |
33 | Usage
34 | -----
35 |
36 | RNASeqReadSimulator includes the following essential scripts:
37 |
38 | genexplvprofile.py is used to assign a random expression level of transcripts;
39 |
40 | gensimreads.py simulates RNA-Seq reads in BED format;
41 |
42 | getseqfrombed.py converts reads from BED format to FASTA format;
43 |
44 | Other optional scripts and files include:
45 |
46 | splitfasta.py splits paired-end reads in FASTA file into to separate files;
47 |
48 | addvariation2splicingbed.py is a supplementary script to generate variations in splicing RNA-Seq reads.
49 |
50 |
51 |
52 | genexplvprofile.py
53 | ------------------
54 |
55 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File).
56 |
57 | USAGE genexplvprofile.py {OPTIONS}
58 |
59 | OPTIONS
60 |
61 | -h/--help Print this message
62 |
63 | -e/--lognormal Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4
64 |
65 | -f/--statonly Print the statistics only; do not assign expression levels.
66 |
67 | NOTE:
68 |
69 | To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).
70 |
71 | The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]).
72 |
73 |
74 | gensimreads.py
75 | -------------
76 | This script generates simulated RNA-Seq reads (in .bed format) from known gene annotations.
77 |
78 | Usage: gensimreads.py {OPTIONS}
79 |
80 | BED-File: The gene annotation file (in BED format). Use '-' for STDIN input.
81 |
82 | OPTIONS
83 |
84 | -e/--expression Specify the weight of each transcript. Each line in the file should have at least (NFIELD+1) fields, with field 0 the annotation id, and field NFIELD the weight of this annotation. NFIELD is given by -f/--field option. If this file is not provided, uniform weight is applied. See the output of genexplvprofile.py as an example.
85 |
86 | -n/--nreads Specify the number of reads to be generated. Default 100000.
87 |
88 | -b/--posbias Specify the positional bias file. The file should include at least 100 lines, each contains only one integer number, showing the preference of the positional bias at this position. If no positional bias file is specified, use uniform distribution bias.
89 |
90 | -l/--readlen Specify the read length. Default 32.
91 |
92 | -o/--output