├── slamdunk
    ├── __init__.py
    ├── dunks
    │   ├── __init__.py
    │   ├── dump.py
    │   ├── snps.py
    │   ├── deduplicator.py
    │   ├── mapper.py
    │   └── filter.py
    ├── plot
    │   ├── __init__.py
    │   ├── checkLibraries.R
    │   ├── PCAPlotter.R
    │   ├── conversion_per_read_position.R
    │   ├── compute_context_TC_rates.R
    │   ├── compute_sample_comparison_statistics.R
    │   ├── compute_overall_rates.R
    │   ├── compute_conversion_rate_mle.R
    │   ├── splash_eval_count_files.R
    │   ├── compute_halflifes.R
    │   ├── SNPeval.R
    │   ├── globalRatePlotter.R
    │   ├── eval_halflifes_error_plot.R
    │   ├── eval_conversion_rate_plots.R
    │   ├── eval_halflife_per_gene_plots.R
    │   └── merge_rate_files.R
    ├── slamseq
    │   └── __init__.py
    ├── test
    │   ├── __init__.py
    │   ├── data
    │   │   ├── actb.bed
    │   │   ├── reads_slamdunk_mapped_filtered_tcount.tsv
    │   │   └── reads.fq
    │   ├── test_sample.py
    │   └── test_sample.sh
    ├── utils
    │   ├── __init__.py
    │   ├── BedReader.py
    │   ├── SNPtools.py
    │   └── misc.py
    ├── contrib
    │   └── RNASeqReadSimulator
    │   │   ├── src
    │   │       ├── getSegs.pyc
    │   │       ├── splitfasta.py
    │   │       ├── addvariation2splicingbed.py
    │   │       ├── getSegs.py
    │   │       ├── genexplvprofile.py
    │   │       ├── getseqfrombed.py
    │   │       └── gensimreads.py
    │   │   ├── demo
    │   │       ├── input
    │   │       │   ├── samplereaderror.txt
    │   │       │   ├── sampleposbias.txt
    │   │       │   └── sample.bed
    │   │       ├── gensingleendreads.sh
    │   │       ├── genstrandedreads.sh
    │   │       └── genpairedendreads.sh
    │   │   └── README
    └── version.py
├── MANIFEST.in
├── .settings
    └── .gitignore
├── requirements.txt
├── .gitignore
├── environment.yml
├── hooks
    └── build
├── bin
    ├── splash
    ├── alleyoop
    ├── slamdunk
    └── _preamble.py
├── .travis.yml
├── Dockerfile
├── README.md
└── setup.py


/slamdunk/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md


--------------------------------------------------------------------------------
/slamdunk/dunks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slamdunk/plot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slamdunk/slamseq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slamdunk/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slamdunk/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.settings/.gitignore:
--------------------------------------------------------------------------------
1 | /org.eclipse.core.resources.prefs
2 | 


--------------------------------------------------------------------------------
/slamdunk/test/data/actb.bed:
--------------------------------------------------------------------------------
1 | chr5	120498	122492	Actb	0	+
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.9.4
2 | pybedtools>=0.6.4
3 | intervaltree>=2.1.0
4 | pandas>=0.13.1
5 | biopython>=1.63
6 | pysam>=0.8.3
7 | Cython>=0.20.1


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/t-neumann/slamdunk/HEAD/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.pyc


--------------------------------------------------------------------------------
/slamdunk/test/test_sample.py:
--------------------------------------------------------------------------------
 1 | # content of test_sample.py
 2 | def func(x):
 3 |     return x + 1
 4 | 
 5 | #def test_run():
 6 |     
 7 | 
 8 | def test_answer():
 9 |     assert func(4) == 5
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.project
 2 | /.pydevproject
 3 | /sync.sh
 4 | *.pyc
 5 | .DS_Store
 6 | /doc/build/
 7 | /bin/NextGenMap/
 8 | /bin/ngm
 9 | /slamdunk/plot/Rslamdunk
10 | .Rhistory
11 | .cache
12 | *-enc.2.ngm
13 | *-ht-13-2.3.ngm
14 | *.fai
15 | 
16 | 


--------------------------------------------------------------------------------
/slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv:
--------------------------------------------------------------------------------
1 | Chromosome	Start	End	Name	Length	Strand	ConversionRate	ReadsCPM	Tcontent	CoverageOnTs	ConversionsOnTs	ReadCount	TcReadCount	multimapCount	ConversionRateLower	ConversionRateUpper
2 | chr5	120498	122492	Actb	1994	+	0.022222222222222223	666666.6666666666	445	90	2	8	4	0	-1.0	-1.0
3 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: slamdunk
 2 | dependencies:
 3 |   - nextgenmap=0.5.5
 4 |   - samtools=1.10
 5 |   - varscan=2.4.4
 6 |   - r-tidyverse=1.3.0
 7 |   - r-matrixstats=0.55.0
 8 |   - r-gridextra=2.3
 9 |   - r-getopt=1.20.3
10 |   - joblib=0.14.0
11 |   - pandas=0.25.3
12 |   - cython=0.29.14
13 |   - biopython=1.74
14 |   - pybedtools=0.8.0
15 |   - intervaltree=3.0.2
16 | 


--------------------------------------------------------------------------------
/hooks/build:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | if [ "$DOCKER_TAG" = "latest" ]; then
 4 |   echo "Building :latest, without VERSION_ARG"
 5 |   TAG=`curl -s https://api.github.com/repos/t-neumann/slamdunk/releases | grep tag_name | head -n 1 | cut -d '"' -f 4`
 6 |   echo $TAG
 7 |   docker build --build-arg VERSION_ARG="$TAG" -t ${IMAGE_NAME} .
 8 | else
 9 |   echo "Building :$DOCKER_TAG, with VERSION_ARG=\"--vers $DOCKER_TAG\""
10 |   docker build --build-arg VERSION_ARG="$DOCKER_TAG" -t ${IMAGE_NAME} .
11 | fi


--------------------------------------------------------------------------------
/slamdunk/test/test_sample.sh:
--------------------------------------------------------------------------------
1 | 
2 | slamdunk all -r slamdunk/test/data/ref.fa -b slamdunk/test/data/actb.bed -o slamdunk/test/data/output -rl 100 -mbq 27 -5 0 slamdunk/test/data/reads.fq
3 | 
4 | grep -v "^#" slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount.tsv > slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv 
5 | 
6 | diff slamdunk/test/data/reads_slamdunk_mapped_filtered_tcount.tsv slamdunk/test/data/output/count/reads_slamdunk_mapped_filtered_tcount_noheader.tsv 
7 | 
8 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/splitfasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Split fasta files including paired-end reads into two separate files
 4 | """
 5 | from __future__ import print_function;
 6 | import sys;
 7 | import re;
 8 | 
 9 | outfile1="";
10 | outfile2="";
11 | 
12 | for i in range(len(sys.argv)):
13 |   if sys.argv[i]=="-o":
14 |     outfile1=sys.argv[i+1]+"_1.fa";
15 |     outfile2=sys.argv[i+1]+"_2.fa";
16 | 
17 | if outfile1=="":
18 |   sys.exit(-1);
19 | 
20 | ofid1=open(outfile1,"w");
21 | ofid2=open(outfile2,"w");
22 | 
23 | isleft=True;
24 | for lines in sys.stdin:
25 |   if lines[0]=='>':
26 |     if lines.strip()[-1]=='1':
27 |       isleft=True;
28 |     else:
29 |       isleft=False;
30 |   lines=re.sub("/[12]","",lines);
31 |   if isleft:
32 |     print(lines,file=ofid1,end='');
33 |   else:
34 |     print(lines,file=ofid2,end='');
35 | 
36 | 
37 | 
38 | 
39 | 
40 | ofid1.close();
41 | ofid2.close();
42 | 


--------------------------------------------------------------------------------
/bin/splash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | # 
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | # 
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | # 
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | import sys
21 | 	
22 | try:
23 | 	import _preamble
24 | except ImportError:
25 | 	sys.exc_clear()
26 | 
27 | from slamdunk import splash
28 | splash.run()


--------------------------------------------------------------------------------
/bin/alleyoop:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | # 
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | # 
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | # 
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | import sys
21 | 	
22 | try:
23 | 	import _preamble
24 | except ImportError:
25 | 	sys.exc_clear()
26 | 
27 | from slamdunk import alleyoop
28 | alleyoop.run()


--------------------------------------------------------------------------------
/bin/slamdunk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | # 
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | # 
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | # 
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | import sys
21 | 	
22 | try:
23 | 	import _preamble
24 | except ImportError:
25 | 	sys.exc_clear()
26 | 
27 | from slamdunk import slamdunk
28 | slamdunk.run()
29 | 


--------------------------------------------------------------------------------
/slamdunk/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 2 | #
 3 | # This file is part of Slamdunk.
 4 | #
 5 | # Slamdunk is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Affero General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | # Overall slamDunk version
19 | __version__ = "0.4.3"
20 | # File format version of BAM files from slamdunk filter
21 | __bam_version__ = "3"
22 | # File format version of count files from slamdunk count
23 | __count_version__ = "3"
24 | # Required NextGenMap version
25 | __ngm_version__ = "0.5.5"
26 | 


--------------------------------------------------------------------------------
/bin/_preamble.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 2 | #
 3 | # This file is part of Slamdunk.
 4 | # 
 5 | # Slamdunk is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Affero General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | # 
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Affero General Public License for more details.
14 | # 
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 |     
18 | import sys, os
19 |     
20 | path = os.path.abspath(sys.argv[0])
21 | while os.path.dirname(path) != path:
22 |     if os.path.exists(os.path.join(path, 'slamdunk', '__init__.py')):
23 |         
24 |         #sys.path.insert(0, os.path.join(path, 'slamdunk'))
25 |         sys.path.insert(0, path)
26 |         break
27 |     path = os.path.dirname(path)
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   # We don't actually use the Travis Python, but this keeps it organized.
 4 |   - "2.7"
 5 |   - "3.5"
 6 |   - "3.6"
 7 |   - "3.7"
 8 | 
 9 | before_install:
10 | 
11 |   # Here we just install Miniconda, which you shouldn't have to change.
12 |   - if [ "$TRAVIS_OS_NAME" == "osx" ]; then
13 |         wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh;
14 |     else
15 |         wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
16 |     fi
17 |   - chmod +x miniconda.sh
18 |   - ./miniconda.sh -b -p $HOME/miniconda
19 |   - export PATH=/home/travis/miniconda/bin:$PATH
20 |   - conda update --yes conda
21 |   - conda config --add channels defaults
22 |   - conda config --add channels bioconda
23 |   - conda config --add channels conda-forge
24 | 
25 | install:
26 | 
27 |   # We just set up a conda environment with the right Python version. This
28 |   # should not need changing.
29 | 
30 |   - conda env create -f environment.yml
31 |   - source activate slamdunk
32 |   - pip install pytest
33 |   - pip install .
34 | 
35 | # command to run tests
36 | script:
37 |  - slamdunk -h
38 |  - alleyoop -h
39 |  - splash -h
40 |  - slamdunk/test/test_sample.sh
41 |  - pytest
42 | 


--------------------------------------------------------------------------------
/slamdunk/plot/checkLibraries.R:
--------------------------------------------------------------------------------
 1 | # Helper function to check whether Rslamdunk libraries are available
 2 | # Install if libraries are not available
 3 | 
 4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 5 | #
 6 | # This file is part of Slamdunk.
 7 | # 
 8 | # Slamdunk is free software: you can redistribute it and/or modify
 9 | # it under the terms of the GNU Affero General Public License as
10 | # published by the Free Software Foundation, either version 3 of the
11 | # License, or (at your option) any later version.
12 | # 
13 | # Slamdunk is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 | # GNU Affero General Public License for more details.
17 | # 
18 | # You should have received a copy of the GNU Affero General Public License
19 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 | 
21 | checkLib <- function(libLoc) {
22 | 	
23 | 	list.of.packages <- c("getopt","ggplot2","gridExtra","RColorBrewer","lattice","matrixStats","assertthat","lazyeval","tibble")
24 | 	new.packages <- list.of.packages[!(list.of.packages %in% installed.packages(lib.loc = libLoc)[,"Package"])]
25 | 	
26 | 	if(length(new.packages)) install.packages(new.packages, repos="http://cran.wu.ac.at/", lib = libLoc, dependencies = TRUE)
27 | }


--------------------------------------------------------------------------------
/slamdunk/test/data/reads.fq:
--------------------------------------------------------------------------------
 1 | @Read1_1_0
 2 | CCGTTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
 3 | +
 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
 5 | @Read2_0_0
 6 | TTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
 7 | +
 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
 9 | @Read3_1_0
10 | CTTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
11 | +
12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
13 | @Read4_1_1
14 | TCTGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
15 | +
16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
17 | @Read5_2_2
18 | CCGTGTAAGGTAAGGTGTGCACTTTTATTGGTCTCA
19 | +
20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
21 | @Read6_1_0
22 | TTGTGTAAGGTAAGGCGTGCACTTTTATTGGTCTCA
23 | +
24 | FFFFFFFFFFFFFFF<FFFFFFFFFFFFFFFFFFFF
25 | @Read7_0_0
26 | CACACCCACCCAACCTGCTCCCCACACCCAGCAAAGCCACAGAGCCACAAGCTGTT
27 | +
28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
29 | @Read8_1_1
30 | CACACCCACCCAACCTGCTCCCCACACCCAGCAAAGCCACAGAGCCACAAGCTGTC
31 | +
32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
33 | @Read9_1_0
34 | CACACCCACCCAACCTGCTCCCCACACCCAGCAAAGCCACAGAGCCACAAGCTGCT
35 | +
36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
37 | @Read10_0_0_rev
38 | TGAGACCAATAAAAGTGCACACCTTACCTTACACAAAC
39 | +
40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
41 | @Read11_1_0_rev
42 | TGAGACCAATAAAAGTGCACACCCTACCTTACACAAAC
43 | +
44 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
45 | @Read12_1_0_rev
46 | TGAGACCAATAGAAGTGCACACCTTACCTTACACAAAC
47 | +
48 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
49 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/input/samplereaderror.txt:
--------------------------------------------------------------------------------
  1 | 3.000000
  2 | 2.720296
  3 | 2.483505
  4 | 2.286566
  5 | 2.126417
  6 | 2.000000
  7 | 1.904253
  8 | 1.836116
  9 | 1.792529
 10 | 1.770431
 11 | 1.766761
 12 | 1.778460
 13 | 1.802466
 14 | 1.835720
 15 | 1.875161
 16 | 1.917728
 17 | 1.960361
 18 | 2.000000
 19 | 2.034154
 20 | 2.062613
 21 | 2.085737
 22 | 2.103886
 23 | 2.117420
 24 | 2.126699
 25 | 2.132082
 26 | 2.133930
 27 | 2.132603
 28 | 2.128460
 29 | 2.121861
 30 | 2.113167
 31 | 2.102737
 32 | 2.090931
 33 | 2.078109
 34 | 2.064631
 35 | 2.050858
 36 | 2.037148
 37 | 2.023861
 38 | 2.011359
 39 | 2.000000
 40 | 1.990105
 41 | 1.981831
 42 | 1.975300
 43 | 1.970628
 44 | 1.967936
 45 | 1.967341
 46 | 1.968964
 47 | 1.972922
 48 | 1.979335
 49 | 1.988321
 50 | 2.000000
 51 | 2.014521
 52 | 2.032159
 53 | 2.053218
 54 | 2.078005
 55 | 2.106823
 56 | 2.139979
 57 | 2.177778
 58 | 2.220525
 59 | 2.268524
 60 | 2.322082
 61 | 2.381504
 62 | 2.447094
 63 | 2.519158
 64 | 2.598001
 65 | 2.683929
 66 | 2.777247
 67 | 2.878259
 68 | 2.987272
 69 | 3.104589
 70 | 3.230518
 71 | 3.365362
 72 | 3.509427
 73 | 3.663018
 74 | 3.826441
 75 | 4.000000
 76 | 4.184001
 77 | 4.378750
 78 | 4.584550
 79 | 4.801708
 80 | 5.030529
 81 | 5.271318
 82 | 5.524380
 83 | 5.790021
 84 | 6.068545
 85 | 6.360258
 86 | 6.665466
 87 | 6.984472
 88 | 7.317583
 89 | 7.665104
 90 | 8.027340
 91 | 8.404596
 92 | 8.797178
 93 | 9.205390
 94 | 9.629538
 95 | 10.069928
 96 | 10.526863
 97 | 11.000650
 98 | 11.491594
 99 | 12.000000
100 | 12.526173
101 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/input/sampleposbias.txt:
--------------------------------------------------------------------------------
  1 | 4.000000
  2 | 4.443116
  3 | 4.845133
  4 | 5.207711
  5 | 5.532509
  6 | 5.821189
  7 | 6.075411
  8 | 6.296835
  9 | 6.487120
 10 | 6.647928
 11 | 6.780919
 12 | 6.887753
 13 | 6.970090
 14 | 7.029590
 15 | 7.067914
 16 | 7.086722
 17 | 7.087675
 18 | 7.072432
 19 | 7.042653
 20 | 7.000000
 21 | 6.946132
 22 | 6.882710
 23 | 6.811393
 24 | 6.733843
 25 | 6.651718
 26 | 6.566681
 27 | 6.480390
 28 | 6.394507
 29 | 6.310691
 30 | 6.230603
 31 | 6.155902
 32 | 6.088250
 33 | 6.029306
 34 | 5.980731
 35 | 5.944186
 36 | 5.921329
 37 | 5.913822
 38 | 5.923324
 39 | 5.951497
 40 | 6.000000
 41 | 6.070019
 42 | 6.160842
 43 | 6.271281
 44 | 6.400151
 45 | 6.546263
 46 | 6.708431
 47 | 6.885469
 48 | 7.076188
 49 | 7.279403
 50 | 7.493927
 51 | 7.718571
 52 | 7.952151
 53 | 8.193478
 54 | 8.441365
 55 | 8.694627
 56 | 8.952075
 57 | 9.212524
 58 | 9.474785
 59 | 9.737673
 60 | 10.000000
 61 | 10.260792
 62 | 10.519923
 63 | 10.777482
 64 | 11.033555
 65 | 11.288230
 66 | 11.541594
 67 | 11.793735
 68 | 12.044740
 69 | 12.294696
 70 | 12.543691
 71 | 12.791812
 72 | 13.039147
 73 | 13.285783
 74 | 13.531807
 75 | 13.777307
 76 | 14.022370
 77 | 14.267083
 78 | 14.511534
 79 | 14.755811
 80 | 15.000000
 81 | 15.244189
 82 | 15.488466
 83 | 15.732917
 84 | 15.977630
 85 | 16.222693
 86 | 16.468193
 87 | 16.714217
 88 | 16.960853
 89 | 17.208188
 90 | 17.456309
 91 | 17.705304
 92 | 17.955260
 93 | 18.206265
 94 | 18.458406
 95 | 18.711770
 96 | 18.966445
 97 | 19.222518
 98 | 19.480077
 99 | 19.739208
100 | 20.000000
101 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 2 | #
 3 | # This file is part of Slamdunk.
 4 | # 
 5 | # Slamdunk is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Affero General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | # 
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Affero General Public License for more details.
14 | # 
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | FROM continuumio/miniconda3:4.7.12
19 | 
20 | MAINTAINER Tobias Neumann <tobias.neumann.at@gmail.com>
21 | 
22 | ARG VERSION_ARG
23 | 
24 | COPY environment.yml /tmp/environment.yml
25 | 
26 | RUN apt-get update \
27 |     && apt-get install -y procps \
28 |     && apt-get clean -y \
29 |     && rm -rf /var/lib/apt/lists/* \
30 |     && conda config --add channels defaults \
31 |     && conda config --add channels bioconda \
32 |     && conda config --add channels conda-forge \
33 |     && conda env create --name slamdunk -f /tmp/environment.yml \
34 |     && /opt/conda/envs/slamdunk/bin/pip install git+https://github.com/t-neumann/slamdunk.git@${VERSION_ARG} \
35 |     && rm -rf /opt/conda/pkgs/*
36 | 
37 | ENV PATH /opt/conda/envs/slamdunk/bin:$PATH
38 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/gensingleendreads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #-------------------------------------------
 4 | # Parameters
 5 | # Required parameters:
 6 | # Transcript annotation (BED file)
 7 | BED=input/sample.bed
 8 | 
 9 | # output FASTA prefix
10 | FASTAFILE=output/single.fa
11 | 
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 | 
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 | 
19 | # Number of reads generated
20 | NREAD=100000
21 | 
22 | # positional bias file
23 | POSBIAS=input/sampleposbias.txt
24 | 
25 | # Read error position profile
26 | READERR=input/samplereaderror.txt
27 | 
28 | # Intermediate files
29 | # File for random expression level assignment
30 | RANDEXPLV=output/explvprofile.txt
31 | 
32 | 
33 | #-----------------------------------------------
34 | # Add paths if users don't install the script
35 | export PATH=../src/:$PATH
36 | # Commands to randomly assign weights to each transcript
37 | 
38 | if [ ! -d "output" ]; then
39 |   mkdir output
40 | fi
41 | 
42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV"
43 | 
44 | echo "Commands to randomly assign weights to each transcript:"
45 | echo $CMD0
46 | 
47 | genexplvprofile.py $BED > $RANDEXPLV
48 | 
49 | # Commands to simulate reads (output to STDOUT in BED format)  
50 | # If you want single-end reads, don't use the "-p" option.
51 | CMD1="gensimreads.py -e $RANDEXPLV  -n $NREAD -b $POSBIAS -l $READLEN  $BED "
52 | 
53 | echo "Commands to generate simulated paired-reads in BED format:"
54 | echo $CMD1
55 | 
56 | 
57 | # Commands to convert BED file to fasta file
58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN -  $REFERENCE"
59 | 
60 | echo "Commands to generate FASTA file from the last command:"
61 | echo $CMD2
62 | echo "Output FASTA prefix: $FASTAFILE"
63 | 
64 | 
65 | 
66 | # Execute two commands simultaneously
67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
68 | $CMD1 | $CMD2 > $FASTAFILE
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/genstrandedreads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #-------------------------------------------
 4 | # Parameters
 5 | # Required parameters:
 6 | # Transcript annotation (BED file)
 7 | BED=input/sample.bed
 8 | 
 9 | # output FASTA prefix
10 | FASTAFILE=output/single-stranded.fa
11 | 
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 | 
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 | 
19 | # Number of reads generated
20 | NREAD=100000
21 | 
22 | # positional bias file
23 | POSBIAS=input/sampleposbias.txt
24 | 
25 | # Read error position profile
26 | READERR=input/samplereaderror.txt
27 | 
28 | # Intermediate files
29 | # File for random expression level assignment
30 | RANDEXPLV=output/explvprofile.txt
31 | 
32 | 
33 | #-----------------------------------------------
34 | # Add paths if users don't install the script
35 | export PATH=../src/:$PATH
36 | # Commands to randomly assign weights to each transcript
37 | 
38 | if [ ! -d "output" ]; then
39 |   mkdir output
40 | fi
41 | 
42 | CMD0="genexplvprofile.py $BED > $RANDEXPLV"
43 | 
44 | echo "Commands to randomly assign weights to each transcript:"
45 | echo $CMD0
46 | 
47 | genexplvprofile.py $BED > $RANDEXPLV
48 | 
49 | # Commands to simulate reads (output to STDOUT in BED format)  
50 | # If you want single-end reads, don't use the "-p" option.
51 | CMD1="gensimreads.py --stranded -e $RANDEXPLV  -n $NREAD -b $POSBIAS -l $READLEN  $BED "
52 | 
53 | echo "Commands to generate simulated paired-reads in BED format:"
54 | echo $CMD1
55 | 
56 | 
57 | # Commands to convert BED file to fasta file
58 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN -  $REFERENCE"
59 | 
60 | echo "Commands to generate FASTA file from the last command:"
61 | echo $CMD2
62 | echo "Output FASTA prefix: $FASTAFILE"
63 | 
64 | 
65 | 
66 | # Execute two commands simultaneously
67 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
68 | $CMD1 | $CMD2 > $FASTAFILE
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="http://t-neumann.github.io/slamdunk/images/slamdunk_logo_light.png" width="300" title="Slamdunk">
 2 | 
 3 | ### Streamlining SLAM-Seq analysis with ultra-high sensitivity.
 4 | 
 5 | [![GitHub release](https://img.shields.io/github/release/t-neumann/slamdunk.svg)](https://github.com/t-neumann/slamdunk/releases/latest)
 6 | [![Travis CI](https://img.shields.io/travis/t-neumann/slamdunk.svg)](https://travis-ci.org/t-neumann/slamdunk)
 7 | 
 8 | [![Docker Pulls](https://img.shields.io/docker/pulls/tobneu/slamdunk.svg)](https://hub.docker.com/r/tobneu/slamdunk)
 9 | [![Docker Automated build](https://img.shields.io/docker/automated/tobneu/slamdunk.svg)](https://hub.docker.com/r/tobneu/slamdunk/builds/)
10 | 
11 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/slamdunk/README.html)
12 | [![Anaconda build](https://anaconda.org/bioconda/slamdunk/badges/version.svg
13 | )](https://anaconda.org/bioconda/slamdunk)
14 | [![Anaconda downloads](https://anaconda.org/bioconda/slamdunk/badges/downloads.svg
15 | )](https://anaconda.org/bioconda/slamdunk)
16 | 
17 | [![PyPI release](https://img.shields.io/pypi/v/slamdunk.svg)](https://pypi.python.org/pypi/slamdunk)
18 | ![Github Stars](https://img.shields.io/github/stars/t-neumann/slamdunk.svg?style=social&label=Star)
19 | 
20 | -----
21 | 
22 | ### Slamdunk documentation
23 | 
24 | http://t-neumann.github.io/slamdunk
25 | 
26 | ### nf-core slamseq workflow
27 | 
28 | [![nfcore/slamseq](https://github.com/nf-core/slamseq/raw/master/docs/images/nf-core-slamseq_logo.png)](https://nf-co.re/slamseq)
29 | 
30 | ### Please cite
31 | 
32 | Neumann, T., Herzog, V. A., Muhar, M., Haeseler, von, A., Zuber, J., Ameres, S. L., & Rescheneder, P. (2019). [Quantification of experimentally induced nucleotide conversions in high-throughput sequencing datasets](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2849-7). BMC Bioinformatics, 20(1), 258. http://doi.org/10.1186/s12859-019-2849-7
33 | 
34 | 


--------------------------------------------------------------------------------
/slamdunk/dunks/dump.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | # 
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | # 
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | # 
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | from __future__ import print_function
21 | 
22 | from slamdunk.utils.misc import checkStep  # @UnresolvedImport
23 | from slamdunk.slamseq.SlamSeqFile import SlamSeqBamFile, SlamSeqWriter  # @UnresolvedImport
24 | from slamdunk.utils import SNPtools  # @UnresolvedImport
25 | 
26 | def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False):
27 |     
28 |     if(not checkStep([bam, referenceFile], [outputCSV], force)):
29 |         print("Skipped computing T->C per reads position for file " + bam, file=log)
30 |     else:
31 |                 
32 |         snps = SNPtools.SNPDictionary(snpsFile)
33 |         snps.read()
34 |     
35 |         outputFile = SlamSeqWriter(outputCSV)
36 |         
37 |         #Go through one chr after the other
38 |         testFile = SlamSeqBamFile(bam, referenceFile, snps)
39 |         
40 |         chromosomes = testFile.getChromosomes()
41 |         
42 |         for chromosome in chromosomes:
43 |             readIterator = testFile.readsInChromosome(chromosome)
44 |             for read in readIterator:
45 |                 outputFile.write(read)
46 | 
47 |         
48 |         outputFile.close()


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/genpairedendreads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #-------------------------------------------
 4 | # Parameters
 5 | # Required parameters:
 6 | # Transcript annotation (BED file)
 7 | BED=input/sample.bed
 8 | 
 9 | # output FASTA prefix
10 | FASTAFILE=output/paired
11 | 
12 | # reference chromosome
13 | REFERENCE=input/reference.fa
14 | 
15 | # Optional parameters
16 | # Read length
17 | READLEN=75
18 | 
19 | # Number of reads generated
20 | NREAD=100000
21 | 
22 | # The mean and std of spans for paired-end reads. 
23 | PAIREDEND="200,20"
24 | 
25 | # positional bias file
26 | POSBIAS=input/sampleposbias.txt
27 | 
28 | # Read error position profile
29 | READERR=input/samplereaderror.txt
30 | 
31 | # Intermediate files
32 | # File for random expression level assignment
33 | RANDEXPLV=output/explvprofile.txt
34 | 
35 | 
36 | #-----------------------------------------------
37 | 
38 | # Add paths if users don't install the script
39 | export PATH=../src/:$PATH
40 | 
41 | # Commands to randomly assign weights to each transcript
42 | 
43 | if [ ! -d "output" ]; then
44 |   mkdir output
45 | fi
46 | 
47 | CMD0=" genexplvprofile.py $BED > $RANDEXPLV"
48 | 
49 | echo "Commands to randomly assign weights to each transcript:"
50 | echo $CMD0
51 | 
52 | 
53 | genexplvprofile.py $BED > $RANDEXPLV 
54 | 
55 | # Commands to simulate reads (output to STDOUT in BED format)  
56 | # If you want single-end reads, don't use the "-p" option.
57 | CMD1="gensimreads.py -e $RANDEXPLV  -n $NREAD -b $POSBIAS -l $READLEN -p $PAIREDEND  $BED "
58 | 
59 | echo "Commands to generate simulated paired-reads in BED format:"
60 | echo $CMD1
61 | 
62 | 
63 | # Commands to convert BED file to fasta file
64 | CMD2="getseqfrombed.py -b $READERR -f A -r 0.01 -l $READLEN -  $REFERENCE"
65 | 
66 | echo "Commands to generate FASTA file from the last command:"
67 | echo $CMD2
68 | echo "Output FASTA prefix: $FASTAFILE"
69 | 
70 | 
71 | 
72 | # Execute two commands simultaneously
73 | # If you want single-end reads, do not use the splitfasta.py. Instead, execute:
74 | # $CMD1 | $CMD2 > $FASTAFILE
75 | $CMD1  | $CMD2 | splitfasta.py -o $FASTAFILE 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/addvariation2splicingbed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | This script is used to add splicing variations from STDIN .BED file.
 4 | 
 5 | Usage: addvariation2splicingbed.py {OPTIONS} 
 6 | 
 7 | OPTIONS
 8 | 
 9 | ATTENTION: 
10 | 
11 | HISTORY
12 | 	01/09/2012 
13 | 
14 | """
15 | from __future__ import print_function;
16 | import sys;
17 | import subprocess;
18 | import pydoc;
19 | import os;
20 | import random;
21 | import bisect;
22 | import math;
23 | from getSegs import *;
24 | 
25 | import pdb;
26 | 
27 | 
28 | errorrate=0.2;
29 | onbedfile="-";
30 | 
31 | 
32 | for i in range(len(sys.argv)):
33 |   if sys.argv[i]=='-h':
34 |     print(pydoc.render_doc(sys.modules[__name__]));
35 |     sys.exit();
36 |   if i<len(sys.argv)-1:
37 |     if sys.argv[i]=='-o':
38 |       onbedfile=sys.argv[i+1];
39 |       print('Output bed file:',onbedfile,file=sys.stderr);
40 | 
41 | 
42 | 
43 | nlines=0;
44 | 
45 | 
46 | for lines in sys.stdin:
47 |   # update line counter
48 |   nlines=nlines+1;
49 |   if nlines %10000==1:
50 |     print('Processing '+str(nlines)+' lines...',file=sys.stderr);
51 |   # parse lines
52 |   bedfield=lines.strip().split();
53 |   if len(bedfield)!=12:
54 |     print('Error: incorrect number of fields (should be 12)',file=sys.stderr);
55 |     continue;
56 |   if int(bedfield[9])!=2:
57 |     print(lines,end='');
58 |     continue;
59 |   if bedfield[5]=='+':
60 |     direction=1;
61 |   elif bedfield[5]=='-':
62 |     direction=-1;
63 |   else:
64 |     print('Error: incorrect field in field[5] %s:' %bedfield[5],file=sys.stderr);
65 |   # parse all segments
66 |   fieldrange=(int(bedfield[1]),int(bedfield[2]));
67 |   if bedfield[10][-1]==',':
68 |     bedfield[10]=bedfield[10][:-1];
69 |   if bedfield[11][-1]==',':
70 |     bedfield[11]=bedfield[11][:-1];
71 |   exonlen=[int(x) for x in bedfield[10].split(',')];
72 |   exonstart=[int(x)+fieldrange[0] for x in bedfield[11].split(',')];
73 |   # get the segments
74 |   if random.random()<errorrate:
75 |     nshift=random.choice([-3,-2,-1,1,2,3]);
76 |     ndir=random.choice([-1,1]);
77 |     if ndir==-1:
78 |       exonstart[0]=exonstart[0]+nshift;
79 |     else:
80 |       exonstart[1]=exonstart[1]+nshift;
81 |   if True:
82 |     # random direction
83 |     writeBedline(sys.stdout,bedfield[3],bedfield[0],direction,exonstart,exonlen);
84 |        
85 | 
86 | print('Total '+str(nlines)+' lines...',file=sys.stderr);
87 | 
88 | 


--------------------------------------------------------------------------------
/slamdunk/dunks/snps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | #
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | from __future__ import print_function
21 | import subprocess
22 | import csv
23 | from slamdunk.utils.misc import checkStep, getBinary  # @UnresolvedImport
24 | 
25 | def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False):
26 |     if(checkStep([inputBAM, referenceFile], [outputSNP], force)):
27 |         fileSNP = open(outputSNP, 'w')
28 | 
29 |         mpileupCmd = "samtools mpileup -B -A -f " + referenceFile + " " + inputBAM
30 |         if(verbose):
31 |             print(mpileupCmd, file=log)
32 |         if(not printOnly):
33 |             mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log)
34 | 
35 |         varscanCmd = "varscan mpileup2snp  --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1"
36 |         if(verbose):
37 |             print(varscanCmd, file=log)
38 |         if(not printOnly):
39 |             varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log)
40 |             varscan.wait()
41 | 
42 |         fileSNP.close()
43 |     else:
44 |         print("Skipping SNP calling", file=log)
45 | 
46 | def countSNPsInFile(inputFile):
47 |     snpCount = 0
48 |     tcSnpCount = 0
49 |     with open(inputFile, "r") as snpFile:
50 |             snpReader = csv.reader(snpFile, delimiter='\t')
51 |             for row in snpReader:
52 |                 if((row[2].upper() == "T" and row[3].upper() == "C") or (row[2].upper() == "A" and row[3].upper() == "G")):
53 |                     tcSnpCount = tcSnpCount + 1
54 |                 snpCount = snpCount + 1
55 |     return snpCount, tcSnpCount
56 | 


--------------------------------------------------------------------------------
/slamdunk/utils/BedReader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 2 | #
 3 | # This file is part of Slamdunk.
 4 | #
 5 | # Slamdunk is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Affero General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | #
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Affero General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | from intervaltree import IntervalTree
19 | 
20 | def bedToIntervallTree(bed):
21 |     utrs = {}
22 | 
23 |     for utr in BedIterator(bed):
24 | 
25 |         if (not utr.chromosome in utrs) :
26 |             utrs[utr.chromosome] = IntervalTree()
27 | 
28 |         utrs[utr.chromosome][utr.start:(utr.stop + 1)] = utr.name
29 | 
30 |     return utrs
31 | 
32 | 
33 | class BedEntry:
34 | 
35 |     def __init__(self):
36 |         self.chromosome = ""
37 |         self.start = 0
38 |         self.stop = 0
39 |         self.name = ""
40 |         self.score = "."
41 |         self.strand = "."
42 | 
43 |     def __repr__(self):
44 |         return (self.chromosome + "\t" + str(self.start) + "\t" + str(self.stop) + "\t" + self.name)
45 | 
46 |     def getLength(self):
47 |         return self.stop - self.start
48 | 
49 |     def hasStrand(self):
50 |         return self.strand == "+" or self.strand == "-"
51 | 
52 |     def hasNonEmptyName(self):
53 |         return self.name != ""
54 | 
55 | class BedIterator:
56 | 
57 |     def __init__(self, filename):
58 |         self._bedFile = open(filename, "r")
59 | 
60 |     def __iter__(self):
61 |         return self
62 | 
63 |     def _toBED(self, line):
64 |         cols = line.rstrip().split("\t")
65 |         bedEntry = BedEntry()
66 |         bedEntry.chromosome = cols[0]
67 |         bedEntry.start = int(cols[1])
68 |         bedEntry.stop = int(cols[2])
69 |         bedEntry.name = cols[3]
70 | 
71 |         if (len(cols) > 4) :
72 |             bedEntry.score = cols[4]
73 |         # Add strand info if available
74 |         if (len(cols) > 5) :
75 |             bedEntry.strand = cols[5]
76 | 
77 |         return bedEntry
78 | 
79 |     def __next__(self):
80 |         try:
81 |             return self._toBED(self._bedFile.__next__())
82 |         except StopIteration:
83 |             self._bedFile.close()
84 |             raise StopIteration
85 | 


--------------------------------------------------------------------------------
/slamdunk/utils/SNPtools.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 2 | #
 3 | # This file is part of Slamdunk.
 4 | # 
 5 | # Slamdunk is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Affero General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | # 
10 | # Slamdunk is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Affero General Public License for more details.
14 | # 
15 | # You should have received a copy of the GNU Affero General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | import os
19 | 
20 | from pybedtools import BedTool
21 | 
22 | 
23 | class SNPDictionary(object):
24 |     
25 |     def __init__(self, vcfFile):
26 |         self._vcfFile = vcfFile
27 |         self._tcSNPs = {}
28 |         self._agSNPs = {}
29 | 
30 |     def _addSNP(self, snp):
31 | 
32 |         if(snp[3].upper() == "T" and snp[4].upper() == "C"):
33 |             key = snp[0] + snp[1]
34 |             self._tcSNPs[key] = True
35 |         
36 |         if(snp[3].upper() == "A" and snp[4].upper() == "G"):
37 |             key = snp[0] + snp[1]
38 |             self._agSNPs[key] = True
39 |         
40 |     def read(self):        
41 |         if (self._vcfFile != None):
42 |             if(os.path.exists(self._vcfFile)):
43 |                 vcfReader = BedTool(self._vcfFile)
44 |                 
45 |                 if(vcfReader.file_type != "vcf"):
46 |                     print("Wrong file type. Empty or not a vcf file.")
47 |          
48 |                 for snp in vcfReader:
49 |                     self._addSNP(snp)
50 |             else:
51 |                 print("Warning: SNP file " + self._vcfFile + " not found.")
52 |             
53 |     def isAGSnp(self, chromosome, position):
54 |         key = chromosome + str(int(position) + 1)
55 |         return key in self._agSNPs
56 |     
57 |     
58 |     def isTCSnp(self, chromosome, position):
59 |         key = chromosome + str(int(position) + 1)
60 |         return key in self._tcSNPs
61 | 
62 |     def getAGSNPsInUTR(self, chromosome, start, stop, snpType):
63 |         count = 0
64 |         for i in range(start, stop):
65 |                 if(self.isAGSnp(chromosome, i)):
66 |                     count += 1
67 |         return count
68 | 
69 |     def getTCSNPsInUTR(self, chromosome, start, stop, snpType):
70 |         count = 0
71 |         for i in range(start, stop):
72 |                 if(self.isTCSnp(chromosome, i)):
73 |                     count += 1
74 |         return count
75 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/demo/input/sample.bed:
--------------------------------------------------------------------------------
 1 | chr1	11873	14409	uc001aaa.3	0	+	11873	11873	0	3	354,109,1189,	0,739,1347,
 2 | chr1	11873	14409	uc010nxq.1	0	+	12189	13639	0	3	354,127,1007,	0,721,1529,
 3 | chr1	11873	14409	uc010nxr.1	0	+	11873	11873	0	3	354,52,1189,	0,772,1347,
 4 | chr1	14362	16765	uc009vis.2	0	-	14362	14362	0	4	467,69,147,159,	0,607,1433,2244,
 5 | chr1	14362	19759	uc001aae.3	0	-	14362	14362	0	10	467,69,152,159,198,136,137,147,99,847,	0,607,1433,2244,2495,2870,3243,3552,3905,4550,
 6 | chr1	14362	19759	uc009vit.2	0	-	14362	14362	0	9	467,69,152,159,198,510,147,99,847,	0,607,1433,2244,2495,2870,3552,3905,4550,
 7 | chr1	14362	19759	uc009viu.2	0	-	14362	14362	0	10	467,69,152,159,198,510,147,102,54,847,	0,607,1433,2244,2495,2870,3552,3905,4138,4550,
 8 | chr1	14362	24901	uc001aab.3	0	-	14362	14362	0	10	467,69,152,159,202,136,137,147,112,164,	0,607,1433,2244,2491,2870,3243,3552,3905,10375,
 9 | chr1	14362	29370	uc001aac.3	0	-	14362	14362	0	11	467,69,152,159,198,110,137,147,102,154,50,	0,607,1433,2244,2495,2896,3243,3552,3905,10375,14958,
10 | chr1	14362	29370	uc001aah.3	0	-	14362	14362	0	11	467,69,152,159,198,136,137,147,99,154,50,	0,607,1433,2244,2495,2870,3243,3552,3905,10375,14958,
11 | chr1	14362	29370	uc009viq.2	0	-	14362	14362	0	7	467,152,159,198,456,154,50,	0,1433,2244,2495,3243,10375,14958,
12 | chr1	14362	29370	uc009vir.2	0	-	14362	14362	0	10	467,69,152,159,198,510,147,99,154,50,	0,607,1433,2244,2495,2870,3552,3905,10375,14958,
13 | chr1	14406	29370	uc009viv.2	0	-	14406	14406	0	7	2359,198,136,137,147,154,50,	0,2451,2826,3199,3508,10331,14914,
14 | chr1	14406	29370	uc009viw.2	0	-	14406	14406	0	7	2359,198,510,147,99,154,50,	0,2451,2826,3508,3861,10331,14914,
15 | chr1	15602	29370	uc009vix.2	0	-	15602	15602	0	7	345,159,198,136,147,154,50,	0,1004,1255,1630,2312,9135,13718,
16 | chr1	15795	18061	uc009vjd.2	0	-	15795	15795	0	5	152,159,198,136,456,	0,811,1062,1437,1810,
17 | chr1	16606	29370	uc009viy.2	0	-	16606	16606	0	9	159,198,136,137,147,95,58,154,50,	0,251,626,999,1308,1661,1890,8131,12714,
18 | chr1	16606	29370	uc009viz.2	0	-	16606	16606	0	8	159,202,136,137,147,112,154,50,	0,247,626,999,1308,1661,8131,12714,
19 | chr1	16857	17751	uc009vjc.1	0	-	16857	16857	0	2	198,519,	0,375,
20 | chr1	16857	19759	uc001aai.1	0	-	16857	16857	0	6	198,136,137,147,112,847,	0,375,748,1057,1410,2055,
21 | chr1	16857	29370	uc010nxs.1	0	-	16857	16857	0	8	198,136,137,147,99,227,154,50,	0,375,748,1057,1410,2055,7880,12463,
22 | chr1	16857	29961	uc009vjb.1	0	-	16857	16857	0	7	198,136,137,147,112,154,138,	0,375,748,1057,1410,7880,12966,
23 | chr1	17232	29370	uc009vje.2	0	-	17232	17232	0	4	510,147,99,50,	0,682,1035,12088,
24 | chr1	17605	29370	uc009vjf.2	0	-	17605	17605	0	7	137,147,95,58,227,154,50,	0,309,662,891,1307,7132,11715,
25 | chr1	34611	36081	uc001aak.2	0	-	34611	34611	0	3	563,205,361,	0,665,1109,
26 | chr1	69090	70008	uc001aal.1	0	+	69090	70008	0	1	918,	0,
27 | chr1	137838	139228	uc001aam.3	0	-	137838	137838	0	1	1390,	0,
28 | 


--------------------------------------------------------------------------------
/slamdunk/plot/PCAPlotter.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Plot PCA based on readcounts in UTRs
 4 | 
 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 6 | #
 7 | # This file is part of Slamdunk.
 8 | #
 9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
21 | 
22 | library(getopt)
23 | 
24 | spec = matrix(c(
25 |   'help'      , 'h', 0, "logical","print the usage of the command",
26 |   'fileTab', "f", 2,"character","tsv table of rate files",
27 |   'outputPDF', "O", 2,"character","output pdf file name",
28 |   'outputPCA', "P", 2,"character","output PCA transformations file name"
29 | ),ncol = 5,byrow=T)
30 | 
31 | opt = getopt(spec)
32 | 
33 | if ( !is.null(opt$help) || length(opt)==1 ) {
34 |   #get the script name
35 |   cmd = commandArgs(FALSE)
36 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
37 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
38 |   #print a friendly message and exit with a non-zero error code
39 |   cat(getopt(spec,command = self,usage=T))
40 |   q(status=1);
41 | }
42 | 
43 | 
44 | if ( is.null(opt$fileTab) ) stop("arg fileTab must be specified")
45 | if ( is.null(opt$outputPDF) ) { opt$outputFile = "out.pdf" }
46 | 
47 | library(ggplot2)
48 | 
49 | samples = read.table(opt$fileTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "")
50 | 
51 | if (nrow(samples) <= 1) {
52 | 	cat('# slamdunk PCA\n',  file=opt$outputPCA)
53 | 	cat(paste(samples$sample,0,"0\n",sep="\t"),append=TRUE,file=opt$outputPCA)
54 | 	#signal success and exit.
55 | 	q(status=0)
56 | }
57 | 
58 | countsList = list()
59 | 
60 | for (i in 1:nrow(samples)) {
61 |   curTab = read.delim(samples$file[i],stringsAsFactors=FALSE, comment.char="#")
62 | 
63 |   countsList[[samples$sample[i]]] = curTab$TcReadCount
64 | 
65 | }
66 | 
67 | countMatrix = do.call(cbind, countsList)
68 | 
69 | variances = apply(countMatrix, 1, var)
70 | 
71 | sel = order(variances, decreasing=TRUE)[seq_len(min(500, length(variances)))]
72 | 
73 | pca = prcomp(t(countMatrix[sel,]))
74 | 
75 | PoV = pca$sdev ^ 2 / sum(pca$sdev ^ 2)
76 | 
77 | plotTab = data.frame(sample = row.names(pca$x), PC1 = pca$x[,1], PC2 = pca$x[,2])
78 | 
79 | pdf(opt$outputPDF)
80 | 
81 | ggplot(plotTab, aes(x=PC1, y=PC2, color = sample)) + geom_point(size = 3) +
82 |   xlab(paste("PC1 (", round(PoV[1],digits=2), " % variance)",sep="")) +
83 |   ylab(paste("PC2 (", round(PoV[2],digits=2), " % variance)",sep="")) +
84 |   theme(legend.position="bottom", legend.title=element_blank()) + ggtitle("Slamdunk PCA")
85 | 
86 | dev.off()
87 | 
88 | cat('# slamdunk PCA\n',  file=opt$outputPCA)
89 | write.table(plotTab,file=opt$outputPCA,append=TRUE,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
90 | 
91 | #signal success and exit.
92 | q(status=0)
93 | 


--------------------------------------------------------------------------------
/slamdunk/plot/conversion_per_read_position.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | #
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | #
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | library(getopt)
21 | 
22 | spec = matrix(c(
23 | 				'help'      , 'h', 0, "logical","print the usage of the command",
24 | 				'utr' 		, 'u', 0, "logical","utr plotting",
25 | 				'inputFile', "i", 2,"character","tsv table of mutations per position",
26 | 				'outputFile', "o", 2,"character","output pdf file name"
27 | 		),ncol = 5,byrow=T)
28 | 
29 | opt = getopt(spec)
30 | 
31 | if ( !is.null(opt$help) || length(opt)==1 ) {
32 | 	#get the script name
33 | 	cmd = commandArgs(FALSE)
34 | 	self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
35 | 	cat(basename(self),": Create mismatches per read/UTR position plots.\n\n")
36 | 	#print a friendly message and exit with a non-zero error code
37 | 	cat(getopt(spec,command = self,usage=T))
38 | 	q(status=1);
39 | }
40 | 
41 | positionLabel = "Position on read"
42 | mutationLabel = "% of reads with mutation"
43 | 
44 | if( !is.null(opt$utr)) {
45 | 	positionLabel = "Position at 3' UTR end (200 bp upstream)"
46 | 	mutationLabel = "% of UTRs with mutation"
47 | }
48 | 
49 | if ( is.null(opt$inputFile) ) stop("arg input must be specified")
50 | if ( is.null(opt$outputFile) ) { opt$outputFile = paste(opt$inputFile, ".pdf", sep="") }
51 | 
52 | 
53 | mut = read.table(opt$inputFile, comment.char = "#")
54 | 
55 | if (is.null(mut$V6)) {
56 | 	mut$V6 = mut$V5
57 | }
58 | 
59 | #mut = read.table("test_mut_bowtie.csv")
60 | 
61 | #totalFwd = mut[1,1]
62 | #totalRev = mut[1,2]
63 | #tcFwd = mut[1,3]
64 | #tcRev = mut[1,4]
65 | 
66 | #mut = mut[-1,]
67 | 
68 | counts = rbind(c(mut$V1)/c(mut$V5) * 100, c(mut$V2)/c(mut$V6) * 100)
69 | countsTC = rbind(c(mut$V3)/c(mut$V5) * 100, c(mut$V4)/c(mut$V6) * 100)
70 | 
71 | ##################################################################
72 | # Workaround for 0 counts (need to work out what's going on there
73 | 
74 | counts[is.nan(counts)] = 0
75 | countsTC[is.nan(countsTC)] = 0
76 | 
77 | ##################################################################
78 | pdf(opt$outputFile, width=10, height=10)
79 | par(mfrow=c(2,1))
80 | 
81 | # Scale to next 10
82 | barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,max(10,ceiling(counts / 10) * 10)), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
83 | #barplot(counts, beside=T, names.arg=1:nrow(mut), main="All mutations", ylim=c(0,10), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
84 | # Scale to next 1
85 | barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,max(1,ceiling(countsTC))), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
86 | #barplot(countsTC, beside=T, names.arg=1:nrow(mut), main="T->C on fwd, A->G on rev", ylim=c(0,1), xlab=positionLabel, ylab=mutationLabel, legend=c("forward", "reverse"))
87 | 
88 | dev.off()
89 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/getSegs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import bisect;
  4 | 
  5 | def getSegs(pos,seglen,direction,exonstart,exonlen):
  6 |   """
  7 |   This function returns the corresponding segstart and seg length (as in the list)
  8 |   parameters:
  9 |   pos: the position to be queried
 10 |   seglen: the length of the segment
 11 |   direction: 1 (forward), -1 (backward)
 12 |   exonstart,exonlen: the exon positions
 13 |   Return value: (segstart,seglength,status)
 14 |   status is used to indicate the possible errors
 15 |   0: normal
 16 |   1: the position is not in the exon range
 17 |   2: the segment exceeds the boundary
 18 |   """
 19 |   segstart=[];
 20 |   seglength=[];
 21 |   # find the corresponding exon which includes pos
 22 |   posexonid=-1;
 23 |   status=0;
 24 |   for ne in range(len(exonstart)):
 25 |     if pos in range(exonstart[ne],exonstart[ne]+exonlen[ne]):
 26 |       posexonid=ne;
 27 |       break;
 28 |   if posexonid==-1:
 29 |     status=1;
 30 |     return (segstart,seglength,status);
 31 |   if direction==1:
 32 |     while seglen>0:
 33 |       lentoadd=min(seglen,exonlen[posexonid]+exonstart[posexonid]-pos);
 34 |       segstart+=[pos];
 35 |       seglength+=[lentoadd];
 36 |       posexonid=posexonid+1;
 37 |       seglen=seglen-lentoadd;
 38 |       if posexonid>=len(exonstart):
 39 |         if seglen>0:
 40 |           status=2;
 41 |         return (segstart,seglength,status);
 42 |       pos=exonstart[posexonid];
 43 |   if direction==-1:
 44 |     while seglen>0:
 45 |       lentoadd=min(seglen,pos-exonstart[posexonid]+1);
 46 |       segstart.insert(0,pos-lentoadd+1); # insert in the front
 47 |       seglength.insert(0,lentoadd);
 48 |       posexonid=posexonid-1;
 49 |       seglen=seglen-lentoadd;
 50 |       if posexonid<0:
 51 |         if seglen>0:
 52 |           status=2;
 53 |         return (segstart,seglength,status);
 54 |       pos=exonstart[posexonid]+exonlen[posexonid]-1;
 55 |   return (segstart,seglength,status);
 56 |     
 57 | def tpos2pos(tpos,cumlen,exonstart):
 58 |   """
 59 |   Convertion from coordinates in a transcript to coordinates in a reference.
 60 |   Need to provide exon start position and the cumulate exon length as input.
 61 |   """        
 62 |   selseg=bisect.bisect_right(cumlen,tpos);
 63 |   # if the position is exceeding the boundary, set as the last position of the boundary
 64 |   if selseg>=len(cumlen):
 65 |     selseg=len(cumlen)-1;
 66 |     tpos=cumlen[-1]-1;
 67 |   if selseg>0:
 68 |     pos=exonstart[selseg]+(tpos-cumlen[selseg-1]);
 69 |   else:
 70 |     pos=exonstart[selseg]+tpos;
 71 |   return pos;
 72 |   
 73 | def writeBedline(fid,lineid,chromosome,direction,startrange,lenrange):
 74 |   """
 75 |   Write one line in .bed file.
 76 |   Need to provide information of chromosome, id, direction, segment starts and segment lengths
 77 |   """
 78 |   # skip if startrange is malformed
 79 |   if not startrange:
 80 |     return None
 81 |   bedrange=(startrange[0],startrange[-1]+lenrange[-1]);
 82 |   startrange=[i-startrange[0] for i in startrange];
 83 |   # directions
 84 |   if direction==1:
 85 |     direction='+';
 86 |   elif direction==-1:
 87 |     direction='-';
 88 |   #write line
 89 |   fid.write(chromosome + '\t' # 0th, chromosome
 90 |       + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 1-2th, start and and
 91 |       + lineid + '\t' # 3th, id
 92 |       + '0\t'+ direction +'\t' # 4th, 5th, 0 and direction
 93 |       + str(bedrange[0])+'\t'+str(bedrange[1])+'\t' # 6-7th, same as 1-2
 94 |       + '0\t'+str(len(startrange))+'\t' # 8th, 0; 9th, number of segments
 95 |       + ''.join([str(i)+',' for i in lenrange]) + '\t' # 10th, length
 96 |       + ''.join([str(i)+',' for i in startrange]) +'\t' # 11th, start position
 97 |       +'\n');
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/slamdunk/dunks/deduplicator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 4 | #
 5 | # This file is part of Slamdunk.
 6 | # 
 7 | # Slamdunk is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Affero General Public License as
 9 | # published by the Free Software Foundation, either version 3 of the
10 | # License, or (at your option) any later version.
11 | # 
12 | # Slamdunk is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Affero General Public License for more details.
16 | # 
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | from __future__ import print_function
21 | import pysam
22 | 
23 | from slamdunk.utils.misc import checkStep, pysamIndex  # @UnresolvedImport
24 | 
25 | def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False):
26 |     
27 |     if(printOnly or checkStep([inputBAM], [outputBAM], force)):
28 |         
29 |         samfile = pysam.AlignmentFile(inputBAM, "rb")
30 |         outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)
31 |         
32 |         processedReads = 0
33 |         retainedReads = 0
34 | 
35 |         prevChr = ""
36 |         prevStart = ""
37 |         
38 |         duplicateBuffer = {}
39 |         
40 |         for read in samfile:
41 |             
42 |             flag = read.cigarstring
43 |             chr = read.reference_id
44 |             start = read.reference_start
45 |             seq = read.query_sequence
46 |             if (read.has_tag("TC")) :
47 |                 tcflag = read.get_tag("TC")
48 |             else :
49 |                 tcflag = 0
50 |             
51 |             if (tcflag >= tcMutations) :
52 |                 
53 |                 if (chr != prevChr or start != prevStart) :
54 |                                 
55 |                     if (prevChr != "") :
56 |                         for curSeq in duplicateBuffer :
57 |                             for curFlag in duplicateBuffer[curSeq]:
58 |                                 for readEntry in duplicateBuffer[curSeq][curFlag]:
59 |                                     if not readEntry.is_duplicate:
60 |                                        retainedReads += 1 
61 |                                     outfile.write(readEntry)
62 |                         duplicateBuffer.clear()
63 |                 
64 |                 if not seq in duplicateBuffer:
65 |                     duplicateBuffer[seq] = {}
66 |                 if not flag in duplicateBuffer[seq]:
67 |                     duplicateBuffer[seq][flag] = list()
68 |                 if len(duplicateBuffer[seq][flag]) > 0 :
69 |                     read.is_duplicate = True
70 |                 duplicateBuffer[seq][flag].append(read)
71 |                  
72 |                 prevChr = chr
73 |                 prevStart = start
74 |             
75 |                 processedReads += 1
76 |             
77 |         for seq in duplicateBuffer:
78 |             for flag in duplicateBuffer[seq] :
79 |                 for readEntry in duplicateBuffer[seq][flag]:
80 |                     if not readEntry.is_duplicate:
81 |                         retainedReads += 1 
82 |                     outfile.write(readEntry)
83 |         duplicateBuffer.clear()
84 |         
85 |         outfile.close()
86 |                 
87 |         print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "")
88 |         print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="")
89 |         print(" compression rate)", file=log)
90 |         
91 |         pysamIndex(outputBAM)
92 |         
93 |     else:
94 |         print("Skipped deduplication for " + inputBAM, file=log)


--------------------------------------------------------------------------------
/slamdunk/plot/compute_context_TC_rates.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Script to plot TC context rates of reads
 4 | 
 5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
 6 | #
 7 | # This file is part of Slamdunk.
 8 | #
 9 | # Slamdunk is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Affero General Public License as
11 | # published by the Free Software Foundation, either version 3 of the
12 | # License, or (at your option) any later version.
13 | #
14 | # Slamdunk is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU Affero General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Affero General Public License
20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
21 | 
22 | library(getopt)
23 | 
24 | spec = matrix(c(
25 |   'help'      , 'h', 0, "logical","print the usage of the command",
26 |   'rateTab', "f", 2,"character","tsv table of rate files",
27 |   'outputFile', "O", 2,"character","output pdf file name"
28 | ),ncol = 5,byrow=T)
29 | 
30 | opt = getopt(spec)
31 | 
32 | if ( !is.null(opt$help) || length(opt)==1 ) {
33 |   #get the script name
34 |   cmd = commandArgs(FALSE)
35 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
36 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
37 |   #print a friendly message and exit with a non-zero error code
38 |   cat(getopt(spec,command = self,usage=T))
39 |   q(status=1)
40 | }
41 | 
42 | 
43 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified")
44 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
45 | 
46 | library(ggplot2)
47 | library(gridExtra)
48 | 
49 | rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"),comment.char = "")
50 | 
51 | pdf(opt$outputFile)
52 | plotList = list()
53 | 
54 | for (i in 1:nrow(rates)) {
55 |   curTab = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE)
56 | 
57 |   subFront = curTab[1:2,]
58 |   subBack = curTab[4:5,]
59 |   names(subBack) = curTab[3,]
60 | 
61 |   #subFront = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1)
62 |   #subBack = read.table(rates$file[i],stringsAsFactors=FALSE,header=TRUE, nrow=1,skip=2)
63 | 
64 |   printTabFront = data.frame(contexts=rep(names(subFront),each=2),strand = factor(rep(c("+","-"),ncol(subFront)),levels=c("+","-")),
65 |                              rate_percent = as.numeric(unlist(subFront)))
66 |   printTabBack = data.frame(contexts=rep(names(subBack),each=2),strand = factor(rep(c("+","-"),ncol(subBack)),levels=c("+","-")),
67 |                             rate_percent = as.numeric(unlist(subBack)))
68 | 
69 |   printTabFront$rate_percent = printTabFront$rate_percent / sum(printTabFront$rate_percent)
70 |   printTabBack$rate_percent = printTabBack$rate_percent / sum(printTabBack$rate_percent)
71 | 
72 |   # Ignore N contexts for now
73 |   printTabFront = printTabFront[-grep("NT",printTabFront$contexts),]
74 |   printTabBack = printTabBack[-grep("TN",printTabBack$contexts),]
75 | 
76 |   curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabFront) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) +
77 |     theme(text = element_text(size=6),axis.text.x = element_text(size=6), plot.title = element_text(size=10))
78 |   plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("5' T->C context")
79 |   curPlot = qplot(x=contexts, y=rate_percent, fill=strand,data=printTabBack) + geom_bar(stat="identity") + geom_text(aes(label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + ylab("TC context percent %") + xlab(rates$sample[i]) +
80 |     theme(text = element_text(size=6),axis.text.x = element_text(size=6),plot.title = element_text(size=10))
81 |   plotList[[length(plotList)+1]] <- curPlot + ylim(0.0,1.0) + ggtitle("3' T->C context")
82 | }
83 | 
84 | do.call(grid.arrange,  plotList)
85 | 
86 | dev.off()
87 | 
88 | #signal success and exit.
89 | q(status=0)
90 | 


--------------------------------------------------------------------------------
/slamdunk/plot/compute_sample_comparison_statistics.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Script to plot pairwise correlations and PCA
  4 | #
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | # Helper
 23 | 
 24 | my_panel_cor <- function(x, y, digits=2, prefix="", cex.cor, ...)
 25 | {
 26 | 	usr <- par("usr"); on.exit(par(usr))
 27 | 	par(usr = c(0, 1, 0, 1))
 28 | 
 29 | 
 30 | 	toUse = which(is.finite(x) & is.finite(y) & (x|y>0))
 31 | 	r <- abs(cor(x[toUse], y[toUse]))
 32 | 
 33 | 
 34 | 	txt <- format(c(r, 0.123456789), digits=digits)[1]
 35 | 	txt <- paste(prefix, txt, sep="")
 36 | 	if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 37 | 	text(0.5, 0.5, txt, cex = cex.cor * r)
 38 | }
 39 | 
 40 | my_panel_smooth <- function(x, y,lcol="red")
 41 | 
 42 | {
 43 | 	smoothScatter(x,y,add=T)
 44 | 	abline(0,1,col=lcol)
 45 | }
 46 | 
 47 | library(getopt)
 48 | 
 49 | spec = matrix(c(
 50 | 				'help'      , 'h', 0, "logical","print the usage of the command",
 51 | 				'sampleTab', "i", 2,"character","csv table of sample counts",
 52 | 				'outputPrefix', "o", 2,"character","output file name prefix"
 53 | 		),ncol = 5,byrow=T)
 54 | 
 55 | opt = getopt(spec)
 56 | 
 57 | if ( !is.null(opt$help) || length(opt)==1 ) {
 58 | 	#get the script name
 59 | 	cmd = commandArgs(FALSE)
 60 | 	self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 61 | 	cat(basename(self),": Compute sample comparison statistics from sample counts.\n\n")
 62 | 	#print a friendly message and exit with a non-zero error code
 63 | 	cat(getopt(spec,command = self,usage=T))
 64 | 	q(status=1);
 65 | }
 66 | 
 67 | 
 68 | if ( is.null(opt$sampleTab) ) stop("arg sampleTab must be specified")
 69 | if ( is.null(opt$outputPrefix) ) { opt$outputPrefix = "sampleCorrelation" }
 70 | 
 71 | rates = read.table(opt$sampleTab,header=TRUE,sep=";", comment.char = "")
 72 | 
 73 | if (ncol(rates) < 6) {
 74 | 	print("No need for calculating pairwise statistics for single sample")
 75 | 	quit(status=0)
 76 | }
 77 | 
 78 | library(RColorBrewer, lib.loc = libLoc)
 79 | library(lattice, lib.loc = libLoc)
 80 | library(matrixStats, lib.loc = libLoc)
 81 | 
 82 | values = data.matrix(rates[,c(5:ncol(rates))])
 83 | 
 84 | ##################################################
 85 | # PCA
 86 | ##################################################
 87 | 
 88 | rowVariances = rowVars(data.matrix(values))
 89 | 
 90 | select = order(rowVariances, decreasing = TRUE)[seq_len(min(500, length(rowVariances)))]
 91 | 
 92 | pca = prcomp(t(values[select, ]))
 93 | 
 94 | if (ncol(values) == 2) {
 95 | 	col = brewer.pal(3, "Paired")[1:2]
 96 | } else if (ncol(values) > 12) {
 97 | 	getPalette = colorRampPalette(brewer.pal(9, "Set1"))
 98 | 	col = getPalette(ncol(values))
 99 | } else {
100 | 	col = brewer.pal(ncol(values), "Paired")
101 | }
102 | 
103 | # Get amount of explained variance (see summary(pca))
104 | varianceProportion = pca$sdev ^ 2 / sum(pca$sdev ^ 2)
105 | 
106 | pdf(paste(opt$outputPrefix,"_PCA.pdf",sep=""))
107 | 
108 | if (ncol(values) > 12) {
109 | 
110 | 	xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x),
111 | 		pch = 20, cex = 2, aspect = "iso", col = col, xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""),
112 | 		ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""),
113 | 	)
114 | 
115 | } else {
116 | 
117 | 	xyplot(PC2 ~ PC1, groups = colnames(values), data = as.data.frame(pca$x),
118 | 			pch = 20, cex = 2, aspect = "iso", col = col, main = draw.key(key = list(rect = list(col = col),
119 | 							text = list(colnames(values)), rep = FALSE)), xlab = paste("PC1 (", round(varianceProportion[1],digits=2), " variance)",sep=""),
120 | 			ylab = paste("PC2 (", round(varianceProportion[2],digits=2), " variance)",sep=""),
121 | 	)
122 | 
123 | }
124 | 
125 | dev.off()
126 | 
127 | ##################################################
128 | # Pairwise correlations
129 | ##################################################
130 | 
131 | if (ncol(values) <= 12) {
132 | 
133 | 	pdf(paste(opt$outputPrefix,"_pairwiseCorrelation.pdf",sep=""))
134 | 
135 | 	pairs(values,upper.panel=my_panel_smooth,lower.panel=my_panel_cor)
136 | 
137 | 	dev.off()
138 | }
139 | 
140 | #signal success and exit.
141 | q(status=0)
142 | 


--------------------------------------------------------------------------------
/slamdunk/plot/compute_overall_rates.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Script to overlap public database file
  4 | #
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | library(getopt)
 23 | library(ggplot2)
 24 | library(gridExtra)
 25 | 
 26 | spec = matrix(c(
 27 | 				'help'      , 'h', 0, "logical","print the usage of the command",
 28 | 				'rateTab', "f", 2,"character","tsv table of rate files",
 29 | 				'name', "n", 2,"character","Sample name",
 30 | 				'outputFile', "O", 2,"character","output pdf file name"
 31 | 		),ncol = 5,byrow=T)
 32 | 
 33 | opt = getopt(spec)
 34 | 
 35 | if ( !is.null(opt$help) || length(opt)<1 ) {
 36 | 	#get the script name
 37 | 	cmd = commandArgs(FALSE)
 38 | 	self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 39 | 	cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 40 | 	#print a friendly message and exit with a non-zero error code
 41 | 	cat(getopt(spec,command = self,usage=T))
 42 | 	q(status=1);
 43 | }
 44 | 
 45 | 
 46 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified")
 47 | if ( is.null(opt$name) ) { opt$outputFile = "Sample 1" }
 48 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
 49 | 
 50 | #rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "")
 51 | 
 52 | fileName = opt$rateTab
 53 | # file = "/project/libby/slamdunk-analysis/sra_example/rates/ERR1692138_slamdunk_mapped_filtered_overallrates.csv"
 54 | # file = "/project/libby/slamdunk-analysis/mareike/rates/AML_41-1_48h_Mll212207_37484.fastq_slamdunk_mapped_filtered_overallrates.csv"
 55 | sampleName = opt$name
 56 | # sampleName = "Sample 1"
 57 | 
 58 | pdf(opt$outputFile)
 59 | 
 60 | plotList = list()
 61 | 
 62 | #for (i in 1:nrow(rates)) {
 63 | 	curTab = read.table(fileName,stringsAsFactors=FALSE)
 64 | 
 65 | 	curTab[, c("A", "C", "G", "T")] <- curTab[, c("A", "C", "G", "T")]/rowSums(curTab[, c("A", "C", "G", "T")]) * 100
 66 | 	curTab[, c("a", "c", "g", "t")] <- curTab[, c("a", "c", "g", "t")]/rowSums(curTab[, c("a", "c", "g", "t")])  * 100
 67 | 
 68 | 	printTab = data.frame(rates=c(rep("AT",2),rep("AC",2),rep("AG",2),
 69 | 					rep("TA",2),rep("TC",2),rep("TG",2),
 70 | 					rep("CA",2),rep("CT",2),rep("CG",2),
 71 | 					rep("GA",2),rep("GT",2),rep("GC",2)), strand = rep(c("+","-"),12),
 72 | 			rate_percent = c(curTab["A","T"],curTab["A","t"],curTab["A","C"],curTab["A","c"],curTab["A","G"],curTab["A","g"],
 73 | 					curTab["T","A"],curTab["T","a"],curTab["T","C"],curTab["T","c"],curTab["T","G"],curTab["T","g"],
 74 | 					curTab["C","A"],curTab["C","a"],curTab["C","T"],curTab["C","t"],curTab["C","G"],curTab["C","g"],
 75 | 					curTab["G","A"],curTab["G","a"],curTab["G","T"],curTab["G","t"],curTab["G","C"],curTab["G","c"])
 76 | 	)
 77 | 
 78 | 
 79 | 	#fwdATot = max(1, sum(curTab["A",c("A", "C", "G", "T", "N")]))
 80 | 	#fwdCTot = max(1, sum(curTab["C",c("A", "C", "G", "T", "N")]))
 81 | 	#fwdGTot = max(1, sum(curTab["G",c("A", "C", "G", "T", "N")]))
 82 | 	#fwdTTot = max(1, sum(curTab["T",c("A", "C", "G", "T", "N")]))
 83 | 
 84 | 	#revATot = max(1, sum(curTab["A",c("a", "c", "g", "t", "n")]))
 85 | 	#revCTot = max(1, sum(curTab["C",c("a", "c", "g", "t", "n")]))
 86 | 	#revGTot = max(1, sum(curTab["G",c("a", "c", "g", "t", "n")]))
 87 | 	#revTTot = max(1, sum(curTab["T",c("a", "c", "g", "t", "n")]))
 88 | 
 89 | 	#total = c(rep(c(fwdATot, revATot), 3), rep(c(fwdTTot, revTTot), 3), rep(c(fwdCTot, revCTot), 3), rep(c(fwdGTot, revGTot), 3) )
 90 | 
 91 | 	#printTab$rate_percent = printTab$rate_percent / total * 100
 92 | 
 93 | 	maxRatePercent = max(10, max(printTab$rate_percent) * 1.1)
 94 | 
 95 | 	printTab$y = -0.3
 96 | 	printTab[printTab$strand == "-", ]$y = printTab[printTab$strand == "-", ]$rate_percent + printTab[printTab$strand == "+", ]$rate_percent
 97 | 
 98 | 	curPlot = qplot(x=rates, y=rate_percent, fill=strand,data=printTab) + ylim(-0.5,maxRatePercent) + geom_bar(stat="identity") + geom_text(aes(y = printTab$y, label = round(rate_percent,digits=2)), size = 3, hjust = 0.5, vjust = -0.50) + ylab("Rate percent %") + xlab(sampleName) +
 99 | 			theme(text = element_text(size=12),axis.text.x = element_text(size=12))
100 | 	#curPlot + xlim(0,35)
101 | 	plotList[[length(plotList)+1]] <- curPlot #+ ylim(0.0,maxRatePercent)
102 | #}
103 | 
104 | do.call(grid.arrange,  plotList)
105 | 
106 | dev.off()
107 | 
108 | #signal success and exit.
109 | q(status=0)
110 | 


--------------------------------------------------------------------------------
/slamdunk/dunks/mapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | from __future__ import print_function
 21 | import os, re
 22 | 
 23 | from slamdunk.utils.misc import files_exist, checkStep, run, pysamIndex, removeFile, getBinary, replaceExtension, shellerr  # @UnresolvedImport
 24 | from slamdunk.version import __ngm_version__  # @UnresolvedImport
 25 | 
 26 | projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 27 | 
 28 | def sort(inputSAM, outputBAM, log, threads=1, keepSam=True, dry=False, verbose=True):
 29 | 
 30 |     if(files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"])):
 31 |         runSam2bam(inputSAM, outputBAM, log, False, False, not keepSam, threads=threads, dry=dry, verbose=verbose)
 32 |     else:
 33 |         print("Skipped sorting for " + inputSAM, file=log)
 34 | 
 35 | def checkNextGenMapVersion():
 36 |     ngmHelp = shellerr("ngm", raiseError = False)
 37 |     matchObj = re.match( r'.*([0-9]+\.[0-9]+\.[0-9]+).*', str(ngmHelp), re.M|re.I)
 38 |     if matchObj:
 39 |         version = matchObj.group(1)
 40 |         if version != __ngm_version__:
 41 |             raise RuntimeError('NextGenMap version expected: ' + __ngm_version__ + " but found " + version + ". Please reinstall slamdunk package.")
 42 |     else:
 43 |         raise RuntimeError('Could not get NextGenMap version. Please reinstall slamdunk package.')
 44 | 
 45 | def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False):
 46 |     if(delinFile and files_exist(outFile) and not files_exist(inFile)):
 47 |         print("Skipping sam2bam for " + outFile, file=log)
 48 |     else:
 49 |         if(onlyUnique and filterMQ == 0):
 50 |             filterMQ = 1;
 51 | 
 52 |         success = True
 53 |         cmd = ["samtools view", "-@", str(threads), "-Sb", "-o", outFile, inFile]
 54 |         if filterMQ > 0:
 55 |             cmd+=["-q", str(filterMQ)]
 56 |         if onlyProperPaired:
 57 |             cmd+=["-f", "2"]
 58 |         if not L is None:
 59 |             cmd+=["-L", L]
 60 |         run(" ".join(cmd), log, verbose=verbose, dry=dry)
 61 | 
 62 |         if(sort):
 63 |             tmp = outFile + "_tmp"
 64 |             if(not dry):
 65 |                 os.rename(outFile, tmp)
 66 |             run(" ".join(["samtools sort", "-@", str(threads), "-o",  outFile, tmp]), log, verbose=verbose, dry=dry)
 67 |             if(success):
 68 |                 removeFile(tmp)
 69 |         if(success and delinFile):
 70 |             if(not dry):
 71 |                 removeFile(inFile)
 72 | 
 73 |     if(index):
 74 |         pysamIndex(outFile)
 75 | 
 76 | 
 77 | def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False):
 78 | 
 79 |     if(quantseqMapping is True) :
 80 |         parameter = "--no-progress"
 81 | 
 82 |     if(trim5p > 0):
 83 |         parameter = parameter + " -5 " + str(trim5p)
 84 | 
 85 |     if(maxPolyA > -1):
 86 |         parameter = parameter + " --max-polya " + str(maxPolyA)
 87 | 
 88 |     if(endtoendMapping is True):
 89 |         parameter = parameter + " -e "
 90 |     else:
 91 |         parameter = parameter + " -l "
 92 | 
 93 |     if(sampleId != None):
 94 |         parameter = parameter + " --rg-id " + str(sampleId)
 95 |         if(sampleName != ""):
 96 |             parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime)
 97 | 
 98 |     if(topn > 1):
 99 |         parameter = parameter + " -n " + str(topn) + " --strata "
100 | 
101 |     if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)):
102 |         if outputSAM.endswith(".sam"):
103 |             # Output SAM
104 |             run("ngm -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
105 |         else:
106 |             # Output BAM directly
107 |             run("ngm -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
108 |     else:
109 |         print("Skipped mapping for " + inputBAM, file=log)
110 | 


--------------------------------------------------------------------------------
/slamdunk/plot/compute_conversion_rate_mle.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | library(getopt)
 21 | library(bbmle)
 22 | 
 23 | spec = matrix(c(
 24 |   'help'      , 'h', 0, "logical","print the usage of the command",
 25 |   'file', "f", 2,"character","",
 26 |   'rate', "r", 2,"character", "",
 27 |   'output', "o", 2,"character","Output tsv"
 28 | ),ncol = 5,byrow=T)
 29 | 
 30 | opt = getopt(spec)
 31 | 
 32 | if ( !is.null(opt$help) || length(opt)==3 ) {
 33 |   #get the script name
 34 |   cmd = commandArgs(FALSE)
 35 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 36 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 37 |   #print a friendly message and exit with a non-zero error code
 38 |   cat(getopt(spec,command = self,usage=T))
 39 |   q(status=1);
 40 | }
 41 | 
 42 | if ( is.null(opt$file) ) stop("arg file must be specified")
 43 | if ( is.null(opt$output) ) stop("arg output must be specified")
 44 | if ( is.null(opt$rate) ) stop("arg rate must be specified")
 45 | 
 46 | # a: percentage of converted transcripts
 47 | # b: conversion rate
 48 | LL <- function(a, b) {
 49 |   R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0)
 50 |   -sum(log(R))
 51 | }
 52 | 
 53 | # Estimate a with fixed b
 54 | LL2 <- function(as) {
 55 |   b = estb
 56 |   rs = c()
 57 |   for(a in as)  {
 58 |     R = a * (( 1 - b ) ^ ( sample$n - sample$k )) * (b ^ sample$k) * choose(sample$n, sample$k) + ( 1 - a ) * as.numeric(sample$k == 0)
 59 |     rs = c(rs, -sum(log(R)))
 60 |   }
 61 |   rs
 62 | }
 63 | 
 64 | readMeatInfo <- function(fileName) {
 65 |   #fileName = filesSlamDunk[1]
 66 |   sampleInfo = read.table(fileName, nrows = 1, comment.char = "")
 67 |   version = paste(lapply(sampleInfo[1,1:3], as.character), collapse = '\t')
 68 |   sampleID = as.character(sampleInfo[1, ]$V7)
 69 |   sampleName = as.character(sampleInfo[1, ]$V6)
 70 |   sampleType = as.character(sampleInfo[1, ]$V8)
 71 |   sampleTime = as.numeric(sampleInfo[1, ]$V9)
 72 |   sampleInfo = read.table(fileName, nrows = 1, skip = 1, comment.char = "")
 73 |   annotationMD5 = as.character(sampleInfo[1, ]$V3)
 74 |   annotationName = as.character(sampleInfo[1, ]$V2)
 75 |   c(sampleID, sampleName, sampleType, sampleTime, annotationName, annotationMD5, version)
 76 | }
 77 | 
 78 | 
 79 | 
 80 | file = opt$file
 81 | #file = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation/halflife_cov60_rl38/slamdunk/count/sample_3_pulse_30min_rep1_reads_slamdunk_mapped_filtered_tcount_perread.tsv"
 82 | #files = as.character(ordered(strsplit(file, ",")[[1]]))
 83 | 
 84 | output = opt$output
 85 | #output = "/project/ngs/philipp/slamdunk-analysis/veronika/ngm-20161027/count-examples/34504_An312_wt-2n_mRNA-slamseq-autoquant_1h-R2.fq_slamdunk_mapped_filtered_tcount_mle.tsv"
 86 | 
 87 | estb = as.numeric(opt$rate)
 88 | #estb = 0.024
 89 | 
 90 | meta = readMeatInfo(file)
 91 | id = meta[1]
 92 | type = meta[3]
 93 | time = meta[4]
 94 | 
 95 | data = read.table(file, header = T, stringsAsFactors = F)
 96 | #head(data)
 97 | 
 98 | for(i in 1:nrow(data)) {
 99 |   #i = 1
100 |   N = as.numeric(strsplit(data[i,]$ReadCount, ",")[[1]])
101 |   k = as.numeric(strsplit(data[i,]$TcReadCount, ",")[[1]])
102 |   sample = data.frame(n = N, k = k)
103 |   fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99))
104 |   confinv = confint(fit)
105 | 
106 |   #result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]]))
107 |   data[i,]$ConversionRate = fit@coef[[1]]
108 |   data[i,]$ReadCount = length(N)
109 |   data[i,]$TcReadCount = sum(k > 0)
110 |   data[i,]$ConversionRateLower = confinv[[1]]
111 |   data[i,]$ConversionRateUpper = confinv[[2]]
112 | }
113 | 
114 | if(sum(is.na(data$ConversionRateLower) > 0)) {
115 |   data[is.na(data$ConversionRateLower), ]$ConversionRateLower = 0
116 | }
117 | if(sum(is.na(data$ConversionRateUpper) > 0)) {
118 |   data[is.na(data$ConversionRateUpper), ]$ConversionRateUpper = 1
119 | }
120 | 
121 | #all = data.frame()
122 | #for(file in files) {
123 | #  part = read.table(file, header = T)
124 | #  all = rbind(all, part)
125 | #}
126 | #head(all)
127 | # Estimate b from all data
128 | #sample = all
129 | #fit = mle2(minuslogl = LL, start = list(a = 0.1, b = 0.01), method = "L-BFGS-B", lower = c(a = 0.000001, b = 0.000001), upper = c(a = 0.99, b = 0.1))
130 | 
131 | #result = c()
132 | #names = as.character(unique(all$utr))
133 | 
134 | #for(name in names) {
135 | #  #name = names[2]
136 | #  sample = all[all$utr == name, ]
137 | #
138 | #  fit = mle2(minuslogl = LL2, start = list(as = 0.29), method = "L-BFGS-B", lower = c(as = 0.000001), upper = c(as = 0.99))
139 | #  confinv = confint(fit)
140 | ##  result = rbind(result, c(id, type, time, name, fit@coef[[1]], confinv[[1]], confinv[[2]]))
141 | #
142 | #}
143 | #
144 | 
145 | # Read header
146 | header = readLines(file, 2)
147 | con <- file(output, open="wt")
148 | # Print header
149 | writeLines(header[1], con)
150 | writeLines(header[2], con)
151 | # Print data
152 | write.table(data, con, sep = "\t", quote = F, row.names = F, col.names = T)
153 | close(con)
154 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/genexplvprofile.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File).
  4 | 
  5 | USAGE 
  6 | 
  7 | 	genexplvprofile.py {OPTIONS} <BED-File|-> 
  8 | 
  9 | OPTIONS
 10 | 
 11 | 	-h/--help\tPrint this message
 12 | 
 13 | 	-e/--lognormal\tmu,sigma	Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4
 14 |         --geometric\tmu			Use geometric distribution with parameter mu instead of lognormal distribution to assign expression levels.
 15 | 	
 16 | 	-f/--statonly\tPrint the statistics only; do not assign expression levels.
 17 | 
 18 | NOTE
 19 | 
 20 | 	1. To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).  
 21 | 
 22 | 	2. The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]).
 23 | 
 24 | HISTORY
 25 | 
 26 | 	07/24/2012
 27 | 	  Enable geometric distribution for expression level assignments. Require numpy package.
 28 | 
 29 | 	02/16/2012
 30 | 	  Run on Python 2.7
 31 | 
 32 | 	02/08/2012
 33 | 	  Initialization.
 34 | '''
 35 | 
 36 | from __future__ import print_function
 37 | import sys;
 38 | import pydoc;
 39 | import os;
 40 | import re;
 41 | import fileinput;
 42 | import random;
 43 | import numpy;
 44 | 
 45 | def parsebed(lines):
 46 |   # Parse one line in count data
 47 |   fd=lines.strip().split('\t');
 48 |   if len(fd)!=12:
 49 |     return ['',-1,-1,0];
 50 |   if fd[10].endswith(','):
 51 |     fd[10]=fd[10][:-1];
 52 |   if fd[11].endswith(','):
 53 |     fd[11]=fd[11][:-1];
 54 |   seglen=[int(x) for x in fd[10].split(',')];
 55 |   segstart=[int(x) for x in fd[11].split(',')];
 56 |   #jstart=int(fd[1])+seglen[0]+1;
 57 |   #jend=int(fd[1])+segstart[1]+1;
 58 |   jstart=int(fd[1])+1; # start is 0-base; increase 1 to convert to 1-base
 59 |   jend=int(fd[2]);
 60 |   # jscore=int(fd[4]);
 61 |   #seg1=[jstart+segstart[i] for i in range(len(segstart))];
 62 |   #seg2=[jstart+segstart[i]+seglen[i]-1 for i in range(len(segstart))]; 
 63 |   # [seg1,seg2] are now 1-base inclusive
 64 |   return [fd[0],jstart,jend,fd[3],sum(seglen),fd[5],fd[9]];
 65 | 
 66 | argvi=1;
 67 | mindist=50;
 68 | minscore=2;
 69 | mu=-4;
 70 | sigma=4;
 71 | assignexplv=True;
 72 | 
 73 | 
 74 | allfile=[];
 75 | 
 76 | distype="lognormal";
 77 | 
 78 | while argvi <(len(sys.argv)):
 79 |   if sys.argv[argvi]=="-h" or sys.argv[argvi]=="--help" :
 80 |     print(pydoc.render_doc(sys.modules[__name__]),file=sys.stderr);
 81 |     sys.exit();
 82 |   elif sys.argv[argvi]=="-f" or sys.argv[argvi]=="--statonly":
 83 |     assignexplv=False;
 84 |   elif sys.argv[argvi]=="-e" or sys.argv[argvi]=="--lognormal" :
 85 |     distype="lognormal";
 86 |     ms=sys.argv[argvi+1].split(",");
 87 |     argvi=argvi+1;
 88 |     if len(ms)!=2:
 89 |       print('Error: incorrect parameter for -e.',file=sys.stderr);
 90 |       sys.exit(); 
 91 |     try:
 92 |       mu=float(ms[0]);
 93 |       sigma=float(ms[1]);
 94 |     except ValueError:
 95 |       print('Error: incorrect parameter for -e.',file=sys.stderr);
 96 |       sys.exit(); 
 97 |     print('Mean and variance for lognormal distribution: '+str(mu)+','+str(sigma),file=sys.stderr);
 98 |   elif sys.argv[argvi]=="--geometric":
 99 |     distype="geometric";
100 |     try:
101 |       mu=float(sys.argv[argvi+1]);
102 |       if mu<0 or mu>1:
103 |         print('Error: the parameter for geometric distribution must be between 0 and 1.',file=sys.stderr);
104 |         sys.exit(); 
105 |     except ValueError:
106 |       print('Error: incorrect parameter for -e.',file=sys.stderr);
107 |       sys.exit(); 
108 |     print('Mean for geometric distribution: '+str(mu),file=sys.stderr);
109 |     argvi=argvi+1;
110 |   else:
111 |     allfile.append(sys.argv[argvi]);
112 |   argvi=argvi+1;
113 | 
114 | 
115 | allid={};
116 | 
117 | prevchr="";
118 | prevrange=[0,0];
119 | rangeid=0;
120 | 
121 | nline=0;
122 | 
123 | currentgene=[];
124 | groupid=0;
125 | 
126 | print('#ID\tLength\tDir\tExons\tPosition\tGroupID\tNIsoformInGroup',end='');
127 | if assignexplv==True:
128 |   print('\tExplv');
129 | else:
130 |   print();
131 | 
132 | for lines in fileinput.input(allfile):
133 |   nline=nline+1;
134 |   pf=parsebed(lines);
135 |   chrname=pf[0];jstart=pf[1];jend=pf[2];id=pf[3];
136 |   if len(chrname)==0 and jstart<0:
137 |     continue;
138 |   length=pf[4];direction=pf[5];nexon=pf[6];
139 |   if chrname!=prevchr or jstart-prevrange[1]>0:
140 |     if len(prevchr)!=0:
141 |       groupid=groupid+1;
142 |       for item in currentgene:
143 |         print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end='');
144 |         if assignexplv==True:
145 |           if distype=="geometric":
146 |            weight=numpy.random.geometric(mu)*item[1];
147 |           else:
148 |             weight=random.lognormvariate(mu,sigma)*item[1];
149 |           print("\t"+str(weight));
150 |         else:
151 |           print();
152 |     prevrange[0]=jstart;
153 |     prevrange[1]=jend;
154 |     prevchr=chrname;
155 |     rangeid=rangeid+1;
156 |     currentgene=[];
157 |   elif jstart<prevrange[0]:
158 |     print('Warning: the range is not sorted at line '+str(nline),file=sys.stderr);
159 |   else:
160 |     if jend>prevrange[1]:
161 |       prevrange[1]=jend;
162 |   currentgene.append((id+"\t"+str(length)+"\t"+direction+"\t"+str(nexon)+"\t"+chrname+":"+str(jstart)+"-"+str(jend),length));
163 | 
164 | 
165 | if len(prevchr)!=0:
166 |   groupid=groupid+1;
167 |   for item in currentgene:
168 |     print(item[0]+"\t"+str(groupid)+"\t"+str(len(currentgene)),end='');
169 |     if assignexplv==True:
170 |       if distype=="geometric":
171 |         weight=numpy.random.geometric(mu)*item[1];
172 |       else:
173 |         weight=random.lognormvariate(mu,sigma)*item[1];
174 |       print("\t"+str(weight));
175 |     else:
176 |       print();
177 | 
178 | 


--------------------------------------------------------------------------------
/slamdunk/plot/splash_eval_count_files.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Script to evaluate Slamdunk count results
  3 | #
  4 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  5 | #
  6 | # This file is part of Slamdunk.
  7 | #
  8 | # Slamdunk is free software: you can redistribute it and/or modify
  9 | # it under the terms of the GNU Affero General Public License as
 10 | # published by the Free Software Foundation, either version 3 of the
 11 | # License, or (at your option) any later version.
 12 | #
 13 | # Slamdunk is distributed in the hope that it will be useful,
 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | # GNU Affero General Public License for more details.
 17 | #
 18 | # You should have received a copy of the GNU Affero General Public License
 19 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 20 | 
 21 | library(getopt)
 22 | 
 23 | spec = matrix(c(
 24 |   'help'      , 'h', 0, "logical","print the usage of the command",
 25 |   'simulated', "s", 2,"character","Summarized count file",
 26 |   'slamdunk', "d", 2,"character","Summarized count file",
 27 |   'output', "o", 2,"character","Output pdf"
 28 | ),ncol = 5,byrow=T)
 29 | 
 30 | opt = getopt(spec)
 31 | 
 32 | if ( !is.null(opt$help) || length(opt)==1 ) {
 33 |   #get the script name
 34 |   cmd = commandArgs(FALSE)
 35 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 36 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 37 |   #print a friendly message and exit with a non-zero error code
 38 |   cat(getopt(spec,command = self,usage=T))
 39 |   q(status=1);
 40 | }
 41 | 
 42 | 
 43 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified")
 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
 45 | if ( is.null(opt$output) ) stop("arg output must be specified")
 46 | 
 47 | 
 48 | rsme <- function(model, measure) {
 49 |   sqrt( mean( (model-measure)^2 , na.rm = TRUE ) )
 50 | }
 51 | 
 52 | #folder = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_6/"
 53 | #version = "slamdunk/"
 54 | #tcRatePerPosition = 0.024
 55 | #readLength = 50 - 12
 56 | #sampleNumber = 21
 57 | #cfactor = 1 - dbinom(0, round(readLength / 4), tcRatePerPosition)
 58 | 
 59 | simulatedFileRates = opt$simulated
 60 | #simulatedFileRates = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/utrsummary_all_samples_rates_reads.tsv"
 61 | 
 62 | slamDunkFile = opt$slamdunk
 63 | #slamDunkFile = "/project/libby/slamdunk-analysis/simulation/data/test_rates_full_cov50_rl88_1/slamdunk/count/tcounts_all_samples_rates.tsv"
 64 | 
 65 | outputFile = opt$output
 66 | outputFileCSV = paste0(outputFile, ".tsv")
 67 | 
 68 | simulatedRates = read.table(simulatedFileRates, header=T, sep="\t", stringsAsFactors = F)
 69 | slamDunkRates = read.table(slamDunkFile, header=T, sep="\t", stringsAsFactors = F)
 70 | 
 71 | # Should not be neccessary, but for large datasets some entries are lost.
 72 | # Keep all that is found in both
 73 | inBoth = intersect(simulatedRates$Name, slamDunkRates$Name)
 74 | simulatedRates = simulatedRates[simulatedRates$Name %in% inBoth,]
 75 | slamDunkRates = slamDunkRates[slamDunkRates$Name %in% inBoth,]
 76 | 
 77 | fixedColumns = 11
 78 | sampleNumber = ncol(simulatedRates) - fixedColumns + 1
 79 | 
 80 | sampleNames = colnames(simulatedRates)[fixedColumns:(fixedColumns + sampleNumber - 1)]
 81 | simulatedSamples = simulatedRates[, fixedColumns:(fixedColumns + sampleNumber - 1)]
 82 | slamDunkSamples = slamDunkRates[, fixedColumns:(fixedColumns + sampleNumber - 1)]
 83 | 
 84 | pdf(outputFile)
 85 | par(mfrow=c(2,1))
 86 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-1,1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2)
 87 | abline(h=0, lty=2, col="grey")
 88 | 
 89 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples  + 0.001)), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2)
 90 | abline(h=0, lty=2, col="grey")
 91 | 
 92 | boxplot(simulatedSamples - slamDunkSamples, ylim=c(-0.1,0.1), names = sampleNames, ylab="Simulated - Slamdunk", xlab="Labeled Transcripts [%]", main="", las=2)
 93 | abline(h=0, lty=2, col="grey")
 94 | 
 95 | boxplot(log2((simulatedSamples + 0.001) / (slamDunkSamples  + 0.001)), ylim=c(-1,1), names = sampleNames, ylab="log2(Simulated / Slamdunk)", xlab="Labeled Transcripts [%]", main="", las=2)
 96 | abline(h=0, lty=2, col="grey")
 97 | 
 98 | merged = data.frame()
 99 | #rsmeTab = data.frame(File=character(), Rate=character(), RSME=character(), stringsAsFactors=F)
100 | rsmeTab = matrix("", ncol=3, nrow=0)
101 | for(currentSample in 0:(sampleNumber - 1)) {
102 |   #currentSample = 0
103 |   current = cbind(slamDunkRates[, c(1:fixedColumns - 1, fixedColumns + currentSample)], simulatedRates[, fixedColumns + currentSample])
104 |   colnames(current) = c(colnames(slamDunkRates[, c(1:fixedColumns - 1)]), "Simulate", "Slamdunk")
105 |   merged = rbind(merged, current)
106 | 
107 |   rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(substring(sampleNames[currentSample + 1], 2)), as.character(rsme(current$Simulate, current$Slamdunk))))
108 | }
109 | 
110 | par(mfrow=c(1,1))
111 | perr = round(rsme(merged$Simulate, merged$Slamdunk), digits = 4)
112 | pcorr = round(cor(merged$Simulate, merged$Slamdunk), digits = 4)
113 | plot(merged$Slamdunk,  merged$Simulate, xlim=c(0,1), ylim=c(0,1), pch=4, xlab="Simulated", ylab="Slamdunk", main=paste("Cor: ", pcorr, ", RMSE: ", perr))
114 | abline(a = 0, b = 1, col="grey", lty=2)
115 | 
116 | plot(merged$avgTcontent, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
117 | 
118 | plot(merged$avgReadsCPM, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
119 | 
120 | plot(merged$avgMultimapper, merged$Slamdunk - merged$Simulate, ylim=c(-1,1), pch=4)
121 | 
122 | dev.off()
123 | 
124 | rsmeTab = rbind(rsmeTab, c(as.character(simulatedFileRates), as.character(-1), as.character(rsme(merged$Simulate, merged$Slamdunk))))
125 | 
126 | write.table(rsmeTab, outputFileCSV, sep = "\t", quote = F, row.names = F, col.names = T)
127 | 


--------------------------------------------------------------------------------
/slamdunk/plot/compute_halflifes.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Script to compute half-lifes from SlamSeq data
  4 | 
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder, Bhat Pooja
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | library(getopt)
 23 | 
 24 | spec = matrix(c(
 25 |   'help'      , 'h', 0, "logical","print the usage of the command",
 26 |   'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results",
 27 |   'timepoints', "t", 2,"character","Comma seperated list of time points",
 28 |   'output', "o", 2,"character","Output tsv"
 29 | ),ncol = 5,byrow=T)
 30 | 
 31 | opt = getopt(spec)
 32 | 
 33 | if ( !is.null(opt$help) || length(opt)==3 ) {
 34 |   #get the script name
 35 |   cmd = commandArgs(FALSE)
 36 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 37 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 38 |   #print a friendly message and exit with a non-zero error code
 39 |   cat(getopt(spec,command = self,usage=T))
 40 |   q(status=1);
 41 | }
 42 | 
 43 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
 44 | if ( is.null(opt$output) ) stop("arg output must be specified")
 45 | if ( is.null(opt$timepoints) ) stop("arg timepoints must be specified")
 46 | 
 47 | slamDunkFiles = opt$slamdunk
 48 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
 49 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]]))
 50 | outputFile = opt$output
 51 | timesParameter = opt$timepoints
 52 | times = as.numeric(strsplit(timesParameter, ",")[[1]])
 53 | times = times / 60
 54 | 
 55 | 
 56 | mergeRates <- function(times, files, perRead) {
 57 |   mergedRates = data.frame()
 58 |   for(i in 1:length(times)) {
 59 |     time = times[i]
 60 |     print(time)
 61 |     simDataFile = files[i]
 62 |     simulation = read.table(simDataFile)
 63 |     colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount")
 64 |     if(nrow(mergedRates) == 0) {
 65 |       mergedRates = simulation[, c("chr", "start", "stop", "name", "strand")]
 66 |       mergedRates$avgReadsCPM = simulation$readsCPM
 67 |       mergedRates$avgMultimapper = simulation$multiMapCount
 68 |       if(perRead == TRUE) {
 69 |         mergedRates$conversionRate = simulation$convertedReads / simulation$readCount
 70 |       } else {
 71 |         mergedRates$conversionRate = simulation$conversionRate
 72 |       }
 73 |     } else {
 74 |       mergedRates$avgReadsCPM = mergedRates$avgReadsCPM + simulation$readsCPM
 75 |       mergedRates$avgMultimapper = mergedRates$avgMultimapper + simulation$multiMapCount
 76 |       if(perRead == TRUE) {
 77 |         mergedRates = cbind(mergedRates, simulation$convertedReads / simulation$readCount)
 78 |       } else {
 79 |         mergedRates = cbind(mergedRates, simulation$conversionRate)
 80 |       }
 81 |     }
 82 |   }
 83 |   colnames(mergedRates) = c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", times)
 84 |   mergedRates$readsCPM = mergedRates$readsCPM / length(times)
 85 |   mergedRates$multiMapCount = mergedRates$multiMapCount / length(times)
 86 |   mergedRates
 87 | }
 88 | 
 89 | computeHalfLife <- function(rates, timepoints) {
 90 |   # Infere half life from data
 91 |   a_start<-max(rates) #param a is the y value when x=0
 92 |   k_start = log(2, base = exp(1))/5
 93 | 
 94 |   halfLifePred = NA
 95 |   C = NA
 96 |   k = NA
 97 | 
 98 |   tryCatch( {
 99 |     fit = nls(rates ~ a*(1-exp(-k*(timepoints))), start=list(a=a_start,k=k_start))
100 |     halfLifePred = log(2, base = exp(1))/coef(fit)[2] * 60
101 |     C = coef(fit)[1]
102 |     k = coef(fit)[2]
103 |   }, error=function(e){})
104 |   summary(fit)
105 | 
106 |   RSS.p <- sum(residuals(fit)^2)
107 |   TSS <- sum((rates - mean(rates))^2)
108 |   rsquared = 1 - (RSS.p/TSS)
109 | 
110 |   c(halfLifePred, C, k, rsquared)
111 | }
112 | 
113 | perRead = F
114 | slamDunkMergedRates = mergeRates(times, filesSlamDunk, perRead)
115 | 
116 | halfLifeTable = data.frame()
117 | 
118 | for(utr in 1:nrow(slamDunkMergedRates)) {
119 |   #utr = 8
120 |   slamDunkMergedRates[utr,]
121 |   pulseSlamDunk = data.frame(y = as.numeric(t(slamDunkMergedRates[utr, 8:(7 + length(times))])[,1]), x = times)
122 | 
123 |   result = computeHalfLife(pulseSlamDunk$y, pulseSlamDunk$x)
124 |   #rates = pulseSlamDunk$y
125 |   #timepoints = pulseSlamDunk$x
126 |   halfLifeTable = rbind(halfLifeTable, cbind(slamDunkMergedRates[utr, c("chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount")], result[1]))
127 | }
128 | 
129 | colnames(halfLifeTable) = c("#chr", "start", "stop", "name", "strand", "readsCPM", "multiMapCount", "score")
130 | 
131 | write.table(halfLifeTable, outputFile, sep = "\t", quote = F, row.names = F, col.names = T)
132 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/README:
--------------------------------------------------------------------------------
  1 | RNASeqReadSimulator
  2 | ==================
  3 | Author: Wei Li (li.david.wei AT gmail.com)
  4 | 
  5 | Introduction
  6 | ------------
  7 | RNASeqReadSimulator is a set of scripts generating simulated RNA-Seq reads. RNASeqReadSimulator provides users a simple tool to generate RNA-Seq reads for research purposes, and a framework to allow experienced users to expand functions. RNASeqReadSimulator offers the following features:
  8 | 
  9 | 1. It allows users to randomly assign expression levels of transcripts and generate simulated single-end or paired-end RNA-Seq reads.
 10 | 
 11 | 2. It is able to generate RNA-Seq reads that have a specified positional bias profile.
 12 | 
 13 | 3. It is able to simulate random read errors from sequencing platforms.
 14 | 
 15 | 4. The simulator consists of a few simple Python scripts. All scripts are command line driven, allowing users to invoke and design more functions.
 16 | 
 17 | Requirements
 18 | ------------
 19 | RNASeqReadSimulator runs on python 2.7 with biopython package installed.
 20 | 
 21 | Installation
 22 | ------------
 23 | After download, it is suggested that the path of the scripts (src) be added to the system path. For example, if the scripts are located at /home/me/rnaseqsimulator, then add the following command to your .bashrc profile:
 24 | 
 25 | export PATH="$PATH:/home/me/rnaseqsimulator/src"
 26 | 
 27 | Demo
 28 | ----
 29 | The demo folder includes a few scripts and sample input files to generate RNA-Seq reads from a simple example. Two bash scripts, gensingleendreads.sh and genpairedendreads.sh, are examples to generate single-end and paired-end reads.
 30 | 
 31 | 
 32 | 
 33 | Usage
 34 | -----
 35 | 
 36 | RNASeqReadSimulator includes the following essential scripts:
 37 | 
 38 |   genexplvprofile.py is used to assign a random expression level of transcripts;
 39 | 
 40 |   gensimreads.py simulates RNA-Seq reads in BED format;
 41 | 
 42 |   getseqfrombed.py converts reads from BED format to FASTA format;
 43 | 
 44 | Other optional scripts and files include:
 45 | 
 46 |   splitfasta.py splits paired-end reads in FASTA file into to separate files;
 47 | 
 48 |   addvariation2splicingbed.py is a supplementary script to generate variations in splicing RNA-Seq reads. 
 49 | 
 50 | 
 51 | 
 52 | genexplvprofile.py
 53 | ------------------
 54 | 
 55 | This file randomly assigns weights for each transcript, and gets the transcript statistics by a given transcript annotation file (BED File).
 56 | 
 57 |     USAGE    genexplvprofile.py {OPTIONS} <BED-File|->
 58 | 
 59 |     OPTIONS
 60 | 
 61 |       -h/--help	  			Print this message
 62 | 
 63 |       -e/--lognormal <float,float>	Specify the mean and variance of the lognormal distribution used to assign expression levels. Default -4,4
 64 | 
 65 |       -f/--statonly   			Print the statistics only; do not assign expression levels.
 66 | 
 67 |     NOTE:
 68 | 
 69 |       To get a good group information, the BED file is suggested to sort according to the chromosome name and start position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).
 70 | 
 71 |       The weight is at the 8th column, if -f option is not specified. The expression level of each transcript (RPKM) can be calculated as column[8]*10^9/column[2]/sum(column[8]).
 72 | 
 73 | 
 74 | gensimreads.py
 75 | -------------
 76 | This script generates simulated RNA-Seq reads (in .bed format) from known gene annotations.
 77 | 
 78 |     Usage: gensimreads.py {OPTIONS} <BED-File|->
 79 |         
 80 |     BED-File:      The gene annotation file (in BED format). Use '-' for STDIN input.
 81 | 
 82 |     OPTIONS
 83 | 
 84 |       -e/--expression <expression level file>   Specify the weight of each transcript. Each line in the file should have at least (NFIELD+1)  fields, with field 0 the annotation id, and field NFIELD the weight of this annotation. NFIELD is given by -f/--field option. If this file is not provided, uniform weight is applied. See the output of genexplvprofile.py as an example.
 85 | 
 86 |       -n/--nreads <int>  		  	Specify the number of reads to be generated. Default 100000.
 87 | 
 88 |       -b/--posbias <positional bias file>     	Specify the positional bias file. The file should include at least 100 lines, each contains only one integer number, showing the preference of the positional bias at this position. If no positional bias file is specified, use uniform distribution bias.
 89 | 
 90 |       -l/--readlen <int>      			Specify the read length. Default 32.
 91 | 
 92 |       -o/--output <output file> 		Specify the output file. The default is STDOUT
 93 | 
 94 |       -f/--field  <int>				The field of each line as weight input. Default is 7 (beginning from field 0).
 95 | 
 96 |       -p/--pairend <int,int>		  	Generate paired-end reads with specified insert length mean and standard derivation. The default is 200,20.
 97 |  
 98 |       --stranded 				Generate stranded RNA-Seq reads.
 99 | 
100 |     NOTE
101 | 
102 |       The bed file is required to sort according to the chromosome name and position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).
103 | 
104 |       No problem to handle reads spanning multiple exons.
105 | 
106 | 
107 | getseqfrombed.py
108 | ----------------
109 | This script is used to extract sequences from bed file.
110 | 
111 |     USAGE getseqfrombed.py {OPTIONS} <input .bed file> <reference .fasta file>
112 | 
113 |     OPTIONS
114 | 
115 |       -b/--seqerror <error file>		Specify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed.
116 | 
117 |       -r/--errorrate <float>			Specify the overall error rate, a real positive number. The number of errors of each read will follow a Poisson distribution with its mean value specified by --errorrate.  Default 0 (no errors).
118 | 
119 |       -l/--readlen <int>		 	Specify the read length. Default is 75.
120 | 
121 |       -f/--fill <string> 			Fill at the end of each read by the sequence seq, if the read is shorter than the read length. Default is 'A' (to simulate poly-A tails in RNA-Seq reads).
122 | 
123 |     NOTE
124 | 
125 |       1. The input .bed file is best to sort according to chromosome names. Use - to input from STDIN.
126 | 
127 |       2. Biopython and numpy package are required.
128 | 
129 |       3. I assume that all sequences are in the same length. The length information is given by the -l parameter. If the sequence length is greater than the read length, nucleotides outside the read length will not be simulated for error.
130 | 


--------------------------------------------------------------------------------
/slamdunk/plot/SNPeval.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Script to look at SNP distributions along UTRs ranked by # T>C SNPs
  4 | #
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | library(getopt)
 23 | 
 24 | spec = matrix(c(
 25 | 				'help'      , 'h', 0, "logical","print the usage of the command",
 26 | 				'inputFile', "i", 2,"character","tsv table of snp vs tc count files",
 27 | 				'coverageCutoff', "c", 2,"numeric","coverage cutoff for calling variants",
 28 | 				'variantFraction', "v", 2,"numeric","variant fraction cutoff for calling variants",
 29 | 				'outputFile', "o", 2,"character","output pdf file name"
 30 | 		),ncol = 5,byrow=T)
 31 | 
 32 | opt = getopt(spec)
 33 | 
 34 | if ( !is.null(opt$help) || length(opt)==1 ) {
 35 | 	#get the script name
 36 | 	cmd = commandArgs(FALSE)
 37 | 	self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 38 | 	cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 39 | 	#print a friendly message and exit with a non-zero error code
 40 | 	cat(getopt(spec,command = self,usage=T))
 41 | 	q(status=1);
 42 | }
 43 | 
 44 | 
 45 | if ( is.null(opt$inputFile) ) stop("arg rateTab must be specified")
 46 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
 47 | if ( is.null(opt$coverageCutoff) ) { opt$coverageCutoff = 0 }
 48 | if ( is.null(opt$variantFraction) ) { opt$variantFraction = 0 }
 49 | 
 50 | tricubeMovingAverage <-function (x, span = 0.5, full.length = TRUE) {
 51 | 
 52 | 	n <- length(x)
 53 | 	width <- span * n
 54 | 	hwidth <- as.integer(width%/%2L)
 55 | 	if (hwidth <= 0L)
 56 | 		return(x)
 57 | 	width <- 2L * hwidth + 1L
 58 | 	u <- seq(from = -1, to = 1, length = width) * width/(width +
 59 | 				1)
 60 | 	tricube.weights <- (1 - abs(u)^3)^3
 61 | 	tricube.weights <- tricube.weights/sum(tricube.weights)
 62 | 	if (!full.length)
 63 | 		return(as.vector(filter(x, tricube.weights), mode = "numeric")[(hwidth + 1):(n - hwidth)])
 64 | 	z <- numeric(hwidth)
 65 | 	x <- as.vector(filter(c(z, x, z), tricube.weights), mode = "numeric")[(hwidth + 1):(n + hwidth)]
 66 | 	cw <- cumsum(tricube.weights)
 67 | 	x[1:hwidth] <- x[1:hwidth]/cw[(width - hwidth):(width - 1)]
 68 | 	x[(n - hwidth + 1):n] <- x[(n - hwidth + 1):n]/cw[(width - 1):(width - hwidth)]
 69 | 	x
 70 | }
 71 | 
 72 | rescale <- function(x, new, old = range(x)) {
 73 | 	new[1] + (x - old[1])/(old[2] - old[1]) *
 74 | 			(new[2] - new[1])
 75 | }
 76 | 
 77 | GSEAplot <- function(counts, snps, ...) {
 78 | 
 79 | 	num <- length(counts)
 80 | 
 81 | 	sel = rep.int(FALSE, num)
 82 | 
 83 | 	sel[snps] <- TRUE
 84 | 
 85 | 	countOrder <- order(counts, na.last = TRUE, decreasing = TRUE)
 86 | 	counts <- counts[countOrder]
 87 | 	sel <- sel[countOrder]
 88 | 
 89 | 	snpLoc <- which(sel)
 90 | 
 91 | 	col.bars <- "black"
 92 | 
 93 | 	ylim <- c(-1, 1.5)
 94 | 
 95 | 	plot(1:num, xlim = c(0, num), ylim = c(0, 2.1), type = "n",
 96 | 			axes = FALSE, ylab = "", ...)
 97 | 
 98 | 	lwd <- 50/length(snpLoc)
 99 | 	lwd <- min(1.9, lwd)
100 | 	lwd <- max(0.2, lwd)
101 | 
102 | 	barlim <- ylim[2] - c(1.5, 0.5)
103 | 	rect.yb <- 0
104 | 	rect.yt <- 0.5
105 | 	rect(0.5, 0, num + 0.5, 0.5, col = "pink", border = NA)
106 | 
107 | 	if (length(snpLoc) > 0) {
108 | 
109 | 		segments(snpLoc, barlim[1], snpLoc, barlim[2]/2, lwd = lwd, col = "black")
110 | 		segments(snpLoc, barlim[2]/2, snpLoc, barlim[2]/2 * 2, lwd = lwd, col = "black")
111 | 
112 | 	}
113 | 
114 | 	axis(side = 2, at = 0.5, padj = 3.8, cex.axis = 0.85,
115 | 			labels = "High # T>C reads", tick = FALSE)
116 | 	axis(side = 4, at = 0.5, padj = -3.8, cex.axis = 0.85,
117 | 			labels = "Low # T>C reads", tick = FALSE)
118 | 	prob <- (10:0)/10
119 | 	axis(at = seq(1, num, len = 11), side = 1, cex.axis = 0.7,
120 | 			las = 2, labels = format(quantile(counts, p = prob),
121 | 					digits = 1))
122 | 
123 | 	ave.enrich1 <- length(snpLoc)/num
124 | 	worm1 <- tricubeMovingAverage(sel, span = 0.45)/ave.enrich1
125 | 
126 | 	r.worm1 <- c(0, max(worm1))
127 | 	worm1.scale <- rescale(worm1, new = c(1.1 , 2.1 ), old = r.worm1)
128 | 
129 | 	lines(x = 1:num, y = worm1.scale, col = "black", lwd = 2)
130 | 	abline(h = rescale(1, new = c(1.1 , 2.1), old = r.worm1), lty = 2)
131 | 	axis(side = 2, at = c(1.1 , 2.1 ), cex.axis = 0.8, labels = c(0, format(max(worm1), digits = 2)))
132 | 	axis(side = 2, labels = "Enrichment", at = 1.6 , padj = -0.6, tick = FALSE, cex.axis = 0.8)
133 | }
134 | 
135 | pdf(opt$outputFile)
136 | 
137 | minCounts = round(opt$coverageCutoff * opt$variantFraction)
138 | 
139 | table = read.delim(opt$inputFile,header=FALSE,col.names = c("name","count","unmasked","masked","snp"))
140 | 
141 | table = table[table$unmasked >= minCounts,]
142 | 
143 | table = table[table$count >= quantile(table$count, 0.75),]
144 | 
145 | par(mfrow=c(2,1))
146 | 
147 | if (length(table(table$snp)) > 1) {
148 | 
149 | 	blindTest = wilcox.test(table$unmasked ~ table$snp == "1", alternative = "less")
150 | 	maskedTest = wilcox.test(table$masked ~ table$snp == "1", alternative = "less")
151 | 
152 | 	blindPvalue = blindTest$p.value
153 | 
154 | 	if (blindPvalue < 0.01) {
155 | 		blindPvalue = "< 0.01"
156 | 	} else {
157 | 		blindPvalue =  paste("= ",round(blindPvalue,digits=2),sep="")
158 | 	}
159 | 
160 | 	maskedPvalue = maskedTest$p.value
161 | 
162 | 	if (maskedPvalue < 0.01) {
163 | 		maskedPvalue = "< 0.01"
164 | 	} else {
165 | 		maskedPvalue =  paste("= ",round(maskedPvalue,digits=2),sep="")
166 | 	}
167 | 
168 | 	GSEAplot(table$unmasked, which(table$snp == 1), main="Blind", xlab = paste("Mann-Whitney-U:  p-value ",blindPvalue,sep=""))
169 | 	GSEAplot(table$masked, which(table$snp == 1), main="SNP-masked", xlab = paste("Mann-Whitney-U:  p-value ",maskedPvalue,sep=""))
170 | 
171 | } else {
172 | 	GSEAplot(table$unmasked, which(table$snp == 1), main="Blind", xlab = paste("Mann-Whitney-U:  p-value NA",sep=""))
173 | 	GSEAplot(table$masked, which(table$snp == 1), main="SNP-masked", xlab = paste("Mann-Whitney-U:  p-value NA",sep=""))
174 | }
175 | 
176 | # wilcox.test(testTab$masked ~ testTab$snp == "1")
177 | # wilcox.test(testTab$unmasked ~ testTab$snp == "1")
178 | #
179 | # ks.test(testTab$masked, testTab$unmasked)
180 | # ks.test(testTab$unmasked[testTab$snp == "0"],testTab$unmasked[testTab$snp == "1"])
181 | 
182 | dev.off()
183 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  2 | #
  3 | # This file is part of Slamdunk.
  4 | #
  5 | # Slamdunk is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU Affero General Public License as
  7 | # published by the Free Software Foundation, either version 3 of the
  8 | # License, or (at your option) any later version.
  9 | #
 10 | # Slamdunk is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU Affero General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU Affero General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | 
 18 | from __future__ import print_function
 19 | import os, sys, re
 20 | 
 21 | try:
 22 |     from setuptools import setup, find_packages
 23 |     from setuptools.command.install import install as _install
 24 |     from codecs import open
 25 |     from os import path
 26 | except ImportError:
 27 |     from distutils.core import setup
 28 |     from distutils.command.install import install as _install
 29 | 
 30 | here = path.abspath(path.dirname(__file__))
 31 | name = "slamdunk"
 32 | 
 33 | #Get the long description from the README file
 34 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 35 |     long_description = f.read()
 36 | 
 37 | # now we have a `__version__` variable
 38 | exec(open(path.join(here, name, 'version.py')).read())
 39 | 
 40 | # Copy bin recursively
 41 | def package_files(directory):
 42 |     paths = []
 43 |     for (path, directories, filenames) in os.walk(directory):
 44 |         for filename in filenames:
 45 |             paths.append(os.path.join("..", "..", path, filename))
 46 |     return paths
 47 | 
 48 | bin_files = package_files(name + os.sep + 'contrib')
 49 | plot_files = package_files(name + os.sep + 'plot')
 50 | 
 51 | def _runExternalBuilds(dir):
 52 | 
 53 |     import subprocess
 54 | 
 55 |     print("Building RNASeqReadSimulator.")
 56 |     syscall = "(cd " + os.path.join(dir, name, "contrib") + " ; ./build-rnaseqreadsimulator.sh)"
 57 |     subprocess.call([syscall], shell=True)
 58 | 
 59 | class install(_install):
 60 | 
 61 |     def initialize_options(self):
 62 |         _install.initialize_options(self)
 63 | 
 64 |     def finalize_options(self):
 65 |         _install.finalize_options(self)
 66 | 
 67 |     def run(self):
 68 |         _install.run(self)
 69 |         self.execute(_runExternalBuilds, (self.install_lib) ,msg="Installing external dependencies")
 70 | 
 71 | setup(
 72 |     name = name,
 73 | 
 74 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
 75 |     # the version across setup.py and the project code, see
 76 |     # https://packaging.python.org/en/latest/single_source_version.html
 77 |     version=__version__,
 78 | 
 79 |     description='SLAMdunk suite for analyzing SLAM-seq data',
 80 |     long_description=long_description,
 81 | 
 82 |     # The project's main homepage.
 83 |     url='http://t-neumann.github.io/slamdunk',
 84 | 
 85 |     # Author details
 86 |     author='Tobias Neumann, Philipp Rescheneder',
 87 |     author_email='tobias.neumann.at@gmail.com, philipp.rescheneder@univie.ac.at',
 88 | 
 89 |     # Choose your license
 90 |     license='GNU Affero General Public License v3 or later (AGPLv3+)',
 91 | 
 92 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 93 |     classifiers=[
 94 |         # How mature is this project? Common values are
 95 |         #   3 - Alpha
 96 |         #   4 - Beta
 97 |         #   5 - Production/Stable
 98 |         'Development Status :: 4 - Beta',
 99 | 
100 |         # Indicate who your project is intended for
101 |         'Intended Audience :: Science/Research',
102 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
103 | 
104 |         # Pick your license as you wish (should match "license" above)
105 |         'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
106 | 
107 |         # Specify the Python versions you support here. In particular, ensure
108 |         # that you indicate whether you support Python 2, Python 3 or both.
109 |         'Programming Language :: Python :: 2',
110 |         'Programming Language :: Python :: 2.6',
111 |         'Programming Language :: Python :: 2.7',
112 |         'Programming Language :: Python :: 3',
113 |         'Programming Language :: Python :: 3.3',
114 |         'Programming Language :: Python :: 3.4',
115 |         'Programming Language :: Python :: 3.5',
116 |         'Programming Language :: C++',
117 |         'Programming Language :: Java',
118 |     ],
119 | 
120 |     # What does your project relate to?
121 |     keywords='Next-Generation-Sequencing NGS QuantSeq SLAMSeq',
122 | 
123 |     # You can just specify the packages manually here if your project is
124 |     # simple. Or you can use find_packages().
125 |     packages=find_packages(exclude=['doc', 'tests']),
126 | 
127 |     # Alternatively, if you want to distribute just a my_module.py, uncomment
128 |     # this:
129 |     #py_modules=["slamdunk.main", "slamdunk.toolbox","slamdunk.simulate"],
130 | 
131 |     # List run-time dependencies here.  These will be installed by pip when
132 |     # your project is installed. For an analysis of "install_requires" vs pip's
133 |     # requirements files see:
134 |     # https://packaging.python.org/en/latest/requirements.html
135 |     install_requires=['joblib>=0.9.4','pybedtools>=0.6.4','intervaltree>=2.1.0','pandas>=0.13.1','biopython>=1.63','pysam>=0.8.3', 'Cython>=0.20.1'],
136 | 
137 |     # List additional groups of dependencies here (e.g. development
138 |     # dependencies). You can install these using the following syntax,
139 |     # for example:
140 |     # $ pip install -e .[dev,test]
141 | #     extras_require={
142 | #         'dev': ['check-manifest'],
143 | #         'test': ['coverage'],
144 | #     },
145 | 
146 |     # If there are data files included in your packages that need to be
147 |     # installed, specify them here.  If using Python 2.6 or less, then these
148 |     # have to be included in MANIFEST.in as well.
149 |     package_data={
150 |         'slamdunk.contrib': bin_files,
151 |         'slamdunk.plot': plot_files,
152 |     },
153 | 
154 |     # Although 'package_data' is the preferred approach, in some case you may
155 |     # need to place data files outside of your packages. See:
156 |     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
157 |     # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
158 |     #data_files=[('bin', extra_files)],
159 | 
160 |     # To provide executable scripts, use entry points in preference to the
161 |     # "scripts" keyword. Entry points provide cross-platform support and allow
162 |     # pip to create the appropriate form of executable for the target platform.
163 |     entry_points={
164 |     'console_scripts': [
165 |         'slamdunk=slamdunk.slamdunk:run',
166 |         'alleyoop=slamdunk.alleyoop:run',
167 |         'splash=slamdunk.splash:run',
168 |     ],
169 |     },
170 | )
171 | 


--------------------------------------------------------------------------------
/slamdunk/plot/globalRatePlotter.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Plot overall conversion rates per UTR
  4 | 
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | library(getopt)
 23 | 
 24 | spec = matrix(c(
 25 | 				'help'      , 'h', 0, "logical","print the usage of the command",
 26 | 				'rateTab', "f", 2,"character","tsv table of rate files",
 27 | 				'outputFile', "O", 2,"character","output pdf file name"
 28 | 		),ncol = 5,byrow=T)
 29 | 
 30 | opt = getopt(spec)
 31 | 
 32 | if ( !is.null(opt$help) || length(opt)==1 ) {
 33 | 	#get the script name
 34 | 	cmd = commandArgs(FALSE)
 35 | 	self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 36 | 	cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 37 | 	#print a friendly message and exit with a non-zero error code
 38 | 	cat(getopt(spec,command = self,usage=T))
 39 | 	q(status=1);
 40 | }
 41 | 
 42 | 
 43 | if ( is.null(opt$rateTab) ) stop("arg rateTab must be specified")
 44 | if ( is.null(opt$outputFile) ) { opt$outputFile = "out.pdf" }
 45 | 
 46 | library(ggplot2)
 47 | library(gridExtra)
 48 | 
 49 | rates = read.table(opt$rateTab,stringsAsFactors=FALSE,col.names = c("sample","file"), comment.char = "")
 50 | 
 51 | pdf(opt$outputFile)
 52 | 
 53 | plotList = list()
 54 | 
 55 | for (i in 1:nrow(rates)) {
 56 | 	curTab = read.delim(rates$file[i],stringsAsFactors=FALSE,comment.char='#')
 57 | 
 58 | 	plusTab = curTab[curTab$Strand == "+",]
 59 | 	minusTab = curTab[curTab$Strand == "-",]
 60 | 
 61 | 	# "Name"      "Chr"       "Start"     "End"       "Strand"    "ReadCount"
 62 | 	# "A_A"       "A_C"       "A_G"       "A_T"       "A_N"       "C_A"
 63 | 	# "C_C"       "C_G"       "C_T"       "C_N"       "G_A"       "G_C"
 64 | 	# "G_G"       "G_T"       "G_N"       "T_A"       "T_C"       "T_G"
 65 | 	# "T_T"       "T_N"       "N_A"       "N_C"       "N_G"       "N_T"
 66 | 	# "N_N"
 67 | 
 68 | 	names(minusTab) = c("Name", "Chr", "Start", "End", "Strand", "ReadCount",
 69 | 			"T_T", "T_G", "T_C", "T_A", "NNN", "G_T",
 70 | 			"G_G", "G_C", "G_A", "NNN", "C_T", "C_G",
 71 | 			"C_C", "C_A", "NNN", "A_T", "A_G", "A_C",
 72 | 			"A_A", "NNN", "NNN", "NNN", "NNN", "NNN",
 73 | 			"NNN")
 74 | 
 75 | 	plusTab = plusTab[,c(1,grep("N",names(plusTab),invert=TRUE))]
 76 | 	minusTab = minusTab[,grep("NNN",names(minusTab),invert=TRUE)]
 77 | 
 78 | 	plusTab = plusTab[,c(-1,-2,-3,-4,-5,-6)]
 79 | 	plusTab = plusTab[rowSums(plusTab) > 0,]
 80 | 
 81 | 	plusTab$Asum = plusTab$A_A + plusTab$A_C + plusTab$A_G + plusTab$A_T
 82 | 	plusTab$Csum = plusTab$C_A + plusTab$C_C + plusTab$C_G + plusTab$C_T
 83 | 	plusTab$Gsum = plusTab$G_A + plusTab$G_C + plusTab$G_G + plusTab$G_T
 84 | 	plusTab$Tsum = plusTab$T_A + plusTab$T_C + plusTab$T_G + plusTab$T_T
 85 | 
 86 | 	plusTab$A_A = plusTab$A_A / plusTab$Asum
 87 | 	plusTab$A_C = plusTab$A_C / plusTab$Asum
 88 | 	plusTab$A_G = plusTab$A_G / plusTab$Asum
 89 | 	plusTab$A_T = plusTab$A_T / plusTab$Asum
 90 | 
 91 | 	plusTab$C_A = plusTab$C_A / plusTab$Csum
 92 | 	plusTab$C_C = plusTab$C_C / plusTab$Csum
 93 | 	plusTab$C_G = plusTab$C_G / plusTab$Csum
 94 | 	plusTab$C_T = plusTab$C_T / plusTab$Csum
 95 | 
 96 | 	plusTab$G_A = plusTab$G_A / plusTab$Gsum
 97 | 	plusTab$G_C = plusTab$G_C / plusTab$Gsum
 98 | 	plusTab$G_G = plusTab$G_G / plusTab$Gsum
 99 | 	plusTab$G_T = plusTab$G_T / plusTab$Gsum
100 | 
101 | 	plusTab$T_A = plusTab$T_A / plusTab$Tsum
102 | 	plusTab$T_C = plusTab$T_C / plusTab$Tsum
103 | 	plusTab$T_G = plusTab$T_G / plusTab$Tsum
104 | 	plusTab$T_T = plusTab$T_T / plusTab$Tsum
105 | 
106 | 	plusTab = plusTab[,grep("sum",names(plusTab),invert=TRUE)]
107 | 
108 | 	plusTab = plusTab * 100
109 | 
110 | 	minusTab = minusTab[,c(-1,-2,-3,-4,-5,-6)]
111 | 	minusTab = minusTab[rowSums(minusTab) > 0,]
112 | 
113 | 	minusTab$Asum = minusTab$A_A + minusTab$A_C + minusTab$A_G + minusTab$A_T
114 | 	minusTab$Csum = minusTab$C_A + minusTab$C_C + minusTab$C_G + minusTab$C_T
115 | 	minusTab$Gsum = minusTab$G_A + minusTab$G_C + minusTab$G_G + minusTab$G_T
116 | 	minusTab$Tsum = minusTab$T_A + minusTab$T_C + minusTab$T_G + minusTab$T_T
117 | 
118 | 	minusTab$A_A = minusTab$A_A / minusTab$Asum
119 | 	minusTab$A_C = minusTab$A_C / minusTab$Asum
120 | 	minusTab$A_G = minusTab$A_G / minusTab$Asum
121 | 	minusTab$A_T = minusTab$A_T / minusTab$Asum
122 | 
123 | 	minusTab$C_A = minusTab$C_A / minusTab$Csum
124 | 	minusTab$C_C = minusTab$C_C / minusTab$Csum
125 | 	minusTab$C_G = minusTab$C_G / minusTab$Csum
126 | 	minusTab$C_T = minusTab$C_T / minusTab$Csum
127 | 
128 | 	minusTab$G_A = minusTab$G_A / minusTab$Gsum
129 | 	minusTab$G_C = minusTab$G_C / minusTab$Gsum
130 | 	minusTab$G_G = minusTab$G_G / minusTab$Gsum
131 | 	minusTab$G_T = minusTab$G_T / minusTab$Gsum
132 | 
133 | 	minusTab$T_A = minusTab$T_A / minusTab$Tsum
134 | 	minusTab$T_C = minusTab$T_C / minusTab$Tsum
135 | 	minusTab$T_G = minusTab$T_G / minusTab$Tsum
136 | 	minusTab$T_T = minusTab$T_T / minusTab$Tsum
137 | 
138 | 	minusTab = minusTab[,grep("sum",names(minusTab),invert=TRUE)]
139 | 
140 | 	minusTab = minusTab * 100
141 | 
142 | 	plotTab = rbind(plusTab, minusTab)
143 | 
144 | 	plotTab = plotTab[,c("A_C","A_G","A_T","C_A","C_G","C_T","G_A","G_C","G_T","T_A","T_C","T_G")]
145 | 	quantiles = lapply(plotTab, function(x) {
146 | 				return(quantile(x, na.rm=TRUE, p=0.75) + 1.5 * IQR(x, na.rm=TRUE))
147 | 			})
148 | 
149 | 	ymax = ceiling(max(unlist(quantiles)))
150 | 
151 | 	plotTab = rbind(
152 | 			data.frame(class = "A_C", values = plotTab$A_C),
153 | 			data.frame(class = "A_G", values = plotTab$A_G),
154 | 			data.frame(class = "A_T", values = plotTab$A_T),
155 | 			data.frame(class = "C_A", values = plotTab$C_A),
156 | 			data.frame(class = "C_G", values = plotTab$C_G),
157 | 			data.frame(class = "C_T", values = plotTab$C_T),
158 | 			data.frame(class = "G_A", values = plotTab$G_A),
159 | 			data.frame(class = "G_C", values = plotTab$G_C),
160 | 			data.frame(class = "G_T", values = plotTab$G_T),
161 | 			data.frame(class = "T_A", values = plotTab$T_A),
162 | 			data.frame(class = "T_C", values = plotTab$T_C),
163 | 			data.frame(class = "T_G", values = plotTab$T_G)
164 | 	)
165 | 
166 | 	plotTab$highlight = "no"
167 | 	plotTab$highlight[plotTab$class == "T_C"] = "yes"
168 | 	plotTab$class = sub("_", ">", plotTab$class)
169 | 	plotTab$group = "A"
170 | 	plotTab$group[plotTab$class %in% c("C>A","C>G","C>T")] = "C"
171 | 	plotTab$group[plotTab$class %in% c("G>A","G>C","G>T")] = "G"
172 | 	plotTab$group[plotTab$class %in% c("T>A","T>C","T>G")] = "T"
173 | 
174 | 	plotTab = plotTab[!is.na(plotTab$values),]
175 | 
176 | 	curPlot = ggplot(plotTab, aes(x=class,y=values,fill=highlight,col=highlight)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.shape = NA,lwd=0.8,fatten=2) + facet_grid(~group, scales="free", space="free") + xlab("") + ylab("Mutation rate per UTR base [%]") +
177 | 			scale_fill_manual(values=c("white","white")) + scale_color_manual(values=c("black", "red")) + theme(axis.ticks.x = element_blank(), legend.position = "none") + coord_cartesian(ylim=c(0, ymax))
178 | 
179 | 	plotList[[length(plotList)+1]] <- curPlot + ggtitle(rates$sample[i])
180 | 
181 | 	anovaTest = aov(values ~ class, data = plotTab)
182 | 	print(paste("Sample: ",rates$sample[i],sep=""))
183 | 	print(TukeyHSD(x = anovaTest, 'class', conf.level = 0.95)$class)
184 | 
185 | }
186 | 
187 | do.call(grid.arrange,  plotList)
188 | 
189 | dev.off()
190 | 
191 | #signal success and exit.
192 | q(status=0)
193 | 


--------------------------------------------------------------------------------
/slamdunk/plot/eval_halflifes_error_plot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | library(getopt)
 21 | 
 22 | spec = matrix(c(
 23 |   'help'      , 'h', 0, "logical","print the usage of the command",
 24 |   'simulated', "s", 2,"character","Half-lifes inferred from SlamDunk results",
 25 |   'predicted', "p", 2,"character","Simulated Half-Lifes",
 26 |   'truth', "t", 2,"character","True Half-Lifes",
 27 |   'output', "o", 2,"character","Output pdf",
 28 |   'missing', "m", 2,"character","List of utrs with missing half-life"
 29 | ),ncol = 5,byrow=T)
 30 | 
 31 | opt = getopt(spec)
 32 | 
 33 | if ( !is.null(opt$help) || length(opt)==5 ) {
 34 |   #get the script name
 35 |   cmd = commandArgs(FALSE)
 36 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 37 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 38 |   #print a friendly message and exit with a non-zero error code
 39 |   cat(getopt(spec,command = self,usage=T))
 40 |   q(status=1);
 41 | }
 42 | 
 43 | 
 44 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified")
 45 | if ( is.null(opt$predicted) ) stop("arg slamdunk must be specified")
 46 | if ( is.null(opt$truth) ) stop("arg truth must be specified")
 47 | if ( is.null(opt$output) ) stop("arg output must be specified")
 48 | if ( is.null(opt$missing) ) stop("arg missing must be specified")
 49 | 
 50 | truthFile = opt$truth
 51 | #truthFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/finalAnnotation_test_cut_chrM_correct_100_original_utrs.bed"
 52 | #truthFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/finalAnnotation_test_cut_chrM_correct_original_utrs.bed"
 53 | simHLFile = opt$simulated
 54 | #simHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_1_0min_utrsummary_halflifes.tsv"
 55 | #simHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/finalAnnotation_test_1_0min_utrsummary_halflifes.tsv"
 56 | predHLFile = opt$predicted
 57 | #predHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/halflifes/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount_halflifes.tsv"
 58 | #predHLFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_3/slamdunk/halflifes/finalAnnotation_test_1_0min_reads_slamdunk_mapped_filtered_tcount_halflifes.tsv"
 59 | output = opt$output
 60 | missing = opt$missing
 61 | 
 62 | trueHL = read.csv(truthFile, sep="\t", header = F)
 63 | colnames(trueHL) = c("chr", "start", "stop", "name", "halflife", "strand")
 64 | simHL = read.csv(simHLFile, sep="\t", header = T)
 65 | predHL = read.csv(predHLFile, sep="\t")
 66 | 
 67 | predHL$simulated_hl = simHL$score
 68 | predHL$true_hl = trueHL$halflife
 69 | predHL$multiperc = predHL$multiMapCount / predHL$readsCPM
 70 | head(simHL)
 71 | 
 72 | tmp = predHL[is.na(predHL$score) | is.na(predHL$simulated_hl), ]
 73 | predHL = predHL[!is.na(predHL$score) & !is.na(predHL$simulated_hl), ]
 74 | 
 75 | predHL$log2DiffSim = log2((predHL$score + 0.1) / (predHL$simulated_hl + 0.1))
 76 | predHL$log2DiffTrue = log2((predHL$score + 0.1) / (predHL$true_hl + 0.1))
 77 | rmseSim = sqrt(sum((predHL$simulated_hl - predHL$score) ^ 2) / nrow(predHL))
 78 | rmseTrue = sqrt(sum((predHL$true_hl - predHL$score) ^ 2) / nrow(predHL))
 79 | avgHL = mean(predHL$true_hl)
 80 | 
 81 | predHLUniq = predHL[predHL$multiMapCount == 0,]
 82 | predHLMulti = predHL[predHL$multiMapCount > 0,]
 83 | 
 84 | #ggplot(predHLUniq,aes(x=readsCPM,y=log2Diff)) + stat_binhex()
 85 | head(predHLUniq)
 86 | 
 87 | pdf(output, height = 6, width = 9)
 88 | plot(0, main=paste0("Unique UTRs (", nrow(predHLUniq), ")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$readsCPM), max(predHLUniq$readsCPM)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n")
 89 | points(predHLUniq$readsCPM, predHLUniq$log2DiffSim, pch=1, col="#00000033")
 90 | abline(h=0, lty=2, col="grey")
 91 | plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$readsCPM), max(predHLUniq$readsCPM)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n")
 92 | points(predHLMulti$readsCPM, predHLMulti$log2DiffSim, pch=1, col="#00000033")
 93 | abline(h=0, lty=2, col="grey")
 94 | 
 95 | plot(0, main=paste0("Unique UTRs (", nrow(predHLUniq), ")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$true_hl), max(predHLUniq$true_hl)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n")
 96 | points(predHLUniq$true_hl, predHLUniq$log2DiffSim, pch=1, col="#00000033")
 97 | abline(h=0, lty=2, col="grey")
 98 | plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$true_hl), max(predHLUniq$true_hl)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n")
 99 | points(predHLMulti$true_hl, predHLMulti$log2DiffSim, pch=1, col="#00000033")
100 | abline(h=0, lty=2, col="grey")
101 | 
102 | 
103 | #plot(0, main=paste0("Multimapper UTRs (", nrow(predHLMulti) ,")\nAvg. HL: ", round(avgHL), ", RMSE: ", round(rmseSim)), pch=4, xlim=c(min(predHLUniq$multiperc), max(predHLUniq$multiperc)), ylim=c(-6,6), xlab="couts per million (reads)", ylab="log2 FC (half-lifes)", type="n")
104 | #points(predHLMulti$multiperc, predHLMulti$log2DiffSim, pch=1, col="#00000033")
105 | #abline(h=0, lty=2, col="grey")
106 | 
107 | 
108 | #lim = max(predHLUniq$simulated_hl) * 1.25
109 | lim = 1440
110 | corr = cor(predHLUniq$simulated_hl, predHLUniq$score)
111 | plot(predHLUniq$simulated_hl ~ predHLUniq$score, main=paste0("Simulated vs. SlamDunk (unique)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (simulated)", xlab="Half-Life (slamDunk)", col="#00000033")
112 | abline(a = 0, b = 1, col="grey", lty=2)
113 | 
114 | corr = cor(predHLMulti$simulated_hl, predHLMulti$score)
115 | plot(predHLMulti$simulated_hl ~ predHLMulti$score, main=paste0("Simulated vs. SlamDunk (multimapper)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (simulated)", xlab="Half-Life (slamDunk)", col="#00000033")
116 | abline(a = 0, b = 1, col="grey", lty=2)
117 | 
118 | 
119 | corr = cor(predHLUniq$true_hl, predHLUniq$score)
120 | plot(predHLUniq$true_hl ~ predHLUniq$score, main=paste0("True vs. SlamDunk (unique)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (true)", xlab="Half-Life (slamDunk)", col="#00000033")
121 | abline(a = 0, b = 1, col="grey", lty=2)
122 | 
123 | corr = cor(predHLMulti$true_hl, predHLMulti$score)
124 | plot(predHLMulti$true_hl ~ predHLMulti$score, main=paste0("True vs. SlamDunk (multimapper)\nPearson: ", round(corr, digits = 4)), xlim=c(0, lim), ylim=c(0, lim), ylab="Half-life (true)", xlab="Half-Life (slamDunk)", col="#00000033")
125 | abline(a = 0, b = 1, col="grey", lty=2)
126 | 
127 | 
128 | dev.off()
129 | 
130 | #ggplot(predHLUniq,aes(x=simulated_hl,y=log2Diff)) + geom_point(alpha = 0.3)
131 | #p = p + stat_binhex(bins=100)
132 | #p = p
133 | #ggplot(predHLMulti,aes(x=simulated_hl,y=log2Diff)) + stat_binhex(bins=100)
134 | 
135 | 
136 | 
137 | write.table(tmp, missing, quote = F, row.names = F, col.names = T)
138 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/getseqfrombed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | This script is used to extract sequences from bed file.
  4 | 
  5 | USAGE
  6 |   getseqfrombed.py {OPTIONS} <.bed file|-> <reference .fasta file>
  7 | 
  8 | OPTIONS
  9 | 
 10 |   -b/--seqerror [error file]\tSpecify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed.
 11 | 
 12 |   -r/--errorrate [error rate]\tSpecify the overall error rate, a real positive number. The number of errors of each read will follow a Poisson distribution with its mean value specified by --errorrate.  Default 0 (no errors).
 13 | 
 14 |   -l/--readlen [read length]\tSpecify the read length. Default 75.
 15 | 
 16 |   -f/--fill [seq]\tFill at the end of each read by the sequence seq, if the read is shorter than the read length. Default A (to simulate poly-A tails in RNA-Seq reads).
 17 | 
 18 | NOTE
 19 | 
 20 |   1. The input .bed file is best to sort according to chromosome names. Use - to input from STDIN.
 21 |   2. Biopython and numpy package are required.
 22 | 
 23 |   3. When applying models, we assume that all sequences are in the same length. The length information is given by the -l parameter. If the sequence length is greater than read length, nucleotides outside the read length will not be simulated for error.
 24 | 
 25 | HISTORY
 26 | 
 27 | 
 28 |     14/12/2019:
 29 |         Tobias Neumann: Make python3 compatible.
 30 |         Fix a bug with error profiles in the minus strand.
 31 | 	02/01/2013:
 32 | 	  Fix a bug with no read errors generated.
 33 | 	  Fix a bug with error profiles in the minus strand.
 34 | 	08/25/2011:
 35 |     	  Rename makebedseq.py to getseqfrombed.py.
 36 |     	  Print results to stdout.
 37 | """
 38 | 
 39 | import sys;
 40 | import pydoc;
 41 | import os;
 42 | import random;
 43 | import bisect;
 44 | import math;
 45 | import numpy;
 46 | from Bio import SeqIO;
 47 | from Bio.SeqRecord import SeqRecord;
 48 | 
 49 | # import argparse;
 50 | # parser=argparse.ArgumentParser('Extract sequences from bed file');
 51 | # parser.add_argument('-b','--seqerror',help='Specify the positional error profile to be used. The file should include at least 100 lines, each containing a positive number. The number at line x is the weight that an error is occured at x% position of the read. If no positional error file specified, uniform weight is assumed.');
 52 | # parser.add_argument('-r','--errorrate',type=float,default=0.0,help='Specify the overall error rate, a number between 0 and 1. Default 0 (no errors).');
 53 | # parser.add_argument('-l','--readlen',type=int,default=75,help='Specify the read length. Default 75.');
 54 | # parser.add_argument('-f','--fill',default='A',help='Fill at the end of each read by the sequence seq, if the read is shorter than the read length. Default A (to simulate poly-A tails in RNA-Seq reads).');
 55 | 
 56 | if len(sys.argv)<2:
 57 |   print>>sys.stderr, (pydoc.render_doc(sys.modules[__name__]));
 58 |   sys.exit();
 59 | 
 60 | # analyzing parameters
 61 | posweight=[];
 62 | errrate=0.00;
 63 | readlength=75;
 64 | forcelength=False;
 65 | filledseq='A';
 66 | 
 67 | for i in range(len(sys.argv)):
 68 |   if i<len(sys.argv)-1:
 69 |     if sys.argv[i]=='-b' or sys.argv[i]=='--seqerror':
 70 |       bline=0;
 71 |       tbweight=0;
 72 |       print('Using pos bias file'+sys.argv[i+1],file=sys.stderr);
 73 |       for lines in open(sys.argv[i+1]):
 74 |         bline=bline+1;
 75 |         if bline>100:
 76 |           break;
 77 |         tbweight=float(lines.strip());
 78 |         posweight.append(tbweight);
 79 |       if len(posweight)!=100:
 80 |         print('Error: the bias file should include at least 100 lines.',file=sys.stderr);
 81 |         sys.exit();
 82 |     if sys.argv[i]=='-r' or sys.argv[i]=='--errorrate':
 83 |       errrate=float(sys.argv[i+1]);
 84 |       if errrate<0: # or errrate>1:
 85 |         print('Error: the error rate should be between 0-1.',file=sys.stderr);
 86 |         sys.exit();
 87 |       print('Error rate: '+str(errrate),file=sys.stderr);
 88 |     if sys.argv[i]=='-l' or sys.argv[i]=='--readlen':
 89 |       readlength=int(sys.argv[i+1]);
 90 |       print('Read length:'+str(readlength),file=sys.stderr);
 91 |     if sys.argv[i]=='-f' or sys.argv[i]=='--fill':
 92 |       forcelength=True;
 93 |       filledseq=sys.argv[i+1];
 94 |       print('Force same read length with filled :'+(filledseq),file=sys.stderr);
 95 | 
 96 | 
 97 | 
 98 | # construct weight probability for read length, if possible
 99 | rlenweight=[];
100 | if len(posweight)!=0:
101 |   kweight=0;
102 |   for i in range(readlength):
103 |     nfrac=i*100.0/readlength;
104 |     lower=int(math.floor(nfrac));
105 |     higher=int(math.ceil(nfrac));
106 |     if higher==lower: higher=lower+1;
107 |     #print('higher:'+str(higher)+',lower:'+str(lower));
108 |     if higher<100:
109 |       val=posweight[lower]*(nfrac-lower)+posweight[higher]*(higher-nfrac);
110 |     else:
111 |       val=posweight[99];
112 |     kweight+=val;
113 |     rlenweight.append(kweight);
114 | 
115 | bedfile=sys.argv[-2];
116 | reffile=sys.argv[-1];
117 | #ofastafile=sys.argv[-1];
118 | 
119 | # build reference
120 | seqref=SeqIO.index(reffile,'fasta');
121 | refkeys = list(seqref.keys())
122 | 
123 | # read bed file, and ready for writing
124 | if bedfile!="-":
125 |   fid=open(bedfile);
126 | else:
127 |   fid=sys.stdin;
128 | #ofid=open(ofastafile,'w');
129 | ofid=sys.stdout
130 | 
131 | nlines=0;
132 | 
133 | prevchr='';
134 | previndex='';
135 | 
136 | for lines in fid:
137 |   # update line counter
138 |   nlines=nlines+1;
139 |   if nlines %10000==1:
140 |     print('Processing '+str(nlines)+' lines...',file=sys.stderr);
141 |   # parse lines
142 |   bedfield=lines.strip().split('\t');
143 |   if len(bedfield)!=12:
144 |     print('Error: incorrect number of fields at line %d (should be 12, observed %d)' % (nlines, len(bedfield)) ,file=sys.stderr);
145 |     continue;
146 |   # clustering
147 |   fieldrange=[int(bedfield[1]),int(bedfield[2])];
148 |   # parse all exons
149 |   exonlen=[int(x) for x in bedfield[10][:-1].split(',')];
150 |   exonstart=[int(x)+fieldrange[0] for x in bedfield[11][:-1].split(',')];
151 |   if not bedfield[0] in refkeys:
152 |     print('Warning: '+bedfield[0]+ ' not in the reference. Ignore...' ,file=sys.stderr);
153 |     continue;
154 |   if bedfield[0]!=prevchr:
155 |     print('Switching to %s ...' % bedfield[0],file=sys.stderr);
156 |     prevchr=bedfield[0];
157 |     previndex=seqref[bedfield[0]];
158 |   # extract sequences
159 |   thisseq=SeqRecord('');
160 |   for i in range(len(exonlen)):
161 |     thisseq+=previndex[exonstart[i]:(exonstart[i]+exonlen[i])];
162 |   if forcelength:
163 |     if sum(exonlen)<readlength:
164 |       thisseq+=filledseq*(readlength-sum(exonlen));
165 |   thisseq.id=bedfield[3];
166 |   thisseq.description='';
167 |   # mutation
168 |   nmut=numpy.random.poisson(errrate);
169 |   if nmut>0:
170 |     newseq=thisseq.seq;
171 |     for n in range(nmut):
172 |       if len(posweight)==0:
173 |         # uniform distrib
174 |         modifyposition=random.choice(range(len(newseq)));
175 |       else:
176 |         rchosen=random.random()*kweight;
177 |         modifyposition=bisect.bisect_right(posweight,rchosen);
178 |       # mutate the position
179 |       if len(newseq)>modifyposition:
180 |         topos=random.choice('ATGC');
181 |         while topos==newseq[modifyposition]:
182 |           topos=random.choice('ATGC');
183 |         print ('MUTATION at position '+str(modifyposition)+','+newseq[modifyposition]+'->'+topos,file=sys.stderr);
184 |         # print >>sys.stderr,('SEQ:'+newseq);
185 |         newseq=newseq[:modifyposition]+topos+newseq[(modifyposition+1):];
186 |         # print >>sys.stderr,('SEQ:'+newseq);
187 |     #print>>sys.stderr,('NMUTATION:'+str(nmut));
188 |     #print>>sys.stderr, (str(thisseq.seq));
189 |     #print>>sys.stderr,(newseq);
190 |     thisseq.seq=newseq;
191 |   # reverse-complement the sequence if it is on the negative strand
192 |   if bedfield[5]=='-':
193 |     #print >>sys.stderr,('SEQ:'+thisseq.seq);
194 |     thisseq.seq=thisseq.seq.reverse_complement();
195 |     #print >>sys.stderr,('RVCSEQ:'+thisseq.seq);
196 |   # write to record
197 |   try:
198 |     SeqIO.write(thisseq,ofid,'fasta');
199 |   except ValueError:
200 |     print('Skip at line '+str(nlines)+', sequence object:',file=sys.stderr);
201 |     print(thisseq,file=sys.stderr);
202 | 
203 | 
204 | 
205 | # ofid.close();
206 | if bedfile!="-":
207 |   fid.close();
208 | 


--------------------------------------------------------------------------------
/slamdunk/plot/eval_conversion_rate_plots.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | library(getopt)
 21 | 
 22 | spec = matrix(c(
 23 |   'help'      , 'h', 0, "logical","print the usage of the command",
 24 |   'simulated', "s", 2,"character","Comma seperated list of simulated files",
 25 |   'slamdunk', "f", 2,"character","Comma seperated lost of SlamDunk results",
 26 |   'output', "o", 2,"character","Output pdf",
 27 |   'conversionrate', "c", 2,"character","Simulated conversion rate"
 28 | ),ncol = 5,byrow=T)
 29 | 
 30 | opt = getopt(spec)
 31 | 
 32 | if ( !is.null(opt$help) || length(opt)==3 ) {
 33 |   #get the script name
 34 |   cmd = commandArgs(FALSE)
 35 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 36 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 37 |   #print a friendly message and exit with a non-zero error code
 38 |   cat(getopt(spec,command = self,usage=T))
 39 |   q(status=1);
 40 | }
 41 | 
 42 | 
 43 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified")
 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
 45 | if ( is.null(opt$output) ) stop("arg output must be specified")
 46 | if ( is.null(opt$conversionrate) ) { opt$conversionrate = 0.03 }
 47 | 
 48 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_8_1440min_utrsummary.csv"
 49 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
 50 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/conversion_rate_per_gene_eval_plots.pdf"
 51 | #timesParameter = "0,15,30,60,180,360,720,1440"
 52 | #conversionRate = 0.03
 53 | 
 54 | simulatedFiles = opt$simulated
 55 | #simulatedFiles = "simulation_1/pooja_UTR_annotation_examples_sample_1_0min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_2_15min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_3_30min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_4_60min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_5_180min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_6_360min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_7_720min_utrsummary.csv,simulation_1/pooja_UTR_annotation_examples_sample_8_1440min_utrsummary.csv"
 56 | slamDunkFiles = opt$slamdunk
 57 | #slamDunkFiles = "simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,simulation_1/slamdunk/count/pooja_UTR_annotation_examples_sample_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
 58 | filesSimulated = as.character(ordered(strsplit(simulatedFiles, ",")[[1]]))
 59 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]]))
 60 | outputFile = opt$output
 61 | conversionRate = opt$conversionrate
 62 | 
 63 | pdf(outputFile, width = 15, height = 6)
 64 | for(timepoint in 1:length(filesSimulated)) {
 65 |   #timepoint = 5
 66 |   simDataFile = filesSimulated[timepoint]
 67 |   slamDunkFile = filesSlamDunk[timepoint]
 68 |   name = basename(simDataFile)
 69 | 
 70 |   simulation = read.table(simDataFile)
 71 |   colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount")
 72 |   #simulation$coverage = (simulation$simulatedReads / (simulation$stop - simulation$start) * 50)
 73 |   simulation$convertedReadsRate = simulation$convertedReads / simulation$readCount
 74 | 
 75 |   slamdunk = read.table(slamDunkFile)
 76 |   colnames(slamdunk) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount")
 77 |   slamdunk$readsCPM_sim = simulation$readsCPM
 78 |   slamdunk$log2diff = log2((simulation$conversionRate + 0.0000001) / (slamdunk$conversionRate + 0.0000001))
 79 |   slamdunk$diff = (simulation$conversionRate - slamdunk$conversionRate)
 80 |   #slamdunk$convertedReadsRate = slamdunk$convertedReads / slamdunk$readCount
 81 |   #slamdunk$diffconvertedReadsRate = (simulation$convertedReads - slamdunk$convertedReads)
 82 |   #slamdunk$diffconvertedReadsRatediff = ((simulation$convertedReads / simulation$readCount) - (slamdunk$convertedReads / slamdunk$readCount))
 83 |   #plot(simulation$V11 ~ slamdunk$V6, xlim=c(0, 0.01), ylim=c(0, 0.01))
 84 | 
 85 |   par(mfrow=c(1,2))
 86 |   #yLim = max(abs(slamdunk$diffconvertedReadsRate))
 87 |   yLim = as.numeric(conversionRate)
 88 |   #boxplot(slamdunk$log2diff)
 89 |   slamDunkUniq = slamdunk[slamdunk$multiMapCount <= 0, ]
 90 |   slamDunkMulti = slamdunk[slamdunk$multiMapCount > 0, ]
 91 |   plot(slamDunkUniq$readsCPM_sim, slamDunkUniq$diff, main=name, pch=4, ylim=c(-yLim, yLim), ylab="conversion (sim) - conversion (slamdunk)", xlab="read counts per million")
 92 |   points(slamDunkMulti$readsCPM_sim, slamDunkMulti$diff, pch=4, col="red")
 93 |   abline(h=0, lty=2, col="grey")
 94 | 
 95 |   yLim = 4
 96 |   plot(slamDunkUniq$readsCPM_sim, slamDunkUniq$log2diff, main=name, pch=4, ylim=c(-yLim, yLim), ylab="log2(conversion (sim) / conversion (slamdunk))", xlab="read counts per million")
 97 |   points(slamDunkMulti$readsCPM_sim, slamDunkMulti$log2diff, pch=4, col="red")
 98 |   abline(h=0, lty=2, col="grey")
 99 | }
100 | dev.off()
101 | 


--------------------------------------------------------------------------------
/slamdunk/contrib/RNASeqReadSimulator/src/gensimreads.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | """
  3 | This script generates simulated RNA-Seq reads (in .bed format) from known gene annotations.
  4 | 
  5 | USAGE 
  6 | 
  7 |   gensimreads.py {OPTIONS} <BED-File|->
  8 | 
  9 | PARAMETER
 10 | 
 11 |   BED-File\tThe gene annotation file (in BED format). Use '-' for STDIN input
 12 | 
 13 | OPTIONS
 14 | 
 15 |   -e/--expression [expression level file] \tSpecify the weight of each transcript. Each line in the file should have at least (NFIELD+1)  fields, with field 0 the annotation id, and field NFIELD the weight of this annoation. If this file is not provided, uniform weight is applied. 
 16 | 
 17 |   -n/--nreads readcnt \tSpecify the number of reads to be generated. Default 100000.
 18 | 
 19 |   -b/--posbias [positional bias file] \tSpecify the positional bias file. The file should include at least 100 lines, each contains only one integer number, showing the preference of the positional bias at this position. If no positional bias file is specified, use uniform distribution bias.
 20 | 
 21 |   -l/--readlen [read length] \tSpecify the read length. Default 32.
 22 | 
 23 |   -o/--output [output .bed file] \tSpecify the output file. Default STDOUT 
 24 | 
 25 |   -f/--field [NFIELD] \tThe field of each line as weight input. Default 7 (beginning from field 0) to compatible to genexplvprofile.py.
 26 | 
 27 |   -p/--pairend [PELENMEAN,PELENSTD]\t Generate paired-end reads with specified insert length mean and standard derivation. The default is 200,20.
 28 | 
 29 |   --stranded \tThe reads are strand specific.
 30 | 
 31 | NOTE 
 32 | 
 33 |   	1. The bed file is required to sort according to the chromosome name and position. In Unix systems, use "sort -k 1,1 -k 2,2n in.BED > out.BED" to get a sorted version (out.BED) of the bed file (in.BED).  
 34 | 
 35 |   	2. No problem to handle reads spanning multiple exons. 
 36 | 
 37 | HISTORY
 38 | 
 39 | 	04/30/2012
 40 | 	  Support generating stranded RNA-Seq reads
 41 | 
 42 | 	02/16/2012
 43 | 	  Now runs on python 2.7
 44 | 
 45 | 	02/08/2012 
 46 | 	  Change default value of NFIELD from 4 to 7 to be compatible with default genexplvprofile values.
 47 | 
 48 | 	01/29/2012 
 49 | 	  Add paired-end support.
 50 | 
 51 | 	01/09/2012 
 52 | 	  Add -f option.
 53 | 
 54 | AUTHOR
 55 | 	Wei Li (li.david.wei AT gmail.com)
 56 | """
 57 | 
 58 | from __future__ import print_function
 59 | import sys;
 60 | import subprocess;
 61 | import pydoc;
 62 | import os;
 63 | import random;
 64 | import bisect;
 65 | import math;
 66 | from getSegs import *;
 67 | 
 68 | import pdb;
 69 | 
 70 | # read length
 71 | readlen=32;
 72 | # number of reads to sample
 73 | readcnt=100000;
 74 | 
 75 | nfield=7;
 76 | 
 77 | if len(sys.argv)<2:
 78 |   print(pydoc.render_doc(sys.modules[__name__]));
 79 |   sys.exit();
 80 | 
 81 | allids={};
 82 | allidl=[];
 83 | allexp=[];
 84 | 
 85 | posweight=[];
 86 | 
 87 | #onbedfile=sys.argv[-1]+'.reads.bed';
 88 | onbedfile="-";
 89 | 
 90 | genpereads=False;
 91 | pemean=200;
 92 | pestd=20;
 93 | 
 94 | stranded=False;
 95 | 
 96 | for i in range(len(sys.argv)):
 97 |   if i<len(sys.argv)-1:
 98 |     if sys.argv[i]=='-e' or sys.argv[i]=='--expression':
 99 |       # parse the annoatation file, and sum up the weights
100 |       nline=0;
101 |       totalweight=0;
102 |       print('Reading annoatation file...',file=sys.stderr);
103 |       for lines in open(sys.argv[i+1]):
104 |         nline=nline+1;
105 |         if lines[0]=='#':
106 |           continue;
107 |         fields=lines.strip().split();
108 |         if len(fields)<nfield+1:
109 |           print('Error: the annotation file should include at least '+str(nfield+1)+' fields.',file=sys.stderr);
110 |           sys.exit();
111 |         allids[fields[0]]=0;
112 |         totalweight+=float(fields[nfield]);
113 |         allexp.append(totalweight);
114 |         allidl.append(fields[0]);
115 |       print('Read %d lines of the annoatation' % nline,file=sys.stderr);
116 |       #print('Total weight: %f' % sum(totalweight));
117 |     if sys.argv[i]=='-b' or sys.argv[i]=='--posbias':
118 |       bline=0;
119 |       tbweight=0;
120 |       for lines in open(sys.argv[i+1]):
121 |         bline=bline+1;
122 |         if bline>100:
123 |           break;
124 |         tbweight=float(lines.strip());
125 |         posweight.append(tbweight);
126 |       if len(posweight)!=100:
127 |         print('Error: the bias file should include at least 100 lines.',file=sys.stderr);
128 |         sys.exit();
129 |     if sys.argv[i]=='-n' or sys.argv[i]=='--nreads':
130 |       readcnt=int(sys.argv[i+1]);
131 |       print('Read count:',readcnt,file=sys.stderr);
132 |     if sys.argv[i]=='-l' or sys.argv[i]=='--readlen':
133 |       readlen=int(sys.argv[i+1]);
134 |       print('Read length:',readlen,file=sys.stderr);
135 |     if sys.argv[i]=='-o' or sys.argv[i]=='--output':
136 |       onbedfile=sys.argv[i+1];
137 |       print('Output bed file:',onbedfile,file=sys.stderr);
138 |     if sys.argv[i]=='-f' or sys.argv[i]=='--field':
139 |       nfield=int(sys.argv[i+1]);
140 |       print('Field:',nfield,file=sys.stderr);
141 |     if sys.argv[i]=='-p' or sys.argv[i]=='--pairend':
142 |       genpereads=True;
143 |       pef=sys.argv[i+1].split(',');
144 |       pemean=int(pef[0]);
145 |       pestd=int(pef[1]);
146 |       print('Generate paired-end reads with mean and std '+str(pemean)+','+str(pestd),file=sys.stderr);
147 |   if sys.argv[i]=='-h' or sys.argv[i]=='--help':
148 |     print(pydoc.render_doc(sys.modules[__name__]));
149 |     sys.exit();
150 |   if sys.argv[i]=='--stranded':
151 |     stranded=True;
152 | 
153 |       
154 | 
155 | bedfile=sys.argv[-1];
156 | 
157 | # if no annotation file is specified, use uniform distri.
158 | print('Assigning weights...',file=sys.stderr);
159 | if len(allexp)==0:
160 |   totalweight=0;
161 |   for lines in open(bedfile):
162 |     bedfield=lines.strip().split();
163 |     allids[bedfield[3]]=0;
164 |     totalweight+=1;
165 |     allexp.append(totalweight);
166 |     allidl.append(bedfield[3]);
167 | 
168 | # sampling process
169 | print('Sampling...',file=sys.stderr);
170 | for j in range(readcnt):
171 |   k=random.random()*totalweight;
172 |   sel=bisect.bisect_right(allexp,k);
173 |   allids[allidl[sel]]=allids[allidl[sel]]+1;
174 | 
175 | # if no bias file specified, use uniform distrib
176 | 
177 | print('Total assigned reads:',sum(allids.values()),file=sys.stderr);
178 | 
179 |   
180 | #debug info:
181 | #for k in allidl:
182 | #  print (k, allids[k]);
183 | 
184 | #sys.exit();
185 | 
186 | if onbedfile!="-":
187 |   onfid=open(onbedfile,'w');
188 | else:
189 |   onfid=sys.stdout;
190 | 
191 | 
192 | nlines=0;
193 | 
194 | totalgenreads=0;
195 | # read bed file
196 | for lines in open(bedfile):
197 |   # update line counter
198 |   nlines=nlines+1;
199 |   if nlines %10000==1:
200 |     print('Processing '+str(nlines)+' lines...',file=sys.stderr);
201 |   # parse lines
202 |   bedfield=lines.strip().split();
203 |   if len(bedfield)!=12:
204 |     print('Error: incorrect number of fields (should be 12)',file=sys.stderr);
205 |     continue;
206 |   if bedfield[5]=='+':
207 |     direction=1;
208 |   elif bedfield[5]=='-':
209 |     direction=-1;
210 |   else:
211 |     print('Error: incorrect field in field[5] %s:' %bedfield[5],file=sys.stderr);
212 |   if bedfield[3] not in allids:
213 |     # the current id not found, continue
214 |     continue;
215 |   nreads=allids[bedfield[3]];
216 |   if nreads<1:
217 |     continue;
218 |   # parse all segments
219 |   fieldrange=(int(bedfield[1]),int(bedfield[2]));
220 |   if bedfield[10][-1]==',':
221 |     bedfield[10]=bedfield[10][:-1];
222 |   if bedfield[11][-1]==',':
223 |     bedfield[11]=bedfield[11][:-1];
224 |   exonlen=[int(x) for x in bedfield[10].split(',')];
225 |   exonstart=[int(x)+fieldrange[0] for x in bedfield[11].split(',')];
226 |   # old code: for each possible position in the transcript, build its segments
227 |   # for ne in range(len(exonlen)):
228 |   #  for pos in range(exonstart[ne],exonstart[ne]+exonlen[ne]):
229 |   # create a position
230 |   totallen=sum(exonlen);
231 |   # here, we randomly choose one position
232 |   if genpereads==False:
233 |     selrange=totallen-readlen+1;
234 |   else:
235 |     selrange=totallen-pemean+2*pestd;
236 |   if selrange<1:
237 |     if genpereads==False:
238 |       print('Ignore annoatation',bedfield[3],'of length',totallen,'Reads:',allids[bedfield[3]],file=sys.stderr);
239 |     else:
240 |       print('Ignore annoatation',bedfield[3],'of length',totallen,'since its shorter than paired-end mean insert length. Reads:',allids[bedfield[3]],file=sys.stderr);
241 |     continue;
242 |   totalgenreads+=nreads;
243 |   cumlen=[];cumlen.extend(exonlen);
244 |   for i in range(1,len(cumlen)):
245 |     cumlen[i]=cumlen[i]+cumlen[i-1];
246 |   # for nun-uniform distribution, construct a new array for selection
247 |   thistbweight=[];
248 |   if len(posweight)!=0:
249 |     kweight=0;
250 |     for i in range(selrange):
251 |       nfrac=i*100.0/selrange; # a value between 0-100
252 |       nlower=int(math.floor(nfrac)); # 0-100
253 |       nhigher=int(math.ceil(nfrac)); # 0-100
254 |       if nhigher==nlower: nhigher=nlower+1;
255 |       if nhigher<100:
256 |         val=posweight[nlower]*(nfrac-nlower)+posweight[nhigher]*(nhigher-nfrac);
257 |       else:
258 |         val=posweight[99];
259 |       kweight+=val;
260 |       thistbweight.append(kweight);
261 |   for t in range(nreads):
262 |     if len(posweight)==0:
263 |       tpos=random.choice(range(selrange));
264 |     else:
265 |       rd=random.random()*kweight;
266 |       bsl=bisect.bisect_right(thistbweight,rd);
267 |       # for reverse transcripts: flip the position
268 |       if direction==-1:
269 |         bsl=selrange-1-bsl;
270 |       tpos=bsl;
271 |     pos=tpos2pos(tpos,cumlen,exonstart);
272 |     if genpereads==True:
273 |       tpos2=tpos+int(random.normalvariate(pemean-readlen+1,pestd));
274 |       pos2=tpos2pos(tpos2,cumlen,exonstart);
275 |     # get the segments
276 |     if True:
277 |       (startrange,lenrange,status)=getSegs(pos,readlen,1,exonstart,exonlen);
278 |       if status!=0:
279 |         print('Status:',status,', pos:', pos,'out of',len(cumlen),file=sys.stderr);
280 |         #pdb.set_trace();
281 |         continue;
282 |       # generate another pair
283 |       if genpereads==True:
284 |         (startrange2,lenrange2,status2)=getSegs(pos2,readlen,1,exonstart,exonlen);
285 |         if status==1:
286 |           print('Status:',status,', pos:', pos,'out of',len(cumlen),file=sys.stderr);
287 |       if genpereads==False:
288 |         lineid="%s_e_%d_%s_%d" % (bedfield[3],t,bedfield[0],pos);
289 |       else:
290 |         lineid="%s_e_%d_%s_%d/1" % (bedfield[3],t,bedfield[0],pos);
291 |         lineid2="%s_e_%d_%s_%d/2" % (bedfield[3],t,bedfield[0],pos);
292 |       # random direction
293 |       if stranded==False or direction==0:
294 |         thisdir=random.choice([1,-1]);
295 |       else:
296 |         thisdir=direction;
297 |       writeBedline(onfid,lineid,bedfield[0],thisdir,startrange,lenrange);
298 |       if genpereads==True:
299 |         writeBedline(onfid,lineid2,bedfield[0],thisdir*(-1),startrange2,lenrange2);
300 |     else:
301 |       print(bedfield[0],file=sys.stdout);
302 | 
303 | #print('Pospool:');
304 | #for k in sorted(pospool.keys()):
305 | #  print(str(k)+":"+str(pospool[k]),end=",");
306 | #print();          
307 |         
308 | 
309 | print('Total '+str(nlines)+' lines...',file=sys.stderr);
310 | print('Total '+str(totalgenreads)+' reads...',file=sys.stderr);
311 | if onbedfile!="-":
312 |   onfid.close();
313 | 
314 | 


--------------------------------------------------------------------------------
/slamdunk/utils/misc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | # Date located in: -
 21 | from __future__ import print_function
 22 | import sys, os
 23 | import pysam
 24 | import subprocess
 25 | import collections
 26 | import csv
 27 | import ast
 28 | import hashlib
 29 | 
 30 | ReadStat = collections.namedtuple('ReadStat' , 'SequencedReads MappedReads DedupReads FilteredReads SNPs AnnotationName AnnotationMD5')
 31 | SampleInfo = collections.namedtuple('SampleInfo' , 'ID Name Type Time')
 32 | 
 33 | class SlamSeqInfo:
 34 | 
 35 |     ID_SequencedRead = "sequenced"
 36 |     ID_MappedReads = "mapped"
 37 |     ID_FilteredReads = "filtered"
 38 |     ID_DedupReads = "dedup"
 39 |     ID_MQFilteredReads = "mqfiltered"
 40 |     ID_IdFilteredReads = "idfiltered"
 41 |     ID_NmFilteredReads = "nmfiltered"
 42 |     ID_MultimapperReads = "multimapper"
 43 |     ID_SNPs = "snps"
 44 |     ID_AnnotationName = "annotation"
 45 |     ID_AnnotationMD5 = "annotationmd5"
 46 | 
 47 |     def getFromReadStat(self, name, stats):
 48 |         if(name in stats):
 49 |             return stats[name]
 50 |         else:
 51 |             return "NA"
 52 | 
 53 |     def __init__(self, bam = None):
 54 |         if bam is None:
 55 |             self.SequencedReads = 0
 56 |             self.MappedReads = 0
 57 |             self.DedupReads = 0
 58 |             self.FilteredReads = 0
 59 |             self.MQFilteredReads = 0
 60 |             self.IdFilteredReads = 0
 61 |             self.NmFilteredReads = 0
 62 |             self.MultimapperReads = 0
 63 |             self.SNPs = 0
 64 |             self.AnnotationName = "NA"
 65 |             self.AnnotationMD5 = "NA"
 66 |         else:
 67 |             DS = ast.literal_eval(getReadGroup(bam)['DS'])
 68 | 
 69 |             self.SequencedReads = self.getFromReadStat(self.ID_SequencedRead, DS)
 70 |             self.MappedReads = self.getFromReadStat(self.ID_MappedReads, DS)
 71 |             self.DedupReads = self.getFromReadStat(self.ID_DedupReads, DS)
 72 |             self.FilteredReads = self.getFromReadStat(self.ID_FilteredReads, DS)
 73 |             self.MQFilteredReads = self.getFromReadStat(self.ID_MQFilteredReads, DS)
 74 |             self.IdFilteredReads = self.getFromReadStat(self.ID_IdFilteredReads, DS)
 75 |             self.NmFilteredReads = self.getFromReadStat(self.ID_NmFilteredReads, DS)
 76 |             self.MultimapperReads = self.getFromReadStat(self.ID_MultimapperReads, DS)
 77 |             self.SNPs = self.getFromReadStat(self.ID_SNPs, DS)
 78 |             self.AnnotationName = self.getFromReadStat(self.ID_AnnotationName, DS)
 79 |             self.AnnotationMD5 = self.getFromReadStat(self.ID_AnnotationMD5, DS)
 80 | 
 81 |     def __repr__(self):
 82 |         return "{" + "'" + self.ID_SequencedRead + "':" + str(self.SequencedReads) + "," + "'" + self.ID_MappedReads + "':" + str(self.MappedReads) + "," + "'" + self.ID_FilteredReads + "':" + str(self.FilteredReads) + "," + "'" + self.ID_MQFilteredReads + "':" + str(self.MQFilteredReads) + "," + "'" + self.ID_IdFilteredReads + "':" + str(self.IdFilteredReads) + "," + "'" + self.ID_NmFilteredReads + "':" + str(self.NmFilteredReads) + "," + "'" + self.ID_MultimapperReads + "':" + str(self.MultimapperReads) + "," + "'" + self.ID_DedupReads + "':" + str(self.DedupReads) + "," + "'" + self.ID_SNPs + "':" + str(self.SNPs) + "," + "'" + self.ID_AnnotationName + "':'" + str(self.AnnotationName) + "'," + "'" + self.ID_AnnotationMD5 + "':'" + str(self.AnnotationMD5) +  "'}"
 83 | 
 84 | def md5(fname):
 85 |     hash_md5 = hashlib.md5()
 86 |     with open(fname, "rb") as f:
 87 |         for chunk in iter(lambda: f.read(4096), b""):
 88 |             hash_md5.update(chunk)
 89 |     return hash_md5.hexdigest()
 90 | 
 91 | def estimateMaxReadLength(bam):
 92 | 
 93 |     readfile = pysam.AlignmentFile(bam, "rb")
 94 | 
 95 |     minLength = sys.maxsize
 96 |     maxLength = 0
 97 | 
 98 |     for read in readfile.head(n = 1000) :
 99 |         minLength = min(minLength, read.query_length + read.get_tag("XA"))
100 |         maxLength = max(maxLength, read.query_length + read.get_tag("XA"))
101 | 
102 |     range = maxLength - minLength
103 | 
104 |     if (range <= 10) :
105 |         return(maxLength + 10)
106 |     else:
107 |         return(-1)
108 | 
109 | #Replaces the file extension of inFile to with <newExtension> and adds a suffix
110 | #Example replaceExtension("reads.fq", ".sam", suffix="_namg") => reads_ngm.sam
111 | def replaceExtension(inFile, newExtension, suffix=""):
112 |     return os.path.splitext(inFile)[0] + suffix + newExtension
113 | 
114 | #Removes right-most extension from file name
115 | def removeExtension(inFile):
116 |     name = os.path.splitext(inFile)[0]
117 |     ext = os.path.splitext(inFile)[1]
118 |     if(ext == ".gz"):
119 |         name = os.path.splitext(name)[0]
120 |     return name
121 | 
122 | def getchar():
123 |     print("Waiting for input", file=sys.stderr)
124 |     sys.stdin.readline()
125 | 
126 | def files_exist(files):
127 |     if (type(files) is list) :
128 |         for f in files:
129 |             if not os.path.exists(f):
130 |                 return False
131 |     else:
132 |         if not os.path.exists(files):
133 |             return False
134 |     return True
135 | 
136 | # remove a (list of) file(s) (if it/they exists)
137 | def removeFile(files):
138 |     if (type(files) is list) :
139 |         for f in files:
140 |             if os.path.exists(f):
141 |                 os.remove(f)
142 |     else:
143 |         if os.path.exists(files):
144 |             os.remove(files)
145 | 
146 | 
147 | def checkStep(inFiles, outFiles, force=False):
148 |     if not files_exist(inFiles):
149 |         raise RuntimeError("One or more input files don't exist: " + str(inFiles))
150 |     inFileDate = os.path.getmtime(inFiles[0])
151 |     for x in inFiles[1:]:
152 |         inFileDate = max(inFileDate, os.path.getmtime(x))
153 | 
154 |     if len(outFiles) > 0 and files_exist(outFiles):
155 |         outFileDate = os.path.getmtime(outFiles[0])
156 |         for x in outFiles[1:]:
157 |             outFileDate = min(outFileDate, os.path.getmtime(x))
158 |         if outFileDate > inFileDate:
159 |             if(force == True):
160 |                 return True
161 |             else:
162 |                 return False
163 | 
164 |     return True
165 | 
166 | def getBinary(name):
167 | 
168 |     projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
169 | 
170 |     return os.path.join(projectPath, "contrib", name)
171 | 
172 | def getRNASeqReadSimulator(name):
173 | 
174 |     projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
175 | 
176 |     return os.path.join(projectPath, "contrib", "RNASeqReadSimulator", "src", name)
177 | 
178 | def getPlotter(name):
179 | 
180 |     projectPath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
181 | 
182 |     return os.path.join(projectPath, "plot", name + ".R")
183 | 
184 | def run(cmd, log=sys.stderr, verbose=False, dry=False):
185 |     if(verbose or dry):
186 |         print(cmd, file=log)
187 | 
188 |     if(not dry):
189 |         #ret = os.system(cmd)
190 |         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
191 |         lines_iterator = iter(p.stdout.readline, b"")
192 |         for line in lines_iterator:
193 |             print(line, end="", file=log) # yield line
194 |         p.wait();
195 |         if(p.returncode != 0):
196 |             raise RuntimeError("Error while executing command: \"" + cmd + "\"")
197 | 
198 | def callR(cmd, log=sys.stderr, verbose=False, dry=False):
199 | 
200 |     if(verbose or dry):
201 |         print(cmd, file=log)
202 | 
203 |     if(not dry):
204 | 
205 |         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
206 |         lines_iterator = iter(p.stdout.readline, b"")
207 |         for line in lines_iterator:
208 |             print(line, end="", file=log) # yield line
209 |         p.wait();
210 |         if(p.returncode != 0):
211 |             raise RuntimeError("Error while executing command: \"" + cmd + "\"")
212 | 
213 | def pysamIndex(outputBam):
214 |     pysam.index(outputBam)  # @UndefinedVariable
215 | 
216 | def countReads(bam):
217 |     bamFile = pysam.AlignmentFile(bam)
218 |     mapped = 0
219 |     unmapped = 0
220 |     for read in bamFile.fetch(until_eof=True):
221 |         if(not read.is_secondary and not read.is_supplementary):
222 |             if(read.is_unmapped):
223 |                 unmapped += 1
224 |             else:
225 |                 mapped += 1
226 |     bamFile.close()
227 |     return mapped, unmapped
228 | 
229 | def getReadGroup(bam):
230 |     bamFile = pysam.AlignmentFile(bam)
231 |     header = bamFile.header
232 |     bamFile.close()
233 |     if('RG' in header and len(header['RG']) > 0):
234 |         return header['RG'][0]
235 |     else:
236 |         raise RuntimeError("Could not get mapped/unmapped/filtered read counts from BAM file. RG is missing. Please rerun slamdunk filter.")
237 | 
238 | def getSampleInfo(bam):
239 |     sampleInfo = getReadGroup(bam)
240 |     sampleInfos = sampleInfo['SM'].split(":")
241 |     return SampleInfo(ID = sampleInfo['ID'], Name = sampleInfos[0], Type = sampleInfos[1], Time = sampleInfos[2])
242 | 
243 | def readSampleNames(sampleNames, bams):
244 |     samples = None
245 | 
246 |     if(sampleNames != None and files_exist(sampleNames)):
247 |         samples = {}
248 |         with open(sampleNames, "r") as sampleFile:
249 |             samplesReader = csv.reader(sampleFile, delimiter='\t')
250 |             for row in samplesReader:
251 |                 samples[removeExtension(row[0])] = row[1]
252 | 
253 |     return samples
254 | 
255 | def getSampleName(fileName, samples):
256 |     if samples == None:
257 |         return removeExtension(fileName)
258 |     else:
259 |         for key in samples:
260 |             if(key in fileName):
261 |                 return samples[key]
262 | 
263 |     return
264 | 
265 | def matchFile(sample, files):
266 |     fileName = None
267 |     for item in files:
268 |         if(sample in item):
269 |             if(fileName == None):
270 |                 fileName = item
271 |             else:
272 |                 raise RuntimeError("Found more than one matching file in list.")
273 | 
274 |     return fileName
275 | 
276 | def complement(seq):
277 |     complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'}
278 |     bases = list(seq)
279 |     bases = [complement[base] for base in bases]
280 |     return ''.join(bases)
281 | 
282 | def shell(cmd):
283 |     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
284 |     p.wait()
285 |     if(p.returncode != 0):
286 |         raise RuntimeError("Error while executing command: " + cmd)
287 |     else:
288 |         return p.communicate()[0]
289 | 
290 | def shellerr(cmd, raiseError=True):
291 |     p = subprocess.Popen(cmd, stderr=subprocess.PIPE, shell=True)
292 |     p.wait()
293 |     if(p.returncode != 0 and raiseError == True):
294 |         raise RuntimeError("Error while executing command: " + cmd)
295 |     else:
296 |         return p.communicate()[1]
297 | 


--------------------------------------------------------------------------------
/slamdunk/plot/eval_halflife_per_gene_plots.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | library(getopt)
 21 | 
 22 | spec = matrix(c(
 23 |   'help'      , 'h', 0, "logical","print the usage of the command",
 24 |   'simulated', "s", 2,"character","Comma seperated list of simulated files",
 25 |   'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results",
 26 |   'timepoints', "t", 2,"character","Comma seperated list of time points",
 27 |   'bed', "b", 2,"character","BED file containing half lifes",
 28 |   'output', "o", 2,"character","Output pdf",
 29 |   'conversionrate', "c", 2,"character","Simulated conversion rate"
 30 | ),ncol = 5,byrow=T)
 31 | 
 32 | opt = getopt(spec)
 33 | 
 34 | if ( !is.null(opt$help) || length(opt)==4 ) {
 35 |   #get the script name
 36 |   cmd = commandArgs(FALSE)
 37 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 38 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 39 |   #print a friendly message and exit with a non-zero error code
 40 |   cat(getopt(spec,command = self,usage=T))
 41 |   q(status=1);
 42 | }
 43 | 
 44 | 
 45 | if ( is.null(opt$simulated) ) stop("arg simulated must be specified")
 46 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
 47 | if ( is.null(opt$output) ) stop("arg output must be specified")
 48 | if ( is.null(opt$timepoints) ) stop("arg timepoints must be specified")
 49 | if ( is.null(opt$bed) ) stop("arg bed specified")
 50 | if ( is.null(opt$conversionrate) ) { opt$conversionrate = 0.03 }
 51 | 
 52 | simulatedFiles = opt$simulated
 53 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_8_1440min_utrsummary.csv"
 54 | #simulatedFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_1_0min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_2_15min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_3_30min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_4_60min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_5_180min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_6_360min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_7_720min_utrsummary.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/pooja_UTR_annotation_examples_sample_8_1440min_utrsummary.csv"
 55 | 
 56 | slamDunkFiles = opt$slamdunk
 57 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
 58 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_1_0min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_2_15min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_3_30min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_4_60min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_5_180min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_6_360min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_7_720min_reads_slamdunk_mapped_filtered_tcount.csv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/slamdunk/count/pooja_UTR_annotation_examples_sample_8_1440min_reads_slamdunk_mapped_filtered_tcount.csv"
 59 | 
 60 | filesSimulated = as.character(ordered(strsplit(simulatedFiles, ",")[[1]]))
 61 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]]))
 62 | outputFile = opt$output
 63 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/halflife_per_gene_eval_plots.pdf"
 64 | #timesParameter = "0,15,30,60,180,360,720,1440"
 65 | timesParameter = opt$timepoints
 66 | times = as.numeric(strsplit(timesParameter, ",")[[1]])
 67 | times = times / 60
 68 | #bedFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/pooja_UTR_annotation_examples_original_utrs.bed"
 69 | #bedFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_2/finalAnnotation_test_cut_chrM_correct_100_original_utrs.bed"
 70 | bedFile = opt$bed
 71 | #conversionRate = 0.03
 72 | conversionRate = as.numeric(opt$conversionrate)
 73 | 
 74 | computeHalfLife <- function(rates, timepoints) {
 75 |   # Infere half life from data
 76 |   a_start<-max(rates) #param a is the y value when x=0
 77 |   k_start = log(2, base = exp(1))/5
 78 | 
 79 |   halfLifePred = NA
 80 |   C = NA
 81 |   k = NA
 82 | 
 83 |   tryCatch( {
 84 |     fit = nls(rates ~ a*(1-exp(-k*(timepoints))), start=list(a=a_start,k=k_start))
 85 |     halfLifePred = log(2, base = exp(1))/coef(fit)[2] * 60
 86 |     C = coef(fit)[1]
 87 |     k = coef(fit)[2]
 88 |   }, error=function(e){})
 89 | 
 90 |   c(halfLifePred, C, k)
 91 | }
 92 | 
 93 | mergeRates <- function(times, files, perRead) {
 94 |   mergedRates = data.frame()
 95 |   for(i in 1:length(times)) {
 96 |     time = times[i]
 97 |     #print(time)
 98 |     simDataFile = files[i]
 99 |     simulation = read.table(simDataFile)
100 |     colnames(simulation) = c("chr", "start", "stop", "name", "strand", "conversionRate", "readsCPM", "tCount", "tcCount", "readCount", "convertedReads", "multiMapCount")
101 |     if(nrow(mergedRates) == 0) {
102 |       mergedRates = simulation[, c("chr", "start", "stop", "name", "strand")]
103 |       if(perRead == TRUE) {
104 |         mergedRates$conversionRate = simulation$convertedReads / simulation$readCount
105 |       } else {
106 |         mergedRates$conversionRate = simulation$conversionRate
107 |       }
108 |     } else {
109 |       if(perRead == TRUE) {
110 |         mergedRates = cbind(mergedRates, simulation$convertedReads / simulation$readCount)
111 |       } else {
112 |         mergedRates = cbind(mergedRates, simulation$conversionRate)
113 |       }
114 |     }
115 |   }
116 |   colnames(mergedRates) = c("chr", "start", "stop", "name", "strand", times)
117 |   mergedRates
118 | }
119 | 
120 | bed = read.table(bedFile)
121 | colnames(bed) = c("char", "start", "stop", "name", "score", "strand")
122 | 
123 | 
124 | 
125 | perRead = F
126 | slamDunkMergedRates = mergeRates(times, filesSlamDunk, perRead)
127 | simMergedRates = mergeRates(times, filesSimulated, perRead)
128 | 
129 | pageNumber = 1
130 | #outputFile = "~/test.pdf"
131 | #pdf(outputFile, height=6, width=9)
132 | pdf(paste0(outputFile, "_page_", pageNumber, ".pdf"), height=6, width=9)
133 | for(utr in 1:nrow(slamDunkMergedRates)) {
134 |   #utr = 1
135 |   pulseSlamDunk = data.frame(y = as.numeric(t(slamDunkMergedRates[utr, 6:(5 + length(times))])[,1]), x = times)
136 |   pulseSimulated = data.frame(y = as.numeric(t(simMergedRates[utr, 6:(5 + length(times))])[,1]), x = times)
137 |   #yLim = max(max(pulseSlamDunk$y), max(pulseSimulated$y))
138 |   yLim = conversionRate * 1.25
139 |   yLab = "conversion rate"
140 |   if(perRead) {
141 |     yLab = "% of T->C reads"
142 |   }
143 | 
144 |   # Infere half life from data
145 |   halfLifeResultSlamDunk = computeHalfLife(pulseSlamDunk$y, pulseSlamDunk$x)
146 |   halfLifePred = halfLifeResultSlamDunk[1]
147 |   halfLifeResultSimulated = computeHalfLife(pulseSimulated$y, pulseSimulated$x)
148 |   halfLifeSim = halfLifeResultSimulated[1]
149 |   halfLifeTruth = bed[utr, ]$score
150 | 
151 | 
152 |   plot(0, type="n", main=paste0(slamDunkMergedRates[utr, ]$name, "\n half life: ", round(halfLifeTruth, digits = 0), " (truth), ", round(halfLifeSim, digits = 0), " (sim), ", round(halfLifePred, digits = 0)," (slamDunk)"), xlab="Time (hours)", ylab=yLab, ylim=c(0, yLim), xlim=c(times[1], times[length(times)]), pch=4)
153 |   lines(pulseSimulated$x, pulseSimulated$y, type="p", col="green", lty=1, pch=4)
154 |   lines(pulseSlamDunk$x, pulseSlamDunk$y, type="p", col="blue", lty=1, pch=4)
155 |   legend("bottomright", c("rates (slamDunk)", "rates (simulated)", "slamDunk", "simulated", "truth"), col=c("blue", "green", "blue", "green", "grey"), lty=c(1, 1, 2, 2, 2), bty="n")
156 | 
157 | 
158 |   t = 0:max(times)
159 |   # Print truth
160 |   lambda = log(2) / (halfLifeTruth / 60)
161 |   lines((1 - exp(-lambda*t)) * conversionRate ~ t, type="l", lty=2, col="grey")
162 |   # Print simulated
163 |   lines((1 - exp(-halfLifeResultSimulated[3]*t)) * halfLifeResultSimulated[2] ~ t, type="l", lty=2, col="green")
164 |   # Print slamDunk
165 |   lines((1 - exp(-halfLifeResultSlamDunk[3]*t)) * halfLifeResultSlamDunk[2] ~ t, type="l", lty=2, col="blue")
166 | 
167 |   if(utr %% 100 == 0) {
168 |     dev.off()
169 |     pageNumber = pageNumber + 1
170 |     pdf(paste0(outputFile, "_page_", pageNumber, ".pdf"), height=6, width=9)
171 |     print(paste0(outputFile, "_page_", pageNumber, ".pdf"))
172 |   }
173 | 
174 | }
175 | dev.off()
176 | 


--------------------------------------------------------------------------------
/slamdunk/plot/merge_rate_files.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | #
  3 | # Script to merge SlamDunk count files
  4 | #
  5 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  6 | #
  7 | # This file is part of Slamdunk.
  8 | #
  9 | # Slamdunk is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU Affero General Public License as
 11 | # published by the Free Software Foundation, either version 3 of the
 12 | # License, or (at your option) any later version.
 13 | #
 14 | # Slamdunk is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU Affero General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU Affero General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | library(getopt)
 23 | 
 24 | spec = matrix(c(
 25 |   'help'      , 'h', 0, "logical","print the usage of the command",
 26 |   'slamdunk', "f", 2,"character","Comma seperated list of SlamDunk results",
 27 |   'output', "o", 2,"character","Output tsv",
 28 |   'column', "c", 2,"character","Column or Expression used to summarize files",
 29 |   'columnname', "n", 2,"character","Index of meta data field to use as column name"
 30 | ),ncol = 5,byrow=T)
 31 | 
 32 | opt = getopt(spec)
 33 | 
 34 | if ( !is.null(opt$help) || length(opt)==2 ) {
 35 |   #get the script name
 36 |   cmd = commandArgs(FALSE)
 37 |   self = strsplit(cmd[grep("--file",cmd)],"=")[[1]][2]
 38 |   cat(basename(self),": Create mismatch plots from rate tabs.\n\n")
 39 |   #print a friendly message and exit with a non-zero error code
 40 |   cat(getopt(spec,command = self,usage=T))
 41 |   q(status=1);
 42 | }
 43 | 
 44 | if ( is.null(opt$slamdunk) ) stop("arg slamdunk must be specified")
 45 | if ( is.null(opt$output) ) stop("arg output must be specified")
 46 | if ( is.null(opt$column) ) { opt$column = "TcReadCount / ReadCount" }
 47 | if ( is.null(opt$column) ) { opt$columnname = 2 }
 48 | 
 49 | slamDunkFiles = opt$slamdunk
 50 | #slamDunkFiles = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_7_720min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_1_0min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_6_360min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_2_15min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_8_1440min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_3_30min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_5_180min_reads_slamdunk_mapped_filtered_tcount.tsv,/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/slamdunk/count/pooja_UTR_annotation_examples_4_60min_reads_slamdunk_mapped_filtered_tcount.tsv"
 51 | #slamDunkFiles = "ngm-20161027/count/34330_An312_wt-2n_mRNA-slamseq-autoquant_0h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34331_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34332_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34333_An312_wt-2n_mRNA-slamseq-autoquant_1h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34334_An312_wt-2n_mRNA-slamseq-autoquant_3h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34335_An312_wt-2n_mRNA-slamseq-autoquant_6h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34336_An312_wt-2n_mRNA-slamseq-autoquant_12h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34343_An312_wt-2n_mRNA-slamseq-autoquant_0h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34344_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34347_An312_wt-2n_mRNA-slamseq-autoquant_3h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34348_An312_wt-2n_mRNA-slamseq-autoquant_6h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34349_An312_wt-2n_mRNA-slamseq-autoquant_12h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34354_An312_wt-2n_mRNA-slamseq-autoquant_24h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34356_An312_wt-2n_mRNA-slamseq-autoquant_0h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34357_An312_wt-2n_mRNA-slamseq-autoquant_0.25h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34358_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34359_An312_wt-2n_mRNA-slamseq-autoquant_1h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34361_An312_wt-2n_mRNA-slamseq-autoquant_6h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34362_An312_wt-2n_mRNA-slamseq-autoquant_12h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34367_An312_wt-2n_mRNA-slamseq-autoquant_24h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34503_An312_wt-2n_mRNA-slamseq-autoquant_0.5h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34504_An312_wt-2n_mRNA-slamseq-autoquant_1h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34505_An312_wt-2n_mRNA-slamseq-autoquant_3h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34506_An312_wt-2n_mRNA-slamseq-autoquant_24h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34507_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34508_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34509_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34510_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34511_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34512_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.25h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34513_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34514_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34515_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-0.5h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34516_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34517_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34518_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-1h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34519_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34520_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34521_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-3h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34522_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34523_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34524_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-6h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34525_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34526_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34527_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-12h-R3.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34528_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R1.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34529_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R2.fq_slamdunk_mapped_filtered_tcount.tsv,ngm-20161027/count/34530_An312_wt-2n_mRNA-slamseq-autoquant_24h-star-24h-R3.fq_slamdunk_mapped_filtered_tcount.tsv"
 52 | filesSlamDunk = as.character(ordered(strsplit(slamDunkFiles, ",")[[1]]))
 53 | outputFile = opt$output
 54 | #outputFile = "/project/ngs/philipp/slamdunk-analysis/simulation/simulation_1/eval/halflife_per_gene_eval_plots.tsv"
 55 | evalExpression = opt$column
 56 | #evalExpression = "TcReadCount / ReadCount"
 57 | columnName = as.integer(opt$columnname)
 58 | #columnName = 2
 59 | 
 60 | readMeatInfo <- function(fileName) {
 61 |   #fileName = filesSlamDunk[1]
 62 |   sampleInfo = read.table(fileName, nrows = 1, comment.char = "")
 63 |   version = paste(lapply(sampleInfo[1,1:3], as.character), collapse = '\t')
 64 |   sampleID = as.character(sampleInfo[1, ]$V7)
 65 |   sampleName = as.character(sampleInfo[1, ]$V6)
 66 |   sampleType = as.character(sampleInfo[1, ]$V8)
 67 |   sampleTime = as.numeric(sampleInfo[1, ]$V9)
 68 |   sampleInfo = read.table(fileName, nrows = 1, skip = 1, comment.char = "")
 69 |   annotationMD5 = as.character(sampleInfo[1, ]$V3)
 70 |   annotationName = as.character(sampleInfo[1, ]$V2)
 71 |   c(sampleID, sampleName, sampleType, sampleTime, annotationName, annotationMD5, version)
 72 | }
 73 | 
 74 | sampleNumber = length(filesSlamDunk)
 75 | mergedRates = data.frame()
 76 | 
 77 | annotationName = ""
 78 | annotationMD5 = ""
 79 | version = ""
 80 | IDs = c()
 81 | 
 82 | # Merge rates from all samples
 83 | for(i in 1:length(filesSlamDunk)) {
 84 |   #i = 1
 85 |   file = filesSlamDunk[i]
 86 |   meta = readMeatInfo(file)
 87 |   sampleName = meta[columnName]
 88 | 
 89 |   if(i == 1) {
 90 |     version = meta[7]
 91 |     annotationName = meta[5]
 92 |     annotationMD5 = meta[6]
 93 |   } else {
 94 |     if(annotationMD5 != meta[6]) {
 95 | 
 96 |     }
 97 |   }
 98 | 
 99 |   IDs = c(IDs, as.numeric(meta[1]))
100 |   data = read.table(file, header = T)
101 |   if(i == 1) {
102 |     mergedRates = data[, c(1:6)]
103 |     mergedRates$avgReadsCPM = data$ReadsCPM
104 |     mergedRates$avgMultimapper = data$multimapCount
105 |     mergedRates$avgTcontent = data$Tcontent
106 |     mergedRates$avgCoverageOnTs = data$CoverageOnTs
107 |   } else {
108 |     mergedRates$avgReadsCPM = mergedRates$avgReadsCPM + data$ReadsCPM
109 |     mergedRates$avgMultimapper = mergedRates$avgMultimapper + data$multimapCount
110 |     mergedRates$avgTcontent = mergedRates$avgTcontent + data$Tcontent
111 |     mergedRates$avgCoverageOnTs = mergedRates$avgCoverageOnTs + data$CoverageOnTs
112 |   }
113 |   #if(perRead == T) {
114 |   attach(data)
115 |   #mergedRates[,sampleName] = data$TcReadCount / data$ReadCount
116 |   mergedRates[,sampleName] = eval(parse(text=evalExpression))
117 |   detach(data)
118 |   mergedRates[data$ReadCount == 0,sampleName] = 0
119 |   #} else {
120 |   #  mergedRates[,sampleName] = data$ConversionRate
121 |   #}
122 | }
123 | # compute average CPM and multimapper per UTR
124 | mergedRates$avgReadsCPM = mergedRates$avgReadsCPM / sampleNumber
125 | mergedRates$avgMultimapper = mergedRates$avgMultimapper / sampleNumber
126 | mergedRates$avgTcontent = mergedRates$avgTcontent / sampleNumber
127 | mergedRates$avgCoverageOnTs = mergedRates$avgCoverageOnTs / sampleNumber
128 | 
129 | #head(mergedRates)
130 | # Sort columns by sample name
131 | colNumber = length(colnames(mergedRates))
132 | firstSampleColumn = (colNumber - sampleNumber + 1)
133 | sampleNames = colnames(mergedRates)[firstSampleColumn:colNumber]
134 | sampleColumnOrder = order(IDs)
135 | mergedRates = mergedRates[, c(1:(firstSampleColumn - 1), (sampleColumnOrder + firstSampleColumn - 1))]
136 | 
137 | #head(mergedRates)
138 | 
139 | # Write to output file
140 | con <- file(outputFile, open="wt")
141 | writeLines(version, con)
142 | writeLines(paste0("#Annotation:\t", annotationName, "\t", annotationMD5), con)
143 | writeLines(paste0("#Expression:\t", evalExpression), con)
144 | write.table(mergedRates, con, sep = "\t", quote = F, row.names = F, col.names = T)
145 | close(con)
146 | 


--------------------------------------------------------------------------------
/slamdunk/dunks/filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2015 Tobias Neumann, Philipp Rescheneder.
  4 | #
  5 | # This file is part of Slamdunk.
  6 | #
  7 | # Slamdunk is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU Affero General Public License as
  9 | # published by the Free Software Foundation, either version 3 of the
 10 | # License, or (at your option) any later version.
 11 | #
 12 | # Slamdunk is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU Affero General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU Affero General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | # Date located in: -
 21 | from __future__ import print_function
 22 | import pysam, random, os
 23 | 
 24 | from slamdunk.version import __version__, __bam_version__  # @UnresolvedImport
 25 | 
 26 | from slamdunk.utils.BedReader import bedToIntervallTree  # @UnresolvedImport
 27 | from slamdunk.utils.misc import checkStep, run, removeFile, getBinary, pysamIndex, SlamSeqInfo, md5  # @UnresolvedImport
 28 | 
 29 | # def Filter_old(inputBAM, outputBAM, log, MQ=2, printOnly=False, verbose=True, force=True):
 30 | #     if(printOnly or checkStep([inputBAM], [outputBAM], force)):
 31 | #         run(" ".join([ getBinary("samtools"), "view -q", str(MQ), "-b", inputBAM, ">", outputBAM]), log, verbose=verbose, dry=printOnly)
 32 | #     else:
 33 | #         print("Skipped filtering for " + inputBAM, file=log)
 34 | #
 35 | #     runIndexBam(outputBAM, log, verbose=verbose, dry=printOnly)
 36 | #     runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)
 37 | 
 38 | def bamSort(outputBAM, log, newHeader, verbose):
 39 | 
 40 |     tmp = outputBAM + "_tmp"
 41 |     if(newHeader != None):
 42 |         pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb")
 43 |         pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader)
 44 |         for read in pyOutputBAM:
 45 |             pyTmp.write(read)
 46 |         pyOutputBAM.close()
 47 |         pyTmp.close()
 48 |     else:
 49 |         os.rename(outputBAM, tmp)
 50 | 
 51 |     #run(" ".join(["samtools", "sort", "-@", str(threads) , tmp, replaceExtension(outFile, "")]), log, verbose=verbose, dry=dry)
 52 |     run(" ".join(["samtools sort", "-o", outputBAM, tmp]), log, verbose=verbose, dry=False)
 53 |     #pysam.sort(tmp, outputBAM)  # @UndefinedVariable
 54 |     removeFile(tmp)
 55 | 
 56 | def dumpBufferToBam (buffer, multimapList, outbam, infile):
 57 |     # Randomly write hit from read
 58 |     #read = random.choice(buffer.values()).pop()
 59 |     read = list(buffer.values()).pop().pop()
 60 | 
 61 | #     printer = read.query_name + "\t" + infile.getrname(read.reference_id) + "\t" + str(read.reference_start) + "\t" + str(read.reference_end) + "\tPRINT\tTrue"
 62 |     read.set_tag("RD", multimapList.rstrip(" "), "Z")
 63 |     read.is_secondary = False
 64 |     read.is_supplementary = False
 65 |     outbam.write(read)
 66 | 
 67 | #     return printer
 68 | #     for key in buffer.keys():
 69 | #         for read in buffer[key]:
 70 | #             outbam.write(read)
 71 | 
 72 | def multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log):
 73 | 
 74 |     mappedReads = 0
 75 |     unmappedReads = 0
 76 |     filteredReads = 0
 77 | 
 78 |     mqFiltered = 0
 79 |     idFiltered = 0
 80 |     nmFiltered = 0
 81 | 
 82 |     utrIntervallTreeDict = bedToIntervallTree(bed)
 83 | 
 84 | #     debugLog = os.path.join("multimapdebug.log")
 85 | #
 86 | #     fo = open(debugLog, "w")
 87 | 
 88 |     # Buffers for multimappers
 89 |     multimapBuffer = {}
 90 |     prevRead = ""
 91 |     # If read maps to another than previously recorded UTR -> do not dump reads to file
 92 |     dumpBuffer = True
 93 |     # This string tracks all multiple alignments
 94 |     multimapList = ""
 95 | #     logList = []
 96 | 
 97 |     for read in infile:
 98 |         if(not read.is_secondary and not read.is_supplementary):
 99 |             if(read.is_unmapped):
100 |                 unmappedReads += 1
101 |             else:
102 |                 mappedReads += 1
103 | 
104 |         # First pass general filters
105 |         if(read.is_unmapped):
106 |             continue
107 |         if(float(read.get_tag("XI")) < minIdentity):
108 |             idFiltered += 1
109 |             continue
110 |         if(NM > -1 and int(read.get_tag("NM")) > NM):
111 |             nmFiltered += 1
112 |             continue
113 |         if (read.mapping_quality == 0) :
114 |             # Previous read was also multimapper
115 |             if (read.query_name != prevRead and prevRead != "") :
116 | 
117 |                 #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
118 |                 if (dumpBuffer and len(multimapBuffer) > 0) :
119 |                     dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
120 |                     filteredReads += 1
121 | 
122 | #                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
123 | #                     print(ret,file = fo)
124 |                     #multimapBuffer = {}
125 |                     #multimapBuffer["nonUTR"] = []
126 | 
127 | #                 for entry in logList:
128 | #                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
129 | #                 logList = []
130 | 
131 |                 dumpBuffer = True
132 |                 multimapList = ""
133 |                 multimapBuffer = {}
134 | 
135 |             # Query Intervall tree for given chromosome for UTs
136 |             chr = infile.getrname(read.reference_id)
137 |             start = read.reference_start
138 |             end = read.reference_end
139 | 
140 |             if (chr in utrIntervallTreeDict) :
141 |                 query = utrIntervallTreeDict[chr][start:end]
142 |             else :
143 |                 query = set()
144 | 
145 |             if len(query) > 0:
146 |                 # First UTR hit is recorded without checks
147 |                 if (len(multimapBuffer) == 0) :
148 |                     for result in query :
149 |                         if (not result.data in multimapBuffer) :
150 |                             multimapBuffer[result.data] = []
151 |                         multimapBuffer[result.data].append(read)
152 |                 # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR
153 |                 else :
154 |                     for result in query :
155 |                         if (not result.data in multimapBuffer) :
156 |                             multimapBuffer[result.data] = []
157 |                             multimapBuffer[result.data].append(read)
158 |                             dumpBuffer = False
159 |                         else :
160 |                             multimapBuffer[result.data].append(read)
161 | 
162 | #             else :
163 | #                 # If no overlap -> nonUTR
164 | #                 multimapBuffer["nonUTR"].append(read)
165 | #                 for result in query :
166 | #                     logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data)
167 | #             else :
168 | #                 logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF")
169 | 
170 |             multimapList = multimapList + chr + ":" + str(start) + "-" + str(end) + " "
171 | 
172 |             prevRead = read.query_name
173 |         else :
174 |             # Dump any multimappers before a unique mapper
175 |             #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) :
176 |             if (len(multimapBuffer) > 0) :
177 |                 if (dumpBuffer) :
178 |                     dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
179 |                     filteredReads += 1
180 | #                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
181 | #                     print(ret,file = fo)
182 |                 multimapBuffer = {}
183 | #                 for entry in logList:
184 | #                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
185 | #                 logList = []
186 |                 #multimapBuffer["nonUTR"] = []
187 |                 dumpBuffer = True
188 |                 multimapList = ""
189 | 
190 |             # Record all unique mappers
191 |             prevRead = read.query_name
192 |             outfile.write(read)
193 |             filteredReads += 1
194 | 
195 |     # Dump last portion if it was multimapper
196 |     #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
197 |     if (dumpBuffer and len(multimapBuffer) > 0) :
198 |         dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
199 |         filteredReads += 1
200 | 
201 |     multimapper = mappedReads - filteredReads - idFiltered - nmFiltered
202 | 
203 |     print("Criterion\tFiltered reads",file=log)
204 |     print("MQ < 0\t0",file=log)
205 |     print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
206 |     print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
207 |     print("MM\t" + str(multimapper),file=log)
208 | 
209 | #     fo.close()
210 |     return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper
211 | 
212 | 
213 | def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False):
214 |     if(printOnly or checkStep([inputBAM], [outputBAM], force)):
215 | 
216 |         mappedReads = 0
217 |         unmappedReads = 0
218 |         filteredReads = 0
219 | 
220 |         mqFiltered = 0
221 |         idFiltered = 0
222 |         nmFiltered = 0
223 |         multimapper = 0
224 | 
225 |         infile = pysam.AlignmentFile(inputBAM, "rb")
226 |         outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
227 | 
228 |         # Default filtering without bed
229 |         if (bed == None) :
230 | 
231 |             print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log)
232 | 
233 |             for read in infile:
234 | 
235 |                 if(not read.is_secondary and not read.is_supplementary):
236 |                     if(read.is_unmapped):
237 |                         unmappedReads += 1
238 |                     else:
239 |                         mappedReads += 1
240 | 
241 |                 if(read.is_unmapped):
242 |                     continue
243 |                 if(read.mapping_quality < MQ):
244 |                     mqFiltered += 1
245 |                     continue
246 |                 if(float(read.get_tag("XI")) < minIdentity):
247 |                     idFiltered += 1
248 |                     continue
249 |                 if(NM > -1 and int(read.get_tag("NM")) > NM):
250 |                     nmFiltered += 1
251 |                     continue
252 | 
253 |                 if(not read.is_secondary and not read.is_supplementary):
254 |                     filteredReads += 1
255 | 
256 |                 outfile.write(read)
257 | 
258 |             print("Criterion\tFiltered reads",file=log)
259 |             print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log)
260 |             print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
261 |             print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
262 |             print("MM\t0",file=log)
263 |         else :
264 |             # Multimap retention strategy filtering when bed is supplied
265 | 
266 |             random.seed(1)
267 | 
268 |             print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log)
269 | 
270 |             mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)
271 |             #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)
272 | 
273 |         # Add number of sequenced and number of mapped reads to the read group description
274 |         # Used for creating summary file
275 |         inFileBamHeader = outfile.header
276 |         if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
277 |             slamseqInfo = SlamSeqInfo()
278 |             slamseqInfo.SequencedReads = mappedReads + unmappedReads
279 |             slamseqInfo.MappedReads = mappedReads
280 |             slamseqInfo.FilteredReads = filteredReads
281 |             slamseqInfo.MQFilteredReads = mqFiltered
282 |             slamseqInfo.IdFilteredReads = idFiltered
283 |             slamseqInfo.NmFilteredReads = nmFiltered
284 |             slamseqInfo.MultimapperReads = multimapper
285 | 
286 |             if (bed != None) :
287 |                 slamseqInfo.AnnotationName = os.path.basename(bed)
288 |                 slamseqInfo.AnnotationMD5 = md5(bed)
289 |             else :
290 |                 slamseqInfo.AnnotationName = ""
291 |                 slamseqInfo.AnnotationMD5 = ""
292 | 
293 |             if not isinstance(inFileBamHeader, dict):
294 |                 inFileBamHeader = inFileBamHeader.to_dict()
295 |             inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
296 |             #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"
297 | 
298 |         slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ }
299 |         if('PG' in inFileBamHeader):
300 |             inFileBamHeader['PG'].append(slamDunkPG)
301 |         else:
302 |             inFileBamHeader['PG'] = [ slamDunkPG ]
303 | 
304 |         infile.close()
305 |         outfile.close()
306 | 
307 |         # Sort afterwards
308 |         bamSort(outputBAM, log, inFileBamHeader, verbose)
309 | 
310 |         pysamIndex(outputBAM)
311 |         #pysamFlagstat(outputBAM)
312 |         #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)
313 | 
314 |     else:
315 |         print("Skipped filtering for " + inputBAM, file=log)
316 | 


--------------------------------------------------------------------------------