├── .gitignore
├── Docker
    ├── Dockerfile
    └── Readme.md
├── Images
    ├── Fig 5.png
    ├── Fig1.png
    ├── Fig2.png
    ├── Fig3.png
    └── Fig4.png
├── LICENSE
├── NPfiles
    ├── editingStats.txt
    ├── editing_sorted.txt
    └── sample_information_file.txt
├── NPscripts
    ├── REDItoolDnaRnav13.py
    ├── collect_editing_candidates.py
    ├── conda_pckg_installer_docker.py
    ├── conda_pckgs_installer.py
    ├── download-prepare-data-NP.py
    ├── download-prepare-data-NP_docker.py
    └── get_Statistics.py
├── PKG-INFO
├── README.md
├── README_1.md
├── README_2.md
├── accessory
    ├── AnnotateTable.py
    ├── FilterTable.py
    ├── GFFtoTabix.py
    ├── Readme.md
    ├── SearchInTable.py
    ├── SortGFF.py
    ├── SortTable.py
    ├── TableToGFF.py
    ├── get_DE_events.py
    ├── readPsl.py
    ├── rediportal2recoding.py
    ├── selectPositions.py
    ├── subCount.py
    ├── subCount2.py
    └── tableToTabix.py
├── main
    ├── REDItoolDenovo.py
    ├── REDItoolDnaRna.py
    └── REDItoolKnown.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | #Download base image centos latest 
 2 | FROM centos
 3 | 
 4 | #Dockerfile Mantainer
 5 | LABEL mantainer="clalogiudice@gmail.com"
 6 | 
 7 | #Update the centos software with yum package-manager
 8 | RUN yum update -y && yum clean all
 9 | 
10 | #Install git, wget and nano package
11 | RUN yum -y install git wget nano && yum clean all
12 | 
13 | #Clone Nature_protocol Git repository
14 | RUN git clone https://github.com/BioinfoUNIBA/REDItools
15 | 
16 | WORKDIR "/REDItools/NPscripts/" 
17 | 
18 | #Install miniconda with conda packages required by the nature_protocol
19 | RUN chmod +x conda_pckg_installer_docker.py
20 | RUN ./conda_pckg_installer_docker.py
21 | ENV PATH /miniconda2/bin:$PATH
22 | RUN echo "source activate nature_protocol" >> ~/.bashrc
23 | 
24 | #PREPARE NATURE_PROTOCOL environment
25 | WORKDIR "/"
26 | RUN echo "python ./REDItools/NPscripts/download-prepare-data-NP_docker.py" >> /root/.bashrc
27 | 


--------------------------------------------------------------------------------
/Docker/Readme.md:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 3 | <html xmlns="http://www.w3.org/1999/xhtml">
 4 |   <head>
 5 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />  
 6 |   </head>
 7 |   <body>
 8 | <h1>DOCKER BASIC COMMANDS</h1>
 9 | <h5>This Dockerfile and its related image are part of the supplemental material for the paper<br>
10 |   "Investigating RNA editing in deep transcriptome datasets with REDItools and REDIportal"</h5>
11 | <p>
12 |   You can compile an image from this dockerfile with:<br>
13 | <pre>docker build -t [image_name] .
14 | <b>eg.</b> docker build -t rna_editing_protocol .</pre>
15 | <br>
16 | and run a container with:<br>
17 | <pre>docker run -it [image_name] bash
18 | <b>eg.</b> docker run -it rna_editing_protocol bash</pre>
19 |     </p>
20 | <p>
21 |   <b>OR</b>
22 | </p>
23 | <p>
24 |   Download a pre-built image from Docker hub with:
25 |   <pre>docker pull claudiologiudice/rna_editing_protocol:latest</pre>
26 |   <br>
27 |   and run a container with:
28 |   <pre>docker run -it claudiologiudice/rna_editing_protocol:latest bash</pre>
29 |   </p>
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/Images/Fig 5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig 5.png


--------------------------------------------------------------------------------
/Images/Fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig1.png


--------------------------------------------------------------------------------
/Images/Fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig2.png


--------------------------------------------------------------------------------
/Images/Fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig3.png


--------------------------------------------------------------------------------
/Images/Fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig4.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 BioinfoUNIBA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NPfiles/editingStats.txt:
--------------------------------------------------------------------------------
 1 | SubType	ALU	REPnonALU	NONREP	ALL
 2 | AC	0.244897959184	0.0	0.0	0.24
 3 | GT	0.571428571429	0.0	0.0	0.56
 4 | AG	92.4897959184	0.0	100.0	92.64
 5 | CA	0.326530612245	0.0	0.0	0.32
 6 | CG	0.244897959184	0.0	0.0	0.24
 7 | GC	0.489795918367	0.0	0.0	0.48
 8 | AT	0.571428571429	0.0	0.0	0.56
 9 | GA	1.22448979592	0.0	0.0	1.2
10 | TG	0.244897959184	0.0	0.0	0.24
11 | CT	1.55102040816	0.0	0.0	1.52
12 | TC	1.87755102041	0.0	0.0	1.84
13 | TA	0.163265306122	0.0	0.0	0.16
14 | 


--------------------------------------------------------------------------------
/NPfiles/sample_information_file.txt:
--------------------------------------------------------------------------------
 1 | Sample,Status
 2 | SRR3306823,DIS
 3 | SRR3306824,DIS
 4 | SRR3306825,DIS
 5 | SRR3306826,DIS
 6 | SRR3306827,DIS
 7 | SRR3306828,DIS
 8 | SRR3306829,DIS
 9 | SRR3306830,CTRL
10 | SRR3306831,CTRL
11 | SRR3306832,CTRL
12 | SRR3306833,CTRL
13 | SRR3306834,CTRL
14 | SRR3306835,CTRL
15 | SRR3306836,CTRL


--------------------------------------------------------------------------------
/NPscripts/collect_editing_candidates.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys, os
 3 | import glob
 4 | 
 5 | atab=glob.glob('firstalu/DnaRna_*/outTable_*')[0] #alu refined
 6 | ftab=glob.glob('second/DnaRna_*/outTable_*')[0]
 7 | if not os.path.exists('knownEditing'): sys.exit('knownEditing file not found.')
 8 | if not os.path.exists('pos.txt'): sys.exit('pos.txt file not found.')
 9 | if not os.path.exists('posalu.txt'): sys.exit('posalu.txt file not found.')
10 | 
11 | o=open('editing.txt','w')
12 | f=open('knownEditing')
13 | for i in f: o.write(i)
14 | f.close()
15 | if os.path.exists(ftab):
16 | 	f=open(ftab)
17 | 	d={}
18 | 	for i in f:
19 | 		if i.startswith('Region'): continue
20 | 		l=(i.strip()).split('\t')
21 | 		d[(l[0],l[1])]=0
22 | 	f.close()
23 | 	f=open('pos.txt')
24 | 	for i in f:
25 | 		if i.startswith('Region'): continue
26 | 		l=(i.strip()).split('\t')
27 | 		if d.has_key((l[0],l[1])): o.write(i)
28 | 	f.close()
29 | f=open(atab)
30 | d={}
31 | for i in f:
32 | 	if i.startswith('Region'): continue
33 | 	l=(i.strip()).split('\t')
34 | 	d[(l[0],l[1])]=0
35 | f.close()
36 | f=open('posalu.txt')
37 | for i in f:
38 | 	if i.startswith('Region'): continue
39 | 	l=(i.strip()).split('\t')
40 | 	if d.has_key((l[0],l[1])): o.write(i)
41 | f.close()
42 | 


--------------------------------------------------------------------------------
/NPscripts/conda_pckg_installer_docker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python 
 2 | # Mantainer clalogiudice@gmail.com
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | import os, commands
22 | 
23 | def install_conda_packages(conda_bin):
24 | 	"""Installs conda packages required by the protocol"""
25 | 	install_cmd = os.system(cmd + ' install -n nature_protocol bcftools==1.9 bedtools==2.28.0 \
26 |         bzip2==1.0.6 bwa==0.7.17 bx-python==0.8.2 fastp==0.20.0 fastqc==0.11.8 \
27 |         fisher==0.1.4 git==2.21.0 gmap==2018.07.04 htslib==1.9 libdeflate==1.0 \
28 |         numpy==1.16.2 pysam==0.15.2 rseqc==2.6.4 samtools==1.9 scipy==1.2.1 \
29 |         star==2.7.0f wget==1.20.1')
30 | 	return install_cmd
31 | 
32 | 
33 | cwd = os.getcwd()
34 | installation_path = cwd + '/opt'
35 | if not os.path.exists(installation_path):
36 | 	os.mkdir(installation_path)
37 | os.chdir('./opt')
38 | conda_url = 'wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh'
39 | i = 0
40 | while os.system(conda_url) != 0 and i <= 5:
41 | 	os.system(conda_url)
42 | 	i+=1
43 | os.system('chmod +x Miniconda2-latest-Linux-x86_64.sh')
44 | #os.system('bash Miniconda2-latest-Linux-x86_64.sh -b -p /REDItools/NPscripts/miniconda2/')
45 | os.system('bash Miniconda2-latest-Linux-x86_64.sh -b -p /miniconda2/')
46 | os.chdir('../')
47 | os.system('rm -rf opt/')
48 | cmd = '/miniconda2/bin/conda'
49 | os.system(cmd + ' config --add channels defaults')
50 | os.system(cmd + ' config --add channels bioconda')
51 | os.system(cmd + ' config --add channels conda-forge')
52 | os.system(cmd + ' create -n nature_protocol python=2.7 anaconda -y')
53 | 
54 | install_conda_packages(cmd)
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/NPscripts/conda_pckgs_installer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python 
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | import os, subprocess
22 | 
23 | def install_conda_packages(conda_bin):
24 | 	"""Installs conda packages required by the protocol"""
25 | 	install_cmd = os.system(cmd + ' install -n nature_protocol bcftools==1.9 bedtools==2.28.0 \
26 |         bzip2==1.0.6 bwa==0.7.17 bx-python==0.8.2 fastp==0.20.0 fastqc==0.11.8 \
27 |         fisher==0.1.4 git==2.21.0 gmap==2018.07.04 htslib==1.9 libdeflate==1.0 \
28 |         numpy==1.16.2 pysam==0.15.2 rseqc==2.6.4 samtools==1.9 scipy==1.2.1 \
29 |         star==2.7.0f wget==1.20.1')
30 | 	return install_cmd
31 | 
32 | if subprocess.getstatusoutput('conda')[0] != 0:
33 | 	cwd = os.getcwd()
34 | 	installation_path = cwd + '/opt'
35 | 	if not os.path.exists(installation_path):
36 | 		os.mkdir(installation_path)
37 | 	os.chdir('./opt')
38 | 	conda_url = 'wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh'
39 | 	i = 0
40 | 	while os.system(conda_url) != 0 and i <= 5:
41 | 		os.system(conda_url)
42 | 		i+=1
43 | 	os.system('chmod +x Miniconda2-latest-Linux-x86_64.sh')
44 | 	os.system('bash Miniconda2-latest-Linux-x86_64.sh')
45 | 	home_folder = os.path.expanduser('~')
46 | 	cmd = home_folder + '/miniconda2/bin/conda'
47 | 	os.system(cmd + ' config --add channels defaults')
48 | 	os.system(cmd + ' config --add channels bioconda')
49 | 	os.system(cmd + ' config --add channels conda-forge')
50 | 	os.system(cmd + ' create -n nature_protocol python=2.7 anaconda')
51 | 	install_conda_packages(cmd)
52 | 	print("Your conda environment has been succesfully created, now close your terminal and open a new one." + "\n" + \
53 | 	      "Type in order:" + "\n" + \
54 | 	      "source " + home_folder + "/.bashrc" + "\n" + \
55 | 	      "conda activate nature_protocol")
56 | else:
57 |         home_folder = os.path.expanduser('~')
58 |         cmd = home_folder + '/miniconda2/bin/conda'
59 |         os.system(cmd + ' config --add channels defaults')
60 |         os.system(cmd + ' config --add channels bioconda')
61 |         os.system(cmd + ' config --add channels conda-forge')
62 |         os.system(cmd + ' create -n nature_protocol python=2.7 anaconda')
63 |         install_conda_packages(cmd)
64 |         print("Your conda environment has been succesfully created, now close your terminal and open a new one." + "\n" + 
65 | 		"Type in order:" + "\n" + \
66 | 		"source " + home_folder + "/.bashrc" + "\n" + \
67 | 		"conda activate nature_protocol")
68 | 
69 | 


--------------------------------------------------------------------------------
/NPscripts/download-prepare-data-NP.py:
--------------------------------------------------------------------------------
  1 | import sys, os, time
  2 | import commands
  3 | import distutils.spawn
  4 | 
  5 | try:
  6 | 	wdir=sys.argv[1] # working directory
  7 | 	redipath=sys.argv[2] # path to REDItools folder
  8 | 	usepath=sys.argv[3]
  9 | except:
 10 | 	sys.exit('<working directory> <path to REDItools folder> <use file with paths 0/1>')
 11 | 
 12 | def getData(cmd):
 13 | 	tr=0
 14 | 	while 1:
 15 | 		st,out=commands.getstatusoutput(cmd)
 16 | 		if st==0:
 17 | 			return 0
 18 | 		tr+=1
 19 | 		if tr==10: break
 20 | 	if tr>0: return 1
 21 | 
 22 | def is_tool(name):
 23 | 	wn=distutils.spawn.find_executable(name)
 24 | 	if wn==None: return 1
 25 | 	else: return wn
 26 | 
 27 | def get_time(tstart,tend):
 28 | 	telapsed=tend - tstart
 29 | 	t_taken=time.strftime("%H:%M:%S", time.gmtime(telapsed))
 30 | 	return t_taken
 31 | 
 32 | if usepath!='1':
 33 | 	exe=['bwa','STAR','awk','bgzip','tabix','sort','gtf_splicesites','wget','python','gunzip']
 34 | 	nt=[]
 35 | 	prg={}
 36 | 	for i in exe:
 37 | 		p=is_tool(i)
 38 | 		if p==1: nt.append(i)
 39 | 		prg[i]=p
 40 | 	if len(nt)>0:
 41 | 		for i in nt:
 42 | 			sys.stderr.write('Program %s NOT found\n' %(i))
 43 | 		sys.exit('Install required software first.')
 44 | else:
 45 | 	if not os.path.exists('mypaths'): sys.exit('File mypaths does not exists.')
 46 | 	nt=[]
 47 | 	f=open('mypaths')
 48 | 	prg={}
 49 | 	for i in f:
 50 | 		l=(i.strip()).replace(' ','')
 51 | 		l=l.split('=')
 52 | 		prg[l[0]]=l[1]	
 53 | 
 54 | if not os.path.exists(redipath): sys.exit('REDItools path does not exist.')
 55 | redirec=os.path.join(redipath,'accessory','rediportal2recoding.py')
 56 | if not os.path.exists(redipath): sys.exit('rediportal2recoding.py script not found.')
 57 | prg['redirec']=redirec
 58 | 
 59 | cdir=os.getcwd()
 60 | sys.stderr.write('Current directory: %s\n' %(cdir))
 61 | folder=os.path.join(cdir,wdir)
 62 | if not os.path.exists(folder):
 63 | 	os.mkdir(folder)
 64 | 	sys.stderr.write('Directory %s created.\n' %(wdir))
 65 | else:
 66 | 	sys.stderr.write('Found working directory.\n')
 67 | sys.stderr.write('Entering %s\n' %(wdir))
 68 | os.chdir(folder)
 69 | 
 70 | #human genome
 71 | sys.stderr.write('Getting human genome\n')
 72 | tstart = time.time()
 73 | os.mkdir('genome_hg19')
 74 | os.chdir('genome_hg19')
 75 | wcmd='%s ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz' %(prg['wget'])
 76 | ot=getData(wcmd)
 77 | if ot==1: sys.stderr.write('I cannot download the human genome.\n')
 78 | else: sys.stderr.write('Human genome complete.\n')
 79 | tend = time.time()
 80 | sys.stderr.write('Human genome - time taken: %s\n' %(get_time(tstart,tend)))
 81 | os.chdir('..')
 82 | #Gencode
 83 | sys.stderr.write('Getting GENCODE genes\n')
 84 | tstart = time.time()
 85 | os.mkdir('Gencode_annotation')
 86 | os.chdir('Gencode_annotation')
 87 | gcmd='%s ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.annotation.gtf.gz' %(prg['wget'])
 88 | ot=getData(gcmd)
 89 | if ot==1: sys.stderr.write('I cannot download GENCODE annotations.\n')
 90 | else: sys.stderr.write('GENCODE annotations ready.\n')
 91 | tend = time.time()
 92 | sys.stderr.write('GENCODE annotations - time taken: %s\n' %(get_time(tstart,tend)))
 93 | os.chdir('..')
 94 | #RefSeq
 95 | sys.stderr.write('Getting RefSeq hg19\n')
 96 | tstart = time.time()
 97 | os.mkdir('Strand_detection')
 98 | os.chdir('Strand_detection')
 99 | gcmd='%s --no-check-certificate https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg19_RefSeq.bed.gz' %(prg['wget'])
100 | ot=getData(gcmd)
101 | if ot==1: sys.stderr.write('I cannot download REFSEQ annotations.\n')
102 | else: sys.stderr.write('REFSEQ annotations ready.\n')
103 | tend = time.time()
104 | sys.stderr.write('REFSEQ annotations - time taken: %s\n' %(get_time(tstart,tend)))
105 | os.chdir('..')
106 | #RepeatMasker
107 | sys.stderr.write('Getting RepeatMasker\n')
108 | tstart = time.time()
109 | os.mkdir('rmsk')
110 | os.chdir('rmsk')
111 | gcmd='%s http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz' %(prg['wget'])
112 | ot=getData(gcmd)
113 | if ot==1: sys.stderr.write('I cannot download RepeatMasker annotations.\n')
114 | else: sys.stderr.write('RepeatMasker annotations ready.\n')
115 | tend = time.time()
116 | sys.stderr.write('RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend)))
117 | os.chdir('..')
118 | #dbSNP
119 | sys.stderr.write('Getting dbSNP\n')
120 | tstart = time.time()
121 | os.mkdir('snp151')
122 | os.chdir('snp151')
123 | gcmd='%s http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/snp151.txt.gz' %(prg['wget'])
124 | ot=getData(gcmd)
125 | if ot==1: sys.stderr.write('I cannot download dbSNP annotations.\n')
126 | else: sys.stderr.write('dbSNP annotations ready.\n')
127 | tend = time.time()
128 | sys.stderr.write('dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend)))
129 | os.chdir('..')
130 | #REDIportal
131 | sys.stderr.write('Getting REDIportal\n')
132 | tstart = time.time()
133 | os.mkdir('rediportal')
134 | os.chdir('rediportal')
135 | gcmd='%s http://srv00.recas.ba.infn.it/webshare/rediportalDownload/table1_full.txt.gz' %(prg['wget'])
136 | ot=getData(gcmd)
137 | if ot==1: sys.stderr.write('I cannot download REDIportal annotations.\n')
138 | else: sys.stderr.write('REDIportal annotations ready.\n')
139 | tend = time.time()
140 | sys.stderr.write('REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend)))
141 | os.chdir('..')
142 | #NA12878 - WGS
143 | sys.stderr.write('Getting NA12878 data - WGS\n')
144 | tstart = time.time()
145 | os.mkdir('WGS_ERR262997')
146 | os.chdir('WGS_ERR262997')
147 | fq1cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz' %(prg['wget'])
148 | fq2cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz' %(prg['wget'])
149 | f1=getData(fq1cmd)
150 | f2=getData(fq2cmd)
151 | if f1==1: sys.stderr.write('I cannot download READ1.\n')
152 | else:
153 | 	gu=getData('%s ERR262997_1.fastq.gz' %(prg['gunzip']))
154 | 	sys.stderr.write('READ1 ready.\n')
155 | if f2==1: sys.stderr.write('I cannot download READ2.\n')
156 | else:
157 | 	gu=getData('%s ERR262997_2.fastq.gz' %(prg['gunzip']))
158 | 	sys.stderr.write('READ2 ready.\n')
159 | tend = time.time()
160 | sys.stderr.write('NA12878 data - WGS - time taken: %s\n' %(get_time(tstart,tend)))	
161 | os.chdir('..')
162 | 
163 | #NA12878 - RNAseq
164 | sys.stderr.write('Getting NA12878 data - RNAseq\n')
165 | tstart = time.time()
166 | os.mkdir('RNASeq_SRR1258218')
167 | os.chdir('RNASeq_SRR1258218')
168 | fq1cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_1.fastq.gz' %(prg['wget'])
169 | fq2cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_2.fastq.gz' %(prg['wget'])
170 | f1=getData(fq1cmd)
171 | f2=getData(fq2cmd)
172 | if f1==1: sys.stderr.write('I cannot download READ1.\n')
173 | else:
174 | 	#gu=getData('%s SRR1258218_1.fastq.gz' %(prg['gunzip']))
175 | 	sys.stderr.write('READ1 ready.\n')
176 | if f2==1: sys.stderr.write('I cannot download READ2.\n')
177 | else:
178 | 	#gu=getData('%s SRR1258218_2.fastq.gz' %(prg['gunzip']))
179 | 	sys.stderr.write('READ2 ready.\n')
180 | tend = time.time()
181 | sys.stderr.write('NA12878 data - RNAseq - time taken: %s\n' %(get_time(tstart,tend)))
182 | os.chdir('..')
183 | 
184 | #PRJNA316625
185 | sys.stderr.write('Getting PRJNA316625 data\n')
186 | tstart = time.time()
187 | os.mkdir('PRJNA_316625')
188 | os.chdir('PRJNA_316625')
189 | fqlist=['ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_2.fastq.gz']
190 | for i in fqlist:
191 | 	fq1,fq2=i.split(',')
192 | 	base=(os.path.basename(fq1)).split("_")[0]
193 | 	os.mkdir(base)
194 | 	os.chdir(base)
195 | 	fq1cmd='%s %s' %(prg['wget'],fq1)
196 | 	fq2cmd='%s %s' %(prg['wget'],fq2)
197 | 	fq1_=getData(fq1cmd)
198 | 	fq2_=getData(fq2cmd)
199 | 	if fq1_+fq2_>0:
200 | 		sys.stderr.write('I cannot download all files in %s.\n' %(base))
201 | 		os.chdir('..')
202 | 	else:
203 | 		#gu1=getData('%s %s' %(prg['gunzip'],os.path.basename(fq1))) 
204 | 		#gu2=getData('%s %s' %(prg['gunzip'],os.path.basename(fq2)))
205 | 		sys.stderr.write('Files in %s ready.\n' %(base))
206 | 		os.chdir('..')
207 | tend = time.time()
208 | sys.stderr.write('PRJNA316625 - time taken: %s\n' %(get_time(tstart,tend)))
209 | os.chdir('..')
210 | 
211 | sys.stderr.write('Preparing data ...\n')
212 | sys.stderr.write('BWA indexing...\n')
213 | tstart = time.time()
214 | os.chdir('genome_hg19')
215 | cmd='%s GRCh37.primary_assembly.genome.fa.gz' %(prg['gunzip'])
216 | cmd1='%s index GRCh37.primary_assembly.genome.fa' %(prg['bwa'])
217 | ot=getData(cmd)
218 | ot1=getData(cmd1)
219 | if ot+ot1>0: sys.stderr.write('BWA indexing error.\n')
220 | else: sys.stderr.write('BWA indices ready.\n')
221 | tend = time.time()
222 | sys.stderr.write('BWA indexing - time taken: %s\n' %(get_time(tstart,tend)))
223 | os.chdir('..')
224 | 
225 | sys.stderr.write('STAR indexing...\n')
226 | cmd='%s Gencode_annotation/gencode.v30lift37.annotation.gtf.gz' %(prg['gunzip'])
227 | ot=getData(cmd)
228 | if ot==1: sys.stderr.write('Gunzipping gencode error.\n')
229 | else: sys.stderr.write('Gunzipping gencode ready.\n')
230 | tstart = time.time()
231 | if not os.path.exists('STAR'): os.mkdir('STAR')
232 | os.chdir('STAR')
233 | os.mkdir('STAR_genome_index_ucsc')
234 | cmd='%s --runMode genomeGenerate --genomeDir STAR_genome_index_ucsc --genomeFastaFiles ../genome_hg19/GRCh37.primary_assembly.genome.fa --sjdbGTFfile ../Gencode_annotation/gencode.v30lift37.annotation.gtf --sjdbOverhang 75' %(prg['STAR'])
235 | ot=getData(cmd)
236 | if ot==1: sys.stderr.write('STAR indexing error.\n')
237 | else: sys.stderr.write('STAR indices ready.\n')
238 | tend = time.time()
239 | sys.stderr.write('STAR indexing - time taken: %s\n' %(get_time(tstart,tend)))
240 | os.chdir('..')
241 | 
242 | sys.stderr.write('Prepare RepeatMasker annotations ...\n')
243 | tstart = time.time()
244 | os.chdir('rmsk')
245 | cmd4='%s rmsk.txt.gz' %(prg['gunzip'])
246 | cmd='%s \'OFS="\t"{print $6,"rmsk_hg19",$12,$7+1,$8,".",$10,".","gene_id \""$11"\"; transcript_id \""$13"\";"}\' rmsk.txt > rmsk.gtf' %(prg['awk'])
247 | cmd1='%s -k1,1 -k4,4n rmsk.gtf > rmsk.sorted.gtf' %(prg['sort'])
248 | cmd2='%s rmsk.sorted.gtf' %(prg['bgzip'])
249 | cmd3='%s -p gff rmsk.sorted.gtf.gz' %(prg['tabix'])
250 | ot4=getData(cmd4)
251 | ot=getData(cmd)
252 | ot1=getData(cmd1)
253 | ot2=getData(cmd2)
254 | ot3=getData(cmd3)
255 | if ot4==1: sys.stderr.write('RepeatMasker gunzip error.\n')
256 | if ot==1: sys.stderr.write('RepeatMasker awk error.\n')
257 | if ot1==1: sys.stderr.write('RepeatMasker sort error.\n')
258 | if ot2==1: sys.stderr.write('RepeatMasker bgzip error.\n')
259 | if ot3==1: sys.stderr.write('RepeatMasker tabix error.\n')
260 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('RepeatMasker ready.\n')
261 | tend = time.time()
262 | sys.stderr.write('Prepare RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend)))
263 | os.chdir('..')
264 | 
265 | sys.stderr.write('Prepare dbSNP annotations ...\n')
266 | tstart = time.time()
267 | os.chdir('snp151')
268 | cmd4='%s snp151.txt.gz' %(prg['gunzip'])
269 | cmd='%s \'OFS="\t"{if ($11=="genomic" && $12=="single") print $2,"ucsc_snp151_hg19","snp",$4,$4,".",$7,".","gene_id \""$5"\"; transcript_id \""$5"\";"}\' snp151.txt > snp151.gtf' %(prg['awk'])
270 | cmd1='%s -k1,1 -k4,4n snp151.gtf > snp151.sorted.gtf' %(prg['sort'])
271 | cmd2='%s snp151.sorted.gtf' %(prg['bgzip'])
272 | cmd3='%s -p gff snp151.sorted.gtf.gz' %(prg['tabix'])
273 | ot4=getData(cmd4)
274 | ot=getData(cmd)
275 | ot1=getData(cmd1)
276 | ot2=getData(cmd2)
277 | ot3=getData(cmd3)
278 | if ot4==1: sys.stderr.write('dbSNP gunzip error.\n')
279 | if ot==1: sys.stderr.write('dbSNP awk error.\n')
280 | if ot1==1: sys.stderr.write('dbSNP sort error.\n')
281 | if ot2==1: sys.stderr.write('dbSNP bgzip error.\n')
282 | if ot3==1: sys.stderr.write('dbSNP tabix error.\n')
283 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('dbSNP ready.\n')
284 | tend = time.time()
285 | sys.stderr.write('Prepare dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend)))
286 | os.chdir('..')
287 | 
288 | sys.stderr.write('Prepare splice sites annotations ...\n')
289 | tstart = time.time()
290 | os.chdir('Gencode_annotation')
291 | cmd='%s gencode.v30lift37.annotation.gtf > splicesites' %(prg['gtf_splicesites'])
292 | cmd1='%s -F" " \'{split($2,a,":"); split(a[2],b,"."); if (b[1]>b[3]) print a[1],b[3],b[1],toupper(substr($3,1,1)),"-"; else print a[1],b[1],b[3],toupper(substr($3,1,1)),"+"}\' splicesites > gencode.v30lift37.splicesites.txt' %(prg['awk'])
293 | ot=getData(cmd)
294 | ot1=getData(cmd1)
295 | if ot==1: sys.stderr.write('Splice sites gtf_splicesites error.\n')
296 | if ot1==1: sys.stderr.write('Splice sites sort error.\n')
297 | if ot+ot1==0: sys.stderr.write('Splice sites ready.\n')
298 | tend = time.time()
299 | sys.stderr.write('Prepare splice sites annotations - time taken: %s\n' %(get_time(tstart,tend)))
300 | os.chdir('..')
301 | 
302 | sys.stderr.write('Prepare REDIportal annotations ...\n')
303 | tstart = time.time()
304 | os.chdir('rediportal')
305 | cmd7='%s table1_full.txt.gz' %(prg['gunzip'])
306 | cmd='%s \'OFS="\t"{sum+=1; print $1,"rediportal","ed",$2,$2,".",$5,".","gene_id \""sum"\"; transcript_id \""sum"\";"}\' table1_full.txt > atlas.gtf' %(prg['awk'])
307 | cmd1='%s atlas.gtf' %(prg['bgzip'])
308 | cmd2='%s -p gff atlas.gtf.gz' %(prg['tabix'])
309 | cmd3='%s %s table1_full.txt > atlas_recoding.gff' %(prg['python'],redirec)
310 | cmd4='%s -V -k1,1 -k4,4n atlas_recoding.gff > srtd_atlas_recoding.gff' %(prg['sort'])
311 | cmd5='%s srtd_atlas_recoding.gff' %(prg['bgzip'])
312 | cmd6='%s -p gff srtd_atlas_recoding.gff.gz' %(prg['tabix'])
313 | ot7=getData(cmd7)
314 | ot=getData(cmd)
315 | ot1=getData(cmd1)
316 | ot2=getData(cmd2)
317 | ot3=getData(cmd3)
318 | ot4=getData(cmd4)
319 | ot5=getData(cmd5)
320 | ot6=getData(cmd6)
321 | if ot7==1: sys.stderr.write('REDIportal gunzip error.\n')
322 | if ot==1: sys.stderr.write('REDIportal awk error.\n')
323 | if ot1==1: sys.stderr.write('REDIportal bgzip error.\n')
324 | if ot2==1: sys.stderr.write('REDIportal tabix error.\n')
325 | if ot3==1: sys.stderr.write('REDIportal python error.\n')
326 | if ot4==1: sys.stderr.write('REDIportal sort error.\n')
327 | if ot5==1: sys.stderr.write('REDIportal bgzip error.\n')
328 | if ot6==1: sys.stderr.write('REDIportal tabix error.\n')
329 | if ot+ot1+ot2+ot3+ot4+ot5+ot6+ot7==0: sys.stderr.write('REDIportal ready.\n')
330 | tend = time.time()
331 | sys.stderr.write('Prepare REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend)))
332 | os.chdir('..')
333 | 
334 | sys.stderr.write('ALL DONE. ENJOY REDItools.\n')
335 | 


--------------------------------------------------------------------------------
/NPscripts/download-prepare-data-NP_docker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | import sys, os, time, shutil
  3 | import commands
  4 | import distutils.spawn
  5 | 
  6 | 
  7 | wdir='rna_editing_protocol'
  8 | redipath='./REDItools/'
  9 | 
 10 | def remove_folder(path):
 11 | 	# check if folder exists
 12 | 	if os.path.exists(path):
 13 | 	# remove if exists	
 14 | 		shutil.rmtree(path)
 15 | 
 16 | def getData(cmd):
 17 | 	tr=0
 18 | 	while 1:
 19 | 		st,out=commands.getstatusoutput(cmd)
 20 | 		if st==0:
 21 | 			return 0
 22 | 		tr+=1
 23 | 		if tr==10: break
 24 | 	if tr>0: return 1
 25 | 
 26 | def is_tool(name):
 27 | 	wn=distutils.spawn.find_executable(name)
 28 | 	if wn==None: return 1
 29 | 	else: return wn
 30 | 
 31 | def get_time(tstart,tend):
 32 | 	telapsed=tend - tstart
 33 | 	t_taken=time.strftime("%H:%M:%S", time.gmtime(telapsed))
 34 | 	return t_taken
 35 | 
 36 | 
 37 | exe=['bwa','STAR','awk','bgzip','tabix','sort','gtf_splicesites','wget','python','gunzip']
 38 | nt=[]
 39 | prg={}
 40 | for i in exe:
 41 |   p=is_tool(i)
 42 |   if p==1: nt.append(i)
 43 |   prg[i]=p
 44 | if len(nt)>0:
 45 |   for i in nt:
 46 |     sys.stderr.write('Program %s NOT found\n' %(i))
 47 |   sys.exit('Install required software first.')
 48 | 
 49 | redirec=os.path.join(redipath,'accessory','rediportal2recoding.py')
 50 | if not os.path.exists(redipath): sys.exit('rediportal2recoding.py script not found.')
 51 | prg['redirec']='../../' + redirec.lstrip('./')
 52 | 
 53 | ipkgs = raw_input("Download nature_protocol input data? yes/no ")
 54 | 
 55 | cdir=os.getcwd()
 56 | sys.stderr.write('Current directory: %s\n' %(cdir))
 57 | folder=os.path.join(cdir,wdir)
 58 | 
 59 | if ipkgs.strip().upper() == 'YES':
 60 | 	remove_folder(folder) # from previous installations
 61 | 	os.mkdir(folder)
 62 | 	sys.stderr.write('Directory %s created.\n' %(wdir))
 63 | 	sys.stderr.write('Entering %s\n' %(wdir))
 64 | 	os.chdir(folder)
 65 | 	#human genome
 66 | 	sys.stderr.write('Getting human genome\n')
 67 | 	tstart = time.time()
 68 | 	os.mkdir('genome_hg19')
 69 | 	os.chdir('genome_hg19')
 70 | 	wcmd='%s -c --retry-connrefused --tries=0 --timeout=5  ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz' %(prg['wget'])
 71 | 	ot=getData(wcmd)
 72 | 	if ot==1: sys.stderr.write('I cannot download the human genome.\n')
 73 | 	else: sys.stderr.write('Human genome complete.\n')
 74 | 	tend = time.time()
 75 | 	sys.stderr.write('Human genome - time taken: %s\n' %(get_time(tstart,tend)))
 76 | 	os.chdir('..')
 77 | 	#Gencode
 78 | 	sys.stderr.write('Getting GENCODE genes\n')
 79 | 	tstart = time.time()
 80 | 	os.mkdir('Gencode_annotation')
 81 | 	os.chdir('Gencode_annotation')
 82 | 	gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.annotation.gtf.gz' %(prg['wget'])
 83 | 	ot=getData(gcmd)
 84 | 	if ot==1: sys.stderr.write('I cannot download GENCODE annotations.\n')
 85 | 	else: sys.stderr.write('GENCODE annotations ready.\n')
 86 | 	tend = time.time()
 87 | 	sys.stderr.write('GENCODE annotations - time taken: %s\n' %(get_time(tstart,tend)))
 88 | 	os.chdir('..')
 89 | 	#RefSeq
 90 | 	sys.stderr.write('Getting RefSeq hg19\n')
 91 | 	tstart = time.time()
 92 | 	os.mkdir('Strand_detection')
 93 | 	os.chdir('Strand_detection')
 94 | 	gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 --no-check-certificate https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg19_RefSeq.bed.gz' %(prg['wget'])
 95 | 	ot=getData(gcmd)
 96 | 	if ot==1: sys.stderr.write('I cannot download REFSEQ annotations.\n')
 97 | 	else: sys.stderr.write('REFSEQ annotations ready.\n')
 98 | 	tend = time.time()
 99 | 	sys.stderr.write('REFSEQ annotations - time taken: %s\n' %(get_time(tstart,tend)))
100 | 	os.chdir('..')
101 | 	#RepeatMasker
102 | 	sys.stderr.write('Getting RepeatMasker\n')
103 | 	tstart = time.time()
104 | 	os.mkdir('rmsk')
105 | 	os.chdir('rmsk')
106 | 	gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz' %(prg['wget'])
107 | 	ot=getData(gcmd)
108 | 	if ot==1: sys.stderr.write('I cannot download RepeatMasker annotations.\n')
109 | 	else: sys.stderr.write('RepeatMasker annotations ready.\n')
110 | 	tend = time.time()
111 | 	sys.stderr.write('RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend)))
112 | 	os.chdir('..')
113 | 	#dbSNP
114 | 	sys.stderr.write('Getting dbSNP\n')
115 | 	tstart = time.time()
116 | 	os.mkdir('snp151')
117 | 	os.chdir('snp151')
118 | 	gcmd='%s -c --retry-connrefused --tries=0 --timeout=5  http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/snp151.txt.gz' %(prg['wget'])
119 | 	ot=getData(gcmd)
120 | 	if ot==1: sys.stderr.write('I cannot download dbSNP annotations.\n')
121 | 	else: sys.stderr.write('dbSNP annotations ready.\n')
122 | 	tend = time.time()
123 | 	sys.stderr.write('dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend)))
124 | 	os.chdir('..')
125 | 	#REDIportal
126 | 	sys.stderr.write('Getting REDIportal\n')
127 | 	tstart = time.time()
128 | 	os.mkdir('rediportal')
129 | 	os.chdir('rediportal')
130 | 	gcmd='%s -c --retry-connrefused --tries=0 --timeout=5  http://srv00.recas.ba.infn.it/webshare/rediportalDownload/table1_full.txt.gz' %(prg['wget'])
131 | 	ot=getData(gcmd)
132 | 	if ot==1: sys.stderr.write('I cannot download REDIportal annotations.\n')
133 | 	else: sys.stderr.write('REDIportal annotations ready.\n')
134 | 	tend = time.time()
135 | 	sys.stderr.write('REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend)))
136 | 	os.chdir('..')
137 | 	#NA12878 - WGS
138 | 	sys.stderr.write('Getting NA12878 data - WGS\n')
139 | 	tstart = time.time()
140 | 	os.mkdir('WGS_ERR262997')
141 | 	os.chdir('WGS_ERR262997')
142 | 	fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz' %(prg['wget'])
143 | 	fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz' %(prg['wget'])
144 | 	f1=getData(fq1cmd)
145 | 	f2=getData(fq2cmd)
146 | 	if f1==1: sys.stderr.write('I cannot download READ1.\n')
147 | 	else:
148 | 		gu=getData('%s ERR262997_1.fastq.gz' %(prg['gunzip']))
149 | 		sys.stderr.write('READ1 ready.\n')
150 | 	if f2==1: sys.stderr.write('I cannot download READ2.\n')
151 | 	else:
152 | 		gu=getData('%s ERR262997_2.fastq.gz' %(prg['gunzip']))
153 | 		sys.stderr.write('READ2 ready.\n')
154 | 	tend = time.time()
155 | 	sys.stderr.write('NA12878 data - WGS - time taken: %s\n' %(get_time(tstart,tend)))	
156 | 	os.chdir('..')
157 | 
158 | 	#NA12878 - RNAseq
159 | 	sys.stderr.write('Getting NA12878 data - RNAseq\n')
160 | 	tstart = time.time()
161 | 	os.mkdir('RNASeq_SRR1258218')
162 | 	os.chdir('RNASeq_SRR1258218')
163 | 	fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_1.fastq.gz' %(prg['wget'])
164 | 	fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_2.fastq.gz' %(prg['wget'])
165 | 	f1=getData(fq1cmd)
166 | 	f2=getData(fq2cmd)
167 | 	if f1==1: sys.stderr.write('I cannot download READ1.\n')
168 | 	else:
169 | 		#gu=getData('%s SRR1258218_1.fastq.gz' %(prg['gunzip']))
170 | 		sys.stderr.write('READ1 ready.\n')
171 | 	if f2==1: sys.stderr.write('I cannot download READ2.\n')
172 | 	else:
173 | 		#gu=getData('%s SRR1258218_2.fastq.gz' %(prg['gunzip']))
174 | 		sys.stderr.write('READ2 ready.\n')
175 | 	tend = time.time()
176 | 	sys.stderr.write('NA12878 data - RNAseq - time taken: %s\n' %(get_time(tstart,tend)))
177 | 	os.chdir('..')
178 | 
179 | 	#PRJNA316625
180 | 	sys.stderr.write('Getting PRJNA316625 data\n')
181 | 	tstart = time.time()
182 | 	os.mkdir('PRJNA_316625')
183 | 	os.chdir('PRJNA_316625')
184 | 	fqlist=['ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_2.fastq.gz']
185 | 	for i in fqlist:
186 | 		fq1,fq2=i.split(',')
187 | 		base=(os.path.basename(fq1)).split("_")[0]
188 | 		os.mkdir(base)
189 | 		os.chdir(base)
190 | 		fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 %s' %(prg['wget'],fq1)
191 | 		fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 %s' %(prg['wget'],fq2)
192 | 		fq1_=getData(fq1cmd)
193 | 		fq2_=getData(fq2cmd)
194 | 		if fq1_+fq2_>0:
195 | 			sys.stderr.write('I cannot download all files in %s.\n' %(base))
196 | 			os.chdir('..')
197 | 		else:
198 | 			#gu1=getData('%s %s' %(prg['gunzip'],os.path.basename(fq1))) 
199 | 			#gu2=getData('%s %s' %(prg['gunzip'],os.path.basename(fq2)))
200 | 			sys.stderr.write('Files in %s ready.\n' %(base))
201 | 			os.chdir('..')
202 | 	tend = time.time()
203 | 	sys.stderr.write('PRJNA316625 - time taken: %s\n' %(get_time(tstart,tend)))
204 | 	os.chdir('..')
205 | 
206 | 	sys.stderr.write('Preparing data ...\n')
207 | 	sys.stderr.write('BWA indexing...\n')
208 | 	tstart = time.time()
209 | 	os.chdir('genome_hg19')
210 | 	cmd='%s GRCh37.primary_assembly.genome.fa.gz' %(prg['gunzip'])
211 | 	cmd1='%s index GRCh37.primary_assembly.genome.fa' %(prg['bwa'])
212 | 	ot=getData(cmd)
213 | 	ot1=getData(cmd1)
214 | 	if ot+ot1>0: sys.stderr.write('BWA indexing error.\n')
215 | 	else: sys.stderr.write('BWA indices ready.\n')
216 | 	tend = time.time()
217 | 	sys.stderr.write('BWA indexing - time taken: %s\n' %(get_time(tstart,tend)))
218 | 	os.chdir('..')
219 | 
220 | 	sys.stderr.write('STAR indexing...\n')
221 | 	cmd='%s Gencode_annotation/gencode.v30lift37.annotation.gtf.gz' %(prg['gunzip'])
222 | 	ot=getData(cmd)
223 | 	if ot==1: sys.stderr.write('Gunzipping gencode error.\n')
224 | 	else: sys.stderr.write('Gunzipping gencode ready.\n')
225 | 	tstart = time.time()
226 | 	if not os.path.exists('STAR'): os.mkdir('STAR')
227 | 	os.chdir('STAR')
228 | 	os.mkdir('STAR_genome_index_ucsc')
229 | 	cmd='%s --runMode genomeGenerate --genomeDir STAR_genome_index_ucsc --genomeFastaFiles ../genome_hg19/GRCh37.primary_assembly.genome.fa --sjdbGTFfile ../Gencode_annotation/gencode.v30lift37.annotation.gtf --sjdbOverhang 75' %(prg['STAR'])
230 | 	ot=getData(cmd)
231 | 	if ot==1: sys.stderr.write('STAR indexing error.\n')
232 | 	else: sys.stderr.write('STAR indices ready.\n')
233 | 	tend = time.time()
234 | 	sys.stderr.write('STAR indexing - time taken: %s\n' %(get_time(tstart,tend)))
235 | 	os.chdir('..')
236 | 
237 | 	sys.stderr.write('Prepare RepeatMasker annotations ...\n')
238 | 	tstart = time.time()
239 | 	os.chdir('rmsk')
240 | 	cmd4='%s rmsk.txt.gz' %(prg['gunzip'])
241 | 	cmd='%s \'OFS="\t"{print $6,"rmsk_hg19",$12,$7+1,$8,".",$10,".","gene_id \""$11"\"; transcript_id \""$13"\";"}\' rmsk.txt > rmsk.gtf' %(prg['awk'])
242 | 	cmd1='%s -k1,1 -k4,4n rmsk.gtf > rmsk.sorted.gtf' %(prg['sort'])
243 | 	cmd2='%s rmsk.sorted.gtf' %(prg['bgzip'])
244 | 	cmd3='%s -p gff rmsk.sorted.gtf.gz' %(prg['tabix'])
245 | 	ot4=getData(cmd4)
246 | 	ot=getData(cmd)
247 | 	ot1=getData(cmd1)
248 | 	ot2=getData(cmd2)
249 | 	ot3=getData(cmd3)
250 | 	if ot==4: sys.stderr.write('RepeatMasker gunzip error.\n')
251 | 	if ot==1: sys.stderr.write('RepeatMasker awk error.\n')
252 | 	if ot1==1: sys.stderr.write('RepeatMasker sort error.\n')
253 | 	if ot2==1: sys.stderr.write('RepeatMasker bgzip error.\n')
254 | 	if ot3==1: sys.stderr.write('RepeatMasker tabix error.\n')
255 | 	if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('RepeatMasker ready.\n')
256 | 	tend = time.time()
257 | 	sys.stderr.write('Prepare RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend)))
258 | 	os.chdir('..')
259 | 
260 | 	sys.stderr.write('Prepare dbSNP annotations ...\n')
261 | 	tstart = time.time()
262 | 	os.chdir('snp151')
263 | 	cmd4='%s snp151.txt.gz' %(prg['gunzip'])
264 | 	cmd='%s \'OFS="\t"{if ($11=="genomic" && $12=="single") print $2,"ucsc_snp151_hg19","snp",$4,$4,".",$7,".","gene_id \""$5"\"; transcript_id \""$5"\";"}\' snp151.txt > snp151.gtf' %(prg['awk'])
265 | 	cmd1='%s -k1,1 -k4,4n snp151.gtf > snp151.sorted.gtf' %(prg['sort'])
266 | 	cmd2='%s snp151.sorted.gtf' %(prg['bgzip'])
267 | 	cmd3='%s -p gff snp151.sorted.gtf.gz' %(prg['tabix'])
268 | 	ot4=getData(cmd4)
269 | 	ot=getData(cmd)
270 | 	ot1=getData(cmd1)
271 | 	ot2=getData(cmd2)
272 | 	ot3=getData(cmd3)
273 | 	if ot==4: sys.stderr.write('dbSNP gunzip error.\n')
274 | 	if ot==1: sys.stderr.write('dbSNP awk error.\n')
275 | 	if ot1==1: sys.stderr.write('dbSNP sort error.\n')
276 | 	if ot2==1: sys.stderr.write('dbSNP bgzip error.\n')
277 | 	if ot3==1: sys.stderr.write('dbSNP tabix error.\n')
278 | 	if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('dbSNP ready.\n')
279 | 	tend = time.time()
280 | 	sys.stderr.write('Prepare dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend)))
281 | 	os.chdir('..')
282 | 
283 | 	sys.stderr.write('Prepare splice sites annotations ...\n')
284 | 	tstart = time.time()
285 | 	os.chdir('Gencode_annotation')
286 | 	cmd='%s gencode.v30lift37.annotation.gtf > splicesites' %(prg['gtf_splicesites'])
287 | 	cmd1='%s -F" " \'{split($2,a,":"); split(a[2],b,"."); if (b[1]>b[3]) print a[1],b[3],b[1],toupper(substr($3,1,1)),"-"; else print a[1],b[1],b[3],toupper(substr($3,1,1)),"+"}\' splicesites > gencode.v30lift37.splicesites.txt' %(prg['awk'])
288 | 	ot=getData(cmd)
289 | 	ot1=getData(cmd1)
290 | 	if ot==1: sys.stderr.write('Splice sites gtf_splicesites error.\n')
291 | 	if ot1==1: sys.stderr.write('Splice sites sort error.\n')
292 | 	if ot+ot1==0: sys.stderr.write('Splice sites ready.\n')
293 | 	tend = time.time()
294 | 	sys.stderr.write('Prepare splice sites annotations - time taken: %s\n' %(get_time(tstart,tend)))
295 | 	os.chdir('..')
296 | 
297 | 	sys.stderr.write('Prepare REDIportal annotations ...\n')
298 | 	tstart = time.time()
299 | 	os.chdir('rediportal')
300 | 	cmd7='%s table1_full.txt.gz' %(prg['gunzip'])
301 | 	ot7=getData(cmd7)
302 | 	if ot7==0:
303 | 		cmd='%s \'OFS="\t"{sum+=1; print $1,"rediportal","ed",$2,$2,".",$5,".","gene_id \""sum"\"; transcript_id \""sum"\";"}\' table1_full.txt > atlas.gtf' %(prg['awk'])
304 | 		cmd1='%s atlas.gtf' %(prg['bgzip'])
305 | 		cmd2='%s -p gff atlas.gtf.gz' %(prg['tabix'])
306 | 		cmd3='%s %s table1_full.txt > atlas_recoding.gff' %(prg['python'],prg['redirec']) #redirec
307 | 		cmd4='%s -V -k1,1 -k4,4n atlas_recoding.gff > srtd_atlas_recoding.gff' %(prg['sort'])
308 | 		cmd5='%s srtd_atlas_recoding.gff' %(prg['bgzip'])
309 | 		cmd6='%s -p gff srtd_atlas_recoding.gff.gz' %(prg['tabix'])
310 | 		ot=getData(cmd)
311 | 		ot1=getData(cmd1)
312 | 		ot2=getData(cmd2)
313 | 		ot3=getData(cmd3)
314 | 		ot4=getData(cmd4)
315 | 		ot5=getData(cmd5)
316 | 		ot6=getData(cmd6)
317 | 		if ot==7: sys.stderr.write('REDIportal gunzip error.\n')
318 | 		if ot==1: sys.stderr.write('REDIportal awk error.\n')
319 | 		if ot1==1: sys.stderr.write('REDIportal bgzip error.\n')
320 | 		if ot2==1: sys.stderr.write('REDIportal tabix error.\n')
321 | 		if ot3==1: sys.stderr.write('REDIportal python error.\n')
322 | 		if ot4==1: sys.stderr.write('REDIportal sort error.\n')
323 | 		if ot5==1: sys.stderr.write('REDIportal bgzip error.\n')
324 | 		if ot6==1: sys.stderr.write('REDIportal tabix error.\n')
325 | 		if ot+ot1+ot2+ot3+ot4+ot5+ot6+ot7==0: sys.stderr.write('REDIportal ready.\n')
326 | 		tend = time.time()
327 | 		sys.stderr.write('Prepare REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend)))
328 | 		os.chdir('..')
329 | 
330 | 		sys.stderr.write('ALL DONE. ENJOY REDItools.\n')
331 | else:
332 | 	sys.stderr.write('Please provide your input data according to nature_protocol or relaunch this script. \n')
333 | 


--------------------------------------------------------------------------------
/NPscripts/get_Statistics.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | 
 3 | def getDistro(lines):
 4 | 	s={}
 5 | 	for i in 'ACGT':
 6 | 		for j in 'ACGT':
 7 | 			if i!=j: s[i+j]=0
 8 | 	n={}
 9 | 	x=0
10 | 	for i in 'ACGT':
11 | 		n[i]=x
12 | 		x+=1
13 | 	all=0
14 | 	for i in lines:
15 | 		sub=i[7].split()[0]
16 | 		nuc=eval(i[6])
17 | 		nv= nuc[n[sub[1]]]
18 | 		s[sub]+=nv
19 | 		all+=nv
20 | 	d={}
21 | 	for i in s:
22 | 		try: v=(s[i]/float(all))*100
23 | 		except: v=0.0
24 | 		d[i]=(s[i],all,v)	
25 | 	return d
26 | 
27 | if not os.path.exists('editing.txt'): sys.exit('editing.txt file not found.')
28 | 
29 | alu,nonalu,nonrep,kn=[],[],[],0
30 | f=open('editing.txt')
31 | for i in f:
32 | 	if i.startswith('Reg'): continue
33 | 	l=(i.strip()).split('\t')
34 | 	if l[18]=='ed': kn+=1
35 | 	if l[14]=='SINE' and l[15][:3]=='Alu': alu.append(l)
36 | 	elif l[14]!='-' and l[15][:3]!='Alu': nonalu.append(l)
37 | 	elif l[14]=='-' and l[15]=='-': nonrep.append(l)
38 | f.close()
39 | 
40 | alust=getDistro(alu)
41 | nonalust=getDistro(nonalu)
42 | nonrepst=getDistro(nonrep)
43 | all=getDistro(alu+nonalu+nonrep)
44 | 
45 | f=open('editingStats.txt','w')
46 | h=['SubType','ALU','REPnonALU','NONREP','ALL']
47 | f.write('\t'.join(h)+'\n')
48 | for i in alust:
49 | 	r=[i,alust[i][2],nonalust[i][2],nonrepst[i][2],all[i][2]]
50 | 	r=[str(x) for x in r]
51 | 	f.write('\t'.join(r)+'\n')
52 | f.close()
53 | 


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: REDItools
 3 | Version: 1.3
 4 | Summary: Python Scripts for RNA editing detection by RNA-Seq data
 5 | Home-page: https://github.com/BioinfoUNIBA/REDItools
 6 | Author: Ernesto Picardi
 7 | Author-email: ernesto.picardi@gmail.com
 8 | License: LICENSE.txt
 9 | Description: REDItools: python scripts for RNA editing detection by RNA-Seq data
10 |         ===================================================================
11 |         
12 |         Introduction
13 |         ============
14 |         REDItools are python scripts developed with the aim to study RNA editing at genomic scale
15 |         by next generation sequencing data. RNA editing is a post-transcriptional phenomenon
16 |         involving the insertion/deletion or substitution of specific bases in precise RNA localizations.
17 |         In human, RNA editing occurs by deamination of cytosine to uridine (C-to-U) or mostly by the
18 |         adenosine to inosine (A-to-I) conversion through ADAR enzymes. A-to-I substitutions may have
19 |         profound functional consequences and have been linked to a variety of human diseases including
20 |         neurological and neurodegenerative disorders or cancer. Next generation sequencing technologies
21 |         offer the unique opportunity to investigate in depth RNA editing even though no dedicated
22 |         software has been released up to now.
23 |         
24 |         REDItools are simple python scripts conceived to facilitate the investigation of RNA editing
25 |         at large-scale and devoted to research groups that would to explore such phenomenon in own
26 |         data but don’t have sufficient bioinformatics skills.
27 |         They work on main operating systems (although unix/linux-based OS are preferred), can handle reads from whatever
28 |         platform in the standard BAM format and implement a variety of filters.
29 |         
30 |         
31 | Platform: Linux
32 | Platform: Unix
33 | Platform: MacOS
34 | Classifier: Intended Audience :: Computational biologists
35 | Classifier: License :: OSI Approved :: MIT
36 | Classifier: Operating System :: MacOS :: MacOS X
37 | Classifier: Operating System :: POSIX
38 | Classifier: Programming Language :: Python
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | REDItools: python scripts for RNA editing detection by RNA-Seq data
 2 | ===================================================================
 3 | 
 4 | Introduction
 5 | ============
 6 | <p align-text="justify"> RNA editing is a post-transcriptional phenomenon
 7 | involving the insertion/deletion or substitution of specific bases in precise RNA localizations.
 8 | In humans, RNA editing occurs by deamination of cytosine to uridine (C-to-U) or mostly by the
 9 | adenosine to inosine (A-to-I) conversion through ADAR enzymes. A-to-I substitutions may have
10 | profound functional consequences and have been linked to a variety of human diseases including
11 | neurological and neurodegenerative disorders or cancer. Next-generation sequencing technologies
12 | offer the unique opportunity to investigate in-depth RNA editing even though no dedicated
13 | software has been released up to now.
14 | 
15 | REDItools are simple Python scripts conceived to facilitate the investigation of RNA editing
16 | at large scale and devoted to research groups that would explore such phenomena in own
17 | data but don’t have sufficient bioinformatics skills.
18 | They work on main operating systems (although unix/linux-based OS are preferred), can handle reads from whatever
19 | platform in the standard BAM format, and implement various filters.</p>
20 | 
21 | <a href="https://github.com/BioinfoUNIBA/REDItools/blob/master/README_1.md">REDItools V1 manual</a>
22 | <br><br>
23 | <a href="https://github.com/BioinfoUNIBA/REDItools2">REDItools V2 manual</a>
24 | <br><br>
25 | <a href="https://github.com/BioinfoUNIBA/REDItools3">REDItools V3 manual</a>
26 | <p><b>Note. </b>REDItools V2 is useful for HPC environments.</p>
27 | </b>REDItools V3 is the latest optimized version for large-scale investigations.</p>
28 | <p><b>Important. </b>Reditool_DNA_RNA.py v1.3 available at <a href="/NPscripts/REDItoolDnaRnav13.py" download="REDItoolDnaRnav13.py">this link </a></p>
29 | 


--------------------------------------------------------------------------------
/README_2.md:
--------------------------------------------------------------------------------
  1 | # REDItools2
  2 | 
  3 | **REDItools2** is the optimized, parallel multi-node version of [<i class="icon-link"></i> REDItools](https://github.com/BioinfoUNIBA/REDItools).
  4 | 
  5 | REDItools takes in input a RNA-Seq (or DNA-Seq BAM) file and outputs a table of RNA-Seq editing events.  Here is an example of REDItools's output:
  6 | <p align="center">
  7 | <img src="Images/Fig1.png" width="700px">
  8 | </p>
  9 | 
 10 | The following image explains the high-level architecture.
 11 | 
 12 | <p align="center">
 13 | <img src="Images/Fig2.png" width="500px">
 14 | </p>
 15 | 
 16 | This version of REDItools shows an average 8x speed improvement over the previous version even when using only the serial-mode:
 17 | 
 18 | <p align="center">
 19 | <img src="Images/Fig3.png" width="800px">
 20 | </p>
 21 | 
 22 | # Index
 23 | 
 24 | - [1. Python setup](#1-python-setup)
 25 | - [2. Environment setup](#2-environment-setup)
 26 | - [3. Cloning / downloading](#3-cloning--downloading)
 27 | - [4. Installing](#4-installing)
 28 | - [5. The two versions of REDItools 2.0](#5-the-two-versions-of-reditools-20)
 29 |   - [5.1 Serial version](#51-serial-version-reditoolspy) 
 30 |   - [5.2 Parallel version](#52-parallel-version--parallel_reditoolspy)
 31 | - [6. Running REDItools 2.0 on your own data](#6-running-reditools-20-on-your-own-data) 
 32 | - [7. REDItools 2.0 options](#7-reditools-20-options) 
 33 | - [8. DNA-Seq annotation with REDItools 2.0](#8-dna-seq-annotation-with-reditools-20)
 34 | - [9. Running REDItools 2.0 in multisample mode](#9-running-reditools-20-in-multisample-mode)
 35 | - [10. Displaying benchmarks in HTML with REDItools 2.0 (parallel version only)](#10-displaying-benchmarks-with-reditools-20-parallel-version-only)
 36 | 
 37 | 
 38 | ## Installation
 39 | 
 40 | ### 1. Python setup
 41 | ---
 42 | This guide assumes you have Python <= 2.7 installed in your system. If you do not have Python, please read the [official Python webpage](https://www.python.org/).
 43 | 
 44 | Make sure to have the following packages installed:
 45 | 
 46 | > sudo apt-get install python-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip zlib-devel zlib zlib1g zlib1g-devel libbz2-dev zlib1g-dev libncurses5-dev libncursesw5-dev liblzma-dev
 47 | 
 48 | Make sure you have you preferred Python version loaded. If you have a single Python version already installed in your system you should do nothing. If you have multiple versions, please be sure to point to a given version; in order to do so check your environmental variables (e.g., PATH).
 49 | 
 50 | If you are running on a cluster (where usually several versions are available) make sure to load a given Python version. For example (if running on CINECA Marconi super computer) the following command would load Python 2.7.12:
 51 | > module load autoload python/2.7.12
 52 | 
 53 | Note: REDItools2.0 has been tested with Python 2.7.12. The software comes with no guarantee of being compatible with other versions of Python (e.g., Python >=3).
 54 | 
 55 | ### 2. Environment setup
 56 | ---
 57 | Make sure the following libraries are installed:
 58 | 
 59 | - htslib (see http://www.htslib.org/download/ and https://www.biostars.org/p/328831/ for instructions)
 60 | - samtools:
 61 | 
 62 | > sudo apt-get install samtools
 63 | 
 64 | - tabix:
 65 | 
 66 | > sudo apt-get install tabix
 67 | 
 68 | - an MPI implementation. We suggest OpenMPI, but you can choose whatever you like the most. For installing OpenMPI, try the following command:
 69 | > sudo apt-get install openmpi-common libopenmpi-dev
 70 | 
 71 | ### 3. Cloning / Downloading
 72 | ---
 73 | 
 74 | The first step is to clone this repository (assumes you have *git* installed in your system - see the [Git official page](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) otherwise):
 75 | > git clone https://github.com/tflati/reditools2.0.git
 76 | 
 77 | (alternatively you can download a ZIP package of REDItools2.0 from [here](https://github.com/tflati/reditools2.0/archive/master.zip) and uncompress the archive).
 78 | 
 79 | Move into the project main directory:
 80 | > cd reditools2.0
 81 | 
 82 | 
 83 | ### 4. Installing
 84 | ---
 85 | 
 86 | REDItools 2.0 requires a few Python modules to be installed in the environment (e.g., pysam, sortedcontainers, mpi4py, etc.). These can be installed in three ways:
 87 | 
 88 | - **System-level**: in this way the dependencies will be installed in your system and all users in your system will see changes. In order to perform this type of installation you need administrator rights.
 89 | To install REDItools2.0 in this modality, just run the following command:
 90 | > sudo pip install -r requirements.txt
 91 | 
 92 | - **User-level**: in this way the dependencies will be installed only for your current user, usually in your home directory. In order to perform this type of installation you need only to be logged as a normal user. Note that this type of installation will install additional software in your local Python directory (usually $HOME/.local/lib/python2.7/site-packages/, but it depends on your operating system and distribution).
 93 | This is the recommended modality if you do not care about altering your user environment. Note that altering your user environment might lead to software corruption. For example, assume you have already the *pysam* package installed (version 0.6); since REDItools 2.0 requires a version for *pysam* >= 0.9, the installation would uninstall the existing version of pysam and would install the version 0.9, thus altering the state of your environment. Any existing software which relied on version pysam 0.6 might break and stop working. In conclusion, choose this modality at your own risk.
 94 | To install REDItools2.0 in this modality, just run the following command:
 95 | > pip install -r requirements.txt --user
 96 |  
 97 | - **Environment-level**: in this type of installation you create an isolated virtual environment (initially empty) which will contain any new required software, without creating conflicts with any existing environment or requiring any particular right.
 98 | This modality will work regardless of the existing packages already installed in your system (both user and system levels) and thus gives the maximum possible freedom to the final-end user.
 99 | This is the recommended modality.
100 | The downside of choosing this modality is a potential duplication of code with respect to other existing environments. For example, assume you already have a given version of *sortedcontainers*; by installing REDItools2.0 at environment-level will download and install a *new* copy of *sortedcontainers* into a new isolated environment (ending up with two copies of the same software present in the system, one inside and one outside the virtual environment).
101 | To install REDItools2.0 in this modality, run the following commands:
102 | 
103 | > virtualenv ENV
104 | > 
105 | > source ENV/bin/activate
106 | >
107 | > pip install -r requirements.txt
108 | > 
109 | >  deactivate
110 | 
111 | These commands will create a new environment called *ENV* (you can choose any name you like) and will install all dependencies listed in the file *requirements.txt* into it). The commands *activate* and *deactivate* respectively activate (i.e., start/open) and deactivate (i.e., end/close) the virtual environment.
112 | When running the real commands, remember to wrap your commands between and activate and deactivate commands:
113 | 
114 | >source ENV/bin/activate
115 | >
116 | >command...
117 | >
118 | >command...
119 | >
120 | >command...
121 | >
122 | >command...
123 | >
124 | >deactivate
125 | 
126 | ## Testing
127 | 
128 | ### 5. The two versions of REDItools 2.0
129 | ---
130 | 
131 | This repo includes test data and a test script for checking that dependencies have been installed properly and the basic REDItools command works.
132 | 
133 | In order to have all the data you need, run the following commands:
134 | 
135 | > cd test
136 | >
137 | > ./prepare_test.sh
138 | 
139 | This will download and index the chromosome 21 of the hg19 version of the human genome (from http://hgdownload.cse.ucsc.edu/downloads.html).
140 | Once the script has finished running, you have all you need to perform the tests.
141 | 
142 | The software comes with two modalities. Feel free to choose the one which best fits your needs.
143 | 
144 | ####  5.1 Serial version (reditools.py)
145 | 
146 | In this modality you benefit only from the optimization introduced after the first version. While being significantly faster (with about a 8x factor), you do not exploit the computational power of having multiple cores. On the other hand the setup and launch of REDItools is much easier.
147 | This might be the first modality you might want to give a try when using REDItools2.0 for the first time.
148 | 
149 | The serial version of REDItools2.0 can be tested by issuing the following command:
150 | 
151 | > serial_test.sh
152 | 
153 | or, if you are in a SLURM-based cluster:
154 | 
155 | > sbatch serial_test_slurm.sh
156 | 
157 | #### 5.2 Parallel version  (parallel_reditools.py)
158 | 
159 | In this modality you benefit both from the serial optimization and from the parallel computation introduced in this brand new version which exploits the existence of multiple cores, also on multiple nodes, making it a perfect tool on High Performance Computing facilities.
160 | Using this modality requires you to perform a little bit more system setup, but it will definitely pay you off.
161 | 
162 | The parallel version leverages on the existence of coverage information which reports for each position the number of supporting reads.
163 | 
164 | We assume you already have installed and correctly configured the following tools:
165 | 
166 | - **samtools** (http://www.htslib.org/)
167 | - **htslib** (http://www.htslib.org/)
168 | 
169 | If you can use *mpi* on your machine (e.g., you are not on a multi-user system and there are no limitations to the jobs you can submit to the system), you can try launching the parallel version of REDItools 2.0 as follows:
170 | 
171 | > ./parallel_test.sh
172 | 
173 | If you are running on a SLURM-based cluster, instead, run the following command:
174 | 
175 | > sbatch ./parallel_test_slurm.sh
176 | 
177 | This script:
178 | - first defines a bunch of variables which point to input, output and accessory files; then
179 | - launches the production of coverage data; then
180 | - REDItools 2.0 is launched in parallel, by using the specified number of cores; finally
181 | - results are gathered and written into a single table (parameter *-o* provided in the command line)
182 | 
183 | ## Running
184 | 
185 | ### 6. Running REDItools 2.0 on your own data
186 | ---
187 | You can now customize the input test scripts to your needs with your input, output and ad-hoc options.
188 | 
189 | ### 7. REDItools 2.0 options
190 | ---
191 | #### 7.1 Basic options
192 | In its most basic form, REDItools 2.0 can be invoked with an input BAM file, a reference genome and an output file:
193 | > python src/cineca/reditools.py -f \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE
194 | 
195 | If you want, you can restrict the analysis only to a certain region (e.g., only chr1), by means of the **-g** option :
196 | > python src/cineca/reditools.py -f  \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE -g chr1
197 | > 
198 | or a specific interval:
199 | > python src/cineca/reditools.py -f  \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE -g chr1:1000-2000
200 | 
201 | For a complete list of options and their usage and meaning, please type:
202 | 
203 | > python src/cineca/reditools.py -h
204 | 
205 | #### 7.2 Other options
206 | 
207 | Here we report the principal options with a detailed explanation for each of them.
208 | The following are the options accepted by the serial version of REDItools:
209 | 
210 | > reditools.py [-h] [-f FILE] [-o OUTPUT_FILE] [-S] [-s STRAND] [-a]
211 |                     [-r REFERENCE] [-g REGION] [-m OMOPOLYMERIC_FILE] [-c]
212 |                     [-os OMOPOLYMERIC_SPAN] [-sf SPLICING_FILE]
213 |                     [-ss SPLICING_SPAN] [-mrl MIN_READ_LENGTH]
214 |                     [-q MIN_READ_QUALITY] [-bq MIN_BASE_QUALITY]
215 |                     [-mbp MIN_BASE_POSITION] [-Mbp MAX_BASE_POSITION]
216 |                     [-l MIN_COLUMN_LENGTH] [-men MIN_EDITS_PER_NUCLEOTIDE]
217 |                     [-me MIN_EDITS] [-Men MAX_EDITING_NUCLEOTIDES] [-d]
218 |                     [-T STRAND_CONFIDENCE] [-C] [-Tv STRAND_CONFIDENCE_VALUE]
219 |                     [-V] [-H] [-D] [-B BED_FILE]
220 |   >                    
221 |   > **-h**, --help
222 |   > show this help message and exit
223 |   >
224 |   >**-f** FILE, --file FILE 
225 |   >The bam file to be analyzed
226 |   >
227 |   >**-o** OUTPUT_FILE, --output-file OUTPUT_FILE
228 |   >The output statistics file
229 |   >
230 |   >**-S**, --strict
231 |   >          Activate strict mode: only sites with edits will be included in the output
232 |   >
233 |   >**-s** STRAND, --strand STRAND
234 |   >Strand: this can be 0 (unstranded), 1 (secondstrand oriented) or 2 (firststrand oriented)
235 |   >
236 |   >**-a**, --append-file
237 |   >Appends results to file (and creates if not existing)
238 |   >
239 |   >**-r** REFERENCE, --reference REFERENCE
240 |   >The reference FASTA file
241 |   >
242 |   >**-g** REGION, --region REGION
243 |   >The region of the bam file to be analyzed
244 |   >
245 |   >**-m** OMOPOLYMERIC_FILE, --omopolymeric-file OMOPOLYMERIC_FILE
246 |   >The file containing the omopolymeric positions
247 |   >
248 |   >**-c**, --create-omopolymeric-file
249 |   >Whether to create the omopolymeric span
250 |   >
251 |   >**-os** OMOPOLYMERIC_SPAN, --omopolymeric-span OMOPOLYMERIC_SPAN
252 |   >The omopolymeric span
253 |   >
254 |   >**-sf** SPLICING_FILE, --splicing-file SPLICING_FILE
255 |   >The file containing the splicing sites positions
256 |   >
257 |   >**-ss** SPLICING_SPAN, --splicing-span SPLICING_SPAN
258 |   >The splicing span
259 |   >
260 |   >**-mrl** MIN_READ_LENGTH, --min-read-length MIN_READ_LENGTH
261 |   >The minimum read length. Reads whose length is below this value will be discarded.
262 |   >
263 |   >**-q** MIN_READ_QUALITY, --min-read-quality MIN_READ_QUALITY
264 |   >The minimum read quality. Reads whose mapping quality is below this value will be discarded.
265 |   >
266 |   >**-bq** MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
267 |   >The minimum base quality. Bases whose quality is below this value will not be included in the analysis.
268 |   >
269 |   >**-mbp** MIN_BASE_POSITION, --min-base-position MIN_BASE_POSITION
270 |   >The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis.
271 |   >
272 |   >**-Mbp** MAX_BASE_POSITION, --max-base-position MAX_BASE_POSITION
273 |   >The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis.
274 |   >
275 |   >**-l** MIN_COLUMN_LENGTH, --min-column-length MIN_COLUMN_LENGTH
276 |   >The minimum length of editing column (per position). Positions whose columns have length below this value will not be included in the analysis.
277 |   >
278 |   >**-men** MIN_EDITS_PER_NUCLEOTIDE, --min-edits-per-nucleotide MIN_EDITS_PER_NUCLEOTIDE
279 |   >The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis.
280 |   >
281 |   >**-me** MIN_EDITS, --min-edits MIN_EDITS
282 |   > The minimum number of editing events (per position). Positions whose columns have bases with less than 'min-edits-per-base edits' will not be included in the analysis.
283 |   >
284 |   >**-Men** MAX_EDITING_NUCLEOTIDES, --max-editing-nucleotides MAX_EDITING_NUCLEOTIDES
285 |   > The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than 'max-editing-nucleotides' will not be included in the analysis.
286 |   >
287 |   >**-d**, --debug
288 |   >REDItools is run in DEBUG mode.
289 |   >
290 |   >**-T** STRAND_CONFIDENCE, --strand-confidence STRAND_CONFIDENCE
291 |   > Strand inference type
292 |   > 1:maxValue
293 |   > 2:useConfidence [1];
294 |   > maxValue: the most prominent strand count will be used;
295 |   > useConfidence: strand is assigned if over a prefixed frequency confidence (-TV option)
296 |   >
297 |   >**-C**, --strand-correction
298 |   > Strand correction. Once the strand has been inferred, only bases according to this strand will be selected.
299 |   >
300 |   >**-Tv** STRAND_CONFIDENCE_VALUE, --strand-confidence-value STRAND_CONFIDENCE_VALUE
301 |   >                        Strand confidence [0.70]
302 |   >
303 |   >**-V**, --verbose
304 |   >         Verbose information in stderr
305 |   >
306 |   >**-H**, --remove-header
307 |   >Do not include header in output file
308 |   >
309 |   >**-N**, --dna
310 |   >Run REDItools 2.0 on DNA-Seq data
311 |   >
312 |   >**-B** BED_FILE, --bed_file BED_FILE
313 |   > Path of BED file containing target regions  
314 | 
315 | The parallel version of REDItools 2.0 has also other 4 additional parameters, namely:
316 |   >**-G**    --coverage-file    The coverage file of the sample to analyze
317 |   >
318 |   >**-D**    --coverage-dir    The coverage directory containing the coverage file of the sample to analyze divided by chromosome
319 |    >
320 |    >**-t**    --temp-dir    The temp directory where to store temporary data for this sample
321 |    >
322 |    >**-Z**    --chromosome-sizes    The file with the chromosome sizes
323 | 
324 | ### 8. DNA-Seq annotation with REDItools 2.0
325 | 
326 | - Analyze your RNA-Seq data (e.g., file *rna.bam*) with any version of REDItools and obtain the corresponding output table (e.g., *rna_table.txt* or *rna_table.txt.gz*);
327 | - Analyze your DNA-Seq data (e.g., *dna.bam*) with REDItools 2.0, providing as input:
328 | 	1. The DNA-Seq file (*dna.bam*) (e.g., option *-f* *dna.bam*);
329 | 	2. The output RNA-table output of the first step (e.g., option *-B* *rna_table.txt*)
330 | This step will produce the output table (e.g., *dna_table.txt*);
331 | - Annotate the RNA-Seq table by means of the DNA-Seq table by running REDItools2.0 annotator (script *src/cineca/annotate_with_DNA.py*) with the two tables as input (e.g., *rna_table.txt* and *dna_table.txt*) which will produce the final annotated table (e.g., *final_table.txt*).
332 | 
333 | <p align="center">
334 | <img src="Images/Fig4.png" width="600px">
335 | </p>
336 | 
337 | When RNA-editing tables are big (e.g., greater than 1GB in gz format) reading the full table in parallel mode can be really a time-consuming task. In order to optimize the loading of target positions, we have provided a script to convert RNA-editing tables to BED files:
338 | 
339 | > python src/cineca/reditools_table_to_bed.py -i RNA_TABLE -o BED_FILE
340 | 
341 | This can be further optimized by creating the final BED in parallel:
342 | 
343 | > extract_bed_dynamic.sh RNA_TABLE TEMP_DIR SIZE_FILE
344 | 
345 | where
346 | - RNA_TABLE is the input RNA-editing table;
347 | - TEMP_DIR is the directory that will contain the output BED file;
348 | - SIZE_FILE is the file containing the chromosome information (e.g., the .fai file of your reference genome).
349 | 
350 | Finally run the script *src/cineca/annotate_with_DNA.py*:
351 | 
352 | > python src/cineca/annotate_with_DNA.py -r RNA_TABLE -d DNA_TABLE [-Z]
353 | 
354 | The option -Z (not mandatory and without arguments) will exclude positions with multiple changes in DNA-Seq.
355 | 
356 | #### 8.1 Useful scripts
357 | 
358 | In order to ease the annotation of RNA-Seq tables with DNA-Seq information, we also provided two sample scripts that you can customize with your own data:
359 | 
360 | - [**WORK IN PROGRESS**] serial_dna_test.sh
361 | - [**WORK IN PROGRESS**] parallel_dna_test.sh
362 | 
363 | ### 9. [**WORK IN PROGRESS**] Running REDItools 2.0 in multisample mode
364 | REDItools also supports the launch on multiple samples at the same time. This modality is extremely useful if you have a dataset (i.e., group of homogeneous samples) and wish to run the same analysis on all of them (i.e., with the same options).
365 | 
366 | In order to do this, we provided a second script analogous to parallel_reditools.py, called *reditools2_multisample.py* which supports the specification of an additional option -F [SAMPLE_FILE]. SAMPLE_FILE is a file containing the (absolute) path of samples to be analyzed.
367 | It can be launched in the following manner:
368 | 
369 | > mpirun src/cineca/reditools2_multisample.py -F $SAMPLE_FILE [OPTIONS]
370 | 
371 | where OPTIONS are the same options accepted by the parallel version of REDItools 2.0.
372 | 
373 |  #### 9.1 Running in multisample mode on a SLURM-based cluster
374 | If you wish to run REDItools 2.0 in multisample mode on a SLURM-based cluster, we provided two scripts that will help you:
375 | 
376 | - [**WORK IN PROGRESS**] *extract_coverage_slurm_multisample.sh*: will calculate the coverage data for all the samples in parallel (by using the script *extract_coverage_dynamic.sh*);
377 | - [**WORK IN PROGRESS**] *multisample_test.sh*: will calculate the RNA-editing events tables for all the samples in parallel using MPI.
378 | 
379 | First run *extract_coverage_slurm_multisample.sh* and then *multisample_test.sh*.
380 | 
381 | ### 10. Displaying benchmarks with REDItools 2.0 (parallel version only)
382 | We also released simple scripts to generate HTML pages containing the snapshot of the amount of time REDItools 2.0 (parallel version) spends on each part of the overall computation for each process (e.g., coverage computation, DIA algorithm, interval analysis, partial results recombination, etc).
383 | 
384 | **Note**: this command will work only when launched *after* the parallel computation has completed.
385 | 
386 | All you have to do to create the HTML page is launching the following command:
387 | > create_html.sh TEMP_DIR
388 | 
389 | where TEMP_DIR is the directory you specified with the -t option; this directory should contain in fact some auxiliary files (e.g., intervals.txt, progress.txt, times.txt and groups.txt) which serve exactly this purpose.
390 | Once created, the HTML page should display time information similar to the following:
391 | 
392 | <p align="center">
393 | <img src="Images/Fig 5.png" width="600px">
394 | </p>
395 | 
396 | By means of this visualization you can *hover* on slices to see more in details the statistics for each interval computation as well as *zoom in* and *zoom out* by using the scroll wheel of your mouse.
397 | 
398 | Issues
399 | ---
400 | No issues are known so far. For any problem, write to t.flati@cineca.it.
401 | <!--stackedit_data:
402 | eyJoaXN0b3J5IjpbLTIxMzkwMjczOTAsNDc1MjkwNjE2LC0xMD
403 | E2NzQ0MjUxLC0yMDk0OTkwNDIzLC05NjkzNjU4MjIsMjc2NzE0
404 | NTA3LDIxMDkxNjI1NDksLTkxNjY3ODgyMSwxODY3MzQ1NjIzLD
405 | IwNDYwMjY1NzYsLTIwOTcwNDQyMDgsMTE1NDk3NTIxNCwtOTEz
406 | OTQ0ODIzXX0=
407 | -->
408 | 


--------------------------------------------------------------------------------
/accessory/AnnotateTable.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, getopt, random, time
 23 | try: import pysam
 24 | except: sys.exit('Pysam module not found.')
 25 | pid=str(os.getpid()+random.randint(0,999999999))
 26 | 
 27 | pysamVersion=pysam.__version__
 28 | sys.stderr.write('Pysam version used: %s\n' %(pysamVersion))
 29 | 
 30 | def usage():
 31 | 	print """
 32 | USAGE: python AnnotateTable.py [options]
 33 | Options:
 34 | -a		Sorted Annotation file
 35 | -i		Annotate a file of positions [column1=region, column2=coordinate (1 based)]
 36 | 		or a single position [region:coordinate (1 based)]
 37 | -k		skip lines starting with: #
 38 | -r		Add a prefix to chromosome name [] (chr when the name is a number)
 39 | -s		Strand column in annotation file [4]
 40 | -u		Not use table strand info (fix it to 2)
 41 | -c		Add columns separated by comma (feature:1, gene_id:2, transcript_id:3) [1,2]
 42 | -n		Column name [Col]
 43 | -S		Correct strand by annotation
 44 | -C		Columns with base distribution [7,12] (in combination with -S)
 45 | -o		Save lines to a file
 46 | -h		Print this help
 47 | """
 48 | 
 49 | try:
 50 | 	opts, args = getopt.getopt(sys.argv[1:], 'i:a:o:hs:c:n:SC:uk:r:',["help"])
 51 | except getopt.GetoptError, err:
 52 | 	print str(err) 
 53 | 	usage()
 54 | 	sys.exit()
 55 | 
 56 | if len(opts)==0:
 57 | 	usage()
 58 | 	sys.exit()
 59 | tablefile,outfile,annfile='','',''
 60 | save,ap,af,addc,cs,nos=0,0,0,[0,1],0,0
 61 | csc=[6,11]
 62 | strcol=3
 63 | colname='Col'
 64 | skip='Region'
 65 | addchr=''
 66 | for o,a in opts:
 67 | 	if o in ("-h","--help"):
 68 | 		usage()
 69 | 		sys.exit()
 70 | 	elif o == "-n": colname = a
 71 | 	elif o == "-k": skip = a
 72 | 	elif o == "-r": addchr = a	
 73 | 	elif o == "-i":
 74 | 		tablefile = a
 75 | 		if not os.path.exists(tablefile): ap,af=1,0
 76 | 		else: ap,af=0,1
 77 | 	elif o == "-o":
 78 | 		outfile = a
 79 | 		save=1
 80 | 	elif o == "-s": strcol = int(a)-1
 81 | 	elif o == "-S": cs = 1
 82 | 	elif o == "-u": nos = 1
 83 | 	elif o == "-C": csc=[int(x)-1 for x in a.split(',')]
 84 | 	elif o == "-c":
 85 | 		addc = [int(x)-1 for x in a.split(',') if x in ['1','2','3']]
 86 | 		addc.sort()
 87 | 	elif o == "-a":
 88 | 		annfile = a
 89 | 		if annfile=='':
 90 | 			usage()
 91 | 			sys.exit('Sorted annotation file not found.')		
 92 | 	else:
 93 | 		assert False, "unhandled option"
 94 | 
 95 | ##############
 96 | def gstr(v):
 97 | 	if v=='-': return '0'
 98 | 	else: return '1'
 99 | 
100 | def comp(s):
101 | 	a={'A':'T','T':'A','C':'G','G':'C'}
102 | 	ss=''
103 | 	for i in s.upper():
104 | 		if a.has_key(i): ss+=a[i]
105 | 		elif i==' ': ss+=' '
106 | 		elif i=='-': ss+='-'
107 | 		else: ss+='N'
108 | 	return ss
109 | 
110 | def bcomp(b):
111 | 	bb=eval(b)
112 | 	return str([bb[3],bb[2],bb[1],bb[0]])
113 | 
114 | def checkstr(stringa):
115 | 	strand='+-'
116 | 	if stringa=='0': strand='-'
117 | 	elif stringa=='1': strand='+'
118 | 	elif stringa=='2': strand='+-'
119 | 	elif stringa=='-': strand='-'
120 | 	elif stringa=='+': strand='+'
121 | 	return strand
122 | 
123 | def parse(res):
124 | 	d={'+':{},'-':{}}
125 | 	anns='+'
126 | 	for i in res:
127 | 		if i[3]=='+':
128 | 			if d['+'].has_key(i[1]):
129 | 				if i[0] not in d['+'][i[1]][0]: d['+'][i[1]][0]=d['+'][i[1]][0]+','+i[0]
130 | 				if i[2]+'-'+i[0] not in d['+'][i[1]][1]: d['+'][i[1]][1]=d['+'][i[1]][1]+','+i[2]+'-'+i[0]
131 | 			else:
132 | 				d['+'][i[1]]=[i[0],i[2]+'-'+i[0]]
133 | 		elif i[3]=='-':
134 | 			if d['-'].has_key(i[1]):
135 | 				if i[0] not in d['-'][i[1]][0]: d['-'][i[1]][0]=d['-'][i[1]][0]+','+i[0]
136 | 				if i[2]+'-'+i[0] not in d['-'][i[1]][1]: d['-'][i[1]][1]=d['-'][i[1]][1]+','+i[2]+'-'+i[0]
137 | 			else:
138 | 				d['-'][i[1]]=[i[0],i[2]+'-'+i[0]]
139 | 	gip='$'.join(d['+'].keys())
140 | 	featp='$'.join([d['+'][x][0] for x in d['+'].keys()])
141 | 	tip='$'.join([d['+'][x][1] for x in d['+'].keys()])
142 | 	gim='$'.join(d['-'].keys())
143 | 	featm='$'.join([d['-'][x][0] for x in d['-'].keys()])
144 | 	tim='$'.join([d['-'][x][1] for x in d['-'].keys()])
145 | 	p=[featp,gip,tip]
146 | 	m=[featm,gim,tim]
147 | 	pm=[(featp+'&'+featm).strip('&'),(gip+'&'+gim).strip('&'),(tip+'&'+tim).strip('&')]
148 | 	if len(d['+'])==0 and len(d['-'])!=0: anns='-'
149 | 	if len(d['+'])==0: p=['-','-','-']
150 | 	if len(d['-'])==0: m=['-','-','-']
151 | 	if len(d['+'])==0 and len(d['-'])==0:
152 | 		pm=['-','-','-']
153 | 		anns='+-'
154 | 	if len(d['+'])!=0 and len(d['-'])!=0: anns='+-'
155 | 	return (p,m,pm,anns)
156 | 
157 | #chr17:7590770
158 | 	
159 | ###############
160 | if ap and af:
161 | 	usage()
162 | 	sys.exit('You can annotate a file of positions or a single positions but not both in one run.')
163 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
164 | sys.stderr.write("Script time --> START: %s\n" %(script_time))
165 | 
166 | 
167 | if not os.path.exists(annfile+'.tbi'):
168 | 	sys.stderr.write('Indexing %s file.\n' %(annfile))
169 | 	annfile=pysam.tabix_index(annfile, preset='gff')
170 | 
171 | tabix=pysam.Tabixfile(annfile)
172 | contig=tabix.contigs
173 | 
174 | if ap:
175 | 	prinfo=['Feature --> ','Gid --> ','Tid --> ']
176 | 	try:
177 | 		query=tablefile.split(':')
178 | 		chr,pos=addchr+query[0],int(query[1])-1
179 | 		try: strand=checkstr(query[2])
180 | 		except: strand=checkstr('')
181 | 		if nos: strand='+-'
182 | 		sres=[]
183 | 		if chr in contig:
184 | 			sres=[(kk.feature,kk.gene_id,kk.transcript_id,kk.strand) for kk in tabix.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())]	
185 | 		ann=parse(sres)
186 | 		if strand=='+': res=ann[0]
187 | 		elif strand=='-': res=ann[1]
188 | 		else: res=ann[2]		
189 | 		for i in addc:
190 | 			print prinfo[i]+ res[i]
191 | 	except: sys.exit('Error: not correct position.')
192 | 	
193 | if af:
194 | 	if save: o=open(outfile,'w')
195 | 	f=open(tablefile)
196 | 	hinfo=['%s_feat' %(colname),'%s_gid' %(colname),'%s_tid' %(colname)]
197 | 	for i in f:
198 | 		if i.strip()=='': continue
199 | 		if i.startswith('Region'):
200 | 			h=[i.strip()]
201 | 			for k in addc: h.append(hinfo[k])
202 | 			if save: o.write('\t'.join(h)+'\n')
203 | 			else: print '\t'.join(h)
204 | 			continue
205 | 		if i.startswith(skip): continue
206 | 		l=(i.strip()).split('\t')
207 | 		chr,pos=addchr+l[0],int(l[1])-1
208 | 		try: strand=checkstr(l[strcol])
209 | 		except: strand='+-'
210 | 		if nos: strand='+-'
211 | 		sres=[]
212 | 		#print chr,pos,pos+1
213 | 		if chr in contig:
214 | 			sres=[(kk.feature,kk.gene_id,kk.transcript_id,kk.strand) for kk in tabix.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())]	
215 | 		ann=parse(sres) #(p,m,pm,anns)
216 | 		if cs:
217 | 			if ann[3]=='+-': pass
218 | 			elif ann[3]==strand: pass
219 | 			elif ann[3]!=strand:
220 | 				l[2]=comp(l[2])
221 | 				l[strcol]=gstr(ann[3])
222 | 				strand=l[strcol]
223 | 				for j in csc:
224 | 					try:
225 | 						l[j]=bcomp(l[j])
226 | 						l[j+1]=comp(l[j+1])
227 | 					except: pass
228 | 		if strand=='+': res=ann[0]
229 | 		elif strand=='-': res=ann[1]
230 | 		else: res=ann[2]
231 | 		for j in addc: l.append(res[j])
232 | 		if save: o.write('\t'.join(l)+'\n')
233 | 		else: print '\t'.join(l)
234 | tabix.close()
235 | if save:
236 | 	o.close()
237 | 	sys.stderr.write("Table saved on %s\n" %(outfile))
238 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
239 | sys.stderr.write("Script time --> END: %s\n" %(script_time))
240 | 


--------------------------------------------------------------------------------
/accessory/FilterTable.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | """
 23 | To do: filtering according to strand of positions in table file
 24 | """
 25 | 
 26 | import sys, time, getopt, string, os, random
 27 | try: import pysam
 28 | except: sys.exit('Pysam module not found.')
 29 | 
 30 | pid=str(os.getpid()+random.randint(0,999999999))
 31 | 
 32 | def usage():
 33 | 	print """
 34 | USAGE: python FilterTable.py [options]
 35 | Options:
 36 | -i		Table file
 37 | -f		Sorted file with positions to filter in
 38 | -s		Sorted file with positions to filter out
 39 | -F		Features to filter in (separated by comma)
 40 | -S		Features to filter out (separated by comma)
 41 | -E		Exclude positions filtered out
 42 | -o		Save filtered lines to a file [stdout]
 43 | -p		Print simple statistics
 44 | -h		Print this help
 45 | 
 46 | """
 47 | 
 48 | try:
 49 | 	opts, args = getopt.getopt(sys.argv[1:], 'i:o:f:hs:F:S:Ep',["help"])
 50 | except getopt.GetoptError, err:
 51 | 	print str(err) 
 52 | 	usage()
 53 | 	sys.exit()
 54 | 
 55 | if len(opts)==0:
 56 | 	usage()
 57 | 	sys.exit()
 58 | tablefile,outfile='',''
 59 | ffile,ofile='',''
 60 | save,ff,fo,exp,ps=0,0,0,0,0
 61 | infeat,outfeat=[],[]
 62 | for o,a in opts:
 63 | 	if o in ("-h","--help"):
 64 | 		usage()
 65 | 		sys.exit()
 66 | 	elif o == "-i":
 67 | 		tablefile = a
 68 | 		if not os.path.exists(tablefile):
 69 | 			usage()
 70 | 			sys.exit('Table file not found')
 71 | 	elif o == "-o":
 72 | 		outfile = a
 73 | 		save=1
 74 | 	elif o == "-s":
 75 | 		ofile = a
 76 | 		fo=1
 77 | 		if ofile=='':
 78 | 			usage()
 79 | 			sys.exit('Sorted file with positions to filter out not found.')		
 80 | 	elif o == "-f":
 81 | 		ffile = a
 82 | 		ff=1
 83 | 		if ffile=='':
 84 | 			usage()
 85 | 			sys.exit('Sorted file with positions to filter in not found.')		
 86 | 	elif o == "-F":
 87 | 		infeat=[x.lower() for x in a.split(',')]
 88 | 	elif o == "-S":
 89 | 		outfeat=[x.lower() for x in a.split(',')]
 90 | 	elif o == "-E": exp=1
 91 | 	elif o == "-p": ps=1	
 92 | 	else:
 93 | 		assert False, "unhandled option"
 94 | 
 95 | # Funzioni
 96 | def filterIn(chr,exfeat,pos):
 97 | 	if len(exfeat)==0: return 1
 98 | 	if ff and not chr in contigf: return 0 
 99 | 	elif not ff: return 1
100 | 	res=[(kk.feature).lower() for kk in tabixf.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())]
101 | 	for i in exfeat:
102 | 		if i in res: return 1
103 | 	return 0
104 | 
105 | def filterOut(chr,exfeat,pos):
106 | 	if len(exfeat)==0: return 0
107 | 	if fo and not chr in contigo: return 0
108 | 	elif not fo: return 0 
109 | 	res=[(kk.feature).lower() for kk in tabixo.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())]	
110 | 	for i in exfeat:
111 | 		if i in res: return 1
112 | 	return 0
113 | 	
114 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
115 | sys.stderr.write("Script time --> START: %s\n" %(script_time))
116 | 
117 | if fo:
118 | 	if not os.path.exists(ofile+'.tbi'):
119 | 		sys.stderr.write('Indexing %s file.\n' %(ofile))
120 | 		ofile=pysam.tabix_index(ofile, preset='gff')
121 | if ff:
122 | 	if not os.path.exists(ffile+'.tbi'):
123 | 		sys.stderr.write('Indexing %s file.\n' %(ffile))
124 | 		ffile=pysam.tabix_index(ffile, preset='gff')
125 | 
126 | if fo:
127 | 	tabixo=pysam.Tabixfile(ofile)
128 | 	contigo=tabixo.contigs
129 | if ff:
130 | 	tabixf=pysam.Tabixfile(ffile)
131 | 	contigf=tabixf.contigs
132 | 	
133 | sys.stderr.write('Reading Table file...\n')
134 | if save: o=open(outfile,'w')
135 | f=open(tablefile)
136 | y,x,xx=0,0,0
137 | for i in f:
138 | 	if i.strip()=='': continue
139 | 	if i.startswith('#'): continue
140 | 	if i.startswith('Region'):
141 | 		if save: o.write(i.strip()+'\n')
142 | 		else: sys.stdout.write(i)
143 | 		continue
144 | 	l=(i.strip('\n')).split('\t')
145 | 	xx+=1
146 | 	reg,pos = l[0],int(l[1]) # sottrarre -1 per la ricerca nella tabella
147 | 	fin=filterIn(reg,infeat,pos-1)
148 | 	fout=filterOut(reg,outfeat,pos-1)
149 | 	if fin:
150 | 		if fout:
151 | 			x+=1
152 | 			if exp: continue
153 | 			if save: o.write('#'+i)
154 | 			else: sys.stdout.write('#'+i)
155 | 		else:
156 | 			y+=1
157 | 			if save: o.write(i)
158 | 			else: sys.stdout.write(i)				
159 | 	else:
160 | 		x+=1
161 | 		if exp: continue
162 | 		if save: o.write('#'+i)
163 | 		else: sys.stdout.write('#'+i)
164 | 	
165 | f.close()
166 | if save: o.close()
167 | if ff: tabixf.close()
168 | if fo: tabixo.close()
169 | if ps:
170 | 	sys.stdout.write("All positions: %i\n" %(xx))
171 | 	sys.stdout.write("Positions filtered in: %i\n" %(y))
172 | 	sys.stdout.write("Positions filtered out: %i\n" %(x))	
173 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
174 | sys.stderr.write("Script time --> END: %s\n" %(script_time))
175 | 
176 | 


--------------------------------------------------------------------------------
/accessory/GFFtoTabix.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, getopt, time, random, heapq, shutil
 23 | from tempfile import gettempdir
 24 | from itertools import islice, cycle
 25 | from collections import namedtuple
 26 | from operator import itemgetter
 27 | try: import pysam
 28 | except: sys.exit('Pysam module not found.')
 29 | 
 30 | version='1.0'
 31 | pid=str(os.getpid()+random.randint(0,999999999))
 32 | 
 33 | def usage():
 34 | 	print """
 35 | USAGE: python GFFtoTabix.py [options]
 36 | Options:
 37 | -i		GFF file
 38 | -S		Do not sort GFF (sort by default)
 39 | -b		Buffer size (as number of lines) [32000]
 40 | -t		Temporary directory to use (multiple -t may be used)
 41 | -u		Save an uncompressed GFF copy (add _copy suffix)
 42 | -h		Print this help
 43 | 
 44 | """
 45 | 
 46 | try:
 47 | 	opts, args = getopt.getopt(sys.argv[1:], "i:Sb:t:hu",["help"])
 48 | 	if len(opts)==0:
 49 | 		usage()
 50 | 		sys.exit(2)
 51 | except getopt.GetoptError as err:
 52 | 	print str(err) # will print something like "option -a not recognized"
 53 | 	usage()
 54 | 	sys.exit(2)
 55 | 
 56 | GFFfile=''
 57 | buffer_size=32000
 58 | tempdirs=[]
 59 | sort=1
 60 | mc=0 # save an uncompressed GFF copy, default no
 61 | for o, a in opts:
 62 | 	if o in ("-h","--help"):
 63 | 		usage()
 64 | 		sys.exit()
 65 | 	elif o == "-i":
 66 | 		GFFfile=a
 67 | 		outfile='.'.join(GFFfile.split('.')[:-1])+'.sorted.gff'
 68 | 		if not os.path.exists(GFFfile):
 69 | 			usage()
 70 | 			sys.exit('GFF file not found')
 71 | 	elif o == "-b": buffer_size=int(a)
 72 | 	elif o == "-t": tempdirs.append(a)
 73 | 	elif o == "-S": sort=0
 74 | 	elif o == "-u": mc=1
 75 | 	else:
 76 | 		assert False, "Unhandled Option"
 77 | 
 78 | Keyed = namedtuple("Keyed", ["key", "obj"])
 79 | key_=eval('lambda line : (%s)' %('line[:]'))
 80 | 
 81 | def gk(key,obj):
 82 | 	ik=itemgetter(0,3,4)(obj.split('\t'))
 83 | 	return key((ik[0],int(ik[1]),int(ik[2])))
 84 | 
 85 | def merge(key=None, *iterables):
 86 | 	# based on code posted by Scott David Daniels in c.l.p.
 87 | 	# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
 88 | 	#print iterables
 89 | 	if key is None:
 90 | 		keyed_iterables = iterables
 91 | 	else:
 92 | 		keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables]
 93 | 		#print keyed_iterables
 94 | 	for element in heapq.merge(*keyed_iterables):
 95 | 		yield element.obj
 96 | 
 97 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
 98 | 	if tempdirs is None:
 99 | 		tempdirs = []
100 | 	if not tempdirs:
101 | 		tempdirs.append(gettempdir())
102 | 	chunks = []
103 | 	xx=0
104 | 	try:
105 | 		with open(input,'rb',64*1024) as input_file:
106 | 			input_iterator = iter(input_file)
107 | 			for tempdir in cycle(tempdirs):
108 | 				current_chunk2=[]
109 | 				for j in islice(input_iterator,buffer_size):
110 | 					l=(j.strip()).split('\t')
111 | 					l[3]=int(l[3])
112 | 					l[4]=int(l[4])
113 | 					current_chunk2.append(l)
114 | 				current_chunk3=[]
115 | 				for j in sorted(current_chunk2, key=itemgetter(0,3,4)):
116 | 					j[3]=str(j[3])
117 | 					j[4]=str(j[4])
118 | 					current_chunk3.append('\t'.join(j)+'\n')
119 | 				xx+=len(current_chunk3)
120 | 				if not current_chunk3: break
121 | 				sys.stdout.write("Loaded and sorted %i lines.\n"%(xx))
122 | 				output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024)
123 | 				chunks.append(output_chunk)
124 | 				output_chunk.writelines(current_chunk3)
125 | 				output_chunk.flush()
126 | 				output_chunk.seek(0)
127 | 		sys.stdout.write("Merging from %i files.\n"%(len(chunks)))
128 | 		with open(output,'wb',64*1024) as output_file:
129 | 			output_file.writelines(merge(key, *chunks))
130 | 	finally:
131 | 		for chunk in chunks:
132 | 			try:
133 | 				chunk.close()
134 | 				os.remove(chunk.name)
135 | 			except Exception:
136 | 				pass
137 | 
138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
139 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
140 | if sort:
141 | 	sys.stdout.write("Sorting GFF file...\n")
142 | 	batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs)
143 | 	GFFfile=outfile
144 | if mc:
145 | 	copyfile=GFFfile+'_copy'
146 | 	shutil.copyfile(GFFfile,copyfile)
147 | 	sys.stdout.write("A copy of uncompressed GFF file has been saved on %s.\n" %(copyfile))
148 | sys.stdout.write("Indexing GFF file...\n")
149 | GFFfile=pysam.tabix_index(GFFfile, preset='gff')
150 | sys.stdout.write("Tabix file saved on %s.\n" %(GFFfile))
151 | sys.stdout.write("Indices saved on %s.tbi.\n" %(GFFfile))
152 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
153 | sys.stdout.write("Script time --> END: %s\n"%(script_time))


--------------------------------------------------------------------------------
/accessory/Readme.md:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 3 | <html xmlns="http://www.w3.org/1999/xhtml">
 4 |   <head>
 5 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />  
 6 |   </head>
 7 |   <body>
 8 | <h1>get_DE_events.py</h1>
 9 | <h5>This scripts and its related files are part of the supplemental material for the paper<br>
10 |   "Investigating RNA editing in deep transcriptome datasets with REDItools and REDIportal"</h5>
11 | <p class-text="justify">
12 | For control case studies by launching the get_DE_events.py script the user can filter REDItoolDnaRna.py outputs according to the following criteria:
13 | <ul>
14 | <li>RNAseq coverage per position (default <b>10 reads</b>)</li>
15 | <li>Minimum editing frequency per position (default <b>10%</b>)</li>
16 | For each editing candidate, the script applies the Mann–Whitney test to check the significance between the two conditions, 
17 | control and HD. By default the test is carried out only if the number of editing events per position is equal to 50% of the samples per group. 
18 | Optionally, p-values can be corrected using Benjamini–Hochberg or Bonferroni tests. 
19 | </ul>
20 | <p>Usage:</p> 
21 | <pre>
22 | get_DE_events.py [-h] [-c MIN_COVERAGE] [-cpval PVALUE_CORRECTION]
23 |                         [-input_file SAMPLES_INFORMATIONS_FILE]
24 |                         [-f MIN_EDIT_FREQUENCY] [-mts MIN_SAMPLE_TESTING]
25 |                         [-sig ONLY_SIGNIFICANT] [-linear]
26 |   
27 | optional arguments:
28 |   -h, --help                             show this help message and exit
29 |   -c MIN_COVERAGE                        Coverage-q30
30 |   -cpval PVALUE_CORRECTION 1 -->         Bonferroni correction / 2 --> Benjamini hochberg
31 |   -input_file SAMPLES_INFORMATIONS_FILE  Comma separated file e.g: <b>Sample,Status</b>
32 |   -f MIN_EDIT_FREQUENCY                  Editing Frequency
33 |   -mts MIN_SAMPLE_TESTING                min percentage of each sample category
34 |   -sig ONLY_SIGNIFICANT                  Return only significant editing events 
35 |                                          (if -cpval flag is activated)
36 |   -linear                                Calculate differential RNA editing according to Tran et al. (2019)
37 |                                                                                         
38 | <b>e.g.</b> python ../REDItools/accessory/get_DE_events.py -cpval 2 -input_file  sample_information.csv -sig yes
39 | <p class-text="justify">The script will filter REDItoolDnaRna.py outputs for each sample contained in the 
40 | SAMPLES_INFORMATIONS_FILE returning only significant editing events (pval <= 0.05)
41 | in accordance with Benjamini hochberg correction.</p>
42 | 
43 | </pre>
44 | </body>
45 | </html> 
46 | 


--------------------------------------------------------------------------------
/accessory/SearchInTable.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, getopt, time
 23 | try: import pysam
 24 | except: sys.exit('Pysam module not found.')
 25 | #pid=str(os.getpid()+random.randint(0,999999999))
 26 | 
 27 | def usage():
 28 | 	print """
 29 | USAGE: python SearchInTable.py [options]
 30 | Options:
 31 | -i		Sorted table file (first col=reference; second col=coordinate 1 based)
 32 | 		or tabix indexed table (ending with .gz)
 33 | -q		Query (file or single positions: chr21:123456)
 34 | -C		Sequence name column [1]
 35 | -S		Start column [2]
 36 | -E		End column; can be identical to '-S' [2]
 37 | -P		Print to stdout found lines
 38 | -p		Print position header (like a fasta header >chr21:123456)
 39 | -n		Print "Not found"
 40 | -s		Print simple statistics on standard error
 41 | -k		Skip lines starting with in query file
 42 | -o		Save found/not found positions on file
 43 | -h		Print this help
 44 | 
 45 | """
 46 | #-k		skip first INT lines [0]
 47 | 
 48 | try:
 49 | 	opts, args = getopt.getopt(sys.argv[1:], "i:q:k:pso:hnC:S:E:O:P",["help"])
 50 | 	if len(opts)==0:
 51 | 		usage()
 52 | 		sys.exit(2)
 53 | except getopt.GetoptError as err:
 54 | 	print str(err) # will print something like "option -a not recognized"
 55 | 	usage()
 56 | 	sys.exit(2)
 57 | 
 58 | tablefile=''
 59 | query=''
 60 | outfile=''
 61 | outfile2=''
 62 | pr,prn,prf=0,0,0
 63 | ps=0
 64 | sv,sv2=0,0
 65 | sk=0
 66 | ski=''
 67 | skil=0
 68 | 
 69 | scol,stcol,ecol=0,1,1
 70 | for o, a in opts:
 71 | 	if o in ("-h","--help"):
 72 | 		usage()
 73 | 		sys.exit()
 74 | 	elif o == "-i":
 75 | 		tablefile=a
 76 | 		if not os.path.exists(tablefile):
 77 | 			usage()
 78 | 			sys.exit('Table file not found')
 79 | 	elif o == "-q":
 80 | 		query=a
 81 | 		if query=='':
 82 | 			usage()
 83 | 			sys.exit('Query not found.')
 84 | 	elif o == "-p": pr=1
 85 | 	elif o == "-C": scol=int(a)-1
 86 | 	elif o == "-S": stcol=int(a)-1
 87 | 	elif o == "-E": ecol=int(a)-1	
 88 | 	elif o == "-n": prn=1
 89 | 	elif o == "-P": prf=1
 90 | 	elif o == "-k":
 91 | 		ski=a
 92 | 		skil=1
 93 | 	elif o == "-s": ps=1
 94 | 	elif o == "-o":
 95 | 		outfile=a
 96 | 		sv=1
 97 | 	elif o == "-O":
 98 | 		outfile2=a
 99 | 		sv2=1
100 | 	else:
101 | 		assert False, "Unhandled Option"
102 | 
103 | 
104 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
105 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
106 | if not os.path.exists(tablefile):
107 | 	#sys.stderr.write('Compressing table file.\n')
108 | 	#pysam.tabix_index(tablefile, tablefile+'.gz')
109 | 	sys.stderr.write('Indexing table file.\n')
110 | 	tablefile=pysam.tabix_index(tablefile, seq_col=scol, start_col=stcol, end_col=ecol)
111 | #if tablefile.endswith('.gz') and not tablefile.endswith('.tbi'):
112 | #	tablefile=pysam.tabix_index(tablefile, seq_col=scol, start_col=stcol, end_col=ecol)
113 | 
114 | tabix=pysam.Tabixfile(tablefile)
115 | allref=tabix.contigs
116 | positions=[]
117 | if os.path.exists(query):
118 | 	f=open(query)
119 | 	for i in f:
120 | 		if i.strip()=='': continue
121 | 		if i.startswith('#'): continue
122 | 		if i.startswith('Region'): continue
123 | 		if skil:
124 | 			if i.startswith(ski): continue
125 | 		l=(i.strip()).split()
126 | 		positions.append((l[0],int(l[1])-1))
127 | 	f.close()
128 | elif query.count(":")==1:
129 | 	l=(query.strip()).split(':')
130 | 	positions.append((l[0],int(l[1])-1))
131 | else: sys.exit('I cannot read the query.')
132 | 
133 | if sv:
134 | 	outf=open(outfile+'_found','w')
135 | 	outnf=open(outfile+'_notfound','w')
136 | if sv2:
137 | 	outf2=open(outfile2+'_foundInSortedTable','w')
138 | xx=0
139 | for pos in positions:
140 | 	res=[]
141 | 	if pos[0] in allref:
142 | 		res=[kk for kk in tabix.fetch(reference=pos[0],start=pos[1],end=pos[1]+1)]
143 | 	if pr: sys.stdout.write('>%s:%i\n' %(pos[0],pos[1]+1))
144 | 	if len(res)==0:
145 | 		if prn: sys.stdout.write('Not Found\n')
146 | 		if sv: outnf.write('%s\t%i\n' %(pos[0],pos[1]+1))
147 | 	else:
148 | 		#if sv: outf.write(res[0]+'\n')
149 | 		if sv: outf.write(res[0]+'\n')
150 | 		if prf: sys.stdout.write(res[0]+'\n')
151 | 		xx+=1	
152 | tabix.close()
153 | if sv:
154 | 	outf.close()
155 | 	outnf.close()
156 | if ps:
157 | 	sys.stdout.write('Positions in query: %i\n' %(len(positions)))
158 | 	sys.stdout.write('Positions found: %i\n' %(xx))
159 | 	sys.stdout.write('Positions not found: %i\n' %(len(positions)-xx))
160 | if sv:
161 | 	sys.stdout.write('Found line(s) saved on: %s\n' %(outfile+'_found'))
162 | 	sys.stdout.write('Not found line(s) saved on: %s\n' %(outfile+'_notfound'))
163 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
164 | sys.stdout.write("Script time --> END: %s\n"%(script_time))


--------------------------------------------------------------------------------
/accessory/SortGFF.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | ## {{{ http://code.activestate.com/recipes/576755/ (r3)
 23 | # based on Recipe 466302: Sorting big files the Python 2.4 way
 24 | # by Nicolas Lehuen
 25 | 
 26 | # Works on python 2.7+ no 3.x
 27 | 
 28 | import sys, os, getopt, heapq, time, random
 29 | from tempfile import gettempdir
 30 | from itertools import islice, cycle
 31 | from collections import namedtuple
 32 | from operator import itemgetter
 33 | 
 34 | version='1.0'
 35 | pid=str(os.getpid()+random.randint(0,999999999))
 36 | 
 37 | def usage():
 38 | 	print """
 39 | USAGE: python SortGFF.py [options]
 40 | Options:
 41 | -i		GFF file
 42 | -o		Sorted output file [GFF_sorted_%s]
 43 | -b		Buffer size (as number of lines) [32000]
 44 | -t		Temporary directory to use (multiple -t may be used)
 45 | -h		Print this help
 46 | 
 47 | """%(pid)
 48 | 
 49 | try:
 50 | 	opts, args = getopt.getopt(sys.argv[1:], "i:o:b:t:h",["help"])
 51 | 	if len(opts)==0:
 52 | 		usage()
 53 | 		sys.exit(2)
 54 | except getopt.GetoptError as err:
 55 | 	print str(err) # will print something like "option -a not recognized"
 56 | 	usage()
 57 | 	sys.exit(2)
 58 | 
 59 | GFFfile=''
 60 | outfile='GFF_sorted_%s' %(pid)
 61 | buffer_size=32000
 62 | tempdirs=[]
 63 | for o, a in opts:
 64 | 	if o in ("-h","--help"):
 65 | 		usage()
 66 | 		sys.exit()
 67 | 	elif o == "-i":
 68 | 		GFFfile=a
 69 | 		if not os.path.exists(GFFfile):
 70 | 			usage()
 71 | 			sys.exit('GFF file not found')
 72 | 	elif o == "-o": outfile=a
 73 | 	elif o == "-b": buffer_size=int(a)
 74 | 	elif o == "-t": tempdirs.append(a)	
 75 | 	else:
 76 | 		assert False, "Unhandled Option"
 77 | 
 78 | Keyed = namedtuple("Keyed", ["key", "obj"])
 79 | key_=eval('lambda line : (%s)' %('line[:]'))
 80 | 
 81 | def gk(key,obj):
 82 | 	ik=itemgetter(0,3,4)(obj.split('\t'))
 83 | 	return key((ik[0],int(ik[1]),int(ik[2])))
 84 | 
 85 | def merge(key=None, *iterables):
 86 | 	# based on code posted by Scott David Daniels in c.l.p.
 87 | 	# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
 88 | 	#print iterables
 89 | 	if key is None:
 90 | 		keyed_iterables = iterables
 91 | 	else:
 92 | 		keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables]
 93 | 		#print keyed_iterables
 94 | 	for element in heapq.merge(*keyed_iterables):
 95 | 		yield element.obj
 96 | 
 97 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
 98 | 	if tempdirs is None:
 99 | 		tempdirs = []
100 | 	if not tempdirs:
101 | 		tempdirs.append(gettempdir())
102 | 	chunks = []
103 | 	xx=0
104 | 	try:
105 | 		with open(input,'rb',64*1024) as input_file:
106 | 			input_iterator = iter(input_file)
107 | 			for tempdir in cycle(tempdirs):
108 | 				current_chunk2=[]
109 | 				for j in islice(input_iterator,buffer_size):
110 | 					l=(j.strip()).split('\t')
111 | 					l[3]=int(l[3])
112 | 					l[4]=int(l[4])
113 | 					current_chunk2.append(l)
114 | 				current_chunk3=[]
115 | 				for j in sorted(current_chunk2, key=itemgetter(0,3,4)):
116 | 					j[3]=str(j[3])
117 | 					j[4]=str(j[4])
118 | 					current_chunk3.append('\t'.join(j)+'\n')
119 | 				xx+=len(current_chunk3)
120 | 				if not current_chunk3: break
121 | 				sys.stdout.write("Loaded and sorted %i lines.\n"%(xx))
122 | 				output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024)
123 | 				chunks.append(output_chunk)
124 | 				output_chunk.writelines(current_chunk3)
125 | 				output_chunk.flush()
126 | 				output_chunk.seek(0)
127 | 		sys.stdout.write("Merging from %i files.\n"%(len(chunks)))
128 | 		with open(output,'wb',64*1024) as output_file:
129 | 			output_file.writelines(merge(key, *chunks))
130 | 	finally:
131 | 		for chunk in chunks:
132 | 			try:
133 | 				chunk.close()
134 | 				os.remove(chunk.name)
135 | 			except Exception:
136 | 				pass
137 | 
138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
139 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
140 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs)
141 | sys.stdout.write("Sorted GFF saved on %s\n"%(outfile))
142 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
143 | sys.stdout.write("Script time --> END: %s\n"%(script_time))


--------------------------------------------------------------------------------
/accessory/SortTable.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | ## {{{ http://code.activestate.com/recipes/576755/ (r3)
 23 | # based on Recipe 466302: Sorting big files the Python 2.4 way
 24 | # by Nicolas Lehuen
 25 | 
 26 | # Works on python 2.7+ no 3.x
 27 | 
 28 | import sys, os, getopt, heapq, time, random
 29 | from tempfile import gettempdir
 30 | from itertools import islice, cycle
 31 | from collections import namedtuple
 32 | from operator import itemgetter
 33 | 
 34 | version='1.0'
 35 | pid=str(os.getpid()+random.randint(0,999999999))
 36 | 
 37 | def usage():
 38 | 	print """
 39 | USAGE: python SortTable.py [options]
 40 | Options:
 41 | -i		Table file
 42 | -d		Delimiter character [\\t] (default TAB)
 43 | -s		Sequence name column [1]
 44 | -c		Start column [4]
 45 | -e		End column (can be identical to -c) [5]
 46 | -m		Skip lines starting with [#]
 47 | -o		Sorted output file [sortedTable_%s]
 48 | -O		Output as TAB-delimited
 49 | -b		Buffer size (as number of lines) [32000]
 50 | -t		Temporary directory to use (multiple -t may be used)
 51 | -h		Print this help
 52 | 
 53 | """%(pid)
 54 | 
 55 | try:
 56 | 	opts, args = getopt.getopt(sys.argv[1:], "i:o:b:t:hd:s:c:e:m:O",["help"])
 57 | 	if len(opts)==0:
 58 | 		usage()
 59 | 		sys.exit(2)
 60 | except getopt.GetoptError as err:
 61 | 	print str(err) # will print something like "option -a not recognized"
 62 | 	usage()
 63 | 	sys.exit(2)
 64 | 
 65 | GFFfile=''
 66 | outfile='sortedTable_%s' %(pid)
 67 | buffer_size=32000
 68 | tempdirs=[]
 69 | scol=0 # sequence column name
 70 | bcol=3 # start column
 71 | ecol=4 # end column
 72 | schar='#' # skip lines starting with this character
 73 | dchar='\t' # delimiter
 74 | odel=0 # tab-delimited as output
 75 | 
 76 | for o, a in opts:
 77 | 	if o in ("-h","--help"):
 78 | 		usage()
 79 | 		sys.exit()
 80 | 	elif o == "-i":
 81 | 		GFFfile=a
 82 | 		if not os.path.exists(GFFfile):
 83 | 			usage()
 84 | 			sys.exit('GFF file not found')
 85 | 	elif o == "-o": outfile=a
 86 | 	elif o == "-b": buffer_size=int(a)
 87 | 	elif o == "-t": tempdirs.append(a)
 88 | 	elif o == "-m": schar=a
 89 | 	elif o == "-d": dchar=a
 90 | 	elif o == "-s": scol=int(a)-1
 91 | 	elif o == "-c": bcol=int(a)-1
 92 | 	elif o == "-e": ecol=int(a)-1
 93 | 	elif o == "-O": odel=1
 94 | 	else:
 95 | 		assert False, "Unhandled Option"
 96 | 
 97 | Keyed = namedtuple("Keyed", ["key", "obj"])
 98 | key_=eval('lambda line : (%s)' %('line[:]'))
 99 | 
100 | def gk(key,obj):
101 | 	if odel: ik=itemgetter(scol,bcol,ecol)(obj.split('\t'))
102 | 	else: ik=itemgetter(scol,bcol,ecol)(obj.split(dchar))
103 | 	return key((ik[0],int(ik[1]),int(ik[2])))
104 | 
105 | def merge(key=None, *iterables):
106 | 	# based on code posted by Scott David Daniels in c.l.p.
107 | 	# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
108 | 	#print iterables
109 | 	if key is None:
110 | 		keyed_iterables = iterables
111 | 	else:
112 | 		keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables]
113 | 		#print keyed_iterables
114 | 	for element in heapq.merge(*keyed_iterables):
115 | 		yield element.obj
116 | 
117 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
118 | 	if tempdirs is None:
119 | 		tempdirs = []
120 | 	if not tempdirs:
121 | 		tempdirs.append(gettempdir())
122 | 	chunks = []
123 | 	xx=0
124 | 	try:
125 | 		with open(input,'rb',64*1024) as input_file:
126 | 			input_iterator = iter(input_file)
127 | 			for tempdir in cycle(tempdirs):
128 | 				current_chunk2=[]
129 | 				for j in islice(input_iterator,buffer_size):
130 | 					if j.startswith('Region'): continue
131 | 					if j.startswith(schar): continue
132 | 					l=(j.strip()).split(dchar)
133 | 					l[bcol]=int(l[bcol])
134 | 					l[ecol]=int(l[ecol])
135 | 					current_chunk2.append(l)
136 | 				current_chunk3=[]
137 | 				for j in sorted(current_chunk2, key=itemgetter(scol,bcol,ecol)):
138 | 					j[bcol]=str(j[bcol])
139 | 					j[ecol]=str(j[ecol])
140 | 					if odel: current_chunk3.append('\t'.join(j)+'\n')
141 | 					else: current_chunk3.append(dchar.join(j)+'\n')
142 | 				xx+=len(current_chunk3)
143 | 				if not current_chunk3: break
144 | 				sys.stdout.write("Loaded and sorted %i lines.\n"%(xx))
145 | 				output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024)
146 | 				chunks.append(output_chunk)
147 | 				output_chunk.writelines(current_chunk3)
148 | 				output_chunk.flush()
149 | 				output_chunk.seek(0)
150 | 		sys.stdout.write("Merging from %i files.\n"%(len(chunks)))
151 | 		with open(output,'wb',64*1024) as output_file:
152 | 			output_file.writelines(merge(key, *chunks))
153 | 	finally:
154 | 		for chunk in chunks:
155 | 			try:
156 | 				chunk.close()
157 | 				os.remove(chunk.name)
158 | 			except Exception:
159 | 				pass
160 | 
161 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
162 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
163 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs)
164 | sys.stdout.write("Sorted GFF saved on %s\n"%(outfile))
165 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
166 | sys.stdout.write("Script time --> END: %s\n"%(script_time))


--------------------------------------------------------------------------------
/accessory/TableToGFF.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, getopt, time, random, heapq, gzip
 23 | from tempfile import gettempdir
 24 | from itertools import islice, cycle
 25 | from collections import namedtuple
 26 | from operator import itemgetter
 27 | 
 28 | version='1.0'
 29 | pid=str(os.getpid()+random.randint(0,999999999))
 30 | 
 31 | def usage():
 32 | 	print """
 33 | USAGE: python TableToGFF.py [options]
 34 | Options:
 35 | -i		Table file from REDItools
 36 | -s		Sort output GFF
 37 | -t		Tabix output GFF (requires Pysam module)
 38 | -b		Buffer size (as number of lines) [32000] (requires -s)
 39 | -T		Temporary directory (requires -s)
 40 | -o		Outfile [outTable_%s.gff]
 41 | -h		Print this help
 42 | 
 43 | """%(pid)
 44 | 
 45 | try:
 46 | 	opts, args = getopt.getopt(sys.argv[1:], "i:o:sthT:b:",["help"])
 47 | 	if len(opts)==0:
 48 | 		usage()
 49 | 		sys.exit(2)
 50 | except getopt.GetoptError as err:
 51 | 	print str(err) # will print something like "option -a not recognized"
 52 | 	usage()
 53 | 	sys.exit(2)
 54 | 
 55 | tablefile=''
 56 | outfile='outTable_%s.gff' %(pid)
 57 | sort=0
 58 | tabix=0
 59 | buffer_size=32000
 60 | tempdirs=[]
 61 | for o, a in opts:
 62 | 	if o in ("-h","--help"):
 63 | 		usage()
 64 | 		sys.exit()
 65 | 	elif o == "-i":
 66 | 		tablefile=a
 67 | 		if not os.path.exists(tablefile):
 68 | 			usage()
 69 | 			sys.exit('Table file not found')
 70 | 	elif o == "-o": outfile=a
 71 | 	elif o == "-s": sort=1
 72 | 	elif o == "-t": tabix=1
 73 | 	elif o == "-b": buffer_size=int(a)
 74 | 	elif o == "-T": tempdirs.append(a)
 75 | 	else:
 76 | 		assert False, "Unhandled Option"
 77 | 
 78 | #Sorting code from SortGFF.py
 79 | 
 80 | Keyed = namedtuple("Keyed", ["key", "obj"])
 81 | key_=eval('lambda line : (%s)' %('line[:]'))
 82 | 
 83 | def gk(key,obj):
 84 | 	ik=itemgetter(0,3,4)(obj.split('\t'))
 85 | 	return key((ik[0],int(ik[1]),int(ik[2])))
 86 | 
 87 | def merge(key=None, *iterables):
 88 | 	# based on code posted by Scott David Daniels in c.l.p.
 89 | 	# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
 90 | 	#print iterables
 91 | 	if key is None:
 92 | 		keyed_iterables = iterables
 93 | 	else:
 94 | 		keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables]
 95 | 		#print keyed_iterables
 96 | 	for element in heapq.merge(*keyed_iterables):
 97 | 		yield element.obj
 98 | 
 99 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
100 | 	if tempdirs is None:
101 | 		tempdirs = []
102 | 	if not tempdirs:
103 | 		tempdirs.append(gettempdir())
104 | 	chunks = []
105 | 	xx=0
106 | 	try:
107 | 		with open(input,'rb',64*1024) as input_file:
108 | 			input_iterator = iter(input_file)
109 | 			for tempdir in cycle(tempdirs):
110 | 				current_chunk2=[]
111 | 				for j in islice(input_iterator,buffer_size):
112 | 					l=(j.strip()).split('\t')
113 | 					l[3]=int(l[3])
114 | 					l[4]=int(l[4])
115 | 					current_chunk2.append(l)
116 | 				current_chunk3=[]
117 | 				for j in sorted(current_chunk2, key=itemgetter(0,3,4)):
118 | 					j[3]=str(j[3])
119 | 					j[4]=str(j[4])
120 | 					current_chunk3.append('\t'.join(j)+'\n')
121 | 				xx+=len(current_chunk3)
122 | 				if not current_chunk3: break
123 | 				sys.stdout.write("Loaded and sorted %i lines.\n"%(xx))
124 | 				output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024)
125 | 				chunks.append(output_chunk)
126 | 				output_chunk.writelines(current_chunk3)
127 | 				output_chunk.flush()
128 | 				output_chunk.seek(0)
129 | 		sys.stdout.write("Merging from %i files.\n"%(len(chunks)))
130 | 		with open(output,'wb',64*1024) as output_file:
131 | 			output_file.writelines(merge(key, *chunks))
132 | 	finally:
133 | 		for chunk in chunks:
134 | 			try:
135 | 				chunk.close()
136 | 				os.remove(chunk.name)
137 | 			except Exception:
138 | 				pass
139 | #END sorting code
140 | 
141 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
142 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
143 | sys.stdout.write("Reading table...\n")
144 | if tablefile.endswith('.gz'): f=gzip.open(tablefile,'rb')
145 | else: f=open(tablefile)
146 | o=open(outfile,'w')
147 | xx=0
148 | #chr21	10205589	C	0	12	34.75	[0, 3, 0, 9]	CT	0.75	16	28.56	[0, 16, 0, 0]	-	0.00	-
149 | for i in f:
150 | 	if i.startswith('Region'): continue
151 | 	if i.strip()=='': continue
152 | 	l=(i.strip()).split('\t')
153 | 	strand='+'
154 | 	if l[3]=='0': strand='-'
155 | 	gffLine=[l[0],'reditoolTable','pos',l[1],l[1],'.',strand,'.',l[0]+'-'+l[1]]
156 | 	o.write('\t'.join(gffLine)+'\n')
157 | 	xx+=1
158 | f.close()
159 | o.close()
160 | sys.stdout.write("Converted %i lines.\n"%(xx))
161 | sys.stdout.write("GFF saved on %s\n"%(outfile))
162 | if sort:
163 | 	sys.stdout.write("Sorting GFF file...\n")
164 | 	outfileS='.'.join(outfile.split('.')[:-1])+'.sorted.gff'
165 | 	batch_sort(outfile,outfileS,key_,buffer_size,tempdirs)
166 | 	outfile=outfileS
167 | 	sys.stdout.write("Sorted GFF saved on %s\n"%(outfileS))
168 | if tabix:
169 | 	try:
170 | 		import pysam
171 | 		sys.stdout.write("Indexing GFF file...\n")
172 | 		outfileS=pysam.tabix_index(outfile, preset='gff')
173 | 		sys.stdout.write("Tabix file saved on %s.\n" %(outfileS))
174 | 		sys.stdout.write("Indices saved on %s.tbi.\n" %(outfileS))
175 | 	except: sys.exit('Pysam module not found.\nTabix indexing not available.')
176 | 
177 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
178 | sys.stdout.write("Script time --> END: %s\n"%(script_time))
179 | 
180 | 


--------------------------------------------------------------------------------
/accessory/get_DE_events.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #################################### REDI OUT TABLE ########################################################
  3 | #Region		Position	Reference	Strand	Coverage-q30	MeanQ	BaseCount[A,C,G,T]	   #
  4 | #AllSubs	Frequency	gCoverage-q30	gMeanQ	gBaseCount[A,C,G,T]	gAllSubs	gFrequency #
  5 | ############################################################################################################
  6 | 
  7 | ###################################GET_DE_events_table###########################################################
  8 | #chromosome	position	type_editing	SRR3306830_CTRL		SRR3306831_CTRL		SRR3306832_CTRL #	
  9 | #SRR3306833_CTRL	SRR3306834_CTRL	SRR3306835_CTRL   SRR3306836_CTRL	SRR3306823_DIS	SRR3306824_DIS	#
 10 | #SRR3306825_DIS	SRR3306826_DIS	SRR3306827_DIS	SRR3306828_DIS	SRR3306829_DIS	[num_controls/num_disease]	#
 11 | #delta_diff	pvalue (Mannwhitney)       									#
 12 | #################################################################################################################
 13 | 
 14 | import os, sys, argparse
 15 | from scipy import stats
 16 | from scipy.stats import wilcoxon, mannwhitneyu, fisher_exact
 17 | import numpy as np
 18 | import pandas as pd
 19 | import math
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument("-c", action = 'store', dest = 'min_coverage', 
 23 | 		type = int, default=10,  help='Coverage-q30')
 24 | parser.add_argument("-cpval", action = 'store', dest = 'pvalue_correction',
 25 |                 type = int, default = 0, help = '1 --> Bonferroni correction / 2 --> Benjamini hochberg')
 26 | parser.add_argument("-input_file", action = 'store', dest = 'samples_informations_file',
 27 | 		type = str, default= 'empty', help = 'Comma separated file  e.g: SRR3306830,Control \
 28 | 		SRR3306829,Healthy...etc')
 29 | parser.add_argument("-f", action = 'store', dest = 'min_edit_frequency',
 30 | 		type = float, default=0.1, help='Editing Frequency')
 31 | parser.add_argument("-mts", action = 'store', dest = 'min_sample_testing',
 32 | 		type = float, default=50.0, help="min percentage of each sample category")
 33 | parser.add_argument("-sig", action = 'store', dest = 'only_significant',
 34 | 		type = str, default = 'no', help = 'Return only significant editing events')
 35 | parser.add_argument("-linear", action = 'store_true', help = 'Enable linear model')
 36 | 
 37 | args = parser.parse_args()
 38 | min_coverage = args.min_coverage
 39 | min_edit_frequency = args.min_edit_frequency
 40 | min_sample_testing = args.min_sample_testing
 41 | only_significants = args.only_significant
 42 | pvalue_correction = args.pvalue_correction
 43 | samples_informations_file = args.samples_informations_file
 44 | enable_linear_model = args.linear
 45 | 
 46 | if args.samples_informations_file == 'empty':
 47 | 	parser.error('sample_informations_file is MISSING!' + '\n' + \
 48 | 	'Please type "python get_DE_events.py -h" for more details on usage of this script')
 49 | 
 50 | 
 51 | def call_differential_editing_sites(config_file):
 52 | 	stability_value = 0.03 #value below which you may use a lower coverage for adding more samples to increase power
 53 | 	min_disease_people = 5 #min number people supporting higher coverage for whch you may base stability off measurements off of
 54 | 	min_control_people = 5  #min number control poeple supporting higher coverage for which you may base stability off of
 55 | 	min_disease_people_5_cov = 10 #min disease number of people of 5 coverage you must have if needing to use unstable 5x coverage
 56 | 	min_control_people_5_cov = 10 #min control number of people of 5 coverage you must have if needing to use unstable 5x coverage
 57 | 	editing_file= './temp.csv'
 58 | 	output_file = './editing_sites.with_stats_converted_disease.csv'
 59 | 	#read in files
 60 | 	editing_table = pd.read_csv(editing_file,sep='\t')
 61 | 	#config_table = pd.read_csv(config_file,sep=',',header=None)
 62 | 	config_table = pd.read_csv(config_file,sep=',',skiprows=1,header=None)
 63 | 	all_people = config_table[0]
 64 | 	disease_people = config_table[0][config_table[1] == "DIS"].reset_index(drop = True) #TODO Change do disease!!!
 65 | 	control_people = config_table[0][config_table[1] == "CTRL"].reset_index(drop = True) #TODO Change to control!!!
 66 | 
 67 | 	#now get just an editing table and coverage table
 68 | 	edit_level_table = editing_table[all_people]
 69 | 	#edit_level_table = editing_table[np.r_[all_people]]
 70 | 
 71 | 	def get_editing_levels_for_cov_table(i):
 72 | 	  info = i.astype(str).str.split(pat="\\^")
 73 | 	  editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[2])
 74 | 	  return editing_levels
 75 | 	cov_table = edit_level_table.apply(get_editing_levels_for_cov_table)
 76 | 	cov_table = cov_table.apply(lambda x: pd.to_numeric(x)) #TODO check if as.numeric and pandas to_numeric do the same.
 77 | 
 78 | 	def get_editing_levels(i):
 79 | 	  info = i.astype(str).str.split(pat="\\^")
 80 | 	  editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[0])
 81 | 	  return editing_levels
 82 | 	edit_level_table = edit_level_table.apply(get_editing_levels)
 83 | 	edit_level_table = edit_level_table.apply(lambda x: pd.to_numeric(x)) #TODO check precision on R and python
 84 | 
 85 | 	#go down line by line and get the prevalence info and mean editing levels based off of stable coverages
 86 | 	#WARNING I'm using float here, not integer allowing NaN values. Is ok?
 87 | 	coverage_threshold_used = np.repeat(0.,edit_level_table.shape[0]) #will hold the coverage threshold required for this editing site
 88 | 	stability_based_on = np.repeat(0.,edit_level_table.shape[0]) #will hold what coverage stability requirements were determined
 89 | 	stable_mean_disease_editing_level = np.repeat(0.,edit_level_table.shape[0]) #mean autistic editing level using individuals passing coverage threshold
 90 | 	stable_std_dev_disease_editing_level = np.repeat(0.,edit_level_table.shape[0]) #standard deviation of autistic editing level using individuals passing coverage threshold
 91 | 	stable_mean_control_editing_level = np.repeat(0.,edit_level_table.shape[0]) #mean control editing level using individuals passing coverage threshold
 92 | 	stable_std_dev_control_editing_level = np.repeat(0.,edit_level_table.shape[0]) #standard deviation of control editing level using individuals passing coverage threshold
 93 | 	stable_number_disease_with_at_least_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #number of autistic individuals passing the coverage threshold
 94 | 	stable_number_disease_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #number of autistic individuals without non zero editing level and passing coverage threshold
 95 | 	stable_disease_prevalence = np.repeat(0.,edit_level_table.shape[0]) #proportion autistic individuals with nonzero editing
 96 | 	stable_number_control_with_at_least_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #same as disease but for control subjects
 97 | 	stable_number_control_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0])
 98 | 	stable_control_prevalence = np.repeat(0.,edit_level_table.shape[0])
 99 | 	stable_total_number_individuals_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #total number of disease and control subjects passing the coverage threshold and having nonzero editing level
100 | 	stable_mann_whitney_p_value = np.repeat(0.,edit_level_table.shape[0]) #wilcoxon rank sum test p value using individuals passing the coverage threshold
101 | 	stable_editing_level_effect_size = np.repeat(0.,edit_level_table.shape[0]) #difference between mean disease and mean control
102 | 	stable_frequency_fishers_p_value = np.repeat(0.,edit_level_table.shape[0]) #prevalence p value determined using two-tailed fisher's exact test
103 | 	stable_frequency_OR = np.repeat(0.,edit_level_table.shape[0]) #odds ratio of the fisher's exact teest
104 | 	stable_prevalence_effect_size = np.repeat(0.,edit_level_table.shape[0]) #difference in editing level prevalences between disease and control subjects
105 | 	#WARNING those are np arrays.
106 | 
107 | 	for i in range(0,edit_level_table.shape[0]):
108 | 	  print i  #keep track of progress
109 | 	  disease_edit_row = edit_level_table.loc[i, disease_people]
110 | 	  control_edit_row = edit_level_table.loc[i, control_people]
111 | 	  disease_cov_row = cov_table.loc[i, disease_people]
112 | 	  control_cov_row = cov_table.loc[i, control_people]
113 | 	  #find what coverage we can base stability off of
114 | 	  number_disease_20_cov = disease_cov_row[disease_cov_row >= 20].count()
115 | 	  number_control_20_cov = control_cov_row[control_cov_row >=20].count()
116 | 	  number_disease_15_cov = disease_cov_row[disease_cov_row >= 15].count()
117 | 	  number_control_15_cov = control_cov_row[control_cov_row >= 15].count()
118 | 	  number_disease_10_cov = disease_cov_row[disease_cov_row >= 10].count()
119 | 	  number_control_10_cov = control_cov_row[control_cov_row >= 10].count()
120 | 	  number_disease_5_cov = disease_cov_row[disease_cov_row >= 5].count()
121 | 	  number_control_5_cov = control_cov_row[control_cov_row >= 5].count()
122 | 	  if number_disease_20_cov >= min_disease_people and number_control_20_cov >= min_control_people:
123 | 		stability_based_on[i] = 20
124 | 	  elif number_disease_15_cov >= min_disease_people and number_control_15_cov >= min_control_people:
125 | 		stability_based_on[i] = 15
126 | 	  elif number_disease_10_cov >= min_disease_people and number_control_10_cov >= min_control_people:
127 | 		stability_based_on[i] = 10
128 | 	  elif number_disease_5_cov >= min_disease_people_5_cov and number_control_5_cov >= min_control_people_5_cov:
129 | 		stability_based_on[i] = 5
130 | 	  else:
131 | 		#stability_based_on[i] = -99999 # there's no np.nan integer representation, only float. We use an invalid value.
132 | 		stability_based_on[i] = float('nan')
133 | 
134 | 	  #need to deal with cases where there just are not enough disease individuals or control individuals to calculate mean
135 | 	  if np.isnan(stability_based_on[i]):
136 | 
137 | 		coverage_threshold_used[i] = 5 #I warn users not to use editing sites that don't have any stability_based_on measurement. We include min coverage of 5 just to get statistical information anyways
138 | 		#stable_min_cov=5
139 | 		#otherwise we can now try to find the stable_min_cov that'll be used for calculation of all statistics'
140 | 
141 | 	  else:
142 | 		current_stability_cov =  stability_based_on[i]
143 | 		stability_disease_mean = disease_edit_row[disease_cov_row >= current_stability_cov].mean()
144 | 		stability_control_mean = control_edit_row[control_cov_row >= current_stability_cov].mean()
145 | 		#print np.arange(5,stability_based_on[i]+1e-4,5)
146 | 		for j in np.arange(5,stability_based_on[i]+1e-4,5): #WARNING using 1e-4 allowing to include stop
147 | 		  disease_mean = disease_edit_row[disease_cov_row >= j].mean()
148 | 		  control_mean = control_edit_row[control_cov_row >= j].mean()
149 | 		  if np.absolute(disease_mean-stability_disease_mean) <=stability_value and np.absolute(control_mean-stability_control_mean) <=stability_value :
150 | 		    coverage_threshold_used[i] = j
151 | 		    break
152 | 	  #now let's calculate all our statics based on the stable coverage threshold
153 | 	  stable_min_cov = coverage_threshold_used[i]
154 | 	  disease_adju_edit_row = disease_edit_row[np.logical_and(np.logical_and((~np.isnan(disease_edit_row)), (~np.isnan(disease_cov_row))), (disease_cov_row >= stable_min_cov))]
155 | 	  disease_adju_cov_row = disease_cov_row[np.logical_and((~np.isnan(disease_cov_row)), (disease_cov_row >= stable_min_cov))]
156 | 	  control_adju_edit_row = control_edit_row[ np.logical_and(np.logical_and((~np.isnan(control_edit_row)), (~np.isnan(control_cov_row))), (control_cov_row >= stable_min_cov))]
157 | 	  control_adju_cov_row = control_cov_row[np.logical_and((~np.isnan(control_cov_row)), (control_cov_row >= stable_min_cov))]
158 | 	  stable_mean_disease_editing_level[i] = disease_adju_edit_row.mean()
159 | 	  stable_std_dev_disease_editing_level[i] = disease_adju_edit_row.std()
160 | 	  stable_mean_control_editing_level[i] = control_adju_edit_row.mean()
161 | 	  stable_std_dev_control_editing_level[i] = control_adju_edit_row.std()
162 | 	  stable_number_disease_with_at_least_min_coverage[i] = disease_adju_cov_row[disease_adju_cov_row >=stable_min_cov].count()
163 | 	  stable_number_disease_nonzero_editing_and_min_coverage[i] = disease_adju_cov_row[ (~np.isnan(disease_adju_cov_row)) & (disease_adju_cov_row >= stable_min_cov) & (disease_adju_edit_row > 0) ].count()
164 | 	  stable_disease_prevalence[i] = stable_number_disease_nonzero_editing_and_min_coverage[i]/stable_number_disease_with_at_least_min_coverage[i]
165 | 	  stable_number_control_with_at_least_min_coverage[i] = control_adju_cov_row[control_adju_cov_row >=stable_min_cov].count()
166 | 	  stable_number_control_nonzero_editing_and_min_coverage[i] = control_adju_cov_row[(~np.isnan(control_adju_cov_row)) & (control_adju_cov_row >= stable_min_cov) & (control_adju_edit_row > 0)].count()
167 | 	  stable_control_prevalence[i] = stable_number_control_nonzero_editing_and_min_coverage[i]/stable_number_control_with_at_least_min_coverage[i]
168 | 	  stable_total_number_individuals_nonzero_editing_and_min_coverage[i] = (stable_number_disease_nonzero_editing_and_min_coverage[i] + stable_number_control_nonzero_editing_and_min_coverage[i]).sum()
169 | 	  if (len(disease_adju_edit_row) >=1) & (len(control_adju_edit_row) >=1):
170 | 		if (np.all(disease_adju_edit_row.values == control_adju_edit_row.values)):
171 | 		  stable_mann_whitney_p_value[i] = float('nan')
172 | 		else:
173 | 		  temp, stable_mann_whitney_p_value[i] = mannwhitneyu(disease_adju_edit_row,control_adju_edit_row, alternative='two-sided')
174 | 	  else:
175 | 		stable_mann_whitney_p_value[i] = float('nan')
176 | 	  stable_editing_level_effect_size[i] =  np.absolute(stable_mean_disease_editing_level[i] - stable_mean_control_editing_level[i])
177 | 	  fisher_matrix = np.matrix([[stable_number_disease_nonzero_editing_and_min_coverage[i], stable_number_disease_with_at_least_min_coverage[i]-stable_number_disease_nonzero_editing_and_min_coverage[i]], [stable_number_control_nonzero_editing_and_min_coverage[i], stable_number_control_with_at_least_min_coverage[i]-stable_number_control_nonzero_editing_and_min_coverage[i]]])
178 | 	  stable_frequency_OR[i], stable_frequency_fishers_p_value[i] = fisher_exact(fisher_matrix)  
179 | 	  #print stable_frequency_OR[i]
180 | 	  #print stable_frequency_fishers_p_value[i]
181 | 	  stable_prevalence_effect_size[i] = np.absolute(stable_disease_prevalence[i] - stable_control_prevalence[i])
182 | 
183 | 	#now put everything back together as a table
184 | 	header_info = editing_table[['chromosome','position','type_editing']]
185 | 	stats_table = pd.DataFrame(coverage_threshold_used)
186 | 	stats_table = stats_table.rename(columns={stats_table.columns[0]: 'coverage_threshold_used'})
187 | 	stats_table['stability_based_on'] = pd.DataFrame(stability_based_on)
188 | 	stats_table['stable_mean_disease_editing_level'] = pd.DataFrame(stable_mean_disease_editing_level)
189 | 	stats_table['stable_std_dev_disease_editing_level'] = pd.DataFrame(stable_std_dev_disease_editing_level)
190 | 	stats_table['stable_mean_control_editing_level'] = pd.DataFrame(stable_mean_control_editing_level)
191 | 	stats_table['stable_std_dev_control_editing_level'] = pd.DataFrame(stable_std_dev_control_editing_level)
192 | 	stats_table['stable_number_disease_with_at_least_min_coverage'] = pd.DataFrame(stable_number_disease_with_at_least_min_coverage)
193 | 	stats_table['stable_number_disease_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_number_disease_nonzero_editing_and_min_coverage)
194 | 	stats_table['stable_disease_prevalence'] = pd.DataFrame(stable_disease_prevalence)
195 | 	stats_table['stable_number_control_with_at_least_min_coverage'] = pd.DataFrame(stable_number_control_with_at_least_min_coverage)
196 | 	stats_table['stable_number_control_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_number_control_nonzero_editing_and_min_coverage)
197 | 	stats_table['stable_control_prevalence'] = pd.DataFrame(stable_control_prevalence)
198 | 	stats_table['stable_total_number_individuals_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_total_number_individuals_nonzero_editing_and_min_coverage)
199 | 	stats_table['stable_mann_whitney_p_value'] = pd.DataFrame(stable_mann_whitney_p_value)
200 | 	stats_table['stable_editing_level_effect_size'] = pd.DataFrame(stable_editing_level_effect_size)
201 | 	stats_table['stable_frequency_fishers_p_value'] = pd.DataFrame(stable_frequency_fishers_p_value)
202 | 	stats_table['stable_frequency_OR'] = pd.DataFrame(stable_frequency_OR)
203 | 	stats_table['stable_prevalence_effect_size'] = pd.DataFrame(stable_prevalence_effect_size)
204 | 
205 | 	full_table = pd.concat([header_info, stats_table, editing_table[all_people]], axis=1)
206 | 
207 | 	#write the full_table to output
208 | 	full_table.to_csv(output_file, sep='\t', index=False)
209 | 
210 | 	print "job completed\n"
211 | 
212 | 
213 | 
214 | 
215 | def Set_Chr_Nr(Chr):
216 |     """ Sort by chromosome """
217 |     if Chr: 
218 | 	New = Chr.lstrip('chr').split('_')[0]
219 |         if New == 'X': New = 23
220 |         elif New == 'Y': New = 24
221 |         elif New == 'M': New = 25
222 |         else: New = int(New)
223 |     else:
224 |         New = 0
225 |     return New
226 | 
227 | def Sample_percentage(row):
228 | 	"""Percentage of samples from each type"""
229 | 	percentage = (len(filter(lambda x: x!= '-', row))/float(len(row)))*100
230 | 	return round(percentage)
231 | 
232 | def Sample_count(row):
233 | 	"""Number of samples from each type"""
234 | 	count = len(filter(lambda x: x!= '-', row))
235 | 	return count 
236 | 
237 | def get_bh(pvalue,siglevel):
238 | 	"""B-H correction """
239 | 	pvalue.sort()
240 | 	x=1
241 | 	y=0
242 | 	p=0
243 | 	for i in pvalue:
244 | 		nf=i[0]*len(pvalue)
245 | 		fdr=nf/x
246 | 		if fdr<=siglevel:
247 | 			i[1].append('True')
248 | 			p=i[0]
249 | 			y+=1
250 | 		else: i[1].append('False')
251 | 		x+=1
252 | 	return pvalue,y,p
253 | 	
254 | def get_b(pvalue,siglevel): 
255 | 	"""Bonferroni correction"""
256 | 	pvalue.sort()
257 | 	y=0
258 | 	pp=1.0
259 | 	for i in pvalue:
260 | 		p=i[0]*len(pvalue)
261 | 		if p<=siglevel:
262 | 			i[1].append('True')
263 | 			y+=1
264 | 			if p<pp: pp=p
265 | 		else: i[1].append('False')
266 | 	return pvalue,y,pp
267 | 
268 | def only_sig(row_a,row):
269 | 	"""Returns only significant events"""
270 | 	if(row_a[-1] != '-' and row_a[-1] != 0.0 and row_a[-1] <= 0.05):
271 | 		row =  row[0].split('_') + row[2:]
272 | 		row.insert(2, 'A.to.G')
273 | 		print '\t'.join(map(str,row))
274 | 
275 | def tuple_replace(i):
276 | 	if type(i) == tuple:
277 | 		return i[0]
278 | 	else:
279 | 		return i
280 | 
281 | def tuple_replace_bis(k):
282 |         if type(k) == tuple:
283 |                 return k[1]
284 |         else:
285 |              	return k
286 | 
287 | def remove_underscore(lis):
288 | 	lis = lis[:lis.index('_')]
289 | 	return lis
290 | 
291 | 
292 | sample_informations = {}
293 | with open(samples_informations_file, 'r') as f:
294 |     for line in f:
295 |         if line.startswith('SRR'):
296 |             line = map(str.strip, line.split(','))
297 |             sample_informations.setdefault(line[0], line[1])
298 | 
299 | 
300 | cwd = filter(os.path.isdir, os.listdir(os.getcwd()))
301 | all_available_sites = []
302 | sample_edited_sites = {}
303 | for directory in cwd:
304 |     if directory.startswith('SRR'):
305 |         path = list(os.walk(directory + '/editing/'))
306 |         table = path[1][0] + '/' + path[1][-1][-1] 
307 |         with open(table,'r') as a:
308 |             for line in a:
309 |                 if line.startswith('chr'):
310 |                     s = map(str.strip, line.split("\t"))
311 | 		    if s[7] == 'AG':
312 | 	                site, freq, coverage = s[0] + "_" + s[1], s[8], s[4]
313 | 			freq_gnum_cov = '%s^%s^%s' %(s[8],eval(s[6])[2],s[4]) 
314 | 			if site not in all_available_sites: all_available_sites.append(site)
315 | 			if (int(coverage) >= min_coverage) and (float(freq) >= min_edit_frequency):
316 |                 		sample_edited_sites.setdefault((directory, site), []).append((freq, freq_gnum_cov))
317 | 
318 | table_columns = map(lambda x: x + '_' + sample_informations[x], sorted(sample_informations.keys()))
319 | 
320 | disease = [i for i in table_columns if i.upper().find('DIS') != -1]
321 | controls = [i for i in table_columns if i.upper().find('CTRL') != -1]
322 | 
323 | if enable_linear_model:
324 | 	outtable=''
325 |         header = ['chromosome', 'position', 'type_editing'] + map(remove_underscore, controls) + map(remove_underscore, disease)
326 |         outtable += '\t'.join(header)
327 | 	outtable += '\n'
328 |         #print '\t'.join(header)
329 |         for chrom in sorted(all_available_sites, key = lambda x: Set_Chr_Nr(x)):
330 |                 row = [chrom]
331 |                 for col in header[2:]:#header.index('[num_controls/num_disease]')]:
332 |                         row.append(sample_edited_sites.get((col.split('_')[0],chrom), ['-'])[0])
333 |                 ctrls = zip(*(zip(controls,row[1:])))[1]
334 |                 dss = zip(*(zip(disease,row[len(ctrls)+1:])))[1]
335 |                 ctrls_freq = map(tuple_replace, ctrls)
336 |                 dss_freq = map(tuple_replace, dss)
337 |                 row.append(str([Sample_count(ctrls), Sample_count(dss)]))
338 | 
339 |                 row_b = map(tuple_replace_bis, row)
340 |                 row_b = row_b[0].split('_') + row_b[2:]
341 |                 row_b.insert(2, 'A.to.G')
342 | 		final_list = row_b[:-1]
343 |                 #print '\t'.join(map(str,final_list))
344 | 		outtable += '\t'.join(map(str,final_list)).replace('-','NA')
345 | 		outtable += '\n'
346 | 
347 | 	with open('temp.csv','w') as t:
348 | 		t.write(outtable)
349 | 		t.close()
350 | 
351 | 	# call linear model script
352 | 	call_differential_editing_sites(samples_informations_file) 
353 | 	
354 | 
355 | else:
356 | 	header = ['chromosome', 'position', 'type_editing'] + controls + disease + ['[num_controls/num_disease]'] + ['delta_diff'] + ['pvalue (Mannwhitney)']
357 | 
358 | 	if pvalue_correction == 1:
359 | 		header += ['pvalue Bonferroni corrected']
360 | 	if pvalue_correction == 2:
361 | 		header += ['pvalue BH corrected']
362 | 		
363 | 	print '\t'.join(header)
364 | 	
365 | 	for chrom in sorted(all_available_sites, key = lambda x: Set_Chr_Nr(x)):
366 | 		row = [chrom]
367 | 		for col in header[3:header.index('[num_controls/num_disease]')]:
368 | 			row.append(sample_edited_sites.get((col.split('_')[0],chrom), ['-'])[0])
369 | 		ctrls = zip(*(zip(controls,row[1:])))[1]
370 | 		dss = zip(*(zip(disease,row[len(ctrls)+1:])))[1] 
371 | 		ctrls_freq = map(tuple_replace, ctrls)
372 | 		dss_freq = map(tuple_replace, dss)
373 | 		row.append(str([Sample_count(ctrls), Sample_count(dss)]))
374 | 		if (Sample_percentage(ctrls) >= min_sample_testing) and (Sample_percentage(dss) >= min_sample_testing):
375 | 			ctrls_mean = sum(map(float, filter(lambda x: x!= '-', ctrls_freq)))/len(filter(lambda x: x!= '-', ctrls_freq))
376 | 	                dss_mean = sum(map(float, filter(lambda x: x!= '-', dss_freq)))/len(filter(lambda x : x!= '-', dss_freq))
377 | 			delta_diff =  abs(ctrls_mean - dss_mean)
378 | 			pvalue=stats.mannwhitneyu(ctrls_freq, dss_freq, alternative='two-sided')
379 | 			row.append(round(delta_diff, 3))
380 | 			row.append(str(round(pvalue[1], 3)))
381 | 			correction_argmnt = [(pvalue[1], ctrls_freq+dss_freq)]
382 | 		
383 | 			if pvalue_correction == 1:
384 | 				row.append(round(get_b(correction_argmnt, 0.05)[-1], 6))
385 | 			elif pvalue_correction == 2:
386 | 				row.append(round(get_bh(correction_argmnt, 0.05)[-1], 6))
387 | 		else:
388 | 			if pvalue_correction == 0:
389 | 				row += ['-', '-']
390 | 			else:
391 | 				row += ['-', '-', '-']
392 | 		row_a = map(tuple_replace, row)
393 | 		row_b = map(tuple_replace_bis, row)
394 | 		if pvalue_correction != 0 and only_significants == 'yes':
395 | 			only_sig(row_a,row_b)
396 | 		else:
397 | 			row_b =  row_b[0].split('_') + row_b[2:]
398 | 	                row_b.insert(2, 'A.to.G')
399 | 			print '\t'.join(map(str,row_b))
400 | 
401 | 


--------------------------------------------------------------------------------
/accessory/readPsl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | import math
  4 | 
  5 | try:
  6 | 	pslfile=sys.argv[1]
  7 | 	outfile=sys.argv[2]
  8 | except:
  9 | 	sys.exit('USAGE: <psl file> <output file>')
 10 | 	
 11 | #### for blat
 12 | def getPS(line):
 13 | 	pid = (100.0 - (pslCalcMilliBad(line) * 0.1))
 14 | 	score = pslScore(line)
 15 | 	#print "The percentage:",pid
 16 | 	#print "Score:",score
 17 | 	return pid,score
 18 | 
 19 | def pslScore(cols):
 20 | 	sizeMul =  1
 21 | 	return sizeMul * (int(cols[0]) + (int(cols[2]))) - sizeMul * int(cols[1]) - int(cols[4]) - int(cols[6])
 22 | 
 23 | def round(number):
 24 | 	return int(number + .5);
 25 | 
 26 | def pslCalcMilliBad(cols):
 27 | 	sizeMul = 1
 28 | 	# cols[0]  matches
 29 | 	# cols[1]  misMatches
 30 | 	# cols[2]  repMaches
 31 | 	# cols[4]  qNumInsert
 32 | 	# cols[6]  tNumInsert
 33 | 	# cols[11] qStart
 34 | 	# cols[12] qEnd
 35 | 	# cols[15] tStart
 36 | 	# cols[16] tEnd
 37 | 	qAliSize = sizeMul * (int(cols[12]) - int(cols[11]))
 38 | 	tAliSize = int(cols[16]) - int(cols[15])
 39 | 	# I want the minimum of qAliSize and tAliSize
 40 | 	if qAliSize < tAliSize: aliSize = qAliSize #? $aliSize = $qAliSize : $aliSize =  $tAliSize;
 41 | 	else: aliSize = tAliSize
 42 | 	# return 0 is AliSize == 0
 43 | 	if aliSize <= 0: return 0
 44 | 	# size diff
 45 | 	sizeDiff = qAliSize - tAliSize
 46 | 	if sizeDiff < 0: sizeDiff = 0  
 47 | 	# insert Factor
 48 | 	insertFactor = int(cols[4]) 
 49 | 	# $insertFactor += $cols[6];
 50 | 	milliBad = (1000 * (int(cols[1])*sizeMul + insertFactor + round(3*math.log( 1 + sizeDiff)))) / (sizeMul * (int(cols[0]) + int(cols[2])+ int(cols[1])))
 51 | 	return milliBad
 52 | 
 53 | def com(num,list):
 54 | 	for i in list:
 55 | 		if i[0]<=num<=i[1]: return 1
 56 | 	return 0
 57 | 
 58 | def min95(val,score):
 59 | 	#if val < (score*95.0)/100: return 1
 60 | 	if val < (score*0.95): return 1
 61 | 	return 0
 62 | 
 63 | def readLines(lines):
 64 | 	res=[]
 65 | 	for line in lines:
 66 | 		pidd,score=getPS(line)
 67 | 		#print pidd,score
 68 | 		sp=[int(x) for x in (line[18].strip(',')).split(',')]
 69 | 		tstarts=[int(x) for x in (line[20].strip(',')).split(',')]
 70 | 		ex=[(tstarts[x]+1,tstarts[x]+sp[x]) for x in range(len(sp))]
 71 | 		nl=[line[9],score,str(int(line[11])+1),line[12],str(line[10]),pidd,line[13],line[8],int(line[15])+1,int(line[16]),ex,int(line[0])]
 72 | 		res.append((int(line[0]),nl)) #score
 73 | 		#if d.has_key(line[9]): d[line[9]].append((score,nl))
 74 | 		#else: d[line[9]]=[(score,nl)]
 75 | 	return res
 76 | 
 77 | def comp(ri,hits):
 78 | 	g,ng=0,0
 79 | 	hits.sort()
 80 | 	hits.reverse()
 81 | 	if len(hits)==1: #unique hit with editing candidate position included
 82 | 		if hits[0][1][6]==ri[2] and com(ri[1],hits[0][1][10]): g+=1 #float(hits[0][1][5])>=90.0
 83 | 		else: ng+=1
 84 | 	elif len(hits)>1: #multiple hits
 85 | 		if hits[0][1][6]==ri[2] and min95(hits[1][0],hits[0][0]): # if second best score less than 95% of first best score
 86 | 			if com(ri[1],hits[0][1][10]): g+=1 # if first best hit include editing position
 87 | 			else: ng+=1
 88 | 		else: ng+=1
 89 | 	if g>ng: return 1
 90 | 	return 0
 91 | 
 92 | def readPSL(infile,outfile):
 93 | 	f=open(infile)
 94 | 	o=open(outfile,'w')
 95 | 	name,lines,xx='',[],0
 96 | 	while 1:
 97 | 		line=f.readline()
 98 | 		if not line:
 99 | 			if name=='': break
100 | 			nn=name.split('$')
101 | 			oread=(name,int(nn[2]),nn[1])
102 | 			bread=readLines(lines)
103 | 			badr=''
104 | 			if len(bread)==0: badr=name
105 | 			else: 
106 | 				if not comp(oread,bread): badr=name
107 | 			if badr!='':
108 | 				#o.write(name[:-2]+' '+name[-1]+'\n')
109 | 				o.write(name.split('_')[0]+' '+name.split('$')[0][-1]+'\n')
110 | 				xx+=1
111 | 			break
112 | 		if line.strip()=='': continue	
113 | 		if line.startswith('psL'): continue
114 | 		if (line.strip()).startswith('match'): continue
115 | 		if line.startswith('-'): continue
116 | 		l=(line.strip()).split('\t')
117 | 		if l[9]!=name:
118 | 			if len(lines)!=0:
119 | 				nn=name.split('$')
120 | 				#(rname,pileupcolumn.pos+1,chr)
121 | 				oread=(name,int(nn[2]),nn[1]) #dread[name]
122 | 				bread=readLines(lines)
123 | 				badr=''
124 | 				if len(bread)==0: badr=name
125 | 				else: 
126 | 					if not comp(oread,bread): badr=name
127 | 				if badr!='':
128 | 					#o.write(name[:-2]+' '+name[-1]+'\n')
129 | 					o.write(name.split('_')[0]+' '+name.split('$')[0][-1]+'\n')
130 | 					xx+=1
131 | 			lines=[l]
132 | 			name=l[9]
133 | 		else: lines.append(l)
134 | 	f.close()
135 | 	o.close()
136 | 	return xx
137 | 
138 | def readgf(infile):
139 | 	f=open(infile)
140 | 	for i in f:
141 | 		if 'Server ready for queries!' in i:
142 | 			f.close()
143 | 			return 1
144 | 	f.close()
145 | 	return 0
146 | 
147 | def parse(line):
148 | 	l=(line.strip()).split('\t')
149 | 	cc=(int(l[3]),int(l[4]))
150 | 	return cc
151 | 
152 | readPSL(pslfile,outfile)
153 | 
154 | 


--------------------------------------------------------------------------------
/accessory/rediportal2recoding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | 
21 | 
22 | #GFF structure
23 | #chr/tvalue/tfeature/tstart/tend/t./tstrand/./t gene_id ed_numb; transcript_id ed_numb;
24 | 
25 | import sys
26 | 
27 | try:
28 | 	in_table = sys.argv[1]
29 | except:
30 | 	sys.exit('<REDIPORTAL_table>') 
31 | 
32 | i=0
33 | with open(in_table,'r') as e:
34 | 	e.readline()
35 | 	for line in e:
36 | 		line = line.split('\t')
37 | 		if line[6] == 'NONREP' and line[9] == 'exonic':
38 | 			if ('\t'.join(line).count('nonsynonymous')) == 3:
39 | 				i+=1
40 | 				valore = line[12].split(':')[0] + '_' +  line[12].split('.')[-1]
41 | 				gff_row = line[0] + '\t'+ valore + '\t' + 'ed' + '\t' + line[1] + \
42 | 				'\t' + line[1] + '\t' + '.' + '\t' + line[4] + '\t' + '.' + '\t' + \
43 | 				'gene_id' + ' '  + '"ed_%s";' %(i) + ' ' + 'transcript_id' + ' ' + '"ed_%s";' %(i)
44 | 				print gff_row
45 | 


--------------------------------------------------------------------------------
/accessory/selectPositions.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, getopt, os, time, random, gzip
 23 | 
 24 | version='1.0'
 25 | pid=str(os.getpid()+random.randint(0,999999999))
 26 | 
 27 | def usage():
 28 | 	print """
 29 | USAGE: python selectPositions.py [options]
 30 | Options:
 31 | -i		Table file from REDItools
 32 | -d		Base distribution column for DNA-Seq (-1: no DNA-Seq) [-1]
 33 | -c		Coverage RNA-Seq [5]
 34 | -C		Coverage DNA-Seq [5]
 35 | -v		Bases supporting RNA-Seq variation [1]
 36 | -V		Bases supporting DNA-Seq variation [0]
 37 | -s		Substitutions to select in RNA-Seq (separated by comma AG,CT) [all]
 38 | -f		Frequency of variation in RNA-Seq [0.1]
 39 | -F		Frequency of non-variation in DNA-Seq [0.95]
 40 | -e		Exclude multiple substitutions in RNA-Seq
 41 | -r		Exclude invariant sites in RNA-Seq
 42 | -R		Exclude variant sites in DNA-Seq #
 43 | -u		Use only positions supported by DNA-Seq
 44 | -o		Save selected positions on outTable_%s
 45 | -h		Print this help
 46 | 
 47 | """%(pid)
 48 | 
 49 | try:
 50 | 	opts, args = getopt.getopt(sys.argv[1:], "i:c:C:v:s:f:F:euo:hrd:RV:",["help"])
 51 | 	if len(opts)==0:
 52 | 		usage()
 53 | 		sys.exit(2)
 54 | except getopt.GetoptError as err:
 55 | 	print str(err) # will print something like "option -a not recognized"
 56 | 	usage()
 57 | 	sys.exit(2)
 58 | 
 59 | tablefile=''
 60 | outfile='outTable_%s' %(pid)
 61 | #rna-seq
 62 | cov=5
 63 | bvar=1
 64 | sfreq=0.1
 65 | expos=0
 66 | upos=0
 67 | exinv=0
 68 | subs=[x+y for x in 'ACGT' for y in 'ACGT' if x!=y]
 69 | #dna-seq
 70 | dnacol=11
 71 | dnacols=[x for x in range(dnacol-2,dnacol+3,1)]
 72 | isdna=0
 73 | gcov=5
 74 | gsfreq=0.95
 75 | gexvar=0
 76 | gbvar=0
 77 | 
 78 | for o, a in opts:
 79 | 	if o in ("-h","--help"):
 80 | 		usage()
 81 | 		sys.exit()
 82 | 	elif o == "-i":
 83 | 		tablefile=a
 84 | 		if not os.path.exists(tablefile):
 85 | 			usage()
 86 | 			sys.exit('Table file not found')
 87 | 	elif o == "-c": cov=int(a)
 88 | 	elif o == "-C": gcov=int(a)
 89 | 	elif o == "-v": bvar=int(a)
 90 | 	elif o == "-V": gbvar=int(a)
 91 | 	elif o == "-s": subs=[x.upper() for x in a.split(',') if x.strip()!='']
 92 | 	elif o == "-f": sfreq=float(a)
 93 | 	elif o == "-F": gsfreq=float(a)	
 94 | 	elif o == "-e": expos=1
 95 | 	elif o == "-u": upos=1
 96 | 	elif o == "-r": exinv=1
 97 | 	elif o == "-R": gexvar=1	
 98 | 	elif o == "-d":
 99 | 		dnacol=int(a)
100 | 		if dnacol>3:
101 | 			isdna=1
102 | 			dnacols=[x-1 for x in range(dnacol-2,dnacol+3,1)]
103 | 	elif o == "-o": outfile=a
104 | 	else:
105 | 		assert False, "Unhandled Option"
106 | 
107 | def isnvar(nuc,idx,val):
108 | 	n=eval(nuc)
109 | 	x=0
110 | 	for j in range(4):
111 | 		if j!=idx and n[j]>=val:
112 | 			x+=1
113 | 	if x>0: return 1
114 | 	return 0
115 | 
116 | def isnvar2(nuc,idx,val):
117 | 	n=eval(nuc)
118 | 	x=0
119 | 	for j in range(4):
120 | 		if j!=idx: x+=n[j]
121 | 	if x<=val: return 1
122 | 	return 0
123 | 
124 | def issub(osubs,esubs):
125 | 	x=0
126 | 	for i in osubs:
127 | 		if i in esubs: x+=1
128 | 	if x>0: return 1
129 | 	return 0
130 | 
131 | def vinv(nuc,idx,val):
132 | 	n=eval(nuc)
133 | 	try: v=float(n[idx])/sum(n)
134 | 	except: v=0.0
135 | 	if v>=val: return 1
136 | 	return 0
137 | 	
138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
139 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
140 | sys.stdout.write("Reading table...\n")
141 | 
142 | if tablefile.endswith('.gz'): f=gzip.open(tablefile,'rb')
143 | else: f=open(tablefile)
144 | db={'A':0,'C':1,'G':2,'T':3}
145 | o=open(outfile,'w')
146 | xx,yy=0,0
147 | for i in f:
148 | 	if i.startswith('Region'):
149 | 		o.write(i)
150 | 		continue
151 | 	if i.strip()=='': continue
152 | 	l=(i.strip()).split('\t')
153 | 	xx+=1
154 | 	if l[2] not in 'ACGTacgt': continue
155 | 	if exinv and l[7]=='-': continue
156 | 	if int(l[4])<cov: continue
157 | 	if not isnvar(l[6],db[l[2]],bvar): continue
158 | 	if l[7]!='-':
159 | 		osubs=[x.upper() for x in l[7].split()]
160 | 		if expos and len(osubs)>1: continue
161 | 		if not issub(osubs,subs): continue
162 | 	if float(l[8])<sfreq: continue
163 | 	#DNA-Seq
164 | 	if upos and not isdna: continue
165 | 	if upos and isdna and l[dnacols[0]]=='-': continue
166 | 	if isdna and l[dnacols[0]]!='-':
167 | 		if int(l[dnacols[0]])<gcov: continue
168 | 		if not isnvar2(l[dnacols[2]],db[l[2]],gbvar): continue
169 | 		if not vinv(l[dnacols[2]],db[l[2]],gsfreq): continue
170 | 		else: l[dnacols[3]]='$'
171 | 		if gexvar and l[dnacols[3]]!='-': continue #rivedere questa opzione
172 | 	o.write(i)
173 | 	yy+=1	
174 | f.close()
175 | o.close()
176 | sys.stdout.write("Total lines: %i\n"%(xx))
177 | sys.stdout.write("Filtered in lines: %i\n"%(yy))
178 | sys.stdout.write("Selected lines saved on %s\n"%(outfile))
179 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
180 | sys.stdout.write("Script time --> END: %s\n"%(script_time))
181 | 
182 | 


--------------------------------------------------------------------------------
/accessory/subCount.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | try:
 5 | 	infile=sys.argv[1]
 6 | except:
 7 | 	sys.exit('USAGE: <REDItool table>')
 8 | 
 9 | s={}
10 | for i in 'ACGT':
11 | 	for j in 'ACGT':
12 | 		if i!=j: s[i+j]=0
13 | n={}
14 | x=0
15 | for i in 'ACGT':
16 | 	n[i]=x
17 | 	x+=1
18 | all=0
19 | f=open(infile)
20 | for i in f:
21 | 	if i.startswith('Reg'): continue
22 | 	l=(i.strip()).split('\t')
23 | 	if l[7]=='-': continue
24 | 	sub=l[7].split()[0]
25 | 	nuc=eval(l[6])
26 | 	nv= nuc[n[sub[1]]]
27 | 	s[sub]+=nv
28 | 	all+=nv
29 | f.close()
30 | 
31 | for i in s:
32 | 	try: v=(s[i]/float(all))*100
33 | 	except: v=0.0
34 | 	print i,s[i],all,v
35 | 
36 | 


--------------------------------------------------------------------------------
/accessory/subCount2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | try:
 5 | 	infile=sys.argv[1]
 6 | except:
 7 | 	sys.exit('USAGE: <REDItool table>')
 8 | 
 9 | s={}
10 | for i in 'ACGT':
11 | 	for j in 'ACGT':
12 | 		if i!=j: s[i+j]=0
13 | n={}
14 | x=0
15 | for i in 'ACGT':
16 | 	n[i]=x
17 | 	x+=1
18 | all=0
19 | f=open(infile)
20 | for i in f:
21 | 	if i.startswith('Reg'): continue
22 | 	l=(i.strip()).split('\t')
23 | 	if l[7]=='-': continue
24 | 	sub=l[7].split()[0]
25 | 	s[sub]+=1
26 | 	all+=1
27 | f.close()
28 | 
29 | for i in s:
30 | 	try: v=(s[i]/float(all))*100
31 | 	except: v=0.0
32 | 	print i,s[i],all,v
33 | 
34 | 


--------------------------------------------------------------------------------
/accessory/tableToTabix.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, getopt, time, random, heapq, shutil
 23 | from tempfile import gettempdir
 24 | from itertools import islice, cycle
 25 | from collections import namedtuple
 26 | from operator import itemgetter
 27 | try: import pysam
 28 | except: sys.exit('Pysam module not found.')
 29 | 
 30 | version='1.0'
 31 | pid=str(os.getpid()+random.randint(0,999999999))
 32 | 
 33 | def usage():
 34 | 	print """
 35 | USAGE: python tableToTabix.py [options]
 36 | Options:
 37 | -i		TAB-delimited file
 38 | -s		Sequence name column [1]
 39 | -c		Start column [4]
 40 | -e		End column (can be identical to -c) [5]
 41 | -m		Skip lines starting with [#]
 42 | -0		Zero based coordinates
 43 | -S		Do not sort input file (sort by default)
 44 | -b		Buffer size (as number of lines) [32000]
 45 | -t		Temporary directory to use (multiple -t may be used)
 46 | -u		Save an uncompressed GFF copy (add _copy suffix)
 47 | -h		Print this help
 48 | 
 49 | """
 50 | 
 51 | try:
 52 | 	opts, args = getopt.getopt(sys.argv[1:], "i:Sb:t:hus:c:e:m:0",["help"])
 53 | 	if len(opts)==0:
 54 | 		usage()
 55 | 		sys.exit(2)
 56 | except getopt.GetoptError as err:
 57 | 	print str(err) # will print something like "option -a not recognized"
 58 | 	usage()
 59 | 	sys.exit(2)
 60 | 
 61 | GFFfile=''
 62 | buffer_size=32000
 63 | tempdirs=[]
 64 | sort=1
 65 | mc=0 # save an uncompressed GFF copy, default no
 66 | scol=0 # sequence column name
 67 | bcol=3 # start column
 68 | ecol=4 # end column
 69 | schar='#' # skip lines starting with this character
 70 | zcoord=False # zero based coordinated 
 71 | for o, a in opts:
 72 | 	if o in ("-h","--help"):
 73 | 		usage()
 74 | 		sys.exit()
 75 | 	elif o == "-i":
 76 | 		GFFfile=a
 77 | 		outfile='.'.join(GFFfile.split('.')[:-1])+'.sorted.gff'
 78 | 		if not os.path.exists(GFFfile):
 79 | 			usage()
 80 | 			sys.exit('GFF file not found')
 81 | 	elif o == "-b": buffer_size=int(a)
 82 | 	elif o == "-t": tempdirs.append(a)
 83 | 	elif o == "-S": sort=0
 84 | 	elif o == "-u": mc=1
 85 | 	elif o == "-m": schar=a
 86 | 	elif o == "-s": scol=int(a)-1
 87 | 	elif o == "-c": bcol=int(a)-1
 88 | 	elif o == "-e": ecol=int(a)-1
 89 | 	elif o == "-0": zcoord=True
 90 | 	else:
 91 | 		assert False, "Unhandled Option"
 92 | 
 93 | Keyed = namedtuple("Keyed", ["key", "obj"])
 94 | key_=eval('lambda line : (%s)' %('line[:]'))
 95 | 
 96 | def gk(key,obj):
 97 | 	ik=itemgetter(scol,bcol,ecol)(obj.split('\t'))
 98 | 	return key((ik[0],int(ik[1]),int(ik[2])))
 99 | 
100 | def merge(key=None, *iterables):
101 | 	# based on code posted by Scott David Daniels in c.l.p.
102 | 	# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
103 | 	#print iterables
104 | 	if key is None:
105 | 		keyed_iterables = iterables
106 | 	else:
107 | 		keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables]
108 | 		#print keyed_iterables
109 | 	for element in heapq.merge(*keyed_iterables):
110 | 		yield element.obj
111 | 
112 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None):
113 | 	if tempdirs is None:
114 | 		tempdirs = []
115 | 	if not tempdirs:
116 | 		tempdirs.append(gettempdir())
117 | 	chunks = []
118 | 	xx=0
119 | 	try:
120 | 		with open(input,'rb',64*1024) as input_file:
121 | 			input_iterator = iter(input_file)
122 | 			for tempdir in cycle(tempdirs):
123 | 				current_chunk2=[]
124 | 				for j in islice(input_iterator,buffer_size):
125 | 					if j.startswith('Region'): continue
126 | 					if j.startswith(schar): continue
127 | 					l=(j.strip()).split('\t')
128 | 					l[bcol]=int(l[bcol])
129 | 					l[ecol]=int(l[ecol])
130 | 					current_chunk2.append(l)
131 | 				current_chunk3=[]
132 | 				for j in sorted(current_chunk2, key=itemgetter(scol,bcol,ecol)):
133 | 					j[bcol]=str(j[bcol])
134 | 					j[ecol]=str(j[ecol])
135 | 					current_chunk3.append('\t'.join(j)+'\n')
136 | 				xx+=len(current_chunk3)
137 | 				if not current_chunk3: break
138 | 				sys.stdout.write("Loaded and sorted %i lines.\n"%(xx))
139 | 				output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024)
140 | 				chunks.append(output_chunk)
141 | 				output_chunk.writelines(current_chunk3)
142 | 				output_chunk.flush()
143 | 				output_chunk.seek(0)
144 | 		sys.stdout.write("Merging from %i files.\n"%(len(chunks)))
145 | 		with open(output,'wb',64*1024) as output_file:
146 | 			output_file.writelines(merge(key, *chunks))
147 | 	finally:
148 | 		for chunk in chunks:
149 | 			try:
150 | 				chunk.close()
151 | 				os.remove(chunk.name)
152 | 			except Exception:
153 | 				pass
154 | 
155 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
156 | sys.stdout.write("Script time --> START: %s\n"%(script_time))
157 | if sort:
158 | 	sys.stdout.write("Sorting GFF file...\n")
159 | 	batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs)
160 | 	GFFfile=outfile
161 | if mc:
162 | 	copyfile=GFFfile+'_copy'
163 | 	shutil.copyfile(GFFfile,copyfile)
164 | 	sys.stdout.write("A copy of uncompressed GFF file has been saved on %s.\n" %(copyfile))
165 | sys.stdout.write("Indexing GFF file...\n")
166 | GFFfile=pysam.tabix_index(GFFfile,seq_col=scol, start_col=bcol, end_col=ecol, zerobased=zcoord)
167 | sys.stdout.write("Tabix file saved on %s.\n" %(GFFfile))
168 | sys.stdout.write("Indices saved on %s.tbi.\n" %(GFFfile))
169 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
170 | sys.stdout.write("Script time --> END: %s\n"%(script_time))


--------------------------------------------------------------------------------
/main/REDItoolKnown.py:
--------------------------------------------------------------------------------
  1 | #!/home/epicardi/bin/python27/bin/python
  2 | # Copyright (c) 2013-2014 Ernesto Picardi <ernesto.picardi@uniba.it>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 | # THE SOFTWARE.
 21 | 
 22 | import sys, os, time, math, random, getopt, operator, string, errno
 23 | try: import pysam
 24 | except: sys.exit('Pysam module not found.')
 25 | from multiprocessing import Process, Queue
 26 | from Queue import Empty
 27 | 
 28 | pysamVersion=pysam.__version__
 29 | 
 30 | sys.stderr.write('Pysam version used: %s\n' %(pysamVersion))
 31 | 
 32 | version='1.3'
 33 | 
 34 | pid=str(os.getpid()+random.randint(0,999999999))
 35 | 
 36 | def usage():
 37 | 	print """
 38 | USAGE: python REDItoolKnown.py [options]
 39 | Options:
 40 | -i		BAM file
 41 | -I		Sort input BAM file
 42 | -f		Reference in fasta file
 43 | -l		List of known RNA editing events
 44 | -C		Base interval to explore [100000]
 45 | -k		List of chromosomes to skip separated by comma or file
 46 | -t		Number of threads [1]
 47 | -o		Output folder [rediFolder_%s]
 48 | -F		Internal folder name [null]
 49 | -c		Min. read coverage [10]
 50 | -q		Min. quality score [30]
 51 | -m		Min. mapping quality score [30]*
 52 | -O		Min. homoplymeric length [5]
 53 | -s		Infer strand (for strand oriented reads) [1]
 54 | -g		Strand inference type 1:maxValue 2:useConfidence [1]
 55 | -x		Strand confidence [0.70]
 56 | -S		Strand correction
 57 | -G		Infer strand by gff annotation (must be sorted, otherwise use -X)
 58 | -X		Sort annotation files
 59 | -K		File with positions to exclude
 60 | -e		Exclude multi hits
 61 | -d		Exclude duplicates
 62 | -p		Use paired concardant reads only
 63 | -u		Consider mapping quality
 64 | -T		Trim x bases up and y bases down per read [0-0]
 65 | -B		Blat file for correction
 66 | -U		Remove substitutions in homopolymeric regions
 67 | -v		Min. num. of reads supporting the variation [3]
 68 | -n		Min. editing frequency [0.1]
 69 | -E		Exclude positions with multiple changes
 70 | -P		File containing splice sites annotations
 71 | -r		Num. of bases near splice sites to explore [4]
 72 | -H		No Table Header
 73 | -h		Print this help
 74 | 
 75 | *This value may change according to the aligner:
 76 | 	- For Bowtie use 255
 77 | 	- For Bowtie2 use 40
 78 | 	- For BWA use 30
 79 | 	- For RNA-STAR use 255
 80 | 	- For HiSAT2 use 60
 81 | 	- For Tophat1 use 255
 82 | 	- For Tophat2 use 50
 83 | 	- For GSNAP use 30
 84 | 
 85 | """%(pid)
 86 | 
 87 | try:
 88 | 	opts, args = getopt.getopt(sys.argv[1:], "i:f:k:t:o:c:q:m:O:s:edpuT:B:Sv:n:EP:r:hHIXG:K:l:C:F:x:g:U")
 89 | except getopt.GetoptError as err:
 90 | 	print str(err) # will print something like "option -a not recognized"
 91 | 	usage()
 92 | 	sys.exit(2)
 93 | 
 94 | MAX_DEPTH=100000
 95 | corrstr=0
 96 | strconf=0.70 #confidenza strand
 97 | useconf=0
 98 | bamfile=''
 99 | fastafile=''
100 | sortbam=0
101 | kfile=''
102 | nochrs=[]
103 | NCPU=1
104 | infolder=''
105 | outfolder_='rediFolder_%s' %(pid)
106 | MINCOV=10
107 | QVAL=33 #NOT USED
108 | MQUAL=30
109 | MAPQ=30
110 | homo=5
111 | rmpv = '0-0'
112 | rmp = [int(x) for x in rmpv.split('-')]
113 | getstrand=0 # considera la strand
114 | exh=0 # escludi multi hits
115 | exd=0 # escludi duplicati
116 | conc=0 # se presenti paired-end, usa solo quelle concordanti
117 | mq=0 # considera il map quality
118 | rmnuc=0 # rimuovi nucleotide a monte ed a valle delle read; connesso a rmp e rmpv
119 | blatr=0 # applica la correzione blat
120 | blatfolder=''
121 | rmsh=0 # rimuovi sostituzioni in omopolimeri di lunghezza maggiore o uguale a homo
122 | vnuc=3 # numero minimo di basi che supportano la variazione
123 | mmf=0.1 # frequenza minima della variazione
124 | exms=0 # escludi sostituzioni multiple
125 | exss=0 # escludi posizioni introniche nei pressi dei siti di splicing a nss nucleotidi 
126 | nss=4 # basi introniche da esplorare per ogni sito si splicing
127 | splicefile='' #'splicesites.hg18.sorted.txt'
128 | #custsub=0  # use custom distribution 
129 | #custfile='' # custom distribution file
130 | #sigsites=0 # select significant sites
131 | #test = 'bh' # select statistical test
132 | usubs=[x+y for x in 'ACGT' for y in 'ACGT' if x!=y] # use these substitutions [default all]
133 | #sval=0.05 # significant value
134 | annfile='' # use annotation file for strand correction and features
135 | sortann=0 # sort annotation file
136 | uann=0 # use annotation
137 | exfile='' # use annotations to exclude positions
138 | expos=0 #
139 | chunckval=100000
140 | unchange1=1
141 | unchange2=0
142 | noheader=0
143 | 
144 | for o, a in opts:
145 | 	if o in ("-h","--help"):
146 | 		usage()
147 | 		sys.exit()
148 | 	elif o == "-H": noheader=1
149 | 	elif o == "-i": bamfile=a
150 | 	elif o == "-f": fastafile=a
151 | 	elif o == "-l": kfile=a
152 | 	elif o == "-k":
153 | 		if os.path.exists(a):
154 | 			f=open(a)
155 | 			nochrs=[x.strip() for x in f if x.strip()!='']
156 | 			f.close()
157 | 		else: nochrs=[x for x in a.split(',') if x.strip()!='']
158 | 	elif o == "-t": NCPU=int(a)
159 | 	elif o == "-F": infolder=a	
160 | 	elif o == "-o": outfolder_=a
161 | 	elif o == "-c": MINCOV=int(a)
162 | 	#elif o == "-Q": QVAL=int(a)
163 | 	elif o == "-q": MQUAL=int(a)
164 | 	elif o == "-m": MAPQ=int(a)	
165 | 	elif o == "-O": homo=int(a)
166 | 	elif o == "-x": strconf=float(a)
167 | 	elif o == "-g":
168 | 		if a=='2': useconf=1
169 | 	elif o == "-s":
170 | 		getstrand=1
171 | 		if int(a)==1: unchange1,unchange2=1,0
172 | 		elif int(a)==0: unchange1,unchange2=0,0
173 | 		elif int(a)==2: unchange1,unchange2=0,1
174 | 		elif int(a)==12: unchange1,unchange2=1,1
175 | 	elif o == "-U": usubs=[x.upper() for x in a.split(',') if a.strip()!='']
176 | 	elif o == "-e": exh=1
177 | 	elif o == "-d": exd=1
178 | 	elif o == "-p": conc=1
179 | 	elif o == "-I": sortbam=1
180 | 	elif o == "-X": sortann=1
181 | 	elif o == "-C": chunckval=int(a)
182 | 	elif o == "-u": mq=1
183 | 	elif o == "-T":
184 | 		rmpv = a
185 | 		try:
186 | 			rmp = [int(x) for x in rmpv.split('-')]
187 | 			rmnuc=1
188 | 		except: rmnuc=0
189 | 	elif o == "-B":
190 | 		blatfolder=a
191 | 		if os.path.exists(blatfolder): blatr=1
192 | 	elif o == "-S": corrstr=1
193 | 	elif o == "-U": rmsh=1
194 | 	elif o == "-v": vnuc=int(a)
195 | 	elif o == "-n": mmf=float(a)
196 | 	elif o == "-E": exms=1
197 | 	elif o == "-P":
198 | 		splicefile=a
199 | 		if os.path.exists(splicefile): exss=1
200 | 	elif o == "-K":
201 | 		exfile=a
202 | 		if os.path.exists(exfile): expos=1	
203 | 	elif o == "-r": nss=int(a)
204 | 	elif o == "-G":
205 | 		annfile=a
206 | 		uann=1
207 | 	else:
208 | 		assert False, "Unhandled Option"
209 | 
210 | #######
211 | commandLine=' '.join(sys.argv[1:])
212 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
213 | params=[]
214 | #Input parameters
215 | params.append('REDItoolKnown version %s\n' %(version))
216 | params.append('User command line: %s\n' %(commandLine))
217 | params.append('Analysis ID: %s\n' %(pid))
218 | params.append('Analysis time: %s\n' %(script_time))
219 | params.append('-i --> BAM file: %s\n' %(bamfile))
220 | params.append('-f --> Reference file: %s\n' %(fastafile))
221 | params.append('-I --> Sort input BAM file: %i\n' %(sortbam))
222 | params.append('-l --> File with known RNA editing positions: %s\n' %(kfile))
223 | params.append('-X --> Sort annotation files: %i\n' %(sortann))
224 | params.append('-k --> Regions to exclude: %s\n' %(','.join(nochrs)))
225 | params.append('-t --> Number of working threads: %i\n' %(NCPU))
226 | params.append('-C --> Base interval to explore: %i\n' %(chunckval))
227 | params.append('-o --> Output folder: %s\n' %(outfolder_))
228 | params.append('-F --> Infolder folder: %s\n' %(infolder))
229 | params.append('-c --> Min. per base coverage: %i\n' %(MINCOV))
230 | #params.append('-Q --> FastQ offset value: %i\n' %(QVAL))
231 | params.append('-q --> Min. per base quality: %i\n' %(MQUAL))
232 | params.append('-m --> Min. mapping quality: %i\n' %(MAPQ))
233 | params.append('-O --> Min. homoplymeric length: %i\n' %(homo))
234 | params.append('-s --> Infer strand: %i - %i-%i\n' %(getstrand,unchange1,unchange2))
235 | params.append('-g --> Use confidence: %i\n' %(useconf))
236 | params.append('-x --> Strand confidence: %.2f\n' %(strconf))
237 | params.append('-S --> Strand correction : %i\n' %(corrstr))
238 | params.append('-G --> GFF annotation to infer strand: %s\n' %(annfile))
239 | params.append('-e --> Exclude multi hits: %i\n' %(exh))
240 | params.append('-d --> Exclude duplicates: %i\n' %(exd))
241 | params.append('-p --> Use paired concardant reads only: %i\n' %(conc))
242 | params.append('-u --> Consider mapping quality: %i\n' %(mq))
243 | params.append('-T --> Trim x bases up and y bases down per read: %i - %i-%i\n' %(rmnuc,rmp[0],rmp[1]))
244 | params.append('-B --> Blat folder for correction: %s\n' %(blatfolder))
245 | params.append('-S --> Remove substitutions in homopolymeric regions: %i\n' %(rmsh))
246 | params.append('-v --> Min. num. of reads supporting the variation: %i\n' %(vnuc))
247 | params.append('-n --> Min. editing frequency: %.2f\n' %(mmf))
248 | params.append('-E --> Exclude positions with multiple changes: %i\n' %(exms))
249 | params.append('-P --> File containing splice sites annotations: %s\n' %(splicefile))
250 | params.append('-r --> Num. of bases near splice sites to explore: %i\n' %(nss))
251 | params.append('-K --> File with positions to exclude: %s\n' %(exfile))
252 | #######
253 | 
254 | def pid_exists(pid):
255 |     """Check whether pid exists in the current process table."""
256 |     if pid < 0:
257 |         return False
258 |     try:
259 |         os.kill(pid, 0)
260 |     except OSError, e:
261 |         return e.errno == errno.EPERM
262 |     else:
263 |         return True
264 |         
265 | def get_no(pvalue,siglevel,ngenes): # No Correction
266 | 	lista=[]
267 | 	pp=siglevel
268 | 	y=0
269 | 	for i in pvalue:
270 | 		p=i[0]
271 | 		if p<=siglevel:
272 | 			lista.append(i)
273 | 			y+=1
274 | 	return lista,y,pp
275 | 
276 | def get_b(pvalue,siglevel,ngenes): # Bonferroni
277 | 	pvalue.sort()
278 | 	lista=[]
279 | 	y=0
280 | 	#bcorr=siglevel/ngenes
281 | 	pp=1.0
282 | 	for i in pvalue:
283 | 		p=i[0]*ngenes
284 | 		if p<=siglevel:
285 | 			lista.append(i)
286 | 			#lista[i[1]]=i[0]
287 | 			y+=1
288 | 			if p<pp: pp=p
289 | 	#print "Passed:",y,pp
290 | 	return lista,y,pp
291 | 
292 | def get_bh(pvalue,siglevel,ngenes): # B-H
293 | 	pvalue.sort()
294 | 	#print ngenes
295 | 	lista=[]
296 | 	x=1
297 | 	y=0
298 | 	p=0
299 | 	for i in pvalue:
300 | 		nf=i[0]*ngenes
301 | 		fdr=nf/x
302 | 		if fdr<=siglevel:
303 | 			#dic[i[1]]=i[0]
304 | 			lista.append(i)
305 | 			p=i[0]
306 | 			y+=1
307 | 		x+=1
308 | 	#print "Passed:",y,p
309 | 	return lista,y,p
310 | 
311 | def getTail(pp):
312 | 	if ftail=='l': return pp.left_tail
313 | 	elif ftail=='r': return pp.right_tail
314 | 	elif ftail=='t': return pp.two_tail
315 | 	
316 | def getDicSS(dicp): # dicp = dizionario con le frequenze di sostituzione
317 | 	dicpp={}
318 | 	for i in dicp:
319 | 		if i[0]!=i[1]:
320 | 			dicpp[i]=1-dicp[i]
321 | 	return dicpp
322 | 
323 | def getFreads(bases):
324 | 	fread={'A':0,'C':0,'G':0,'T':0}
325 | 	for i in range(4):
326 | 		if i==0: fread['A']=bases[i]
327 | 		elif i==1: fread['C']=bases[i]
328 | 		elif i==2: fread['G']=bases[i]
329 | 		elif i==3: fread['T']=bases[i]
330 | 	return fread
331 | 
332 | def getSub(ref,fread,dics):
333 | 	#fread={A,C,G,T}
334 | 	nref=fread[ref.upper()]
335 | 	sub=[(ref.upper()+i,nref,fread[i]) for i in fread if i!=ref.upper() and fread[i]!=0]
336 | 	allsub=' '.join([x[0] for x in sub])
337 | 	# lista del tipo [('AT', 50, 10), ('AG', 50, 2)]
338 | 	res=[] #[(int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1),pvalue(i[1],i[2],int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1))) for i in sub]
339 | 	for i in sub:
340 | 		#if binomial:
341 | 		#	pval=bdtrc(i[2],i[1]+i[2],(1.0-dics[i[0]]))
342 | 		#	#pval=Bprob(i[2],i[1]+i[2],(1.0-dics[i[0]]))
343 | 		#	#print i[2],i[1]+i[2],(1.0-dics[i[0]]),pval
344 | 		#	obs1,obs2,exp1,exp2=0,0,0,0
345 | 		obs1=i[1]
346 | 		obs2=i[2]
347 | 		exp1=int(dics[i[0]]*(i[1]+i[2]))
348 | 		exp2=((i[1]+i[2]) - exp1)
349 | 		pval=pvalue(obs1,obs2,exp1,exp2)
350 | 		pval=getTail(pval)
351 | 		res.append((i[0],obs1,obs2,exp1,exp2,str(pval)))
352 | 	if len(res)==1: return res[0][5] #,allsub,fread
353 | 	elif len(res) > 1:
354 | 		rr=[float(x[-1]) for x in res]
355 | 		idx=rr.index(min(rr))
356 | 		return res[idx][5] #,allsub,fread
357 | 	else: return '1.0' #,0,0
358 | 	
359 | def BaseCount(seq,ref):
360 | 	b={'A':0,'C':0,'G':0,'T':0}
361 | 	subs=[]
362 | 	subv=[]
363 | 	for i in seq.upper():
364 | 		if b.has_key(i): b[i]+=1
365 | 	for i in b:
366 | 		if not b.has_key(ref): continue
367 | 		if b[i]!=0 and i!=ref:
368 | 			vv=float(b[i])/(b[i]+b[ref])
369 | 			subv.append((b[i],vv,ref+i))
370 | 	subv.sort()
371 | 	subv.reverse()
372 | 	for i in subv:
373 | 		if i[0]>=vnuc and i[1]>=mmf: subs.append(i[2])
374 | 	freq=0.0
375 | 	if len(subs)==0: subs.append('-')
376 | 	else: freq=subv[0][1]	
377 | 	return sum(b.values()),[b['A'],b['C'],b['G'],b['T']],' '.join(subs),'%.2f'%(freq)
378 | 
379 | def meanq(v,n):
380 | 	try:m=float(v)/n
381 | 	except: m=0.0
382 | 	return '%.2f'%(m)
383 | 	
384 | def rmHomo(sequp,seqdw,gh,ref):
385 | 	if len(sequp)==0 and len(seqdw)==0: return 0
386 | 	up,dw=0,0
387 | 	for i in seqdw:
388 | 		if i==ref:dw+=1
389 | 		else:break
390 | 	for i in sequp[::-1]:
391 | 		if i==ref:up+=1
392 | 		else:break
393 | 	if up+dw+1 >= gh : return 1
394 | 	return 0
395 | 
396 | def prop(tot,va):
397 | 	try: av=float(va)/tot
398 | 	except: av=0.0
399 | 	return av
400 | 
401 | def vstand(strand):
402 | 	vv=[(strand.count('+'),'+'),(strand.count('-'),'-'),(strand.count('*'),'*')]
403 | 	if vv[0][0]==0 and vv[1][0]==0: return '*'
404 | 	if useconf:
405 | 		totvv=sum([x[0] for x in vv[:2]])
406 | 		if prop(totvv,vv[0][0])>=strconf: return '+'
407 | 		if prop(totvv,vv[1][0])>=strconf: return '-'
408 | 		return '*'
409 | 	else:
410 | 		if vv[0][0]==vv[1][0] and vv[2][0]==0: return '+'
411 | 		return max(vv)[1]
412 | 
413 | def comp(s):
414 | 	a={'A':'T','T':'A','C':'G','G':'C'}
415 | 	ss=''
416 | 	for i in s.upper():
417 | 		if a.has_key(i): ss+=a[i]
418 | 		else: ss+='N'
419 | 	return ss	
420 | 
421 | def whereis(program):
422 | 	for path in os.environ.get('PATH', '').split(':'):
423 | 		if os.path.exists(os.path.join(path, program)) and not os.path.isdir(os.path.join(path, program)): return 1
424 | 	return 0
425 | 
426 | def vstrand(lista):
427 | 	if len(lista)==0: return '2'
428 | 	p=lista.count('+')
429 | 	m=lista.count('-')
430 | 	if p==len(lista): return '1'
431 | 	elif m==len(lista): return '0'
432 | 	else: return '2'
433 | 
434 | def getd(lines): #fixed error in reading strand 6/3/2014
435 | 	d={}
436 | 	for i in lines:
437 | 		l=(i.strip('\n\r')).split('\t')
438 | 		if len(l)>=3:
439 | 			if l[2]=='+': strand='1'
440 | 			elif l[2]=='-': strand='0'
441 | 			elif l[2] in '012': strand=l[2]
442 | 			else: strand='2'
443 | 		else: strand='2'
444 | 		d[int(l[1])]=strand
445 | 	return d
446 | 
447 | def normByStrand(seq_,strand_,squal_,mystrand_):
448 | 	st='+'
449 | 	if mystrand_=='0': st='-'
450 | 	seq,qual,squal='',0,[]
451 | 	for i in range(len(seq_)):
452 | 		if strand_[i]==st:
453 | 			seq+=seq_[i]
454 | 			qual+=squal_[i] #-QVAL
455 | 			squal.append(squal_[i])
456 | 	return seq,qual,squal
457 | 
458 | def normByBlat(seq_,strand_,squal_,blatc_):
459 | 	seq,qual,squal,strand='',0,[],''
460 | 	for i in range(len(seq_)):
461 | 		if blatc_[i]=='1':
462 | 			seq+=seq_[i]
463 | 			qual+=squal_[i]
464 | 			squal.append(squal_[i])
465 | 			strand+=strand_[i]
466 | 	return seq,qual,squal,strand
467 | 
468 | def testBlat(blc):
469 | 	if blc.count('1') > blc.count('0'): return 1
470 | 	return 0
471 | 		
472 | #######
473 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
474 | sys.stderr.write("Script time --> START: %s\n"%(script_time))
475 | sys.stderr.write("Analysis ID: %s\n"%(pid))
476 | 
477 | if not os.path.exists(bamfile):
478 | 	usage()
479 | 	sys.exit('BAM file %s not found.' %(bamfile))
480 | if sortbam:
481 | 	sys.stderr.write('Sorting BAM file.\n')
482 | 	pysam.sort(bamfile,'sorted_%s'%(pid))
483 | 	os.rename(bamfile,bamfile+'_old')
484 | 	os.rename('sorted_%s.bam'%(pid),bamfile)
485 | 	sys.stderr.write('Indexing BAM file.\n')
486 | 	pysam.index(bamfile)
487 | if not os.path.exists(bamfile+'.bai') and not sortbam:
488 | 	sys.stderr.write('Indexing BAM file.\n')
489 | 	pysam.index(bamfile)	
490 | if not os.path.exists(fastafile):
491 | 	usage()
492 | 	sys.exit('Fasta file %s not found.' %(fastafile))
493 | if not os.path.exists(fastafile+'.fai'):
494 | 	sys.stderr.write('Indexing Fasta file.\n')
495 | 	pysam.faidx(fastafile)
496 | if not os.path.exists(kfile): sys.exit('File containing RNA editing positions not found.')
497 | if sortann:
498 | 		if not whereis('grep'): sys.exit('grep command not found.')
499 | 		if not whereis('sort'): sys.exit('sort command not found.')
500 | 		sys.stderr.write('Sorting file with known editing positions.\n')
501 | 		scmd='grep -v ^"chrom" %s | grep -v "^[[:space:]]*$" | sort -k1,1 -k2,2n > %s' %(kfile,'positions_%s'%(pid))
502 | 		os.system(scmd)
503 | 		os.rename(kfile,kfile+'_old')
504 | 		os.rename('positions_%s'%(pid),kfile)
505 | if not os.path.exists(kfile+'.tbi'):
506 | 		sys.stderr.write('Indexing file with known positions.\n')
507 | 		kfile=pysam.tabix_index(kfile, seq_col=0, start_col=1, end_col=1)
508 | 		# Format for tabfile with positions:
509 | 		# chr start strand
510 | ##################################
511 | # check reference names
512 | rrefs={}
513 | ridxinfo=pysam.idxstats(bamfile)
514 | for j in ridxinfo.split('\n'): #MOD
515 | 	l=(j.strip()).split('\t')
516 | 	if l[0] in ['*','']: continue #MOD
517 | 	if int(l[2])+int(l[3]) > 0: rrefs[l[0]]=int(l[1])
518 | frefs=[]
519 | fidxinfo=open(fastafile+'.fai')
520 | for j in fidxinfo:
521 | 	l=(j.strip()).split('\t')
522 | 	if l[0]=='': continue
523 | 	frefs.append(l[0])
524 | fidxinfo.close()
525 | # in rna-seq
526 | rnof=[]
527 | for i in rrefs.keys():
528 | 	if i not in frefs: sys.stderr.write('WARNING: Region %s in RNA-Seq not found in reference file.\n' %(i))
529 | ##################################
530 | 
531 | if uann:
532 | 	getstrand=0
533 | 	if not os.path.exists(annfile):
534 | 		usage()
535 | 		sys.exit('Annotation file %s not found.' %(annfile))
536 | 	if sortann:
537 | 		if not whereis('grep'): sys.exit('grep command not found.')
538 | 		if not whereis('sort'): sys.exit('sort command not found.')
539 | 		sys.stderr.write('Sorting annotation file.\n')
540 | 		scmd='grep ^"#" %s; grep -v ^"#" %s | sort -k1,1 -k4,4n > %s' %(annfile,annfile,'annotation_%s'%(pid))
541 | 		os.system(scmd)
542 | 		os.rename(annfile,annfile+'_old')
543 | 		os.rename('annotation_%s'%(pid),annfile)
544 | 	if not os.path.exists(annfile+'.tbi'):
545 | 		sys.stderr.write('Indexing annotation file.\n')
546 | 		annfile=pysam.tabix_index(annfile, preset='gff')
547 | ###########################################################
548 | # Annotation file to exclude positions
549 | if expos:
550 | 	if not os.path.exists(exfile):
551 | 		usage()
552 | 		sys.exit('File %s not found.' %(exfile))
553 | 	if sortann:
554 | 		if not whereis('grep'): sys.exit('grep command not found.')
555 | 		if not whereis('sort'): sys.exit('sort command not found.')
556 | 		sys.stderr.write('Sorting file.\n')
557 | 		scmd='grep ^"#" %s; grep -v ^"#" %s | sort -k1,1 -k4,4n > %s' %(exfile,exfile,'exfile_%s'%(pid))
558 | 		os.system(scmd)
559 | 		os.rename(exfile,exfile+'_old')
560 | 		os.rename('exfile_%s'%(pid),exfile)
561 | 	if not os.path.exists(exfile+'.tbi'):
562 | 		sys.stderr.write('Indexing %s file.\n' %(exfile))
563 | 		exfile=pysam.tabix_index(exfile, preset='gff')	
564 | ###########################################################
565 | #mainbam=pysam.Samfile(bamfile,"rb")
566 | #regions=mainbam.references
567 | #regionslens=mainbam.lengths
568 | #mainbam.close()
569 | #dicregions=dict([(regions[x],regionslens[x]) for x in range(len(regions))])
570 | #chrs=[x for x in regions if x not in nochrs]
571 | dicregions=dict(rrefs.items())
572 | chrs=[x for x in dicregions.keys() if x not in nochrs]
573 | sys.stderr.write('Analysis on %i regions.\n' %(len(chrs)))
574 | 
575 | if infolder!='': outfolder=os.path.join(outfolder_,'known_%s_%s' %(infolder,pid))
576 | else: outfolder=os.path.join(outfolder_,'known_%s' %(pid))
577 | if not os.path.exists(outfolder):
578 | 	splitfolder=os.path.split(outfolder)
579 | 	if not os.path.exists(splitfolder[0]): os.mkdir(splitfolder[0])
580 | 	os.mkdir(outfolder)	
581 | outtable=os.path.join(outfolder,'outTable_%s' %(pid))
582 | #write command line and input parameters
583 | f=open(os.path.join(outfolder,'parameters.txt'),'w')
584 | f.writelines(params)
585 | f.close()
586 | 
587 | #############################################
588 | d={}
589 | if blatr:
590 | 	badblat=blatfolder #os.path.join(blatfolder,'blatseqs_%s.bad'%(chr))
591 | 	if os.path.exists(badblat):
592 | 		sys.stderr.write('Using Blat mapping for RNAseq...\n')
593 | 		f=open(badblat)
594 | 		for i in f:
595 | 			l=(i.strip()).split()
596 | 			d[l[0]+'_'+l[1]]=int(l[1])
597 | 		f.close()
598 | 		sys.stderr.write('Found %i reads.\n'%(len(d)))
599 | 
600 | def exploreBAM(myinput):
601 | 	inputs=myinput.split('$')
602 | 	chr,bamfile=inputs[0],inputs[1]
603 | 	outfile=os.path.join(outfolder,'table_%s_%s'%(chr,pid))
604 | 	#outfile2=os.path.join(outfolder,'subs_%s_%s'%(chr,pid))
605 | 	d,di={},{}
606 | 	bam=pysam.Samfile(bamfile,"rb")
607 | 	fasta=pysam.Fastafile(fastafile)
608 | 	ktabix=pysam.Tabixfile(kfile)
609 | 	lenregion=dicregions[chr]
610 | 	if uann: tabix=pysam.Tabixfile(annfile)
611 | 	if expos: extabix=pysam.Tabixfile(exfile)
612 | 	out=open(outfile,'w')
613 | 	#if not custsub:
614 | 	#	dsubs=dict([(x+y, 0) for x in 'ACGT' for y in 'ACGT'])
615 | 	#	out2=open(outfile2,'w')
616 | 	#header='Region\tPosition\tReference\tCoverage\tMeanQuality\tBaseCount\tSubs\tFrequency\n'
617 | 	#out.write(header)
618 | 	sys.stderr.write('Started analysis on region: %s\n'%(chr))
619 | 	#if blatr:
620 | 	#	badblat=os.path.join(blatfolder,'blatseqs_%s.bad'%(chr))
621 | 	#	if os.path.exists(badblat):
622 | 	#		sys.stderr.write('Using Blat mapping for region %s\n'%(chr))
623 | 	#		f=open(badblat)
624 | 	#		for i in f:
625 | 	#			l=(i.strip()).split()
626 | 	#			d[l[0]+'_'+l[1]]=int(l[1])
627 | 	#		f.close()
628 | 	#		sys.stderr.write('Found %i reads for region %s\n'%(len(d),chr))
629 | 	if exss:
630 | 		if os.path.exists(splicefile):
631 | 			sys.stderr.write('Loading known splice sites for region %s\n'%(chr))
632 | 			f=open(splicefile)
633 | 			for i in f:
634 | 				l=(i.strip()).split()
635 | 				if l[0]!=chr: continue
636 | 				st,tp,cc=l[4],l[3],int(l[1])
637 | 				if st=='+' and tp=='D':
638 | 					for j in range(nss): di[cc+(j+1)]=0
639 | 				if st=='+' and tp=='A':
640 | 					for j in range(nss): di[cc-(j+1)]=0
641 | 				if st=='-' and tp=='D': 	
642 | 					for j in range(nss): di[cc-(j+1)]=0
643 | 				if st=='-' and tp=='A':
644 | 					for j in range(nss): di[cc+(j+1)]=0	
645 | 			f.close()
646 | 			sys.stderr.write('Loaded %i positions for %s\n'%(len(di),chr))
647 | 	if chr in ktabix.contigs:
648 | 		for kpos in range(0,lenregion,chunckval):
649 | 			startk,endk=kpos,(kpos+chunckval)-1
650 | 			kres=[kk for kk in ktabix.fetch(reference=chr,start=startk,end=endk)]
651 | 			if len(kres)==0: continue
652 | 			kdic=getd(kres)
653 | 			#print kdic
654 | 			# else explore bam to find exact positions
655 | 			for pileupcolumn in bam.pileup(chr,startk,endk,stepper='nofilter', max_depth=MAX_DEPTH):
656 | 				if not startk<=pileupcolumn.reference_pos<=endk: continue
657 | 				if not kdic.has_key(pileupcolumn.reference_pos+1): continue
658 | 				ref=fasta.fetch(chr,pileupcolumn.reference_pos,pileupcolumn.reference_pos+1).upper()
659 | 				seq,qual,strand,squal,blatc='',0,'',[],''
660 | 				if rmsh:
661 | 					if ((pileupcolumn.reference_pos+1)-homo)-1 < 0: sequp=''
662 | 					else: sequp=(fasta.fetch(chr,((pileupcolumn.reference_pos+1)-homo)-1,(pileupcolumn.reference_pos+1)-1)).upper()
663 | 					seqdw=(fasta.fetch(chr,pileupcolumn.reference_pos+1,(pileupcolumn.reference_pos+1)+homo)).upper()
664 | 				for pileupread in pileupcolumn.pileups: # per ogni base dell'allineamento multiplo
665 | 					#s,q,t,qq=pileupread.alignment.seq[pileupread.qpos].upper(),ord(pileupread.alignment.qual[pileupread.qpos])-QVAL,'*',pileupread.alignment.qual[pileupread.qpos]
666 | 					if pileupread.is_del: continue
667 | 					if pileupread.alignment.is_qcfail: continue
668 | 					if pileupread.alignment.is_supplementary: continue
669 | 					if pileupread.alignment.has_tag('SA'): continue
670 | 					# escludi posizioni introniche nei pressi di splice sites
671 | 					if exss and di.has_key(pileupcolumn.reference_pos+1): continue
672 | 					# multiple hit
673 | 					if exh:
674 | 						if pileupread.alignment.is_secondary: continue
675 | 						if pileupread.alignment.has_tag('NH'):
676 | 							if pileupread.alignment.get_tag('NH') > 1: continue
677 | 					# duplicates
678 | 					if exd and pileupread.alignment.is_duplicate: continue
679 | 					# se paired end
680 | 					if conc: # se devi usare solo le paired
681 | 						# se non sono paired
682 | 						if not pileupread.alignment.is_paired: continue
683 | 						# se non sono concordanti
684 | 						if not pileupread.alignment.is_proper_pair: continue
685 | 						# se concordanti ma nello stesso orientamento
686 | 						flag=pileupread.alignment.flag
687 | 						if pileupread.alignment.is_duplicate: flag=flag-1024
688 | 						if pileupread.alignment.is_secondary: flag=flag-256
689 | 						if flag in [67,131,115,179]: continue
690 | 					# mapping quality
691 | 					if mq and pileupread.alignment.mapping_quality < MAPQ: continue
692 | 					#se la qualita' >= alla qualita' minima
693 | 					if not pileupread.alignment.query_qualities: pileupread.alignment.query_qualities=[30 for vn in range(len(pileupread.alignment.query_sequence))]
694 | 					#
695 | 					#print pileupread.alignment.query_sequence
696 | 					#print pileupread.query_position
697 | 					s,q,t,qq=pileupread.alignment.query_sequence[pileupread.query_position].upper(),pileupread.alignment.query_qualities[pileupread.query_position],'*',pileupread.alignment.query_qualities[pileupread.query_position]
698 | 					if q >= MQUAL and pileupcolumn.reference_pos in pileupread.alignment.get_reference_positions():
699 | 						#tags=dict(pileupread.alignment.tags)
700 | 						#deduci la strand per ogni posizione
701 | 						if getstrand:
702 | 							#usa le info del mapping se strand oriented 
703 | 							if pileupread.alignment.is_read1:
704 | 								if unchange1:
705 | 									if pileupread.alignment.is_reverse: t='-'
706 | 									else: t='+'
707 | 								else:
708 | 									if pileupread.alignment.is_reverse: t='+'
709 | 									else: t='-'
710 | 							elif pileupread.alignment.is_read2:
711 | 								if unchange2:
712 | 									if pileupread.alignment.is_reverse: t='-'
713 | 									else: t='+'
714 | 								else:
715 | 									if pileupread.alignment.is_reverse: t='+'
716 | 									else: t='-'
717 | 							else: # for single ends
718 | 								if unchange1:
719 | 									if pileupread.alignment.is_reverse: t='-'
720 | 									else: t='+'
721 | 								else:
722 | 									if pileupread.alignment.is_reverse: t='+'
723 | 									else: t='-'	
724 | 						if rmnuc:
725 | 							#rlen=pileupread.alignment.rlen #pileupread.alignment.qlen #lunghezza della specifica read
726 | 							#print rlen,pileupread.qpos,pileupread.alignment.qstart,pileupread.alignment.qend
727 | 							# verifica se il nuc deve essere rimosso alle estremita' nel range x-y
728 | 							# testare il forward
729 | 							#qp=pileupread.qpos #pileupread.qpos-pileupread.alignment.qstart
730 | 							#print pileupread.qpos,pileupread.alignment.rlen,len(pileupread.alignment.seq)
731 | 							#if pileupread.alignment.is_reverse:
732 | 							#	if (rlen-qp)-1 < rmp[0]:continue
733 | 							#	if (rlen-qp)-1 > ((rlen)-rmp[1])-1: continue
734 | 							#else:
735 | 							#	if qp<rmp[0]:continue
736 | 							#	if qp>(rlen-rmp[1])-1: continue
737 | 							rlen=pileupread.alignment.query_length #pileupread.alignment.qlen #lunghezza della specifica read
738 | 							qp=pileupread.query_position #pileupread.qpos-pileupread.alignment.qstart
739 | 							if rmp[0]>0: #rimuovi posizioni al 5'
740 | 								if pileupread.alignment.is_reverse:
741 | 									if (pileupread.alignment.query_alignment_end-rmp[1]) <=qp<= pileupread.alignment.query_alignment_end-1: continue
742 | 								else: 
743 | 									if pileupread.alignment.query_alignment_start <=qp<= (pileupread.alignment.query_alignment_start+rmp[0])-1: continue
744 | 							if rmp[1]>0: #rimuovi posizioni al 3'
745 | 								if pileupread.alignment.is_reverse:
746 | 									if pileupread.alignment.query_alignment_start <=qp<= (pileupread.alignment.query_alignment_start+rmp[0])-1: continue
747 | 								else: 
748 | 									if (pileupread.alignment.query_alignment_end-rmp[1]) <=qp<= pileupread.alignment.query_alignment_end-1: continue
749 | 							#print qp, rmp								
750 | 						# se la read di appartenenza non mappa in modo univoco con Blat
751 | 						if blatr:
752 | 							rt=0
753 | 							if pileupread.alignment.is_read1: rt=1
754 | 							elif pileupread.alignment.is_read2: rt=2
755 | 							else: rt=0
756 | 							rname=pileupread.alignment.query_name+'_%i'%(rt)
757 | 							if d.has_key(rname): blatc+='0' #continue
758 | 							else: blatc+='1'
759 | 						# se la base e' diversa dal reference
760 | 						# se in regione omopolimerica scarta
761 | 						if rmsh and rmHomo(sequp,seqdw,homo,ref): continue
762 | 						seq+=s
763 | 						qual+=q
764 | 						strand+=t
765 | 						squal.append(qq)
766 | 				if seq.strip()!='':
767 | 					if blatr:
768 | 						if testBlat(blatc): seq,qual,squal,strand=normByBlat(seq,strand,squal,blatc)
769 | 						else: continue
770 | 					#print pileupcolumn.reference_pos+1,seq,squal
771 | 					#mystrand=kdic[pileupcolumn.reference_pos+1]
772 | 					#print mystrand
773 | 					try: mystrand=kdic[pileupcolumn.reference_pos+1]
774 | 					except: mystrand='2'
775 | 					#print chr,pileupcolumn.reference_pos+1,seq,strand, mystrand
776 | 					if uann and not getstrand:
777 | 						if chr in tabix.contigs:
778 | 							sres=[kk.strand for kk in tabix.fetch(reference=chr,start=(pileupcolumn.reference_pos),end=(pileupcolumn.reference_pos+1),parser=pysam.asGTF())]
779 | 							mystrand=vstrand(sres)
780 | 					if getstrand and not uann:
781 | 						mystr=vstand(strand)
782 | 						if mystr=='-': mystrand='0'
783 | 						elif mystr=='+': mystrand='1'
784 | 						else: mystrand='2'
785 | 					if mystrand=='0':
786 | 						seq=comp(seq)
787 | 						ref=comp(ref)
788 | 					#if getstrand and mystrand in ['1','0'] and not useconf: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
789 | 					if getstrand and mystrand in ['1','0'] and corrstr: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
790 | 					if uann and mystrand in ['1','0'] and corrstr: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
791 | 					#if not getstrand and not uann and mystrand in ['1','0']: seq,qual,squal=normByStrand(seq,strand,squal,mystrand)
792 | 					#print chr,pileupcolumn.reference_pos+1,seq,strand,mystrand
793 | 					cov,bcomp,subs,freq=BaseCount(seq,ref)
794 | 					if cov < MINCOV: continue
795 | 					if exms and subs.count(' ')>0: continue
796 | 					mqua=meanq(qual,len(seq))
797 | 					if expos:
798 | 						if chr in extabix.contigs:
799 | 							exres=[kk for kk in extabix.fetch(reference=chr,start=(pileupcolumn.reference_pos),end=(pileupcolumn.reference_pos+1))]
800 | 							if len(exres)>0: continue			
801 | 					line='\t'.join([chr,str(pileupcolumn.reference_pos+1),ref,mystrand,str(cov),(mqua),str(bcomp),subs,freq])+'\n'
802 | 					out.write(line)
803 | 	bam.close()
804 | 	fasta.close()
805 | 	ktabix.close()
806 | 	out.close()
807 | 	if uann: tabix.close()
808 | 	if expos: extabix.close()
809 | 	sys.stderr.write('Job completed for region: %s\n'%(chr))
810 | 
811 | def addPvalue(myinput2): # not used here 
812 | 	inputs=myinput2.split('$')
813 | 	f=open(inputs[0])
814 | 	subs=eval((f.readline()).strip())
815 | 	f.close()
816 | 	dsubs={}
817 | 	for i in subs: dsubs[i]=float(subs[i])/sum(subs.values())
818 | 	dsubss=getDicSS(dsubs)
819 | 	#print dsubss
820 | 	o=open(inputs[2],'w')
821 | 	f=open(inputs[1])
822 | 	for i in f:
823 | 		l=(i.strip()).split('\t')
824 | 		if i.strip()=='': continue
825 | 		#if i.startswith('Region'):
826 | 		#	l.append('Pvalue')
827 | 		#	o.write('\t'.join(l)+'\n')
828 | 		#	continue
829 | 		if l[6]!='-': pval=getSub(l[2],getFreads(eval(l[6])),dsubss)
830 | 		else: pval='1.0'
831 | 		l.append(pval)
832 | 		o.write('\t'.join(l)+'\n')
833 | 	o.close()
834 | 	
835 | def do_work(q):
836 | 	while True:
837 | 		try:
838 | 			x=q.get(block=False)
839 | 			exploreBAM(x)
840 | 		except Empty:
841 | 			break
842 | 
843 | work_queue = Queue()
844 | for i in chrs:
845 | 	strinput=i+'$'+bamfile
846 | 	work_queue.put(strinput)
847 | processes=[Process(target=do_work, args=(work_queue,)) for i in range(NCPU)]
848 | for t in processes:
849 | 	t.start()
850 | for t in processes:
851 | 	t.join()
852 | time.sleep(0.5)
853 | #
854 | head='Region\tPosition\tReference\tStrand\tCoverage-q%i\tMeanQ\tBaseCount[A,C,G,T]\tAllSubs\tFrequency\n' %(MQUAL)
855 | sys.stderr.write('Merging Tables.\n')
856 | o=open(outtable,'w')
857 | if noheader==0: o.write(head)
858 | for i in chrs:
859 | 	#tabfile=os.path.join(outfolder,'outTable_%s_%s' %(i,pid))
860 | 	tabfile=os.path.join(outfolder,'table_%s_%s' %(i,pid))
861 | 	if os.path.exists(tabfile):
862 | 		f=open(tabfile)
863 | 		for j in f: o.write(j)
864 | 		f.close()
865 | 		os.remove(tabfile)
866 | 		#os.remove(intabfile)
867 | o.close()
868 | 
869 | #if sigsites:
870 | #	sys.stderr.write('Selecting significant sites.\n')
871 | #	outsig=os.path.join(outfolder,'outTableSig_%s' %(pid))
872 | #	f=open(outtable)
873 | #	o=open(outsig,'w')
874 | #	o.write(head)
875 | #	allv=[]
876 | #	for i in f:
877 | #		if i.startswith('Region'): continue
878 | #		if i.strip()=='': continue
879 | #		l=(i.strip()).split('\t')
880 | #		if l[7]=='-': continue
881 | #		if l[7] not in usubs: continue
882 | #		pp=float(l[9])
883 | #		allv.append((pp,i))
884 | #	if test=='bh': rr=get_bh(allv,sval,len(allv))
885 | #	elif test=='bo': rr=get_b(allv,sval,len(allv))
886 | #	else: rr=get_no(allv,sval,len(allv))
887 | #	for i in rr[0]: o.write(i[1])		
888 | #	f.close()
889 | #	o.close()
890 | 
891 | sys.stderr.write('Results saved on %s\n'%(outtable))
892 | #if sigsites: sys.stderr.write('Significant sites saved on %s\n'%(outsig))
893 | 
894 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
895 | sys.stderr.write("Script time --> END: %s\n"%(script_time))
896 | 
897 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(name='REDItools',
 6 | 	version='1.3',
 7 | 	description='Python Scripts for RNA editing detection by RNA-Seq data',
 8 | 	author='Ernesto Picardi',
 9 | 	author_email='ernesto.picardi@gmail.com',
10 | 	url='https://github.com/BioinfoUNIBA/REDItools',
11 | 	scripts=['main/REDItoolDenovo.py',
12 | 	'main/REDItoolDnaRna.py',
13 | 	'main/REDItoolKnown.py',
14 | 	'accessory/AnnotateTable.py',
15 | 	'accessory/FilterTable.py',
16 | 	'accessory/SearchInTable.py',
17 | 	'accessory/selectPositions.py',
18 | 	'accessory/GFFtoTabix.py',
19 | 	'accessory/SortGFF.py',
20 | 	'accessory/SortTable.py',
21 | 	'accessory/TableToGFF.py',
22 | 	'accessory/tableToTabix.py',
23 | 	'accessory/readPsl.py',
24 | 	'accessory/subCount.py',
25 | 	'accessory/subCount2.py',
26 | 	'accessory/rediportal2recoding.py'
27 | 	],
28 | 	license='LICENSE.txt',
29 | 	classifiers=[
30 |           'Intended Audience :: Computational biologists',
31 |           'License :: OSI Approved :: MIT',
32 |           'Operating System :: MacOS :: MacOS X',
33 |           'Operating System :: POSIX',
34 |           'Programming Language :: Python',
35 |           ],
36 | 	long_description=open('README_1.md').read(),
37 | 	platforms=['Linux','Unix','MacOS']
38 | )
39 | 
40 | 


--------------------------------------------------------------------------------