├── .gitignore ├── Docker ├── Dockerfile └── Readme.md ├── Images ├── Fig 5.png ├── Fig1.png ├── Fig2.png ├── Fig3.png └── Fig4.png ├── LICENSE ├── NPfiles ├── editingStats.txt ├── editing_sorted.txt └── sample_information_file.txt ├── NPscripts ├── REDItoolDnaRnav13.py ├── collect_editing_candidates.py ├── conda_pckg_installer_docker.py ├── conda_pckgs_installer.py ├── download-prepare-data-NP.py ├── download-prepare-data-NP_docker.py └── get_Statistics.py ├── PKG-INFO ├── README.md ├── README_1.md ├── README_2.md ├── accessory ├── AnnotateTable.py ├── FilterTable.py ├── GFFtoTabix.py ├── Readme.md ├── SearchInTable.py ├── SortGFF.py ├── SortTable.py ├── TableToGFF.py ├── get_DE_events.py ├── readPsl.py ├── rediportal2recoding.py ├── selectPositions.py ├── subCount.py ├── subCount2.py └── tableToTabix.py ├── main ├── REDItoolDenovo.py ├── REDItoolDnaRna.py └── REDItoolKnown.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | #Download base image centos latest 2 | FROM centos 3 | 4 | #Dockerfile Mantainer 5 | LABEL mantainer="clalogiudice@gmail.com" 6 | 7 | #Update the centos software with yum package-manager 8 | RUN yum update -y && yum clean all 9 | 10 | #Install git, wget and nano package 11 | RUN yum -y install git wget nano && yum clean all 12 | 13 | #Clone Nature_protocol Git repository 14 | RUN git clone https://github.com/BioinfoUNIBA/REDItools 15 | 16 | WORKDIR "/REDItools/NPscripts/" 17 | 18 | #Install miniconda with conda packages required by the nature_protocol 19 | RUN chmod +x conda_pckg_installer_docker.py 20 | RUN ./conda_pckg_installer_docker.py 21 | ENV PATH /miniconda2/bin:$PATH 22 | RUN echo "source activate nature_protocol" >> ~/.bashrc 23 | 24 | #PREPARE NATURE_PROTOCOL environment 25 | WORKDIR "/" 26 | RUN echo "python ./REDItools/NPscripts/download-prepare-data-NP_docker.py" >> /root/.bashrc 27 | -------------------------------------------------------------------------------- /Docker/Readme.md: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 |

DOCKER BASIC COMMANDS

9 |
This Dockerfile and its related image are part of the supplemental material for the paper
10 | "Investigating RNA editing in deep transcriptome datasets with REDItools and REDIportal"
11 |

12 | You can compile an image from this dockerfile with:
13 |

docker build -t [image_name] .
14 | eg. docker build -t rna_editing_protocol .
15 |
16 | and run a container with:
17 |
docker run -it [image_name] bash
18 | eg. docker run -it rna_editing_protocol bash
19 |

20 |

21 | OR 22 |

23 |

24 | Download a pre-built image from Docker hub with: 25 |

docker pull claudiologiudice/rna_editing_protocol:latest
26 |
27 | and run a container with: 28 |
docker run -it claudiologiudice/rna_editing_protocol:latest bash
29 |

30 | 31 | 32 | -------------------------------------------------------------------------------- /Images/Fig 5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig 5.png -------------------------------------------------------------------------------- /Images/Fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig1.png -------------------------------------------------------------------------------- /Images/Fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig2.png -------------------------------------------------------------------------------- /Images/Fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig3.png -------------------------------------------------------------------------------- /Images/Fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioinfoUNIBA/REDItools/822cee595a2584a7bcf6fc4a4da3857ba1c8e8ca/Images/Fig4.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 BioinfoUNIBA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NPfiles/editingStats.txt: -------------------------------------------------------------------------------- 1 | SubType ALU REPnonALU NONREP ALL 2 | AC 0.244897959184 0.0 0.0 0.24 3 | GT 0.571428571429 0.0 0.0 0.56 4 | AG 92.4897959184 0.0 100.0 92.64 5 | CA 0.326530612245 0.0 0.0 0.32 6 | CG 0.244897959184 0.0 0.0 0.24 7 | GC 0.489795918367 0.0 0.0 0.48 8 | AT 0.571428571429 0.0 0.0 0.56 9 | GA 1.22448979592 0.0 0.0 1.2 10 | TG 0.244897959184 0.0 0.0 0.24 11 | CT 1.55102040816 0.0 0.0 1.52 12 | TC 1.87755102041 0.0 0.0 1.84 13 | TA 0.163265306122 0.0 0.0 0.16 14 | -------------------------------------------------------------------------------- /NPfiles/sample_information_file.txt: -------------------------------------------------------------------------------- 1 | Sample,Status 2 | SRR3306823,DIS 3 | SRR3306824,DIS 4 | SRR3306825,DIS 5 | SRR3306826,DIS 6 | SRR3306827,DIS 7 | SRR3306828,DIS 8 | SRR3306829,DIS 9 | SRR3306830,CTRL 10 | SRR3306831,CTRL 11 | SRR3306832,CTRL 12 | SRR3306833,CTRL 13 | SRR3306834,CTRL 14 | SRR3306835,CTRL 15 | SRR3306836,CTRL -------------------------------------------------------------------------------- /NPscripts/collect_editing_candidates.py: -------------------------------------------------------------------------------- 1 | 2 | import sys, os 3 | import glob 4 | 5 | atab=glob.glob('firstalu/DnaRna_*/outTable_*')[0] #alu refined 6 | ftab=glob.glob('second/DnaRna_*/outTable_*')[0] 7 | if not os.path.exists('knownEditing'): sys.exit('knownEditing file not found.') 8 | if not os.path.exists('pos.txt'): sys.exit('pos.txt file not found.') 9 | if not os.path.exists('posalu.txt'): sys.exit('posalu.txt file not found.') 10 | 11 | o=open('editing.txt','w') 12 | f=open('knownEditing') 13 | for i in f: o.write(i) 14 | f.close() 15 | if os.path.exists(ftab): 16 | f=open(ftab) 17 | d={} 18 | for i in f: 19 | if i.startswith('Region'): continue 20 | l=(i.strip()).split('\t') 21 | d[(l[0],l[1])]=0 22 | f.close() 23 | f=open('pos.txt') 24 | for i in f: 25 | if i.startswith('Region'): continue 26 | l=(i.strip()).split('\t') 27 | if d.has_key((l[0],l[1])): o.write(i) 28 | f.close() 29 | f=open(atab) 30 | d={} 31 | for i in f: 32 | if i.startswith('Region'): continue 33 | l=(i.strip()).split('\t') 34 | d[(l[0],l[1])]=0 35 | f.close() 36 | f=open('posalu.txt') 37 | for i in f: 38 | if i.startswith('Region'): continue 39 | l=(i.strip()).split('\t') 40 | if d.has_key((l[0],l[1])): o.write(i) 41 | f.close() 42 | -------------------------------------------------------------------------------- /NPscripts/conda_pckg_installer_docker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Mantainer clalogiudice@gmail.com 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | import os, commands 22 | 23 | def install_conda_packages(conda_bin): 24 | """Installs conda packages required by the protocol""" 25 | install_cmd = os.system(cmd + ' install -n nature_protocol bcftools==1.9 bedtools==2.28.0 \ 26 | bzip2==1.0.6 bwa==0.7.17 bx-python==0.8.2 fastp==0.20.0 fastqc==0.11.8 \ 27 | fisher==0.1.4 git==2.21.0 gmap==2018.07.04 htslib==1.9 libdeflate==1.0 \ 28 | numpy==1.16.2 pysam==0.15.2 rseqc==2.6.4 samtools==1.9 scipy==1.2.1 \ 29 | star==2.7.0f wget==1.20.1') 30 | return install_cmd 31 | 32 | 33 | cwd = os.getcwd() 34 | installation_path = cwd + '/opt' 35 | if not os.path.exists(installation_path): 36 | os.mkdir(installation_path) 37 | os.chdir('./opt') 38 | conda_url = 'wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh' 39 | i = 0 40 | while os.system(conda_url) != 0 and i <= 5: 41 | os.system(conda_url) 42 | i+=1 43 | os.system('chmod +x Miniconda2-latest-Linux-x86_64.sh') 44 | #os.system('bash Miniconda2-latest-Linux-x86_64.sh -b -p /REDItools/NPscripts/miniconda2/') 45 | os.system('bash Miniconda2-latest-Linux-x86_64.sh -b -p /miniconda2/') 46 | os.chdir('../') 47 | os.system('rm -rf opt/') 48 | cmd = '/miniconda2/bin/conda' 49 | os.system(cmd + ' config --add channels defaults') 50 | os.system(cmd + ' config --add channels bioconda') 51 | os.system(cmd + ' config --add channels conda-forge') 52 | os.system(cmd + ' create -n nature_protocol python=2.7 anaconda -y') 53 | 54 | install_conda_packages(cmd) 55 | 56 | 57 | -------------------------------------------------------------------------------- /NPscripts/conda_pckgs_installer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | import os, subprocess 22 | 23 | def install_conda_packages(conda_bin): 24 | """Installs conda packages required by the protocol""" 25 | install_cmd = os.system(cmd + ' install -n nature_protocol bcftools==1.9 bedtools==2.28.0 \ 26 | bzip2==1.0.6 bwa==0.7.17 bx-python==0.8.2 fastp==0.20.0 fastqc==0.11.8 \ 27 | fisher==0.1.4 git==2.21.0 gmap==2018.07.04 htslib==1.9 libdeflate==1.0 \ 28 | numpy==1.16.2 pysam==0.15.2 rseqc==2.6.4 samtools==1.9 scipy==1.2.1 \ 29 | star==2.7.0f wget==1.20.1') 30 | return install_cmd 31 | 32 | if subprocess.getstatusoutput('conda')[0] != 0: 33 | cwd = os.getcwd() 34 | installation_path = cwd + '/opt' 35 | if not os.path.exists(installation_path): 36 | os.mkdir(installation_path) 37 | os.chdir('./opt') 38 | conda_url = 'wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh' 39 | i = 0 40 | while os.system(conda_url) != 0 and i <= 5: 41 | os.system(conda_url) 42 | i+=1 43 | os.system('chmod +x Miniconda2-latest-Linux-x86_64.sh') 44 | os.system('bash Miniconda2-latest-Linux-x86_64.sh') 45 | home_folder = os.path.expanduser('~') 46 | cmd = home_folder + '/miniconda2/bin/conda' 47 | os.system(cmd + ' config --add channels defaults') 48 | os.system(cmd + ' config --add channels bioconda') 49 | os.system(cmd + ' config --add channels conda-forge') 50 | os.system(cmd + ' create -n nature_protocol python=2.7 anaconda') 51 | install_conda_packages(cmd) 52 | print("Your conda environment has been succesfully created, now close your terminal and open a new one." + "\n" + \ 53 | "Type in order:" + "\n" + \ 54 | "source " + home_folder + "/.bashrc" + "\n" + \ 55 | "conda activate nature_protocol") 56 | else: 57 | home_folder = os.path.expanduser('~') 58 | cmd = home_folder + '/miniconda2/bin/conda' 59 | os.system(cmd + ' config --add channels defaults') 60 | os.system(cmd + ' config --add channels bioconda') 61 | os.system(cmd + ' config --add channels conda-forge') 62 | os.system(cmd + ' create -n nature_protocol python=2.7 anaconda') 63 | install_conda_packages(cmd) 64 | print("Your conda environment has been succesfully created, now close your terminal and open a new one." + "\n" + 65 | "Type in order:" + "\n" + \ 66 | "source " + home_folder + "/.bashrc" + "\n" + \ 67 | "conda activate nature_protocol") 68 | 69 | -------------------------------------------------------------------------------- /NPscripts/download-prepare-data-NP.py: -------------------------------------------------------------------------------- 1 | import sys, os, time 2 | import commands 3 | import distutils.spawn 4 | 5 | try: 6 | wdir=sys.argv[1] # working directory 7 | redipath=sys.argv[2] # path to REDItools folder 8 | usepath=sys.argv[3] 9 | except: 10 | sys.exit(' ') 11 | 12 | def getData(cmd): 13 | tr=0 14 | while 1: 15 | st,out=commands.getstatusoutput(cmd) 16 | if st==0: 17 | return 0 18 | tr+=1 19 | if tr==10: break 20 | if tr>0: return 1 21 | 22 | def is_tool(name): 23 | wn=distutils.spawn.find_executable(name) 24 | if wn==None: return 1 25 | else: return wn 26 | 27 | def get_time(tstart,tend): 28 | telapsed=tend - tstart 29 | t_taken=time.strftime("%H:%M:%S", time.gmtime(telapsed)) 30 | return t_taken 31 | 32 | if usepath!='1': 33 | exe=['bwa','STAR','awk','bgzip','tabix','sort','gtf_splicesites','wget','python','gunzip'] 34 | nt=[] 35 | prg={} 36 | for i in exe: 37 | p=is_tool(i) 38 | if p==1: nt.append(i) 39 | prg[i]=p 40 | if len(nt)>0: 41 | for i in nt: 42 | sys.stderr.write('Program %s NOT found\n' %(i)) 43 | sys.exit('Install required software first.') 44 | else: 45 | if not os.path.exists('mypaths'): sys.exit('File mypaths does not exists.') 46 | nt=[] 47 | f=open('mypaths') 48 | prg={} 49 | for i in f: 50 | l=(i.strip()).replace(' ','') 51 | l=l.split('=') 52 | prg[l[0]]=l[1] 53 | 54 | if not os.path.exists(redipath): sys.exit('REDItools path does not exist.') 55 | redirec=os.path.join(redipath,'accessory','rediportal2recoding.py') 56 | if not os.path.exists(redipath): sys.exit('rediportal2recoding.py script not found.') 57 | prg['redirec']=redirec 58 | 59 | cdir=os.getcwd() 60 | sys.stderr.write('Current directory: %s\n' %(cdir)) 61 | folder=os.path.join(cdir,wdir) 62 | if not os.path.exists(folder): 63 | os.mkdir(folder) 64 | sys.stderr.write('Directory %s created.\n' %(wdir)) 65 | else: 66 | sys.stderr.write('Found working directory.\n') 67 | sys.stderr.write('Entering %s\n' %(wdir)) 68 | os.chdir(folder) 69 | 70 | #human genome 71 | sys.stderr.write('Getting human genome\n') 72 | tstart = time.time() 73 | os.mkdir('genome_hg19') 74 | os.chdir('genome_hg19') 75 | wcmd='%s ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz' %(prg['wget']) 76 | ot=getData(wcmd) 77 | if ot==1: sys.stderr.write('I cannot download the human genome.\n') 78 | else: sys.stderr.write('Human genome complete.\n') 79 | tend = time.time() 80 | sys.stderr.write('Human genome - time taken: %s\n' %(get_time(tstart,tend))) 81 | os.chdir('..') 82 | #Gencode 83 | sys.stderr.write('Getting GENCODE genes\n') 84 | tstart = time.time() 85 | os.mkdir('Gencode_annotation') 86 | os.chdir('Gencode_annotation') 87 | gcmd='%s ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.annotation.gtf.gz' %(prg['wget']) 88 | ot=getData(gcmd) 89 | if ot==1: sys.stderr.write('I cannot download GENCODE annotations.\n') 90 | else: sys.stderr.write('GENCODE annotations ready.\n') 91 | tend = time.time() 92 | sys.stderr.write('GENCODE annotations - time taken: %s\n' %(get_time(tstart,tend))) 93 | os.chdir('..') 94 | #RefSeq 95 | sys.stderr.write('Getting RefSeq hg19\n') 96 | tstart = time.time() 97 | os.mkdir('Strand_detection') 98 | os.chdir('Strand_detection') 99 | gcmd='%s --no-check-certificate https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg19_RefSeq.bed.gz' %(prg['wget']) 100 | ot=getData(gcmd) 101 | if ot==1: sys.stderr.write('I cannot download REFSEQ annotations.\n') 102 | else: sys.stderr.write('REFSEQ annotations ready.\n') 103 | tend = time.time() 104 | sys.stderr.write('REFSEQ annotations - time taken: %s\n' %(get_time(tstart,tend))) 105 | os.chdir('..') 106 | #RepeatMasker 107 | sys.stderr.write('Getting RepeatMasker\n') 108 | tstart = time.time() 109 | os.mkdir('rmsk') 110 | os.chdir('rmsk') 111 | gcmd='%s http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz' %(prg['wget']) 112 | ot=getData(gcmd) 113 | if ot==1: sys.stderr.write('I cannot download RepeatMasker annotations.\n') 114 | else: sys.stderr.write('RepeatMasker annotations ready.\n') 115 | tend = time.time() 116 | sys.stderr.write('RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend))) 117 | os.chdir('..') 118 | #dbSNP 119 | sys.stderr.write('Getting dbSNP\n') 120 | tstart = time.time() 121 | os.mkdir('snp151') 122 | os.chdir('snp151') 123 | gcmd='%s http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/snp151.txt.gz' %(prg['wget']) 124 | ot=getData(gcmd) 125 | if ot==1: sys.stderr.write('I cannot download dbSNP annotations.\n') 126 | else: sys.stderr.write('dbSNP annotations ready.\n') 127 | tend = time.time() 128 | sys.stderr.write('dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend))) 129 | os.chdir('..') 130 | #REDIportal 131 | sys.stderr.write('Getting REDIportal\n') 132 | tstart = time.time() 133 | os.mkdir('rediportal') 134 | os.chdir('rediportal') 135 | gcmd='%s http://srv00.recas.ba.infn.it/webshare/rediportalDownload/table1_full.txt.gz' %(prg['wget']) 136 | ot=getData(gcmd) 137 | if ot==1: sys.stderr.write('I cannot download REDIportal annotations.\n') 138 | else: sys.stderr.write('REDIportal annotations ready.\n') 139 | tend = time.time() 140 | sys.stderr.write('REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend))) 141 | os.chdir('..') 142 | #NA12878 - WGS 143 | sys.stderr.write('Getting NA12878 data - WGS\n') 144 | tstart = time.time() 145 | os.mkdir('WGS_ERR262997') 146 | os.chdir('WGS_ERR262997') 147 | fq1cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz' %(prg['wget']) 148 | fq2cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz' %(prg['wget']) 149 | f1=getData(fq1cmd) 150 | f2=getData(fq2cmd) 151 | if f1==1: sys.stderr.write('I cannot download READ1.\n') 152 | else: 153 | gu=getData('%s ERR262997_1.fastq.gz' %(prg['gunzip'])) 154 | sys.stderr.write('READ1 ready.\n') 155 | if f2==1: sys.stderr.write('I cannot download READ2.\n') 156 | else: 157 | gu=getData('%s ERR262997_2.fastq.gz' %(prg['gunzip'])) 158 | sys.stderr.write('READ2 ready.\n') 159 | tend = time.time() 160 | sys.stderr.write('NA12878 data - WGS - time taken: %s\n' %(get_time(tstart,tend))) 161 | os.chdir('..') 162 | 163 | #NA12878 - RNAseq 164 | sys.stderr.write('Getting NA12878 data - RNAseq\n') 165 | tstart = time.time() 166 | os.mkdir('RNASeq_SRR1258218') 167 | os.chdir('RNASeq_SRR1258218') 168 | fq1cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_1.fastq.gz' %(prg['wget']) 169 | fq2cmd='%s ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_2.fastq.gz' %(prg['wget']) 170 | f1=getData(fq1cmd) 171 | f2=getData(fq2cmd) 172 | if f1==1: sys.stderr.write('I cannot download READ1.\n') 173 | else: 174 | #gu=getData('%s SRR1258218_1.fastq.gz' %(prg['gunzip'])) 175 | sys.stderr.write('READ1 ready.\n') 176 | if f2==1: sys.stderr.write('I cannot download READ2.\n') 177 | else: 178 | #gu=getData('%s SRR1258218_2.fastq.gz' %(prg['gunzip'])) 179 | sys.stderr.write('READ2 ready.\n') 180 | tend = time.time() 181 | sys.stderr.write('NA12878 data - RNAseq - time taken: %s\n' %(get_time(tstart,tend))) 182 | os.chdir('..') 183 | 184 | #PRJNA316625 185 | sys.stderr.write('Getting PRJNA316625 data\n') 186 | tstart = time.time() 187 | os.mkdir('PRJNA_316625') 188 | os.chdir('PRJNA_316625') 189 | fqlist=['ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_2.fastq.gz'] 190 | for i in fqlist: 191 | fq1,fq2=i.split(',') 192 | base=(os.path.basename(fq1)).split("_")[0] 193 | os.mkdir(base) 194 | os.chdir(base) 195 | fq1cmd='%s %s' %(prg['wget'],fq1) 196 | fq2cmd='%s %s' %(prg['wget'],fq2) 197 | fq1_=getData(fq1cmd) 198 | fq2_=getData(fq2cmd) 199 | if fq1_+fq2_>0: 200 | sys.stderr.write('I cannot download all files in %s.\n' %(base)) 201 | os.chdir('..') 202 | else: 203 | #gu1=getData('%s %s' %(prg['gunzip'],os.path.basename(fq1))) 204 | #gu2=getData('%s %s' %(prg['gunzip'],os.path.basename(fq2))) 205 | sys.stderr.write('Files in %s ready.\n' %(base)) 206 | os.chdir('..') 207 | tend = time.time() 208 | sys.stderr.write('PRJNA316625 - time taken: %s\n' %(get_time(tstart,tend))) 209 | os.chdir('..') 210 | 211 | sys.stderr.write('Preparing data ...\n') 212 | sys.stderr.write('BWA indexing...\n') 213 | tstart = time.time() 214 | os.chdir('genome_hg19') 215 | cmd='%s GRCh37.primary_assembly.genome.fa.gz' %(prg['gunzip']) 216 | cmd1='%s index GRCh37.primary_assembly.genome.fa' %(prg['bwa']) 217 | ot=getData(cmd) 218 | ot1=getData(cmd1) 219 | if ot+ot1>0: sys.stderr.write('BWA indexing error.\n') 220 | else: sys.stderr.write('BWA indices ready.\n') 221 | tend = time.time() 222 | sys.stderr.write('BWA indexing - time taken: %s\n' %(get_time(tstart,tend))) 223 | os.chdir('..') 224 | 225 | sys.stderr.write('STAR indexing...\n') 226 | cmd='%s Gencode_annotation/gencode.v30lift37.annotation.gtf.gz' %(prg['gunzip']) 227 | ot=getData(cmd) 228 | if ot==1: sys.stderr.write('Gunzipping gencode error.\n') 229 | else: sys.stderr.write('Gunzipping gencode ready.\n') 230 | tstart = time.time() 231 | if not os.path.exists('STAR'): os.mkdir('STAR') 232 | os.chdir('STAR') 233 | os.mkdir('STAR_genome_index_ucsc') 234 | cmd='%s --runMode genomeGenerate --genomeDir STAR_genome_index_ucsc --genomeFastaFiles ../genome_hg19/GRCh37.primary_assembly.genome.fa --sjdbGTFfile ../Gencode_annotation/gencode.v30lift37.annotation.gtf --sjdbOverhang 75' %(prg['STAR']) 235 | ot=getData(cmd) 236 | if ot==1: sys.stderr.write('STAR indexing error.\n') 237 | else: sys.stderr.write('STAR indices ready.\n') 238 | tend = time.time() 239 | sys.stderr.write('STAR indexing - time taken: %s\n' %(get_time(tstart,tend))) 240 | os.chdir('..') 241 | 242 | sys.stderr.write('Prepare RepeatMasker annotations ...\n') 243 | tstart = time.time() 244 | os.chdir('rmsk') 245 | cmd4='%s rmsk.txt.gz' %(prg['gunzip']) 246 | cmd='%s \'OFS="\t"{print $6,"rmsk_hg19",$12,$7+1,$8,".",$10,".","gene_id \""$11"\"; transcript_id \""$13"\";"}\' rmsk.txt > rmsk.gtf' %(prg['awk']) 247 | cmd1='%s -k1,1 -k4,4n rmsk.gtf > rmsk.sorted.gtf' %(prg['sort']) 248 | cmd2='%s rmsk.sorted.gtf' %(prg['bgzip']) 249 | cmd3='%s -p gff rmsk.sorted.gtf.gz' %(prg['tabix']) 250 | ot4=getData(cmd4) 251 | ot=getData(cmd) 252 | ot1=getData(cmd1) 253 | ot2=getData(cmd2) 254 | ot3=getData(cmd3) 255 | if ot4==1: sys.stderr.write('RepeatMasker gunzip error.\n') 256 | if ot==1: sys.stderr.write('RepeatMasker awk error.\n') 257 | if ot1==1: sys.stderr.write('RepeatMasker sort error.\n') 258 | if ot2==1: sys.stderr.write('RepeatMasker bgzip error.\n') 259 | if ot3==1: sys.stderr.write('RepeatMasker tabix error.\n') 260 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('RepeatMasker ready.\n') 261 | tend = time.time() 262 | sys.stderr.write('Prepare RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend))) 263 | os.chdir('..') 264 | 265 | sys.stderr.write('Prepare dbSNP annotations ...\n') 266 | tstart = time.time() 267 | os.chdir('snp151') 268 | cmd4='%s snp151.txt.gz' %(prg['gunzip']) 269 | cmd='%s \'OFS="\t"{if ($11=="genomic" && $12=="single") print $2,"ucsc_snp151_hg19","snp",$4,$4,".",$7,".","gene_id \""$5"\"; transcript_id \""$5"\";"}\' snp151.txt > snp151.gtf' %(prg['awk']) 270 | cmd1='%s -k1,1 -k4,4n snp151.gtf > snp151.sorted.gtf' %(prg['sort']) 271 | cmd2='%s snp151.sorted.gtf' %(prg['bgzip']) 272 | cmd3='%s -p gff snp151.sorted.gtf.gz' %(prg['tabix']) 273 | ot4=getData(cmd4) 274 | ot=getData(cmd) 275 | ot1=getData(cmd1) 276 | ot2=getData(cmd2) 277 | ot3=getData(cmd3) 278 | if ot4==1: sys.stderr.write('dbSNP gunzip error.\n') 279 | if ot==1: sys.stderr.write('dbSNP awk error.\n') 280 | if ot1==1: sys.stderr.write('dbSNP sort error.\n') 281 | if ot2==1: sys.stderr.write('dbSNP bgzip error.\n') 282 | if ot3==1: sys.stderr.write('dbSNP tabix error.\n') 283 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('dbSNP ready.\n') 284 | tend = time.time() 285 | sys.stderr.write('Prepare dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend))) 286 | os.chdir('..') 287 | 288 | sys.stderr.write('Prepare splice sites annotations ...\n') 289 | tstart = time.time() 290 | os.chdir('Gencode_annotation') 291 | cmd='%s gencode.v30lift37.annotation.gtf > splicesites' %(prg['gtf_splicesites']) 292 | cmd1='%s -F" " \'{split($2,a,":"); split(a[2],b,"."); if (b[1]>b[3]) print a[1],b[3],b[1],toupper(substr($3,1,1)),"-"; else print a[1],b[1],b[3],toupper(substr($3,1,1)),"+"}\' splicesites > gencode.v30lift37.splicesites.txt' %(prg['awk']) 293 | ot=getData(cmd) 294 | ot1=getData(cmd1) 295 | if ot==1: sys.stderr.write('Splice sites gtf_splicesites error.\n') 296 | if ot1==1: sys.stderr.write('Splice sites sort error.\n') 297 | if ot+ot1==0: sys.stderr.write('Splice sites ready.\n') 298 | tend = time.time() 299 | sys.stderr.write('Prepare splice sites annotations - time taken: %s\n' %(get_time(tstart,tend))) 300 | os.chdir('..') 301 | 302 | sys.stderr.write('Prepare REDIportal annotations ...\n') 303 | tstart = time.time() 304 | os.chdir('rediportal') 305 | cmd7='%s table1_full.txt.gz' %(prg['gunzip']) 306 | cmd='%s \'OFS="\t"{sum+=1; print $1,"rediportal","ed",$2,$2,".",$5,".","gene_id \""sum"\"; transcript_id \""sum"\";"}\' table1_full.txt > atlas.gtf' %(prg['awk']) 307 | cmd1='%s atlas.gtf' %(prg['bgzip']) 308 | cmd2='%s -p gff atlas.gtf.gz' %(prg['tabix']) 309 | cmd3='%s %s table1_full.txt > atlas_recoding.gff' %(prg['python'],redirec) 310 | cmd4='%s -V -k1,1 -k4,4n atlas_recoding.gff > srtd_atlas_recoding.gff' %(prg['sort']) 311 | cmd5='%s srtd_atlas_recoding.gff' %(prg['bgzip']) 312 | cmd6='%s -p gff srtd_atlas_recoding.gff.gz' %(prg['tabix']) 313 | ot7=getData(cmd7) 314 | ot=getData(cmd) 315 | ot1=getData(cmd1) 316 | ot2=getData(cmd2) 317 | ot3=getData(cmd3) 318 | ot4=getData(cmd4) 319 | ot5=getData(cmd5) 320 | ot6=getData(cmd6) 321 | if ot7==1: sys.stderr.write('REDIportal gunzip error.\n') 322 | if ot==1: sys.stderr.write('REDIportal awk error.\n') 323 | if ot1==1: sys.stderr.write('REDIportal bgzip error.\n') 324 | if ot2==1: sys.stderr.write('REDIportal tabix error.\n') 325 | if ot3==1: sys.stderr.write('REDIportal python error.\n') 326 | if ot4==1: sys.stderr.write('REDIportal sort error.\n') 327 | if ot5==1: sys.stderr.write('REDIportal bgzip error.\n') 328 | if ot6==1: sys.stderr.write('REDIportal tabix error.\n') 329 | if ot+ot1+ot2+ot3+ot4+ot5+ot6+ot7==0: sys.stderr.write('REDIportal ready.\n') 330 | tend = time.time() 331 | sys.stderr.write('Prepare REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend))) 332 | os.chdir('..') 333 | 334 | sys.stderr.write('ALL DONE. ENJOY REDItools.\n') 335 | -------------------------------------------------------------------------------- /NPscripts/download-prepare-data-NP_docker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os, time, shutil 3 | import commands 4 | import distutils.spawn 5 | 6 | 7 | wdir='rna_editing_protocol' 8 | redipath='./REDItools/' 9 | 10 | def remove_folder(path): 11 | # check if folder exists 12 | if os.path.exists(path): 13 | # remove if exists 14 | shutil.rmtree(path) 15 | 16 | def getData(cmd): 17 | tr=0 18 | while 1: 19 | st,out=commands.getstatusoutput(cmd) 20 | if st==0: 21 | return 0 22 | tr+=1 23 | if tr==10: break 24 | if tr>0: return 1 25 | 26 | def is_tool(name): 27 | wn=distutils.spawn.find_executable(name) 28 | if wn==None: return 1 29 | else: return wn 30 | 31 | def get_time(tstart,tend): 32 | telapsed=tend - tstart 33 | t_taken=time.strftime("%H:%M:%S", time.gmtime(telapsed)) 34 | return t_taken 35 | 36 | 37 | exe=['bwa','STAR','awk','bgzip','tabix','sort','gtf_splicesites','wget','python','gunzip'] 38 | nt=[] 39 | prg={} 40 | for i in exe: 41 | p=is_tool(i) 42 | if p==1: nt.append(i) 43 | prg[i]=p 44 | if len(nt)>0: 45 | for i in nt: 46 | sys.stderr.write('Program %s NOT found\n' %(i)) 47 | sys.exit('Install required software first.') 48 | 49 | redirec=os.path.join(redipath,'accessory','rediportal2recoding.py') 50 | if not os.path.exists(redipath): sys.exit('rediportal2recoding.py script not found.') 51 | prg['redirec']='../../' + redirec.lstrip('./') 52 | 53 | ipkgs = raw_input("Download nature_protocol input data? yes/no ") 54 | 55 | cdir=os.getcwd() 56 | sys.stderr.write('Current directory: %s\n' %(cdir)) 57 | folder=os.path.join(cdir,wdir) 58 | 59 | if ipkgs.strip().upper() == 'YES': 60 | remove_folder(folder) # from previous installations 61 | os.mkdir(folder) 62 | sys.stderr.write('Directory %s created.\n' %(wdir)) 63 | sys.stderr.write('Entering %s\n' %(wdir)) 64 | os.chdir(folder) 65 | #human genome 66 | sys.stderr.write('Getting human genome\n') 67 | tstart = time.time() 68 | os.mkdir('genome_hg19') 69 | os.chdir('genome_hg19') 70 | wcmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz' %(prg['wget']) 71 | ot=getData(wcmd) 72 | if ot==1: sys.stderr.write('I cannot download the human genome.\n') 73 | else: sys.stderr.write('Human genome complete.\n') 74 | tend = time.time() 75 | sys.stderr.write('Human genome - time taken: %s\n' %(get_time(tstart,tend))) 76 | os.chdir('..') 77 | #Gencode 78 | sys.stderr.write('Getting GENCODE genes\n') 79 | tstart = time.time() 80 | os.mkdir('Gencode_annotation') 81 | os.chdir('Gencode_annotation') 82 | gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.annotation.gtf.gz' %(prg['wget']) 83 | ot=getData(gcmd) 84 | if ot==1: sys.stderr.write('I cannot download GENCODE annotations.\n') 85 | else: sys.stderr.write('GENCODE annotations ready.\n') 86 | tend = time.time() 87 | sys.stderr.write('GENCODE annotations - time taken: %s\n' %(get_time(tstart,tend))) 88 | os.chdir('..') 89 | #RefSeq 90 | sys.stderr.write('Getting RefSeq hg19\n') 91 | tstart = time.time() 92 | os.mkdir('Strand_detection') 93 | os.chdir('Strand_detection') 94 | gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 --no-check-certificate https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg19_RefSeq.bed.gz' %(prg['wget']) 95 | ot=getData(gcmd) 96 | if ot==1: sys.stderr.write('I cannot download REFSEQ annotations.\n') 97 | else: sys.stderr.write('REFSEQ annotations ready.\n') 98 | tend = time.time() 99 | sys.stderr.write('REFSEQ annotations - time taken: %s\n' %(get_time(tstart,tend))) 100 | os.chdir('..') 101 | #RepeatMasker 102 | sys.stderr.write('Getting RepeatMasker\n') 103 | tstart = time.time() 104 | os.mkdir('rmsk') 105 | os.chdir('rmsk') 106 | gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz' %(prg['wget']) 107 | ot=getData(gcmd) 108 | if ot==1: sys.stderr.write('I cannot download RepeatMasker annotations.\n') 109 | else: sys.stderr.write('RepeatMasker annotations ready.\n') 110 | tend = time.time() 111 | sys.stderr.write('RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend))) 112 | os.chdir('..') 113 | #dbSNP 114 | sys.stderr.write('Getting dbSNP\n') 115 | tstart = time.time() 116 | os.mkdir('snp151') 117 | os.chdir('snp151') 118 | gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/snp151.txt.gz' %(prg['wget']) 119 | ot=getData(gcmd) 120 | if ot==1: sys.stderr.write('I cannot download dbSNP annotations.\n') 121 | else: sys.stderr.write('dbSNP annotations ready.\n') 122 | tend = time.time() 123 | sys.stderr.write('dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend))) 124 | os.chdir('..') 125 | #REDIportal 126 | sys.stderr.write('Getting REDIportal\n') 127 | tstart = time.time() 128 | os.mkdir('rediportal') 129 | os.chdir('rediportal') 130 | gcmd='%s -c --retry-connrefused --tries=0 --timeout=5 http://srv00.recas.ba.infn.it/webshare/rediportalDownload/table1_full.txt.gz' %(prg['wget']) 131 | ot=getData(gcmd) 132 | if ot==1: sys.stderr.write('I cannot download REDIportal annotations.\n') 133 | else: sys.stderr.write('REDIportal annotations ready.\n') 134 | tend = time.time() 135 | sys.stderr.write('REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend))) 136 | os.chdir('..') 137 | #NA12878 - WGS 138 | sys.stderr.write('Getting NA12878 data - WGS\n') 139 | tstart = time.time() 140 | os.mkdir('WGS_ERR262997') 141 | os.chdir('WGS_ERR262997') 142 | fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz' %(prg['wget']) 143 | fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz' %(prg['wget']) 144 | f1=getData(fq1cmd) 145 | f2=getData(fq2cmd) 146 | if f1==1: sys.stderr.write('I cannot download READ1.\n') 147 | else: 148 | gu=getData('%s ERR262997_1.fastq.gz' %(prg['gunzip'])) 149 | sys.stderr.write('READ1 ready.\n') 150 | if f2==1: sys.stderr.write('I cannot download READ2.\n') 151 | else: 152 | gu=getData('%s ERR262997_2.fastq.gz' %(prg['gunzip'])) 153 | sys.stderr.write('READ2 ready.\n') 154 | tend = time.time() 155 | sys.stderr.write('NA12878 data - WGS - time taken: %s\n' %(get_time(tstart,tend))) 156 | os.chdir('..') 157 | 158 | #NA12878 - RNAseq 159 | sys.stderr.write('Getting NA12878 data - RNAseq\n') 160 | tstart = time.time() 161 | os.mkdir('RNASeq_SRR1258218') 162 | os.chdir('RNASeq_SRR1258218') 163 | fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_1.fastq.gz' %(prg['wget']) 164 | fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/008/SRR1258218/SRR1258218_2.fastq.gz' %(prg['wget']) 165 | f1=getData(fq1cmd) 166 | f2=getData(fq2cmd) 167 | if f1==1: sys.stderr.write('I cannot download READ1.\n') 168 | else: 169 | #gu=getData('%s SRR1258218_1.fastq.gz' %(prg['gunzip'])) 170 | sys.stderr.write('READ1 ready.\n') 171 | if f2==1: sys.stderr.write('I cannot download READ2.\n') 172 | else: 173 | #gu=getData('%s SRR1258218_2.fastq.gz' %(prg['gunzip'])) 174 | sys.stderr.write('READ2 ready.\n') 175 | tend = time.time() 176 | sys.stderr.write('NA12878 data - RNAseq - time taken: %s\n' %(get_time(tstart,tend))) 177 | os.chdir('..') 178 | 179 | #PRJNA316625 180 | sys.stderr.write('Getting PRJNA316625 data\n') 181 | tstart = time.time() 182 | os.mkdir('PRJNA_316625') 183 | os.chdir('PRJNA_316625') 184 | fqlist=['ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306823/SRR3306823_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306824/SRR3306824_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306825/SRR3306825_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306826/SRR3306826_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/007/SRR3306827/SRR3306827_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/008/SRR3306828/SRR3306828_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/009/SRR3306829/SRR3306829_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/000/SRR3306830/SRR3306830_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/001/SRR3306831/SRR3306831_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/002/SRR3306832/SRR3306832_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/003/SRR3306833/SRR3306833_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/004/SRR3306834/SRR3306834_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/005/SRR3306835/SRR3306835_2.fastq.gz','ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_1.fastq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR330/006/SRR3306836/SRR3306836_2.fastq.gz'] 185 | for i in fqlist: 186 | fq1,fq2=i.split(',') 187 | base=(os.path.basename(fq1)).split("_")[0] 188 | os.mkdir(base) 189 | os.chdir(base) 190 | fq1cmd='%s -c --retry-connrefused --tries=0 --timeout=5 %s' %(prg['wget'],fq1) 191 | fq2cmd='%s -c --retry-connrefused --tries=0 --timeout=5 %s' %(prg['wget'],fq2) 192 | fq1_=getData(fq1cmd) 193 | fq2_=getData(fq2cmd) 194 | if fq1_+fq2_>0: 195 | sys.stderr.write('I cannot download all files in %s.\n' %(base)) 196 | os.chdir('..') 197 | else: 198 | #gu1=getData('%s %s' %(prg['gunzip'],os.path.basename(fq1))) 199 | #gu2=getData('%s %s' %(prg['gunzip'],os.path.basename(fq2))) 200 | sys.stderr.write('Files in %s ready.\n' %(base)) 201 | os.chdir('..') 202 | tend = time.time() 203 | sys.stderr.write('PRJNA316625 - time taken: %s\n' %(get_time(tstart,tend))) 204 | os.chdir('..') 205 | 206 | sys.stderr.write('Preparing data ...\n') 207 | sys.stderr.write('BWA indexing...\n') 208 | tstart = time.time() 209 | os.chdir('genome_hg19') 210 | cmd='%s GRCh37.primary_assembly.genome.fa.gz' %(prg['gunzip']) 211 | cmd1='%s index GRCh37.primary_assembly.genome.fa' %(prg['bwa']) 212 | ot=getData(cmd) 213 | ot1=getData(cmd1) 214 | if ot+ot1>0: sys.stderr.write('BWA indexing error.\n') 215 | else: sys.stderr.write('BWA indices ready.\n') 216 | tend = time.time() 217 | sys.stderr.write('BWA indexing - time taken: %s\n' %(get_time(tstart,tend))) 218 | os.chdir('..') 219 | 220 | sys.stderr.write('STAR indexing...\n') 221 | cmd='%s Gencode_annotation/gencode.v30lift37.annotation.gtf.gz' %(prg['gunzip']) 222 | ot=getData(cmd) 223 | if ot==1: sys.stderr.write('Gunzipping gencode error.\n') 224 | else: sys.stderr.write('Gunzipping gencode ready.\n') 225 | tstart = time.time() 226 | if not os.path.exists('STAR'): os.mkdir('STAR') 227 | os.chdir('STAR') 228 | os.mkdir('STAR_genome_index_ucsc') 229 | cmd='%s --runMode genomeGenerate --genomeDir STAR_genome_index_ucsc --genomeFastaFiles ../genome_hg19/GRCh37.primary_assembly.genome.fa --sjdbGTFfile ../Gencode_annotation/gencode.v30lift37.annotation.gtf --sjdbOverhang 75' %(prg['STAR']) 230 | ot=getData(cmd) 231 | if ot==1: sys.stderr.write('STAR indexing error.\n') 232 | else: sys.stderr.write('STAR indices ready.\n') 233 | tend = time.time() 234 | sys.stderr.write('STAR indexing - time taken: %s\n' %(get_time(tstart,tend))) 235 | os.chdir('..') 236 | 237 | sys.stderr.write('Prepare RepeatMasker annotations ...\n') 238 | tstart = time.time() 239 | os.chdir('rmsk') 240 | cmd4='%s rmsk.txt.gz' %(prg['gunzip']) 241 | cmd='%s \'OFS="\t"{print $6,"rmsk_hg19",$12,$7+1,$8,".",$10,".","gene_id \""$11"\"; transcript_id \""$13"\";"}\' rmsk.txt > rmsk.gtf' %(prg['awk']) 242 | cmd1='%s -k1,1 -k4,4n rmsk.gtf > rmsk.sorted.gtf' %(prg['sort']) 243 | cmd2='%s rmsk.sorted.gtf' %(prg['bgzip']) 244 | cmd3='%s -p gff rmsk.sorted.gtf.gz' %(prg['tabix']) 245 | ot4=getData(cmd4) 246 | ot=getData(cmd) 247 | ot1=getData(cmd1) 248 | ot2=getData(cmd2) 249 | ot3=getData(cmd3) 250 | if ot==4: sys.stderr.write('RepeatMasker gunzip error.\n') 251 | if ot==1: sys.stderr.write('RepeatMasker awk error.\n') 252 | if ot1==1: sys.stderr.write('RepeatMasker sort error.\n') 253 | if ot2==1: sys.stderr.write('RepeatMasker bgzip error.\n') 254 | if ot3==1: sys.stderr.write('RepeatMasker tabix error.\n') 255 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('RepeatMasker ready.\n') 256 | tend = time.time() 257 | sys.stderr.write('Prepare RepeatMasker annotations - time taken: %s\n' %(get_time(tstart,tend))) 258 | os.chdir('..') 259 | 260 | sys.stderr.write('Prepare dbSNP annotations ...\n') 261 | tstart = time.time() 262 | os.chdir('snp151') 263 | cmd4='%s snp151.txt.gz' %(prg['gunzip']) 264 | cmd='%s \'OFS="\t"{if ($11=="genomic" && $12=="single") print $2,"ucsc_snp151_hg19","snp",$4,$4,".",$7,".","gene_id \""$5"\"; transcript_id \""$5"\";"}\' snp151.txt > snp151.gtf' %(prg['awk']) 265 | cmd1='%s -k1,1 -k4,4n snp151.gtf > snp151.sorted.gtf' %(prg['sort']) 266 | cmd2='%s snp151.sorted.gtf' %(prg['bgzip']) 267 | cmd3='%s -p gff snp151.sorted.gtf.gz' %(prg['tabix']) 268 | ot4=getData(cmd4) 269 | ot=getData(cmd) 270 | ot1=getData(cmd1) 271 | ot2=getData(cmd2) 272 | ot3=getData(cmd3) 273 | if ot==4: sys.stderr.write('dbSNP gunzip error.\n') 274 | if ot==1: sys.stderr.write('dbSNP awk error.\n') 275 | if ot1==1: sys.stderr.write('dbSNP sort error.\n') 276 | if ot2==1: sys.stderr.write('dbSNP bgzip error.\n') 277 | if ot3==1: sys.stderr.write('dbSNP tabix error.\n') 278 | if ot+ot1+ot2+ot3+ot4==0: sys.stderr.write('dbSNP ready.\n') 279 | tend = time.time() 280 | sys.stderr.write('Prepare dbSNP annotations - time taken: %s\n' %(get_time(tstart,tend))) 281 | os.chdir('..') 282 | 283 | sys.stderr.write('Prepare splice sites annotations ...\n') 284 | tstart = time.time() 285 | os.chdir('Gencode_annotation') 286 | cmd='%s gencode.v30lift37.annotation.gtf > splicesites' %(prg['gtf_splicesites']) 287 | cmd1='%s -F" " \'{split($2,a,":"); split(a[2],b,"."); if (b[1]>b[3]) print a[1],b[3],b[1],toupper(substr($3,1,1)),"-"; else print a[1],b[1],b[3],toupper(substr($3,1,1)),"+"}\' splicesites > gencode.v30lift37.splicesites.txt' %(prg['awk']) 288 | ot=getData(cmd) 289 | ot1=getData(cmd1) 290 | if ot==1: sys.stderr.write('Splice sites gtf_splicesites error.\n') 291 | if ot1==1: sys.stderr.write('Splice sites sort error.\n') 292 | if ot+ot1==0: sys.stderr.write('Splice sites ready.\n') 293 | tend = time.time() 294 | sys.stderr.write('Prepare splice sites annotations - time taken: %s\n' %(get_time(tstart,tend))) 295 | os.chdir('..') 296 | 297 | sys.stderr.write('Prepare REDIportal annotations ...\n') 298 | tstart = time.time() 299 | os.chdir('rediportal') 300 | cmd7='%s table1_full.txt.gz' %(prg['gunzip']) 301 | ot7=getData(cmd7) 302 | if ot7==0: 303 | cmd='%s \'OFS="\t"{sum+=1; print $1,"rediportal","ed",$2,$2,".",$5,".","gene_id \""sum"\"; transcript_id \""sum"\";"}\' table1_full.txt > atlas.gtf' %(prg['awk']) 304 | cmd1='%s atlas.gtf' %(prg['bgzip']) 305 | cmd2='%s -p gff atlas.gtf.gz' %(prg['tabix']) 306 | cmd3='%s %s table1_full.txt > atlas_recoding.gff' %(prg['python'],prg['redirec']) #redirec 307 | cmd4='%s -V -k1,1 -k4,4n atlas_recoding.gff > srtd_atlas_recoding.gff' %(prg['sort']) 308 | cmd5='%s srtd_atlas_recoding.gff' %(prg['bgzip']) 309 | cmd6='%s -p gff srtd_atlas_recoding.gff.gz' %(prg['tabix']) 310 | ot=getData(cmd) 311 | ot1=getData(cmd1) 312 | ot2=getData(cmd2) 313 | ot3=getData(cmd3) 314 | ot4=getData(cmd4) 315 | ot5=getData(cmd5) 316 | ot6=getData(cmd6) 317 | if ot==7: sys.stderr.write('REDIportal gunzip error.\n') 318 | if ot==1: sys.stderr.write('REDIportal awk error.\n') 319 | if ot1==1: sys.stderr.write('REDIportal bgzip error.\n') 320 | if ot2==1: sys.stderr.write('REDIportal tabix error.\n') 321 | if ot3==1: sys.stderr.write('REDIportal python error.\n') 322 | if ot4==1: sys.stderr.write('REDIportal sort error.\n') 323 | if ot5==1: sys.stderr.write('REDIportal bgzip error.\n') 324 | if ot6==1: sys.stderr.write('REDIportal tabix error.\n') 325 | if ot+ot1+ot2+ot3+ot4+ot5+ot6+ot7==0: sys.stderr.write('REDIportal ready.\n') 326 | tend = time.time() 327 | sys.stderr.write('Prepare REDIportal annotations - time taken: %s\n' %(get_time(tstart,tend))) 328 | os.chdir('..') 329 | 330 | sys.stderr.write('ALL DONE. ENJOY REDItools.\n') 331 | else: 332 | sys.stderr.write('Please provide your input data according to nature_protocol or relaunch this script. \n') 333 | -------------------------------------------------------------------------------- /NPscripts/get_Statistics.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | def getDistro(lines): 4 | s={} 5 | for i in 'ACGT': 6 | for j in 'ACGT': 7 | if i!=j: s[i+j]=0 8 | n={} 9 | x=0 10 | for i in 'ACGT': 11 | n[i]=x 12 | x+=1 13 | all=0 14 | for i in lines: 15 | sub=i[7].split()[0] 16 | nuc=eval(i[6]) 17 | nv= nuc[n[sub[1]]] 18 | s[sub]+=nv 19 | all+=nv 20 | d={} 21 | for i in s: 22 | try: v=(s[i]/float(all))*100 23 | except: v=0.0 24 | d[i]=(s[i],all,v) 25 | return d 26 | 27 | if not os.path.exists('editing.txt'): sys.exit('editing.txt file not found.') 28 | 29 | alu,nonalu,nonrep,kn=[],[],[],0 30 | f=open('editing.txt') 31 | for i in f: 32 | if i.startswith('Reg'): continue 33 | l=(i.strip()).split('\t') 34 | if l[18]=='ed': kn+=1 35 | if l[14]=='SINE' and l[15][:3]=='Alu': alu.append(l) 36 | elif l[14]!='-' and l[15][:3]!='Alu': nonalu.append(l) 37 | elif l[14]=='-' and l[15]=='-': nonrep.append(l) 38 | f.close() 39 | 40 | alust=getDistro(alu) 41 | nonalust=getDistro(nonalu) 42 | nonrepst=getDistro(nonrep) 43 | all=getDistro(alu+nonalu+nonrep) 44 | 45 | f=open('editingStats.txt','w') 46 | h=['SubType','ALU','REPnonALU','NONREP','ALL'] 47 | f.write('\t'.join(h)+'\n') 48 | for i in alust: 49 | r=[i,alust[i][2],nonalust[i][2],nonrepst[i][2],all[i][2]] 50 | r=[str(x) for x in r] 51 | f.write('\t'.join(r)+'\n') 52 | f.close() 53 | -------------------------------------------------------------------------------- /PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: REDItools 3 | Version: 1.3 4 | Summary: Python Scripts for RNA editing detection by RNA-Seq data 5 | Home-page: https://github.com/BioinfoUNIBA/REDItools 6 | Author: Ernesto Picardi 7 | Author-email: ernesto.picardi@gmail.com 8 | License: LICENSE.txt 9 | Description: REDItools: python scripts for RNA editing detection by RNA-Seq data 10 | =================================================================== 11 | 12 | Introduction 13 | ============ 14 | REDItools are python scripts developed with the aim to study RNA editing at genomic scale 15 | by next generation sequencing data. RNA editing is a post-transcriptional phenomenon 16 | involving the insertion/deletion or substitution of specific bases in precise RNA localizations. 17 | In human, RNA editing occurs by deamination of cytosine to uridine (C-to-U) or mostly by the 18 | adenosine to inosine (A-to-I) conversion through ADAR enzymes. A-to-I substitutions may have 19 | profound functional consequences and have been linked to a variety of human diseases including 20 | neurological and neurodegenerative disorders or cancer. Next generation sequencing technologies 21 | offer the unique opportunity to investigate in depth RNA editing even though no dedicated 22 | software has been released up to now. 23 | 24 | REDItools are simple python scripts conceived to facilitate the investigation of RNA editing 25 | at large-scale and devoted to research groups that would to explore such phenomenon in own 26 | data but don’t have sufficient bioinformatics skills. 27 | They work on main operating systems (although unix/linux-based OS are preferred), can handle reads from whatever 28 | platform in the standard BAM format and implement a variety of filters. 29 | 30 | 31 | Platform: Linux 32 | Platform: Unix 33 | Platform: MacOS 34 | Classifier: Intended Audience :: Computational biologists 35 | Classifier: License :: OSI Approved :: MIT 36 | Classifier: Operating System :: MacOS :: MacOS X 37 | Classifier: Operating System :: POSIX 38 | Classifier: Programming Language :: Python 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | REDItools: python scripts for RNA editing detection by RNA-Seq data 2 | =================================================================== 3 | 4 | Introduction 5 | ============ 6 |

RNA editing is a post-transcriptional phenomenon 7 | involving the insertion/deletion or substitution of specific bases in precise RNA localizations. 8 | In humans, RNA editing occurs by deamination of cytosine to uridine (C-to-U) or mostly by the 9 | adenosine to inosine (A-to-I) conversion through ADAR enzymes. A-to-I substitutions may have 10 | profound functional consequences and have been linked to a variety of human diseases including 11 | neurological and neurodegenerative disorders or cancer. Next-generation sequencing technologies 12 | offer the unique opportunity to investigate in-depth RNA editing even though no dedicated 13 | software has been released up to now. 14 | 15 | REDItools are simple Python scripts conceived to facilitate the investigation of RNA editing 16 | at large scale and devoted to research groups that would explore such phenomena in own 17 | data but don’t have sufficient bioinformatics skills. 18 | They work on main operating systems (although unix/linux-based OS are preferred), can handle reads from whatever 19 | platform in the standard BAM format, and implement various filters.

20 | 21 | REDItools V1 manual 22 |

23 | REDItools V2 manual 24 |

25 | REDItools V3 manual 26 |

Note. REDItools V2 is useful for HPC environments.

27 | REDItools V3 is the latest optimized version for large-scale investigations.

28 |

Important. Reditool_DNA_RNA.py v1.3 available at this link

29 | -------------------------------------------------------------------------------- /README_2.md: -------------------------------------------------------------------------------- 1 | # REDItools2 2 | 3 | **REDItools2** is the optimized, parallel multi-node version of [ REDItools](https://github.com/BioinfoUNIBA/REDItools). 4 | 5 | REDItools takes in input a RNA-Seq (or DNA-Seq BAM) file and outputs a table of RNA-Seq editing events. Here is an example of REDItools's output: 6 |

7 | 8 |

9 | 10 | The following image explains the high-level architecture. 11 | 12 |

13 | 14 |

15 | 16 | This version of REDItools shows an average 8x speed improvement over the previous version even when using only the serial-mode: 17 | 18 |

19 | 20 |

21 | 22 | # Index 23 | 24 | - [1. Python setup](#1-python-setup) 25 | - [2. Environment setup](#2-environment-setup) 26 | - [3. Cloning / downloading](#3-cloning--downloading) 27 | - [4. Installing](#4-installing) 28 | - [5. The two versions of REDItools 2.0](#5-the-two-versions-of-reditools-20) 29 | - [5.1 Serial version](#51-serial-version-reditoolspy) 30 | - [5.2 Parallel version](#52-parallel-version--parallel_reditoolspy) 31 | - [6. Running REDItools 2.0 on your own data](#6-running-reditools-20-on-your-own-data) 32 | - [7. REDItools 2.0 options](#7-reditools-20-options) 33 | - [8. DNA-Seq annotation with REDItools 2.0](#8-dna-seq-annotation-with-reditools-20) 34 | - [9. Running REDItools 2.0 in multisample mode](#9-running-reditools-20-in-multisample-mode) 35 | - [10. Displaying benchmarks in HTML with REDItools 2.0 (parallel version only)](#10-displaying-benchmarks-with-reditools-20-parallel-version-only) 36 | 37 | 38 | ## Installation 39 | 40 | ### 1. Python setup 41 | --- 42 | This guide assumes you have Python <= 2.7 installed in your system. If you do not have Python, please read the [official Python webpage](https://www.python.org/). 43 | 44 | Make sure to have the following packages installed: 45 | 46 | > sudo apt-get install python-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip zlib-devel zlib zlib1g zlib1g-devel libbz2-dev zlib1g-dev libncurses5-dev libncursesw5-dev liblzma-dev 47 | 48 | Make sure you have you preferred Python version loaded. If you have a single Python version already installed in your system you should do nothing. If you have multiple versions, please be sure to point to a given version; in order to do so check your environmental variables (e.g., PATH). 49 | 50 | If you are running on a cluster (where usually several versions are available) make sure to load a given Python version. For example (if running on CINECA Marconi super computer) the following command would load Python 2.7.12: 51 | > module load autoload python/2.7.12 52 | 53 | Note: REDItools2.0 has been tested with Python 2.7.12. The software comes with no guarantee of being compatible with other versions of Python (e.g., Python >=3). 54 | 55 | ### 2. Environment setup 56 | --- 57 | Make sure the following libraries are installed: 58 | 59 | - htslib (see http://www.htslib.org/download/ and https://www.biostars.org/p/328831/ for instructions) 60 | - samtools: 61 | 62 | > sudo apt-get install samtools 63 | 64 | - tabix: 65 | 66 | > sudo apt-get install tabix 67 | 68 | - an MPI implementation. We suggest OpenMPI, but you can choose whatever you like the most. For installing OpenMPI, try the following command: 69 | > sudo apt-get install openmpi-common libopenmpi-dev 70 | 71 | ### 3. Cloning / Downloading 72 | --- 73 | 74 | The first step is to clone this repository (assumes you have *git* installed in your system - see the [Git official page](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) otherwise): 75 | > git clone https://github.com/tflati/reditools2.0.git 76 | 77 | (alternatively you can download a ZIP package of REDItools2.0 from [here](https://github.com/tflati/reditools2.0/archive/master.zip) and uncompress the archive). 78 | 79 | Move into the project main directory: 80 | > cd reditools2.0 81 | 82 | 83 | ### 4. Installing 84 | --- 85 | 86 | REDItools 2.0 requires a few Python modules to be installed in the environment (e.g., pysam, sortedcontainers, mpi4py, etc.). These can be installed in three ways: 87 | 88 | - **System-level**: in this way the dependencies will be installed in your system and all users in your system will see changes. In order to perform this type of installation you need administrator rights. 89 | To install REDItools2.0 in this modality, just run the following command: 90 | > sudo pip install -r requirements.txt 91 | 92 | - **User-level**: in this way the dependencies will be installed only for your current user, usually in your home directory. In order to perform this type of installation you need only to be logged as a normal user. Note that this type of installation will install additional software in your local Python directory (usually $HOME/.local/lib/python2.7/site-packages/, but it depends on your operating system and distribution). 93 | This is the recommended modality if you do not care about altering your user environment. Note that altering your user environment might lead to software corruption. For example, assume you have already the *pysam* package installed (version 0.6); since REDItools 2.0 requires a version for *pysam* >= 0.9, the installation would uninstall the existing version of pysam and would install the version 0.9, thus altering the state of your environment. Any existing software which relied on version pysam 0.6 might break and stop working. In conclusion, choose this modality at your own risk. 94 | To install REDItools2.0 in this modality, just run the following command: 95 | > pip install -r requirements.txt --user 96 | 97 | - **Environment-level**: in this type of installation you create an isolated virtual environment (initially empty) which will contain any new required software, without creating conflicts with any existing environment or requiring any particular right. 98 | This modality will work regardless of the existing packages already installed in your system (both user and system levels) and thus gives the maximum possible freedom to the final-end user. 99 | This is the recommended modality. 100 | The downside of choosing this modality is a potential duplication of code with respect to other existing environments. For example, assume you already have a given version of *sortedcontainers*; by installing REDItools2.0 at environment-level will download and install a *new* copy of *sortedcontainers* into a new isolated environment (ending up with two copies of the same software present in the system, one inside and one outside the virtual environment). 101 | To install REDItools2.0 in this modality, run the following commands: 102 | 103 | > virtualenv ENV 104 | > 105 | > source ENV/bin/activate 106 | > 107 | > pip install -r requirements.txt 108 | > 109 | > deactivate 110 | 111 | These commands will create a new environment called *ENV* (you can choose any name you like) and will install all dependencies listed in the file *requirements.txt* into it). The commands *activate* and *deactivate* respectively activate (i.e., start/open) and deactivate (i.e., end/close) the virtual environment. 112 | When running the real commands, remember to wrap your commands between and activate and deactivate commands: 113 | 114 | >source ENV/bin/activate 115 | > 116 | >command... 117 | > 118 | >command... 119 | > 120 | >command... 121 | > 122 | >command... 123 | > 124 | >deactivate 125 | 126 | ## Testing 127 | 128 | ### 5. The two versions of REDItools 2.0 129 | --- 130 | 131 | This repo includes test data and a test script for checking that dependencies have been installed properly and the basic REDItools command works. 132 | 133 | In order to have all the data you need, run the following commands: 134 | 135 | > cd test 136 | > 137 | > ./prepare_test.sh 138 | 139 | This will download and index the chromosome 21 of the hg19 version of the human genome (from http://hgdownload.cse.ucsc.edu/downloads.html). 140 | Once the script has finished running, you have all you need to perform the tests. 141 | 142 | The software comes with two modalities. Feel free to choose the one which best fits your needs. 143 | 144 | #### 5.1 Serial version (reditools.py) 145 | 146 | In this modality you benefit only from the optimization introduced after the first version. While being significantly faster (with about a 8x factor), you do not exploit the computational power of having multiple cores. On the other hand the setup and launch of REDItools is much easier. 147 | This might be the first modality you might want to give a try when using REDItools2.0 for the first time. 148 | 149 | The serial version of REDItools2.0 can be tested by issuing the following command: 150 | 151 | > serial_test.sh 152 | 153 | or, if you are in a SLURM-based cluster: 154 | 155 | > sbatch serial_test_slurm.sh 156 | 157 | #### 5.2 Parallel version (parallel_reditools.py) 158 | 159 | In this modality you benefit both from the serial optimization and from the parallel computation introduced in this brand new version which exploits the existence of multiple cores, also on multiple nodes, making it a perfect tool on High Performance Computing facilities. 160 | Using this modality requires you to perform a little bit more system setup, but it will definitely pay you off. 161 | 162 | The parallel version leverages on the existence of coverage information which reports for each position the number of supporting reads. 163 | 164 | We assume you already have installed and correctly configured the following tools: 165 | 166 | - **samtools** (http://www.htslib.org/) 167 | - **htslib** (http://www.htslib.org/) 168 | 169 | If you can use *mpi* on your machine (e.g., you are not on a multi-user system and there are no limitations to the jobs you can submit to the system), you can try launching the parallel version of REDItools 2.0 as follows: 170 | 171 | > ./parallel_test.sh 172 | 173 | If you are running on a SLURM-based cluster, instead, run the following command: 174 | 175 | > sbatch ./parallel_test_slurm.sh 176 | 177 | This script: 178 | - first defines a bunch of variables which point to input, output and accessory files; then 179 | - launches the production of coverage data; then 180 | - REDItools 2.0 is launched in parallel, by using the specified number of cores; finally 181 | - results are gathered and written into a single table (parameter *-o* provided in the command line) 182 | 183 | ## Running 184 | 185 | ### 6. Running REDItools 2.0 on your own data 186 | --- 187 | You can now customize the input test scripts to your needs with your input, output and ad-hoc options. 188 | 189 | ### 7. REDItools 2.0 options 190 | --- 191 | #### 7.1 Basic options 192 | In its most basic form, REDItools 2.0 can be invoked with an input BAM file, a reference genome and an output file: 193 | > python src/cineca/reditools.py -f \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE 194 | 195 | If you want, you can restrict the analysis only to a certain region (e.g., only chr1), by means of the **-g** option : 196 | > python src/cineca/reditools.py -f \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE -g chr1 197 | > 198 | or a specific interval: 199 | > python src/cineca/reditools.py -f \$INPUT_BAM_FILE -r $REFERENCE -o \$OUTPUT_FILE -g chr1:1000-2000 200 | 201 | For a complete list of options and their usage and meaning, please type: 202 | 203 | > python src/cineca/reditools.py -h 204 | 205 | #### 7.2 Other options 206 | 207 | Here we report the principal options with a detailed explanation for each of them. 208 | The following are the options accepted by the serial version of REDItools: 209 | 210 | > reditools.py [-h] [-f FILE] [-o OUTPUT_FILE] [-S] [-s STRAND] [-a] 211 | [-r REFERENCE] [-g REGION] [-m OMOPOLYMERIC_FILE] [-c] 212 | [-os OMOPOLYMERIC_SPAN] [-sf SPLICING_FILE] 213 | [-ss SPLICING_SPAN] [-mrl MIN_READ_LENGTH] 214 | [-q MIN_READ_QUALITY] [-bq MIN_BASE_QUALITY] 215 | [-mbp MIN_BASE_POSITION] [-Mbp MAX_BASE_POSITION] 216 | [-l MIN_COLUMN_LENGTH] [-men MIN_EDITS_PER_NUCLEOTIDE] 217 | [-me MIN_EDITS] [-Men MAX_EDITING_NUCLEOTIDES] [-d] 218 | [-T STRAND_CONFIDENCE] [-C] [-Tv STRAND_CONFIDENCE_VALUE] 219 | [-V] [-H] [-D] [-B BED_FILE] 220 | > 221 | > **-h**, --help 222 | > show this help message and exit 223 | > 224 | >**-f** FILE, --file FILE 225 | >The bam file to be analyzed 226 | > 227 | >**-o** OUTPUT_FILE, --output-file OUTPUT_FILE 228 | >The output statistics file 229 | > 230 | >**-S**, --strict 231 | > Activate strict mode: only sites with edits will be included in the output 232 | > 233 | >**-s** STRAND, --strand STRAND 234 | >Strand: this can be 0 (unstranded), 1 (secondstrand oriented) or 2 (firststrand oriented) 235 | > 236 | >**-a**, --append-file 237 | >Appends results to file (and creates if not existing) 238 | > 239 | >**-r** REFERENCE, --reference REFERENCE 240 | >The reference FASTA file 241 | > 242 | >**-g** REGION, --region REGION 243 | >The region of the bam file to be analyzed 244 | > 245 | >**-m** OMOPOLYMERIC_FILE, --omopolymeric-file OMOPOLYMERIC_FILE 246 | >The file containing the omopolymeric positions 247 | > 248 | >**-c**, --create-omopolymeric-file 249 | >Whether to create the omopolymeric span 250 | > 251 | >**-os** OMOPOLYMERIC_SPAN, --omopolymeric-span OMOPOLYMERIC_SPAN 252 | >The omopolymeric span 253 | > 254 | >**-sf** SPLICING_FILE, --splicing-file SPLICING_FILE 255 | >The file containing the splicing sites positions 256 | > 257 | >**-ss** SPLICING_SPAN, --splicing-span SPLICING_SPAN 258 | >The splicing span 259 | > 260 | >**-mrl** MIN_READ_LENGTH, --min-read-length MIN_READ_LENGTH 261 | >The minimum read length. Reads whose length is below this value will be discarded. 262 | > 263 | >**-q** MIN_READ_QUALITY, --min-read-quality MIN_READ_QUALITY 264 | >The minimum read quality. Reads whose mapping quality is below this value will be discarded. 265 | > 266 | >**-bq** MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY 267 | >The minimum base quality. Bases whose quality is below this value will not be included in the analysis. 268 | > 269 | >**-mbp** MIN_BASE_POSITION, --min-base-position MIN_BASE_POSITION 270 | >The minimum base position. Bases which reside in a previous position (in the read) will not be included in the analysis. 271 | > 272 | >**-Mbp** MAX_BASE_POSITION, --max-base-position MAX_BASE_POSITION 273 | >The maximum base position. Bases which reside in a further position (in the read) will not be included in the analysis. 274 | > 275 | >**-l** MIN_COLUMN_LENGTH, --min-column-length MIN_COLUMN_LENGTH 276 | >The minimum length of editing column (per position). Positions whose columns have length below this value will not be included in the analysis. 277 | > 278 | >**-men** MIN_EDITS_PER_NUCLEOTIDE, --min-edits-per-nucleotide MIN_EDITS_PER_NUCLEOTIDE 279 | >The minimum number of editing for events each nucleotide (per position). Positions whose columns have bases with less than min-edits-per-base edits will not be included in the analysis. 280 | > 281 | >**-me** MIN_EDITS, --min-edits MIN_EDITS 282 | > The minimum number of editing events (per position). Positions whose columns have bases with less than 'min-edits-per-base edits' will not be included in the analysis. 283 | > 284 | >**-Men** MAX_EDITING_NUCLEOTIDES, --max-editing-nucleotides MAX_EDITING_NUCLEOTIDES 285 | > The maximum number of editing nucleotides, from 0 to 4 (per position). Positions whose columns have more than 'max-editing-nucleotides' will not be included in the analysis. 286 | > 287 | >**-d**, --debug 288 | >REDItools is run in DEBUG mode. 289 | > 290 | >**-T** STRAND_CONFIDENCE, --strand-confidence STRAND_CONFIDENCE 291 | > Strand inference type 292 | > 1:maxValue 293 | > 2:useConfidence [1]; 294 | > maxValue: the most prominent strand count will be used; 295 | > useConfidence: strand is assigned if over a prefixed frequency confidence (-TV option) 296 | > 297 | >**-C**, --strand-correction 298 | > Strand correction. Once the strand has been inferred, only bases according to this strand will be selected. 299 | > 300 | >**-Tv** STRAND_CONFIDENCE_VALUE, --strand-confidence-value STRAND_CONFIDENCE_VALUE 301 | > Strand confidence [0.70] 302 | > 303 | >**-V**, --verbose 304 | > Verbose information in stderr 305 | > 306 | >**-H**, --remove-header 307 | >Do not include header in output file 308 | > 309 | >**-N**, --dna 310 | >Run REDItools 2.0 on DNA-Seq data 311 | > 312 | >**-B** BED_FILE, --bed_file BED_FILE 313 | > Path of BED file containing target regions 314 | 315 | The parallel version of REDItools 2.0 has also other 4 additional parameters, namely: 316 | >**-G** --coverage-file The coverage file of the sample to analyze 317 | > 318 | >**-D** --coverage-dir The coverage directory containing the coverage file of the sample to analyze divided by chromosome 319 | > 320 | >**-t** --temp-dir The temp directory where to store temporary data for this sample 321 | > 322 | >**-Z** --chromosome-sizes The file with the chromosome sizes 323 | 324 | ### 8. DNA-Seq annotation with REDItools 2.0 325 | 326 | - Analyze your RNA-Seq data (e.g., file *rna.bam*) with any version of REDItools and obtain the corresponding output table (e.g., *rna_table.txt* or *rna_table.txt.gz*); 327 | - Analyze your DNA-Seq data (e.g., *dna.bam*) with REDItools 2.0, providing as input: 328 | 1. The DNA-Seq file (*dna.bam*) (e.g., option *-f* *dna.bam*); 329 | 2. The output RNA-table output of the first step (e.g., option *-B* *rna_table.txt*) 330 | This step will produce the output table (e.g., *dna_table.txt*); 331 | - Annotate the RNA-Seq table by means of the DNA-Seq table by running REDItools2.0 annotator (script *src/cineca/annotate_with_DNA.py*) with the two tables as input (e.g., *rna_table.txt* and *dna_table.txt*) which will produce the final annotated table (e.g., *final_table.txt*). 332 | 333 |

334 | 335 |

336 | 337 | When RNA-editing tables are big (e.g., greater than 1GB in gz format) reading the full table in parallel mode can be really a time-consuming task. In order to optimize the loading of target positions, we have provided a script to convert RNA-editing tables to BED files: 338 | 339 | > python src/cineca/reditools_table_to_bed.py -i RNA_TABLE -o BED_FILE 340 | 341 | This can be further optimized by creating the final BED in parallel: 342 | 343 | > extract_bed_dynamic.sh RNA_TABLE TEMP_DIR SIZE_FILE 344 | 345 | where 346 | - RNA_TABLE is the input RNA-editing table; 347 | - TEMP_DIR is the directory that will contain the output BED file; 348 | - SIZE_FILE is the file containing the chromosome information (e.g., the .fai file of your reference genome). 349 | 350 | Finally run the script *src/cineca/annotate_with_DNA.py*: 351 | 352 | > python src/cineca/annotate_with_DNA.py -r RNA_TABLE -d DNA_TABLE [-Z] 353 | 354 | The option -Z (not mandatory and without arguments) will exclude positions with multiple changes in DNA-Seq. 355 | 356 | #### 8.1 Useful scripts 357 | 358 | In order to ease the annotation of RNA-Seq tables with DNA-Seq information, we also provided two sample scripts that you can customize with your own data: 359 | 360 | - [**WORK IN PROGRESS**] serial_dna_test.sh 361 | - [**WORK IN PROGRESS**] parallel_dna_test.sh 362 | 363 | ### 9. [**WORK IN PROGRESS**] Running REDItools 2.0 in multisample mode 364 | REDItools also supports the launch on multiple samples at the same time. This modality is extremely useful if you have a dataset (i.e., group of homogeneous samples) and wish to run the same analysis on all of them (i.e., with the same options). 365 | 366 | In order to do this, we provided a second script analogous to parallel_reditools.py, called *reditools2_multisample.py* which supports the specification of an additional option -F [SAMPLE_FILE]. SAMPLE_FILE is a file containing the (absolute) path of samples to be analyzed. 367 | It can be launched in the following manner: 368 | 369 | > mpirun src/cineca/reditools2_multisample.py -F $SAMPLE_FILE [OPTIONS] 370 | 371 | where OPTIONS are the same options accepted by the parallel version of REDItools 2.0. 372 | 373 | #### 9.1 Running in multisample mode on a SLURM-based cluster 374 | If you wish to run REDItools 2.0 in multisample mode on a SLURM-based cluster, we provided two scripts that will help you: 375 | 376 | - [**WORK IN PROGRESS**] *extract_coverage_slurm_multisample.sh*: will calculate the coverage data for all the samples in parallel (by using the script *extract_coverage_dynamic.sh*); 377 | - [**WORK IN PROGRESS**] *multisample_test.sh*: will calculate the RNA-editing events tables for all the samples in parallel using MPI. 378 | 379 | First run *extract_coverage_slurm_multisample.sh* and then *multisample_test.sh*. 380 | 381 | ### 10. Displaying benchmarks with REDItools 2.0 (parallel version only) 382 | We also released simple scripts to generate HTML pages containing the snapshot of the amount of time REDItools 2.0 (parallel version) spends on each part of the overall computation for each process (e.g., coverage computation, DIA algorithm, interval analysis, partial results recombination, etc). 383 | 384 | **Note**: this command will work only when launched *after* the parallel computation has completed. 385 | 386 | All you have to do to create the HTML page is launching the following command: 387 | > create_html.sh TEMP_DIR 388 | 389 | where TEMP_DIR is the directory you specified with the -t option; this directory should contain in fact some auxiliary files (e.g., intervals.txt, progress.txt, times.txt and groups.txt) which serve exactly this purpose. 390 | Once created, the HTML page should display time information similar to the following: 391 | 392 |

393 | 394 |

395 | 396 | By means of this visualization you can *hover* on slices to see more in details the statistics for each interval computation as well as *zoom in* and *zoom out* by using the scroll wheel of your mouse. 397 | 398 | Issues 399 | --- 400 | No issues are known so far. For any problem, write to t.flati@cineca.it. 401 | 408 | -------------------------------------------------------------------------------- /accessory/AnnotateTable.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, getopt, random, time 23 | try: import pysam 24 | except: sys.exit('Pysam module not found.') 25 | pid=str(os.getpid()+random.randint(0,999999999)) 26 | 27 | pysamVersion=pysam.__version__ 28 | sys.stderr.write('Pysam version used: %s\n' %(pysamVersion)) 29 | 30 | def usage(): 31 | print """ 32 | USAGE: python AnnotateTable.py [options] 33 | Options: 34 | -a Sorted Annotation file 35 | -i Annotate a file of positions [column1=region, column2=coordinate (1 based)] 36 | or a single position [region:coordinate (1 based)] 37 | -k skip lines starting with: # 38 | -r Add a prefix to chromosome name [] (chr when the name is a number) 39 | -s Strand column in annotation file [4] 40 | -u Not use table strand info (fix it to 2) 41 | -c Add columns separated by comma (feature:1, gene_id:2, transcript_id:3) [1,2] 42 | -n Column name [Col] 43 | -S Correct strand by annotation 44 | -C Columns with base distribution [7,12] (in combination with -S) 45 | -o Save lines to a file 46 | -h Print this help 47 | """ 48 | 49 | try: 50 | opts, args = getopt.getopt(sys.argv[1:], 'i:a:o:hs:c:n:SC:uk:r:',["help"]) 51 | except getopt.GetoptError, err: 52 | print str(err) 53 | usage() 54 | sys.exit() 55 | 56 | if len(opts)==0: 57 | usage() 58 | sys.exit() 59 | tablefile,outfile,annfile='','','' 60 | save,ap,af,addc,cs,nos=0,0,0,[0,1],0,0 61 | csc=[6,11] 62 | strcol=3 63 | colname='Col' 64 | skip='Region' 65 | addchr='' 66 | for o,a in opts: 67 | if o in ("-h","--help"): 68 | usage() 69 | sys.exit() 70 | elif o == "-n": colname = a 71 | elif o == "-k": skip = a 72 | elif o == "-r": addchr = a 73 | elif o == "-i": 74 | tablefile = a 75 | if not os.path.exists(tablefile): ap,af=1,0 76 | else: ap,af=0,1 77 | elif o == "-o": 78 | outfile = a 79 | save=1 80 | elif o == "-s": strcol = int(a)-1 81 | elif o == "-S": cs = 1 82 | elif o == "-u": nos = 1 83 | elif o == "-C": csc=[int(x)-1 for x in a.split(',')] 84 | elif o == "-c": 85 | addc = [int(x)-1 for x in a.split(',') if x in ['1','2','3']] 86 | addc.sort() 87 | elif o == "-a": 88 | annfile = a 89 | if annfile=='': 90 | usage() 91 | sys.exit('Sorted annotation file not found.') 92 | else: 93 | assert False, "unhandled option" 94 | 95 | ############## 96 | def gstr(v): 97 | if v=='-': return '0' 98 | else: return '1' 99 | 100 | def comp(s): 101 | a={'A':'T','T':'A','C':'G','G':'C'} 102 | ss='' 103 | for i in s.upper(): 104 | if a.has_key(i): ss+=a[i] 105 | elif i==' ': ss+=' ' 106 | elif i=='-': ss+='-' 107 | else: ss+='N' 108 | return ss 109 | 110 | def bcomp(b): 111 | bb=eval(b) 112 | return str([bb[3],bb[2],bb[1],bb[0]]) 113 | 114 | def checkstr(stringa): 115 | strand='+-' 116 | if stringa=='0': strand='-' 117 | elif stringa=='1': strand='+' 118 | elif stringa=='2': strand='+-' 119 | elif stringa=='-': strand='-' 120 | elif stringa=='+': strand='+' 121 | return strand 122 | 123 | def parse(res): 124 | d={'+':{},'-':{}} 125 | anns='+' 126 | for i in res: 127 | if i[3]=='+': 128 | if d['+'].has_key(i[1]): 129 | if i[0] not in d['+'][i[1]][0]: d['+'][i[1]][0]=d['+'][i[1]][0]+','+i[0] 130 | if i[2]+'-'+i[0] not in d['+'][i[1]][1]: d['+'][i[1]][1]=d['+'][i[1]][1]+','+i[2]+'-'+i[0] 131 | else: 132 | d['+'][i[1]]=[i[0],i[2]+'-'+i[0]] 133 | elif i[3]=='-': 134 | if d['-'].has_key(i[1]): 135 | if i[0] not in d['-'][i[1]][0]: d['-'][i[1]][0]=d['-'][i[1]][0]+','+i[0] 136 | if i[2]+'-'+i[0] not in d['-'][i[1]][1]: d['-'][i[1]][1]=d['-'][i[1]][1]+','+i[2]+'-'+i[0] 137 | else: 138 | d['-'][i[1]]=[i[0],i[2]+'-'+i[0]] 139 | gip='$'.join(d['+'].keys()) 140 | featp='$'.join([d['+'][x][0] for x in d['+'].keys()]) 141 | tip='$'.join([d['+'][x][1] for x in d['+'].keys()]) 142 | gim='$'.join(d['-'].keys()) 143 | featm='$'.join([d['-'][x][0] for x in d['-'].keys()]) 144 | tim='$'.join([d['-'][x][1] for x in d['-'].keys()]) 145 | p=[featp,gip,tip] 146 | m=[featm,gim,tim] 147 | pm=[(featp+'&'+featm).strip('&'),(gip+'&'+gim).strip('&'),(tip+'&'+tim).strip('&')] 148 | if len(d['+'])==0 and len(d['-'])!=0: anns='-' 149 | if len(d['+'])==0: p=['-','-','-'] 150 | if len(d['-'])==0: m=['-','-','-'] 151 | if len(d['+'])==0 and len(d['-'])==0: 152 | pm=['-','-','-'] 153 | anns='+-' 154 | if len(d['+'])!=0 and len(d['-'])!=0: anns='+-' 155 | return (p,m,pm,anns) 156 | 157 | #chr17:7590770 158 | 159 | ############### 160 | if ap and af: 161 | usage() 162 | sys.exit('You can annotate a file of positions or a single positions but not both in one run.') 163 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 164 | sys.stderr.write("Script time --> START: %s\n" %(script_time)) 165 | 166 | 167 | if not os.path.exists(annfile+'.tbi'): 168 | sys.stderr.write('Indexing %s file.\n' %(annfile)) 169 | annfile=pysam.tabix_index(annfile, preset='gff') 170 | 171 | tabix=pysam.Tabixfile(annfile) 172 | contig=tabix.contigs 173 | 174 | if ap: 175 | prinfo=['Feature --> ','Gid --> ','Tid --> '] 176 | try: 177 | query=tablefile.split(':') 178 | chr,pos=addchr+query[0],int(query[1])-1 179 | try: strand=checkstr(query[2]) 180 | except: strand=checkstr('') 181 | if nos: strand='+-' 182 | sres=[] 183 | if chr in contig: 184 | sres=[(kk.feature,kk.gene_id,kk.transcript_id,kk.strand) for kk in tabix.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())] 185 | ann=parse(sres) 186 | if strand=='+': res=ann[0] 187 | elif strand=='-': res=ann[1] 188 | else: res=ann[2] 189 | for i in addc: 190 | print prinfo[i]+ res[i] 191 | except: sys.exit('Error: not correct position.') 192 | 193 | if af: 194 | if save: o=open(outfile,'w') 195 | f=open(tablefile) 196 | hinfo=['%s_feat' %(colname),'%s_gid' %(colname),'%s_tid' %(colname)] 197 | for i in f: 198 | if i.strip()=='': continue 199 | if i.startswith('Region'): 200 | h=[i.strip()] 201 | for k in addc: h.append(hinfo[k]) 202 | if save: o.write('\t'.join(h)+'\n') 203 | else: print '\t'.join(h) 204 | continue 205 | if i.startswith(skip): continue 206 | l=(i.strip()).split('\t') 207 | chr,pos=addchr+l[0],int(l[1])-1 208 | try: strand=checkstr(l[strcol]) 209 | except: strand='+-' 210 | if nos: strand='+-' 211 | sres=[] 212 | #print chr,pos,pos+1 213 | if chr in contig: 214 | sres=[(kk.feature,kk.gene_id,kk.transcript_id,kk.strand) for kk in tabix.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())] 215 | ann=parse(sres) #(p,m,pm,anns) 216 | if cs: 217 | if ann[3]=='+-': pass 218 | elif ann[3]==strand: pass 219 | elif ann[3]!=strand: 220 | l[2]=comp(l[2]) 221 | l[strcol]=gstr(ann[3]) 222 | strand=l[strcol] 223 | for j in csc: 224 | try: 225 | l[j]=bcomp(l[j]) 226 | l[j+1]=comp(l[j+1]) 227 | except: pass 228 | if strand=='+': res=ann[0] 229 | elif strand=='-': res=ann[1] 230 | else: res=ann[2] 231 | for j in addc: l.append(res[j]) 232 | if save: o.write('\t'.join(l)+'\n') 233 | else: print '\t'.join(l) 234 | tabix.close() 235 | if save: 236 | o.close() 237 | sys.stderr.write("Table saved on %s\n" %(outfile)) 238 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 239 | sys.stderr.write("Script time --> END: %s\n" %(script_time)) 240 | -------------------------------------------------------------------------------- /accessory/FilterTable.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | """ 23 | To do: filtering according to strand of positions in table file 24 | """ 25 | 26 | import sys, time, getopt, string, os, random 27 | try: import pysam 28 | except: sys.exit('Pysam module not found.') 29 | 30 | pid=str(os.getpid()+random.randint(0,999999999)) 31 | 32 | def usage(): 33 | print """ 34 | USAGE: python FilterTable.py [options] 35 | Options: 36 | -i Table file 37 | -f Sorted file with positions to filter in 38 | -s Sorted file with positions to filter out 39 | -F Features to filter in (separated by comma) 40 | -S Features to filter out (separated by comma) 41 | -E Exclude positions filtered out 42 | -o Save filtered lines to a file [stdout] 43 | -p Print simple statistics 44 | -h Print this help 45 | 46 | """ 47 | 48 | try: 49 | opts, args = getopt.getopt(sys.argv[1:], 'i:o:f:hs:F:S:Ep',["help"]) 50 | except getopt.GetoptError, err: 51 | print str(err) 52 | usage() 53 | sys.exit() 54 | 55 | if len(opts)==0: 56 | usage() 57 | sys.exit() 58 | tablefile,outfile='','' 59 | ffile,ofile='','' 60 | save,ff,fo,exp,ps=0,0,0,0,0 61 | infeat,outfeat=[],[] 62 | for o,a in opts: 63 | if o in ("-h","--help"): 64 | usage() 65 | sys.exit() 66 | elif o == "-i": 67 | tablefile = a 68 | if not os.path.exists(tablefile): 69 | usage() 70 | sys.exit('Table file not found') 71 | elif o == "-o": 72 | outfile = a 73 | save=1 74 | elif o == "-s": 75 | ofile = a 76 | fo=1 77 | if ofile=='': 78 | usage() 79 | sys.exit('Sorted file with positions to filter out not found.') 80 | elif o == "-f": 81 | ffile = a 82 | ff=1 83 | if ffile=='': 84 | usage() 85 | sys.exit('Sorted file with positions to filter in not found.') 86 | elif o == "-F": 87 | infeat=[x.lower() for x in a.split(',')] 88 | elif o == "-S": 89 | outfeat=[x.lower() for x in a.split(',')] 90 | elif o == "-E": exp=1 91 | elif o == "-p": ps=1 92 | else: 93 | assert False, "unhandled option" 94 | 95 | # Funzioni 96 | def filterIn(chr,exfeat,pos): 97 | if len(exfeat)==0: return 1 98 | if ff and not chr in contigf: return 0 99 | elif not ff: return 1 100 | res=[(kk.feature).lower() for kk in tabixf.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())] 101 | for i in exfeat: 102 | if i in res: return 1 103 | return 0 104 | 105 | def filterOut(chr,exfeat,pos): 106 | if len(exfeat)==0: return 0 107 | if fo and not chr in contigo: return 0 108 | elif not fo: return 0 109 | res=[(kk.feature).lower() for kk in tabixo.fetch(reference=chr,start=pos,end=pos+1,parser=pysam.asGTF())] 110 | for i in exfeat: 111 | if i in res: return 1 112 | return 0 113 | 114 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 115 | sys.stderr.write("Script time --> START: %s\n" %(script_time)) 116 | 117 | if fo: 118 | if not os.path.exists(ofile+'.tbi'): 119 | sys.stderr.write('Indexing %s file.\n' %(ofile)) 120 | ofile=pysam.tabix_index(ofile, preset='gff') 121 | if ff: 122 | if not os.path.exists(ffile+'.tbi'): 123 | sys.stderr.write('Indexing %s file.\n' %(ffile)) 124 | ffile=pysam.tabix_index(ffile, preset='gff') 125 | 126 | if fo: 127 | tabixo=pysam.Tabixfile(ofile) 128 | contigo=tabixo.contigs 129 | if ff: 130 | tabixf=pysam.Tabixfile(ffile) 131 | contigf=tabixf.contigs 132 | 133 | sys.stderr.write('Reading Table file...\n') 134 | if save: o=open(outfile,'w') 135 | f=open(tablefile) 136 | y,x,xx=0,0,0 137 | for i in f: 138 | if i.strip()=='': continue 139 | if i.startswith('#'): continue 140 | if i.startswith('Region'): 141 | if save: o.write(i.strip()+'\n') 142 | else: sys.stdout.write(i) 143 | continue 144 | l=(i.strip('\n')).split('\t') 145 | xx+=1 146 | reg,pos = l[0],int(l[1]) # sottrarre -1 per la ricerca nella tabella 147 | fin=filterIn(reg,infeat,pos-1) 148 | fout=filterOut(reg,outfeat,pos-1) 149 | if fin: 150 | if fout: 151 | x+=1 152 | if exp: continue 153 | if save: o.write('#'+i) 154 | else: sys.stdout.write('#'+i) 155 | else: 156 | y+=1 157 | if save: o.write(i) 158 | else: sys.stdout.write(i) 159 | else: 160 | x+=1 161 | if exp: continue 162 | if save: o.write('#'+i) 163 | else: sys.stdout.write('#'+i) 164 | 165 | f.close() 166 | if save: o.close() 167 | if ff: tabixf.close() 168 | if fo: tabixo.close() 169 | if ps: 170 | sys.stdout.write("All positions: %i\n" %(xx)) 171 | sys.stdout.write("Positions filtered in: %i\n" %(y)) 172 | sys.stdout.write("Positions filtered out: %i\n" %(x)) 173 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 174 | sys.stderr.write("Script time --> END: %s\n" %(script_time)) 175 | 176 | -------------------------------------------------------------------------------- /accessory/GFFtoTabix.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, getopt, time, random, heapq, shutil 23 | from tempfile import gettempdir 24 | from itertools import islice, cycle 25 | from collections import namedtuple 26 | from operator import itemgetter 27 | try: import pysam 28 | except: sys.exit('Pysam module not found.') 29 | 30 | version='1.0' 31 | pid=str(os.getpid()+random.randint(0,999999999)) 32 | 33 | def usage(): 34 | print """ 35 | USAGE: python GFFtoTabix.py [options] 36 | Options: 37 | -i GFF file 38 | -S Do not sort GFF (sort by default) 39 | -b Buffer size (as number of lines) [32000] 40 | -t Temporary directory to use (multiple -t may be used) 41 | -u Save an uncompressed GFF copy (add _copy suffix) 42 | -h Print this help 43 | 44 | """ 45 | 46 | try: 47 | opts, args = getopt.getopt(sys.argv[1:], "i:Sb:t:hu",["help"]) 48 | if len(opts)==0: 49 | usage() 50 | sys.exit(2) 51 | except getopt.GetoptError as err: 52 | print str(err) # will print something like "option -a not recognized" 53 | usage() 54 | sys.exit(2) 55 | 56 | GFFfile='' 57 | buffer_size=32000 58 | tempdirs=[] 59 | sort=1 60 | mc=0 # save an uncompressed GFF copy, default no 61 | for o, a in opts: 62 | if o in ("-h","--help"): 63 | usage() 64 | sys.exit() 65 | elif o == "-i": 66 | GFFfile=a 67 | outfile='.'.join(GFFfile.split('.')[:-1])+'.sorted.gff' 68 | if not os.path.exists(GFFfile): 69 | usage() 70 | sys.exit('GFF file not found') 71 | elif o == "-b": buffer_size=int(a) 72 | elif o == "-t": tempdirs.append(a) 73 | elif o == "-S": sort=0 74 | elif o == "-u": mc=1 75 | else: 76 | assert False, "Unhandled Option" 77 | 78 | Keyed = namedtuple("Keyed", ["key", "obj"]) 79 | key_=eval('lambda line : (%s)' %('line[:]')) 80 | 81 | def gk(key,obj): 82 | ik=itemgetter(0,3,4)(obj.split('\t')) 83 | return key((ik[0],int(ik[1]),int(ik[2]))) 84 | 85 | def merge(key=None, *iterables): 86 | # based on code posted by Scott David Daniels in c.l.p. 87 | # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d 88 | #print iterables 89 | if key is None: 90 | keyed_iterables = iterables 91 | else: 92 | keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables] 93 | #print keyed_iterables 94 | for element in heapq.merge(*keyed_iterables): 95 | yield element.obj 96 | 97 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): 98 | if tempdirs is None: 99 | tempdirs = [] 100 | if not tempdirs: 101 | tempdirs.append(gettempdir()) 102 | chunks = [] 103 | xx=0 104 | try: 105 | with open(input,'rb',64*1024) as input_file: 106 | input_iterator = iter(input_file) 107 | for tempdir in cycle(tempdirs): 108 | current_chunk2=[] 109 | for j in islice(input_iterator,buffer_size): 110 | l=(j.strip()).split('\t') 111 | l[3]=int(l[3]) 112 | l[4]=int(l[4]) 113 | current_chunk2.append(l) 114 | current_chunk3=[] 115 | for j in sorted(current_chunk2, key=itemgetter(0,3,4)): 116 | j[3]=str(j[3]) 117 | j[4]=str(j[4]) 118 | current_chunk3.append('\t'.join(j)+'\n') 119 | xx+=len(current_chunk3) 120 | if not current_chunk3: break 121 | sys.stdout.write("Loaded and sorted %i lines.\n"%(xx)) 122 | output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024) 123 | chunks.append(output_chunk) 124 | output_chunk.writelines(current_chunk3) 125 | output_chunk.flush() 126 | output_chunk.seek(0) 127 | sys.stdout.write("Merging from %i files.\n"%(len(chunks))) 128 | with open(output,'wb',64*1024) as output_file: 129 | output_file.writelines(merge(key, *chunks)) 130 | finally: 131 | for chunk in chunks: 132 | try: 133 | chunk.close() 134 | os.remove(chunk.name) 135 | except Exception: 136 | pass 137 | 138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 139 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 140 | if sort: 141 | sys.stdout.write("Sorting GFF file...\n") 142 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs) 143 | GFFfile=outfile 144 | if mc: 145 | copyfile=GFFfile+'_copy' 146 | shutil.copyfile(GFFfile,copyfile) 147 | sys.stdout.write("A copy of uncompressed GFF file has been saved on %s.\n" %(copyfile)) 148 | sys.stdout.write("Indexing GFF file...\n") 149 | GFFfile=pysam.tabix_index(GFFfile, preset='gff') 150 | sys.stdout.write("Tabix file saved on %s.\n" %(GFFfile)) 151 | sys.stdout.write("Indices saved on %s.tbi.\n" %(GFFfile)) 152 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 153 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) -------------------------------------------------------------------------------- /accessory/Readme.md: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 |

get_DE_events.py

9 |
This scripts and its related files are part of the supplemental material for the paper
10 | "Investigating RNA editing in deep transcriptome datasets with REDItools and REDIportal"
11 |

12 | For control case studies by launching the get_DE_events.py script the user can filter REDItoolDnaRna.py outputs according to the following criteria: 13 |

    14 |
  • RNAseq coverage per position (default 10 reads)
  • 15 |
  • Minimum editing frequency per position (default 10%)
  • 16 | For each editing candidate, the script applies the Mann–Whitney test to check the significance between the two conditions, 17 | control and HD. By default the test is carried out only if the number of editing events per position is equal to 50% of the samples per group. 18 | Optionally, p-values can be corrected using Benjamini–Hochberg or Bonferroni tests. 19 |
20 |

Usage:

21 |
22 | get_DE_events.py [-h] [-c MIN_COVERAGE] [-cpval PVALUE_CORRECTION]
23 |                         [-input_file SAMPLES_INFORMATIONS_FILE]
24 |                         [-f MIN_EDIT_FREQUENCY] [-mts MIN_SAMPLE_TESTING]
25 |                         [-sig ONLY_SIGNIFICANT] [-linear]
26 |   
27 | optional arguments:
28 |   -h, --help                             show this help message and exit
29 |   -c MIN_COVERAGE                        Coverage-q30
30 |   -cpval PVALUE_CORRECTION 1 -->         Bonferroni correction / 2 --> Benjamini hochberg
31 |   -input_file SAMPLES_INFORMATIONS_FILE  Comma separated file e.g: Sample,Status
32 |   -f MIN_EDIT_FREQUENCY                  Editing Frequency
33 |   -mts MIN_SAMPLE_TESTING                min percentage of each sample category
34 |   -sig ONLY_SIGNIFICANT                  Return only significant editing events 
35 |                                          (if -cpval flag is activated)
36 |   -linear                                Calculate differential RNA editing according to Tran et al. (2019)
37 |                                                                                         
38 | e.g. python ../REDItools/accessory/get_DE_events.py -cpval 2 -input_file  sample_information.csv -sig yes
39 | 

The script will filter REDItoolDnaRna.py outputs for each sample contained in the 40 | SAMPLES_INFORMATIONS_FILE returning only significant editing events (pval <= 0.05) 41 | in accordance with Benjamini hochberg correction.

42 | 43 |
44 | 45 | 46 | -------------------------------------------------------------------------------- /accessory/SearchInTable.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, getopt, time 23 | try: import pysam 24 | except: sys.exit('Pysam module not found.') 25 | #pid=str(os.getpid()+random.randint(0,999999999)) 26 | 27 | def usage(): 28 | print """ 29 | USAGE: python SearchInTable.py [options] 30 | Options: 31 | -i Sorted table file (first col=reference; second col=coordinate 1 based) 32 | or tabix indexed table (ending with .gz) 33 | -q Query (file or single positions: chr21:123456) 34 | -C Sequence name column [1] 35 | -S Start column [2] 36 | -E End column; can be identical to '-S' [2] 37 | -P Print to stdout found lines 38 | -p Print position header (like a fasta header >chr21:123456) 39 | -n Print "Not found" 40 | -s Print simple statistics on standard error 41 | -k Skip lines starting with in query file 42 | -o Save found/not found positions on file 43 | -h Print this help 44 | 45 | """ 46 | #-k skip first INT lines [0] 47 | 48 | try: 49 | opts, args = getopt.getopt(sys.argv[1:], "i:q:k:pso:hnC:S:E:O:P",["help"]) 50 | if len(opts)==0: 51 | usage() 52 | sys.exit(2) 53 | except getopt.GetoptError as err: 54 | print str(err) # will print something like "option -a not recognized" 55 | usage() 56 | sys.exit(2) 57 | 58 | tablefile='' 59 | query='' 60 | outfile='' 61 | outfile2='' 62 | pr,prn,prf=0,0,0 63 | ps=0 64 | sv,sv2=0,0 65 | sk=0 66 | ski='' 67 | skil=0 68 | 69 | scol,stcol,ecol=0,1,1 70 | for o, a in opts: 71 | if o in ("-h","--help"): 72 | usage() 73 | sys.exit() 74 | elif o == "-i": 75 | tablefile=a 76 | if not os.path.exists(tablefile): 77 | usage() 78 | sys.exit('Table file not found') 79 | elif o == "-q": 80 | query=a 81 | if query=='': 82 | usage() 83 | sys.exit('Query not found.') 84 | elif o == "-p": pr=1 85 | elif o == "-C": scol=int(a)-1 86 | elif o == "-S": stcol=int(a)-1 87 | elif o == "-E": ecol=int(a)-1 88 | elif o == "-n": prn=1 89 | elif o == "-P": prf=1 90 | elif o == "-k": 91 | ski=a 92 | skil=1 93 | elif o == "-s": ps=1 94 | elif o == "-o": 95 | outfile=a 96 | sv=1 97 | elif o == "-O": 98 | outfile2=a 99 | sv2=1 100 | else: 101 | assert False, "Unhandled Option" 102 | 103 | 104 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 105 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 106 | if not os.path.exists(tablefile): 107 | #sys.stderr.write('Compressing table file.\n') 108 | #pysam.tabix_index(tablefile, tablefile+'.gz') 109 | sys.stderr.write('Indexing table file.\n') 110 | tablefile=pysam.tabix_index(tablefile, seq_col=scol, start_col=stcol, end_col=ecol) 111 | #if tablefile.endswith('.gz') and not tablefile.endswith('.tbi'): 112 | # tablefile=pysam.tabix_index(tablefile, seq_col=scol, start_col=stcol, end_col=ecol) 113 | 114 | tabix=pysam.Tabixfile(tablefile) 115 | allref=tabix.contigs 116 | positions=[] 117 | if os.path.exists(query): 118 | f=open(query) 119 | for i in f: 120 | if i.strip()=='': continue 121 | if i.startswith('#'): continue 122 | if i.startswith('Region'): continue 123 | if skil: 124 | if i.startswith(ski): continue 125 | l=(i.strip()).split() 126 | positions.append((l[0],int(l[1])-1)) 127 | f.close() 128 | elif query.count(":")==1: 129 | l=(query.strip()).split(':') 130 | positions.append((l[0],int(l[1])-1)) 131 | else: sys.exit('I cannot read the query.') 132 | 133 | if sv: 134 | outf=open(outfile+'_found','w') 135 | outnf=open(outfile+'_notfound','w') 136 | if sv2: 137 | outf2=open(outfile2+'_foundInSortedTable','w') 138 | xx=0 139 | for pos in positions: 140 | res=[] 141 | if pos[0] in allref: 142 | res=[kk for kk in tabix.fetch(reference=pos[0],start=pos[1],end=pos[1]+1)] 143 | if pr: sys.stdout.write('>%s:%i\n' %(pos[0],pos[1]+1)) 144 | if len(res)==0: 145 | if prn: sys.stdout.write('Not Found\n') 146 | if sv: outnf.write('%s\t%i\n' %(pos[0],pos[1]+1)) 147 | else: 148 | #if sv: outf.write(res[0]+'\n') 149 | if sv: outf.write(res[0]+'\n') 150 | if prf: sys.stdout.write(res[0]+'\n') 151 | xx+=1 152 | tabix.close() 153 | if sv: 154 | outf.close() 155 | outnf.close() 156 | if ps: 157 | sys.stdout.write('Positions in query: %i\n' %(len(positions))) 158 | sys.stdout.write('Positions found: %i\n' %(xx)) 159 | sys.stdout.write('Positions not found: %i\n' %(len(positions)-xx)) 160 | if sv: 161 | sys.stdout.write('Found line(s) saved on: %s\n' %(outfile+'_found')) 162 | sys.stdout.write('Not found line(s) saved on: %s\n' %(outfile+'_notfound')) 163 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 164 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) -------------------------------------------------------------------------------- /accessory/SortGFF.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | ## {{{ http://code.activestate.com/recipes/576755/ (r3) 23 | # based on Recipe 466302: Sorting big files the Python 2.4 way 24 | # by Nicolas Lehuen 25 | 26 | # Works on python 2.7+ no 3.x 27 | 28 | import sys, os, getopt, heapq, time, random 29 | from tempfile import gettempdir 30 | from itertools import islice, cycle 31 | from collections import namedtuple 32 | from operator import itemgetter 33 | 34 | version='1.0' 35 | pid=str(os.getpid()+random.randint(0,999999999)) 36 | 37 | def usage(): 38 | print """ 39 | USAGE: python SortGFF.py [options] 40 | Options: 41 | -i GFF file 42 | -o Sorted output file [GFF_sorted_%s] 43 | -b Buffer size (as number of lines) [32000] 44 | -t Temporary directory to use (multiple -t may be used) 45 | -h Print this help 46 | 47 | """%(pid) 48 | 49 | try: 50 | opts, args = getopt.getopt(sys.argv[1:], "i:o:b:t:h",["help"]) 51 | if len(opts)==0: 52 | usage() 53 | sys.exit(2) 54 | except getopt.GetoptError as err: 55 | print str(err) # will print something like "option -a not recognized" 56 | usage() 57 | sys.exit(2) 58 | 59 | GFFfile='' 60 | outfile='GFF_sorted_%s' %(pid) 61 | buffer_size=32000 62 | tempdirs=[] 63 | for o, a in opts: 64 | if o in ("-h","--help"): 65 | usage() 66 | sys.exit() 67 | elif o == "-i": 68 | GFFfile=a 69 | if not os.path.exists(GFFfile): 70 | usage() 71 | sys.exit('GFF file not found') 72 | elif o == "-o": outfile=a 73 | elif o == "-b": buffer_size=int(a) 74 | elif o == "-t": tempdirs.append(a) 75 | else: 76 | assert False, "Unhandled Option" 77 | 78 | Keyed = namedtuple("Keyed", ["key", "obj"]) 79 | key_=eval('lambda line : (%s)' %('line[:]')) 80 | 81 | def gk(key,obj): 82 | ik=itemgetter(0,3,4)(obj.split('\t')) 83 | return key((ik[0],int(ik[1]),int(ik[2]))) 84 | 85 | def merge(key=None, *iterables): 86 | # based on code posted by Scott David Daniels in c.l.p. 87 | # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d 88 | #print iterables 89 | if key is None: 90 | keyed_iterables = iterables 91 | else: 92 | keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables] 93 | #print keyed_iterables 94 | for element in heapq.merge(*keyed_iterables): 95 | yield element.obj 96 | 97 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): 98 | if tempdirs is None: 99 | tempdirs = [] 100 | if not tempdirs: 101 | tempdirs.append(gettempdir()) 102 | chunks = [] 103 | xx=0 104 | try: 105 | with open(input,'rb',64*1024) as input_file: 106 | input_iterator = iter(input_file) 107 | for tempdir in cycle(tempdirs): 108 | current_chunk2=[] 109 | for j in islice(input_iterator,buffer_size): 110 | l=(j.strip()).split('\t') 111 | l[3]=int(l[3]) 112 | l[4]=int(l[4]) 113 | current_chunk2.append(l) 114 | current_chunk3=[] 115 | for j in sorted(current_chunk2, key=itemgetter(0,3,4)): 116 | j[3]=str(j[3]) 117 | j[4]=str(j[4]) 118 | current_chunk3.append('\t'.join(j)+'\n') 119 | xx+=len(current_chunk3) 120 | if not current_chunk3: break 121 | sys.stdout.write("Loaded and sorted %i lines.\n"%(xx)) 122 | output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024) 123 | chunks.append(output_chunk) 124 | output_chunk.writelines(current_chunk3) 125 | output_chunk.flush() 126 | output_chunk.seek(0) 127 | sys.stdout.write("Merging from %i files.\n"%(len(chunks))) 128 | with open(output,'wb',64*1024) as output_file: 129 | output_file.writelines(merge(key, *chunks)) 130 | finally: 131 | for chunk in chunks: 132 | try: 133 | chunk.close() 134 | os.remove(chunk.name) 135 | except Exception: 136 | pass 137 | 138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 139 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 140 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs) 141 | sys.stdout.write("Sorted GFF saved on %s\n"%(outfile)) 142 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 143 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) -------------------------------------------------------------------------------- /accessory/SortTable.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | ## {{{ http://code.activestate.com/recipes/576755/ (r3) 23 | # based on Recipe 466302: Sorting big files the Python 2.4 way 24 | # by Nicolas Lehuen 25 | 26 | # Works on python 2.7+ no 3.x 27 | 28 | import sys, os, getopt, heapq, time, random 29 | from tempfile import gettempdir 30 | from itertools import islice, cycle 31 | from collections import namedtuple 32 | from operator import itemgetter 33 | 34 | version='1.0' 35 | pid=str(os.getpid()+random.randint(0,999999999)) 36 | 37 | def usage(): 38 | print """ 39 | USAGE: python SortTable.py [options] 40 | Options: 41 | -i Table file 42 | -d Delimiter character [\\t] (default TAB) 43 | -s Sequence name column [1] 44 | -c Start column [4] 45 | -e End column (can be identical to -c) [5] 46 | -m Skip lines starting with [#] 47 | -o Sorted output file [sortedTable_%s] 48 | -O Output as TAB-delimited 49 | -b Buffer size (as number of lines) [32000] 50 | -t Temporary directory to use (multiple -t may be used) 51 | -h Print this help 52 | 53 | """%(pid) 54 | 55 | try: 56 | opts, args = getopt.getopt(sys.argv[1:], "i:o:b:t:hd:s:c:e:m:O",["help"]) 57 | if len(opts)==0: 58 | usage() 59 | sys.exit(2) 60 | except getopt.GetoptError as err: 61 | print str(err) # will print something like "option -a not recognized" 62 | usage() 63 | sys.exit(2) 64 | 65 | GFFfile='' 66 | outfile='sortedTable_%s' %(pid) 67 | buffer_size=32000 68 | tempdirs=[] 69 | scol=0 # sequence column name 70 | bcol=3 # start column 71 | ecol=4 # end column 72 | schar='#' # skip lines starting with this character 73 | dchar='\t' # delimiter 74 | odel=0 # tab-delimited as output 75 | 76 | for o, a in opts: 77 | if o in ("-h","--help"): 78 | usage() 79 | sys.exit() 80 | elif o == "-i": 81 | GFFfile=a 82 | if not os.path.exists(GFFfile): 83 | usage() 84 | sys.exit('GFF file not found') 85 | elif o == "-o": outfile=a 86 | elif o == "-b": buffer_size=int(a) 87 | elif o == "-t": tempdirs.append(a) 88 | elif o == "-m": schar=a 89 | elif o == "-d": dchar=a 90 | elif o == "-s": scol=int(a)-1 91 | elif o == "-c": bcol=int(a)-1 92 | elif o == "-e": ecol=int(a)-1 93 | elif o == "-O": odel=1 94 | else: 95 | assert False, "Unhandled Option" 96 | 97 | Keyed = namedtuple("Keyed", ["key", "obj"]) 98 | key_=eval('lambda line : (%s)' %('line[:]')) 99 | 100 | def gk(key,obj): 101 | if odel: ik=itemgetter(scol,bcol,ecol)(obj.split('\t')) 102 | else: ik=itemgetter(scol,bcol,ecol)(obj.split(dchar)) 103 | return key((ik[0],int(ik[1]),int(ik[2]))) 104 | 105 | def merge(key=None, *iterables): 106 | # based on code posted by Scott David Daniels in c.l.p. 107 | # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d 108 | #print iterables 109 | if key is None: 110 | keyed_iterables = iterables 111 | else: 112 | keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables] 113 | #print keyed_iterables 114 | for element in heapq.merge(*keyed_iterables): 115 | yield element.obj 116 | 117 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): 118 | if tempdirs is None: 119 | tempdirs = [] 120 | if not tempdirs: 121 | tempdirs.append(gettempdir()) 122 | chunks = [] 123 | xx=0 124 | try: 125 | with open(input,'rb',64*1024) as input_file: 126 | input_iterator = iter(input_file) 127 | for tempdir in cycle(tempdirs): 128 | current_chunk2=[] 129 | for j in islice(input_iterator,buffer_size): 130 | if j.startswith('Region'): continue 131 | if j.startswith(schar): continue 132 | l=(j.strip()).split(dchar) 133 | l[bcol]=int(l[bcol]) 134 | l[ecol]=int(l[ecol]) 135 | current_chunk2.append(l) 136 | current_chunk3=[] 137 | for j in sorted(current_chunk2, key=itemgetter(scol,bcol,ecol)): 138 | j[bcol]=str(j[bcol]) 139 | j[ecol]=str(j[ecol]) 140 | if odel: current_chunk3.append('\t'.join(j)+'\n') 141 | else: current_chunk3.append(dchar.join(j)+'\n') 142 | xx+=len(current_chunk3) 143 | if not current_chunk3: break 144 | sys.stdout.write("Loaded and sorted %i lines.\n"%(xx)) 145 | output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024) 146 | chunks.append(output_chunk) 147 | output_chunk.writelines(current_chunk3) 148 | output_chunk.flush() 149 | output_chunk.seek(0) 150 | sys.stdout.write("Merging from %i files.\n"%(len(chunks))) 151 | with open(output,'wb',64*1024) as output_file: 152 | output_file.writelines(merge(key, *chunks)) 153 | finally: 154 | for chunk in chunks: 155 | try: 156 | chunk.close() 157 | os.remove(chunk.name) 158 | except Exception: 159 | pass 160 | 161 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 162 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 163 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs) 164 | sys.stdout.write("Sorted GFF saved on %s\n"%(outfile)) 165 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 166 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) -------------------------------------------------------------------------------- /accessory/TableToGFF.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, getopt, time, random, heapq, gzip 23 | from tempfile import gettempdir 24 | from itertools import islice, cycle 25 | from collections import namedtuple 26 | from operator import itemgetter 27 | 28 | version='1.0' 29 | pid=str(os.getpid()+random.randint(0,999999999)) 30 | 31 | def usage(): 32 | print """ 33 | USAGE: python TableToGFF.py [options] 34 | Options: 35 | -i Table file from REDItools 36 | -s Sort output GFF 37 | -t Tabix output GFF (requires Pysam module) 38 | -b Buffer size (as number of lines) [32000] (requires -s) 39 | -T Temporary directory (requires -s) 40 | -o Outfile [outTable_%s.gff] 41 | -h Print this help 42 | 43 | """%(pid) 44 | 45 | try: 46 | opts, args = getopt.getopt(sys.argv[1:], "i:o:sthT:b:",["help"]) 47 | if len(opts)==0: 48 | usage() 49 | sys.exit(2) 50 | except getopt.GetoptError as err: 51 | print str(err) # will print something like "option -a not recognized" 52 | usage() 53 | sys.exit(2) 54 | 55 | tablefile='' 56 | outfile='outTable_%s.gff' %(pid) 57 | sort=0 58 | tabix=0 59 | buffer_size=32000 60 | tempdirs=[] 61 | for o, a in opts: 62 | if o in ("-h","--help"): 63 | usage() 64 | sys.exit() 65 | elif o == "-i": 66 | tablefile=a 67 | if not os.path.exists(tablefile): 68 | usage() 69 | sys.exit('Table file not found') 70 | elif o == "-o": outfile=a 71 | elif o == "-s": sort=1 72 | elif o == "-t": tabix=1 73 | elif o == "-b": buffer_size=int(a) 74 | elif o == "-T": tempdirs.append(a) 75 | else: 76 | assert False, "Unhandled Option" 77 | 78 | #Sorting code from SortGFF.py 79 | 80 | Keyed = namedtuple("Keyed", ["key", "obj"]) 81 | key_=eval('lambda line : (%s)' %('line[:]')) 82 | 83 | def gk(key,obj): 84 | ik=itemgetter(0,3,4)(obj.split('\t')) 85 | return key((ik[0],int(ik[1]),int(ik[2]))) 86 | 87 | def merge(key=None, *iterables): 88 | # based on code posted by Scott David Daniels in c.l.p. 89 | # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d 90 | #print iterables 91 | if key is None: 92 | keyed_iterables = iterables 93 | else: 94 | keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables] 95 | #print keyed_iterables 96 | for element in heapq.merge(*keyed_iterables): 97 | yield element.obj 98 | 99 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): 100 | if tempdirs is None: 101 | tempdirs = [] 102 | if not tempdirs: 103 | tempdirs.append(gettempdir()) 104 | chunks = [] 105 | xx=0 106 | try: 107 | with open(input,'rb',64*1024) as input_file: 108 | input_iterator = iter(input_file) 109 | for tempdir in cycle(tempdirs): 110 | current_chunk2=[] 111 | for j in islice(input_iterator,buffer_size): 112 | l=(j.strip()).split('\t') 113 | l[3]=int(l[3]) 114 | l[4]=int(l[4]) 115 | current_chunk2.append(l) 116 | current_chunk3=[] 117 | for j in sorted(current_chunk2, key=itemgetter(0,3,4)): 118 | j[3]=str(j[3]) 119 | j[4]=str(j[4]) 120 | current_chunk3.append('\t'.join(j)+'\n') 121 | xx+=len(current_chunk3) 122 | if not current_chunk3: break 123 | sys.stdout.write("Loaded and sorted %i lines.\n"%(xx)) 124 | output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024) 125 | chunks.append(output_chunk) 126 | output_chunk.writelines(current_chunk3) 127 | output_chunk.flush() 128 | output_chunk.seek(0) 129 | sys.stdout.write("Merging from %i files.\n"%(len(chunks))) 130 | with open(output,'wb',64*1024) as output_file: 131 | output_file.writelines(merge(key, *chunks)) 132 | finally: 133 | for chunk in chunks: 134 | try: 135 | chunk.close() 136 | os.remove(chunk.name) 137 | except Exception: 138 | pass 139 | #END sorting code 140 | 141 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 142 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 143 | sys.stdout.write("Reading table...\n") 144 | if tablefile.endswith('.gz'): f=gzip.open(tablefile,'rb') 145 | else: f=open(tablefile) 146 | o=open(outfile,'w') 147 | xx=0 148 | #chr21 10205589 C 0 12 34.75 [0, 3, 0, 9] CT 0.75 16 28.56 [0, 16, 0, 0] - 0.00 - 149 | for i in f: 150 | if i.startswith('Region'): continue 151 | if i.strip()=='': continue 152 | l=(i.strip()).split('\t') 153 | strand='+' 154 | if l[3]=='0': strand='-' 155 | gffLine=[l[0],'reditoolTable','pos',l[1],l[1],'.',strand,'.',l[0]+'-'+l[1]] 156 | o.write('\t'.join(gffLine)+'\n') 157 | xx+=1 158 | f.close() 159 | o.close() 160 | sys.stdout.write("Converted %i lines.\n"%(xx)) 161 | sys.stdout.write("GFF saved on %s\n"%(outfile)) 162 | if sort: 163 | sys.stdout.write("Sorting GFF file...\n") 164 | outfileS='.'.join(outfile.split('.')[:-1])+'.sorted.gff' 165 | batch_sort(outfile,outfileS,key_,buffer_size,tempdirs) 166 | outfile=outfileS 167 | sys.stdout.write("Sorted GFF saved on %s\n"%(outfileS)) 168 | if tabix: 169 | try: 170 | import pysam 171 | sys.stdout.write("Indexing GFF file...\n") 172 | outfileS=pysam.tabix_index(outfile, preset='gff') 173 | sys.stdout.write("Tabix file saved on %s.\n" %(outfileS)) 174 | sys.stdout.write("Indices saved on %s.tbi.\n" %(outfileS)) 175 | except: sys.exit('Pysam module not found.\nTabix indexing not available.') 176 | 177 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 178 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) 179 | 180 | -------------------------------------------------------------------------------- /accessory/get_DE_events.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #################################### REDI OUT TABLE ######################################################## 3 | #Region Position Reference Strand Coverage-q30 MeanQ BaseCount[A,C,G,T] # 4 | #AllSubs Frequency gCoverage-q30 gMeanQ gBaseCount[A,C,G,T] gAllSubs gFrequency # 5 | ############################################################################################################ 6 | 7 | ###################################GET_DE_events_table########################################################### 8 | #chromosome position type_editing SRR3306830_CTRL SRR3306831_CTRL SRR3306832_CTRL # 9 | #SRR3306833_CTRL SRR3306834_CTRL SRR3306835_CTRL SRR3306836_CTRL SRR3306823_DIS SRR3306824_DIS # 10 | #SRR3306825_DIS SRR3306826_DIS SRR3306827_DIS SRR3306828_DIS SRR3306829_DIS [num_controls/num_disease] # 11 | #delta_diff pvalue (Mannwhitney) # 12 | ################################################################################################################# 13 | 14 | import os, sys, argparse 15 | from scipy import stats 16 | from scipy.stats import wilcoxon, mannwhitneyu, fisher_exact 17 | import numpy as np 18 | import pandas as pd 19 | import math 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("-c", action = 'store', dest = 'min_coverage', 23 | type = int, default=10, help='Coverage-q30') 24 | parser.add_argument("-cpval", action = 'store', dest = 'pvalue_correction', 25 | type = int, default = 0, help = '1 --> Bonferroni correction / 2 --> Benjamini hochberg') 26 | parser.add_argument("-input_file", action = 'store', dest = 'samples_informations_file', 27 | type = str, default= 'empty', help = 'Comma separated file e.g: SRR3306830,Control \ 28 | SRR3306829,Healthy...etc') 29 | parser.add_argument("-f", action = 'store', dest = 'min_edit_frequency', 30 | type = float, default=0.1, help='Editing Frequency') 31 | parser.add_argument("-mts", action = 'store', dest = 'min_sample_testing', 32 | type = float, default=50.0, help="min percentage of each sample category") 33 | parser.add_argument("-sig", action = 'store', dest = 'only_significant', 34 | type = str, default = 'no', help = 'Return only significant editing events') 35 | parser.add_argument("-linear", action = 'store_true', help = 'Enable linear model') 36 | 37 | args = parser.parse_args() 38 | min_coverage = args.min_coverage 39 | min_edit_frequency = args.min_edit_frequency 40 | min_sample_testing = args.min_sample_testing 41 | only_significants = args.only_significant 42 | pvalue_correction = args.pvalue_correction 43 | samples_informations_file = args.samples_informations_file 44 | enable_linear_model = args.linear 45 | 46 | if args.samples_informations_file == 'empty': 47 | parser.error('sample_informations_file is MISSING!' + '\n' + \ 48 | 'Please type "python get_DE_events.py -h" for more details on usage of this script') 49 | 50 | 51 | def call_differential_editing_sites(config_file): 52 | stability_value = 0.03 #value below which you may use a lower coverage for adding more samples to increase power 53 | min_disease_people = 5 #min number people supporting higher coverage for whch you may base stability off measurements off of 54 | min_control_people = 5 #min number control poeple supporting higher coverage for which you may base stability off of 55 | min_disease_people_5_cov = 10 #min disease number of people of 5 coverage you must have if needing to use unstable 5x coverage 56 | min_control_people_5_cov = 10 #min control number of people of 5 coverage you must have if needing to use unstable 5x coverage 57 | editing_file= './temp.csv' 58 | output_file = './editing_sites.with_stats_converted_disease.csv' 59 | #read in files 60 | editing_table = pd.read_csv(editing_file,sep='\t') 61 | #config_table = pd.read_csv(config_file,sep=',',header=None) 62 | config_table = pd.read_csv(config_file,sep=',',skiprows=1,header=None) 63 | all_people = config_table[0] 64 | disease_people = config_table[0][config_table[1] == "DIS"].reset_index(drop = True) #TODO Change do disease!!! 65 | control_people = config_table[0][config_table[1] == "CTRL"].reset_index(drop = True) #TODO Change to control!!! 66 | 67 | #now get just an editing table and coverage table 68 | edit_level_table = editing_table[all_people] 69 | #edit_level_table = editing_table[np.r_[all_people]] 70 | 71 | def get_editing_levels_for_cov_table(i): 72 | info = i.astype(str).str.split(pat="\\^") 73 | editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[2]) 74 | return editing_levels 75 | cov_table = edit_level_table.apply(get_editing_levels_for_cov_table) 76 | cov_table = cov_table.apply(lambda x: pd.to_numeric(x)) #TODO check if as.numeric and pandas to_numeric do the same. 77 | 78 | def get_editing_levels(i): 79 | info = i.astype(str).str.split(pat="\\^") 80 | editing_levels = info.apply(lambda x: float('nan') if x[0] == "nan" else x[0]) 81 | return editing_levels 82 | edit_level_table = edit_level_table.apply(get_editing_levels) 83 | edit_level_table = edit_level_table.apply(lambda x: pd.to_numeric(x)) #TODO check precision on R and python 84 | 85 | #go down line by line and get the prevalence info and mean editing levels based off of stable coverages 86 | #WARNING I'm using float here, not integer allowing NaN values. Is ok? 87 | coverage_threshold_used = np.repeat(0.,edit_level_table.shape[0]) #will hold the coverage threshold required for this editing site 88 | stability_based_on = np.repeat(0.,edit_level_table.shape[0]) #will hold what coverage stability requirements were determined 89 | stable_mean_disease_editing_level = np.repeat(0.,edit_level_table.shape[0]) #mean autistic editing level using individuals passing coverage threshold 90 | stable_std_dev_disease_editing_level = np.repeat(0.,edit_level_table.shape[0]) #standard deviation of autistic editing level using individuals passing coverage threshold 91 | stable_mean_control_editing_level = np.repeat(0.,edit_level_table.shape[0]) #mean control editing level using individuals passing coverage threshold 92 | stable_std_dev_control_editing_level = np.repeat(0.,edit_level_table.shape[0]) #standard deviation of control editing level using individuals passing coverage threshold 93 | stable_number_disease_with_at_least_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #number of autistic individuals passing the coverage threshold 94 | stable_number_disease_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #number of autistic individuals without non zero editing level and passing coverage threshold 95 | stable_disease_prevalence = np.repeat(0.,edit_level_table.shape[0]) #proportion autistic individuals with nonzero editing 96 | stable_number_control_with_at_least_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #same as disease but for control subjects 97 | stable_number_control_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0]) 98 | stable_control_prevalence = np.repeat(0.,edit_level_table.shape[0]) 99 | stable_total_number_individuals_nonzero_editing_and_min_coverage = np.repeat(0.,edit_level_table.shape[0]) #total number of disease and control subjects passing the coverage threshold and having nonzero editing level 100 | stable_mann_whitney_p_value = np.repeat(0.,edit_level_table.shape[0]) #wilcoxon rank sum test p value using individuals passing the coverage threshold 101 | stable_editing_level_effect_size = np.repeat(0.,edit_level_table.shape[0]) #difference between mean disease and mean control 102 | stable_frequency_fishers_p_value = np.repeat(0.,edit_level_table.shape[0]) #prevalence p value determined using two-tailed fisher's exact test 103 | stable_frequency_OR = np.repeat(0.,edit_level_table.shape[0]) #odds ratio of the fisher's exact teest 104 | stable_prevalence_effect_size = np.repeat(0.,edit_level_table.shape[0]) #difference in editing level prevalences between disease and control subjects 105 | #WARNING those are np arrays. 106 | 107 | for i in range(0,edit_level_table.shape[0]): 108 | print i #keep track of progress 109 | disease_edit_row = edit_level_table.loc[i, disease_people] 110 | control_edit_row = edit_level_table.loc[i, control_people] 111 | disease_cov_row = cov_table.loc[i, disease_people] 112 | control_cov_row = cov_table.loc[i, control_people] 113 | #find what coverage we can base stability off of 114 | number_disease_20_cov = disease_cov_row[disease_cov_row >= 20].count() 115 | number_control_20_cov = control_cov_row[control_cov_row >=20].count() 116 | number_disease_15_cov = disease_cov_row[disease_cov_row >= 15].count() 117 | number_control_15_cov = control_cov_row[control_cov_row >= 15].count() 118 | number_disease_10_cov = disease_cov_row[disease_cov_row >= 10].count() 119 | number_control_10_cov = control_cov_row[control_cov_row >= 10].count() 120 | number_disease_5_cov = disease_cov_row[disease_cov_row >= 5].count() 121 | number_control_5_cov = control_cov_row[control_cov_row >= 5].count() 122 | if number_disease_20_cov >= min_disease_people and number_control_20_cov >= min_control_people: 123 | stability_based_on[i] = 20 124 | elif number_disease_15_cov >= min_disease_people and number_control_15_cov >= min_control_people: 125 | stability_based_on[i] = 15 126 | elif number_disease_10_cov >= min_disease_people and number_control_10_cov >= min_control_people: 127 | stability_based_on[i] = 10 128 | elif number_disease_5_cov >= min_disease_people_5_cov and number_control_5_cov >= min_control_people_5_cov: 129 | stability_based_on[i] = 5 130 | else: 131 | #stability_based_on[i] = -99999 # there's no np.nan integer representation, only float. We use an invalid value. 132 | stability_based_on[i] = float('nan') 133 | 134 | #need to deal with cases where there just are not enough disease individuals or control individuals to calculate mean 135 | if np.isnan(stability_based_on[i]): 136 | 137 | coverage_threshold_used[i] = 5 #I warn users not to use editing sites that don't have any stability_based_on measurement. We include min coverage of 5 just to get statistical information anyways 138 | #stable_min_cov=5 139 | #otherwise we can now try to find the stable_min_cov that'll be used for calculation of all statistics' 140 | 141 | else: 142 | current_stability_cov = stability_based_on[i] 143 | stability_disease_mean = disease_edit_row[disease_cov_row >= current_stability_cov].mean() 144 | stability_control_mean = control_edit_row[control_cov_row >= current_stability_cov].mean() 145 | #print np.arange(5,stability_based_on[i]+1e-4,5) 146 | for j in np.arange(5,stability_based_on[i]+1e-4,5): #WARNING using 1e-4 allowing to include stop 147 | disease_mean = disease_edit_row[disease_cov_row >= j].mean() 148 | control_mean = control_edit_row[control_cov_row >= j].mean() 149 | if np.absolute(disease_mean-stability_disease_mean) <=stability_value and np.absolute(control_mean-stability_control_mean) <=stability_value : 150 | coverage_threshold_used[i] = j 151 | break 152 | #now let's calculate all our statics based on the stable coverage threshold 153 | stable_min_cov = coverage_threshold_used[i] 154 | disease_adju_edit_row = disease_edit_row[np.logical_and(np.logical_and((~np.isnan(disease_edit_row)), (~np.isnan(disease_cov_row))), (disease_cov_row >= stable_min_cov))] 155 | disease_adju_cov_row = disease_cov_row[np.logical_and((~np.isnan(disease_cov_row)), (disease_cov_row >= stable_min_cov))] 156 | control_adju_edit_row = control_edit_row[ np.logical_and(np.logical_and((~np.isnan(control_edit_row)), (~np.isnan(control_cov_row))), (control_cov_row >= stable_min_cov))] 157 | control_adju_cov_row = control_cov_row[np.logical_and((~np.isnan(control_cov_row)), (control_cov_row >= stable_min_cov))] 158 | stable_mean_disease_editing_level[i] = disease_adju_edit_row.mean() 159 | stable_std_dev_disease_editing_level[i] = disease_adju_edit_row.std() 160 | stable_mean_control_editing_level[i] = control_adju_edit_row.mean() 161 | stable_std_dev_control_editing_level[i] = control_adju_edit_row.std() 162 | stable_number_disease_with_at_least_min_coverage[i] = disease_adju_cov_row[disease_adju_cov_row >=stable_min_cov].count() 163 | stable_number_disease_nonzero_editing_and_min_coverage[i] = disease_adju_cov_row[ (~np.isnan(disease_adju_cov_row)) & (disease_adju_cov_row >= stable_min_cov) & (disease_adju_edit_row > 0) ].count() 164 | stable_disease_prevalence[i] = stable_number_disease_nonzero_editing_and_min_coverage[i]/stable_number_disease_with_at_least_min_coverage[i] 165 | stable_number_control_with_at_least_min_coverage[i] = control_adju_cov_row[control_adju_cov_row >=stable_min_cov].count() 166 | stable_number_control_nonzero_editing_and_min_coverage[i] = control_adju_cov_row[(~np.isnan(control_adju_cov_row)) & (control_adju_cov_row >= stable_min_cov) & (control_adju_edit_row > 0)].count() 167 | stable_control_prevalence[i] = stable_number_control_nonzero_editing_and_min_coverage[i]/stable_number_control_with_at_least_min_coverage[i] 168 | stable_total_number_individuals_nonzero_editing_and_min_coverage[i] = (stable_number_disease_nonzero_editing_and_min_coverage[i] + stable_number_control_nonzero_editing_and_min_coverage[i]).sum() 169 | if (len(disease_adju_edit_row) >=1) & (len(control_adju_edit_row) >=1): 170 | if (np.all(disease_adju_edit_row.values == control_adju_edit_row.values)): 171 | stable_mann_whitney_p_value[i] = float('nan') 172 | else: 173 | temp, stable_mann_whitney_p_value[i] = mannwhitneyu(disease_adju_edit_row,control_adju_edit_row, alternative='two-sided') 174 | else: 175 | stable_mann_whitney_p_value[i] = float('nan') 176 | stable_editing_level_effect_size[i] = np.absolute(stable_mean_disease_editing_level[i] - stable_mean_control_editing_level[i]) 177 | fisher_matrix = np.matrix([[stable_number_disease_nonzero_editing_and_min_coverage[i], stable_number_disease_with_at_least_min_coverage[i]-stable_number_disease_nonzero_editing_and_min_coverage[i]], [stable_number_control_nonzero_editing_and_min_coverage[i], stable_number_control_with_at_least_min_coverage[i]-stable_number_control_nonzero_editing_and_min_coverage[i]]]) 178 | stable_frequency_OR[i], stable_frequency_fishers_p_value[i] = fisher_exact(fisher_matrix) 179 | #print stable_frequency_OR[i] 180 | #print stable_frequency_fishers_p_value[i] 181 | stable_prevalence_effect_size[i] = np.absolute(stable_disease_prevalence[i] - stable_control_prevalence[i]) 182 | 183 | #now put everything back together as a table 184 | header_info = editing_table[['chromosome','position','type_editing']] 185 | stats_table = pd.DataFrame(coverage_threshold_used) 186 | stats_table = stats_table.rename(columns={stats_table.columns[0]: 'coverage_threshold_used'}) 187 | stats_table['stability_based_on'] = pd.DataFrame(stability_based_on) 188 | stats_table['stable_mean_disease_editing_level'] = pd.DataFrame(stable_mean_disease_editing_level) 189 | stats_table['stable_std_dev_disease_editing_level'] = pd.DataFrame(stable_std_dev_disease_editing_level) 190 | stats_table['stable_mean_control_editing_level'] = pd.DataFrame(stable_mean_control_editing_level) 191 | stats_table['stable_std_dev_control_editing_level'] = pd.DataFrame(stable_std_dev_control_editing_level) 192 | stats_table['stable_number_disease_with_at_least_min_coverage'] = pd.DataFrame(stable_number_disease_with_at_least_min_coverage) 193 | stats_table['stable_number_disease_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_number_disease_nonzero_editing_and_min_coverage) 194 | stats_table['stable_disease_prevalence'] = pd.DataFrame(stable_disease_prevalence) 195 | stats_table['stable_number_control_with_at_least_min_coverage'] = pd.DataFrame(stable_number_control_with_at_least_min_coverage) 196 | stats_table['stable_number_control_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_number_control_nonzero_editing_and_min_coverage) 197 | stats_table['stable_control_prevalence'] = pd.DataFrame(stable_control_prevalence) 198 | stats_table['stable_total_number_individuals_nonzero_editing_and_min_coverage'] = pd.DataFrame(stable_total_number_individuals_nonzero_editing_and_min_coverage) 199 | stats_table['stable_mann_whitney_p_value'] = pd.DataFrame(stable_mann_whitney_p_value) 200 | stats_table['stable_editing_level_effect_size'] = pd.DataFrame(stable_editing_level_effect_size) 201 | stats_table['stable_frequency_fishers_p_value'] = pd.DataFrame(stable_frequency_fishers_p_value) 202 | stats_table['stable_frequency_OR'] = pd.DataFrame(stable_frequency_OR) 203 | stats_table['stable_prevalence_effect_size'] = pd.DataFrame(stable_prevalence_effect_size) 204 | 205 | full_table = pd.concat([header_info, stats_table, editing_table[all_people]], axis=1) 206 | 207 | #write the full_table to output 208 | full_table.to_csv(output_file, sep='\t', index=False) 209 | 210 | print "job completed\n" 211 | 212 | 213 | 214 | 215 | def Set_Chr_Nr(Chr): 216 | """ Sort by chromosome """ 217 | if Chr: 218 | New = Chr.lstrip('chr').split('_')[0] 219 | if New == 'X': New = 23 220 | elif New == 'Y': New = 24 221 | elif New == 'M': New = 25 222 | else: New = int(New) 223 | else: 224 | New = 0 225 | return New 226 | 227 | def Sample_percentage(row): 228 | """Percentage of samples from each type""" 229 | percentage = (len(filter(lambda x: x!= '-', row))/float(len(row)))*100 230 | return round(percentage) 231 | 232 | def Sample_count(row): 233 | """Number of samples from each type""" 234 | count = len(filter(lambda x: x!= '-', row)) 235 | return count 236 | 237 | def get_bh(pvalue,siglevel): 238 | """B-H correction """ 239 | pvalue.sort() 240 | x=1 241 | y=0 242 | p=0 243 | for i in pvalue: 244 | nf=i[0]*len(pvalue) 245 | fdr=nf/x 246 | if fdr<=siglevel: 247 | i[1].append('True') 248 | p=i[0] 249 | y+=1 250 | else: i[1].append('False') 251 | x+=1 252 | return pvalue,y,p 253 | 254 | def get_b(pvalue,siglevel): 255 | """Bonferroni correction""" 256 | pvalue.sort() 257 | y=0 258 | pp=1.0 259 | for i in pvalue: 260 | p=i[0]*len(pvalue) 261 | if p<=siglevel: 262 | i[1].append('True') 263 | y+=1 264 | if p= min_coverage) and (float(freq) >= min_edit_frequency): 316 | sample_edited_sites.setdefault((directory, site), []).append((freq, freq_gnum_cov)) 317 | 318 | table_columns = map(lambda x: x + '_' + sample_informations[x], sorted(sample_informations.keys())) 319 | 320 | disease = [i for i in table_columns if i.upper().find('DIS') != -1] 321 | controls = [i for i in table_columns if i.upper().find('CTRL') != -1] 322 | 323 | if enable_linear_model: 324 | outtable='' 325 | header = ['chromosome', 'position', 'type_editing'] + map(remove_underscore, controls) + map(remove_underscore, disease) 326 | outtable += '\t'.join(header) 327 | outtable += '\n' 328 | #print '\t'.join(header) 329 | for chrom in sorted(all_available_sites, key = lambda x: Set_Chr_Nr(x)): 330 | row = [chrom] 331 | for col in header[2:]:#header.index('[num_controls/num_disease]')]: 332 | row.append(sample_edited_sites.get((col.split('_')[0],chrom), ['-'])[0]) 333 | ctrls = zip(*(zip(controls,row[1:])))[1] 334 | dss = zip(*(zip(disease,row[len(ctrls)+1:])))[1] 335 | ctrls_freq = map(tuple_replace, ctrls) 336 | dss_freq = map(tuple_replace, dss) 337 | row.append(str([Sample_count(ctrls), Sample_count(dss)])) 338 | 339 | row_b = map(tuple_replace_bis, row) 340 | row_b = row_b[0].split('_') + row_b[2:] 341 | row_b.insert(2, 'A.to.G') 342 | final_list = row_b[:-1] 343 | #print '\t'.join(map(str,final_list)) 344 | outtable += '\t'.join(map(str,final_list)).replace('-','NA') 345 | outtable += '\n' 346 | 347 | with open('temp.csv','w') as t: 348 | t.write(outtable) 349 | t.close() 350 | 351 | # call linear model script 352 | call_differential_editing_sites(samples_informations_file) 353 | 354 | 355 | else: 356 | header = ['chromosome', 'position', 'type_editing'] + controls + disease + ['[num_controls/num_disease]'] + ['delta_diff'] + ['pvalue (Mannwhitney)'] 357 | 358 | if pvalue_correction == 1: 359 | header += ['pvalue Bonferroni corrected'] 360 | if pvalue_correction == 2: 361 | header += ['pvalue BH corrected'] 362 | 363 | print '\t'.join(header) 364 | 365 | for chrom in sorted(all_available_sites, key = lambda x: Set_Chr_Nr(x)): 366 | row = [chrom] 367 | for col in header[3:header.index('[num_controls/num_disease]')]: 368 | row.append(sample_edited_sites.get((col.split('_')[0],chrom), ['-'])[0]) 369 | ctrls = zip(*(zip(controls,row[1:])))[1] 370 | dss = zip(*(zip(disease,row[len(ctrls)+1:])))[1] 371 | ctrls_freq = map(tuple_replace, ctrls) 372 | dss_freq = map(tuple_replace, dss) 373 | row.append(str([Sample_count(ctrls), Sample_count(dss)])) 374 | if (Sample_percentage(ctrls) >= min_sample_testing) and (Sample_percentage(dss) >= min_sample_testing): 375 | ctrls_mean = sum(map(float, filter(lambda x: x!= '-', ctrls_freq)))/len(filter(lambda x: x!= '-', ctrls_freq)) 376 | dss_mean = sum(map(float, filter(lambda x: x!= '-', dss_freq)))/len(filter(lambda x : x!= '-', dss_freq)) 377 | delta_diff = abs(ctrls_mean - dss_mean) 378 | pvalue=stats.mannwhitneyu(ctrls_freq, dss_freq, alternative='two-sided') 379 | row.append(round(delta_diff, 3)) 380 | row.append(str(round(pvalue[1], 3))) 381 | correction_argmnt = [(pvalue[1], ctrls_freq+dss_freq)] 382 | 383 | if pvalue_correction == 1: 384 | row.append(round(get_b(correction_argmnt, 0.05)[-1], 6)) 385 | elif pvalue_correction == 2: 386 | row.append(round(get_bh(correction_argmnt, 0.05)[-1], 6)) 387 | else: 388 | if pvalue_correction == 0: 389 | row += ['-', '-'] 390 | else: 391 | row += ['-', '-', '-'] 392 | row_a = map(tuple_replace, row) 393 | row_b = map(tuple_replace_bis, row) 394 | if pvalue_correction != 0 and only_significants == 'yes': 395 | only_sig(row_a,row_b) 396 | else: 397 | row_b = row_b[0].split('_') + row_b[2:] 398 | row_b.insert(2, 'A.to.G') 399 | print '\t'.join(map(str,row_b)) 400 | 401 | -------------------------------------------------------------------------------- /accessory/readPsl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | import math 4 | 5 | try: 6 | pslfile=sys.argv[1] 7 | outfile=sys.argv[2] 8 | except: 9 | sys.exit('USAGE: ') 10 | 11 | #### for blat 12 | def getPS(line): 13 | pid = (100.0 - (pslCalcMilliBad(line) * 0.1)) 14 | score = pslScore(line) 15 | #print "The percentage:",pid 16 | #print "Score:",score 17 | return pid,score 18 | 19 | def pslScore(cols): 20 | sizeMul = 1 21 | return sizeMul * (int(cols[0]) + (int(cols[2]))) - sizeMul * int(cols[1]) - int(cols[4]) - int(cols[6]) 22 | 23 | def round(number): 24 | return int(number + .5); 25 | 26 | def pslCalcMilliBad(cols): 27 | sizeMul = 1 28 | # cols[0] matches 29 | # cols[1] misMatches 30 | # cols[2] repMaches 31 | # cols[4] qNumInsert 32 | # cols[6] tNumInsert 33 | # cols[11] qStart 34 | # cols[12] qEnd 35 | # cols[15] tStart 36 | # cols[16] tEnd 37 | qAliSize = sizeMul * (int(cols[12]) - int(cols[11])) 38 | tAliSize = int(cols[16]) - int(cols[15]) 39 | # I want the minimum of qAliSize and tAliSize 40 | if qAliSize < tAliSize: aliSize = qAliSize #? $aliSize = $qAliSize : $aliSize = $tAliSize; 41 | else: aliSize = tAliSize 42 | # return 0 is AliSize == 0 43 | if aliSize <= 0: return 0 44 | # size diff 45 | sizeDiff = qAliSize - tAliSize 46 | if sizeDiff < 0: sizeDiff = 0 47 | # insert Factor 48 | insertFactor = int(cols[4]) 49 | # $insertFactor += $cols[6]; 50 | milliBad = (1000 * (int(cols[1])*sizeMul + insertFactor + round(3*math.log( 1 + sizeDiff)))) / (sizeMul * (int(cols[0]) + int(cols[2])+ int(cols[1]))) 51 | return milliBad 52 | 53 | def com(num,list): 54 | for i in list: 55 | if i[0]<=num<=i[1]: return 1 56 | return 0 57 | 58 | def min95(val,score): 59 | #if val < (score*95.0)/100: return 1 60 | if val < (score*0.95): return 1 61 | return 0 62 | 63 | def readLines(lines): 64 | res=[] 65 | for line in lines: 66 | pidd,score=getPS(line) 67 | #print pidd,score 68 | sp=[int(x) for x in (line[18].strip(',')).split(',')] 69 | tstarts=[int(x) for x in (line[20].strip(',')).split(',')] 70 | ex=[(tstarts[x]+1,tstarts[x]+sp[x]) for x in range(len(sp))] 71 | nl=[line[9],score,str(int(line[11])+1),line[12],str(line[10]),pidd,line[13],line[8],int(line[15])+1,int(line[16]),ex,int(line[0])] 72 | res.append((int(line[0]),nl)) #score 73 | #if d.has_key(line[9]): d[line[9]].append((score,nl)) 74 | #else: d[line[9]]=[(score,nl)] 75 | return res 76 | 77 | def comp(ri,hits): 78 | g,ng=0,0 79 | hits.sort() 80 | hits.reverse() 81 | if len(hits)==1: #unique hit with editing candidate position included 82 | if hits[0][1][6]==ri[2] and com(ri[1],hits[0][1][10]): g+=1 #float(hits[0][1][5])>=90.0 83 | else: ng+=1 84 | elif len(hits)>1: #multiple hits 85 | if hits[0][1][6]==ri[2] and min95(hits[1][0],hits[0][0]): # if second best score less than 95% of first best score 86 | if com(ri[1],hits[0][1][10]): g+=1 # if first best hit include editing position 87 | else: ng+=1 88 | else: ng+=1 89 | if g>ng: return 1 90 | return 0 91 | 92 | def readPSL(infile,outfile): 93 | f=open(infile) 94 | o=open(outfile,'w') 95 | name,lines,xx='',[],0 96 | while 1: 97 | line=f.readline() 98 | if not line: 99 | if name=='': break 100 | nn=name.split('$') 101 | oread=(name,int(nn[2]),nn[1]) 102 | bread=readLines(lines) 103 | badr='' 104 | if len(bread)==0: badr=name 105 | else: 106 | if not comp(oread,bread): badr=name 107 | if badr!='': 108 | #o.write(name[:-2]+' '+name[-1]+'\n') 109 | o.write(name.split('_')[0]+' '+name.split('$')[0][-1]+'\n') 110 | xx+=1 111 | break 112 | if line.strip()=='': continue 113 | if line.startswith('psL'): continue 114 | if (line.strip()).startswith('match'): continue 115 | if line.startswith('-'): continue 116 | l=(line.strip()).split('\t') 117 | if l[9]!=name: 118 | if len(lines)!=0: 119 | nn=name.split('$') 120 | #(rname,pileupcolumn.pos+1,chr) 121 | oread=(name,int(nn[2]),nn[1]) #dread[name] 122 | bread=readLines(lines) 123 | badr='' 124 | if len(bread)==0: badr=name 125 | else: 126 | if not comp(oread,bread): badr=name 127 | if badr!='': 128 | #o.write(name[:-2]+' '+name[-1]+'\n') 129 | o.write(name.split('_')[0]+' '+name.split('$')[0][-1]+'\n') 130 | xx+=1 131 | lines=[l] 132 | name=l[9] 133 | else: lines.append(l) 134 | f.close() 135 | o.close() 136 | return xx 137 | 138 | def readgf(infile): 139 | f=open(infile) 140 | for i in f: 141 | if 'Server ready for queries!' in i: 142 | f.close() 143 | return 1 144 | f.close() 145 | return 0 146 | 147 | def parse(line): 148 | l=(line.strip()).split('\t') 149 | cc=(int(l[3]),int(l[4])) 150 | return cc 151 | 152 | readPSL(pslfile,outfile) 153 | 154 | -------------------------------------------------------------------------------- /accessory/rediportal2recoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | # THE SOFTWARE. 20 | 21 | 22 | #GFF structure 23 | #chr/tvalue/tfeature/tstart/tend/t./tstrand/./t gene_id ed_numb; transcript_id ed_numb; 24 | 25 | import sys 26 | 27 | try: 28 | in_table = sys.argv[1] 29 | except: 30 | sys.exit('') 31 | 32 | i=0 33 | with open(in_table,'r') as e: 34 | e.readline() 35 | for line in e: 36 | line = line.split('\t') 37 | if line[6] == 'NONREP' and line[9] == 'exonic': 38 | if ('\t'.join(line).count('nonsynonymous')) == 3: 39 | i+=1 40 | valore = line[12].split(':')[0] + '_' + line[12].split('.')[-1] 41 | gff_row = line[0] + '\t'+ valore + '\t' + 'ed' + '\t' + line[1] + \ 42 | '\t' + line[1] + '\t' + '.' + '\t' + line[4] + '\t' + '.' + '\t' + \ 43 | 'gene_id' + ' ' + '"ed_%s";' %(i) + ' ' + 'transcript_id' + ' ' + '"ed_%s";' %(i) 44 | print gff_row 45 | -------------------------------------------------------------------------------- /accessory/selectPositions.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, getopt, os, time, random, gzip 23 | 24 | version='1.0' 25 | pid=str(os.getpid()+random.randint(0,999999999)) 26 | 27 | def usage(): 28 | print """ 29 | USAGE: python selectPositions.py [options] 30 | Options: 31 | -i Table file from REDItools 32 | -d Base distribution column for DNA-Seq (-1: no DNA-Seq) [-1] 33 | -c Coverage RNA-Seq [5] 34 | -C Coverage DNA-Seq [5] 35 | -v Bases supporting RNA-Seq variation [1] 36 | -V Bases supporting DNA-Seq variation [0] 37 | -s Substitutions to select in RNA-Seq (separated by comma AG,CT) [all] 38 | -f Frequency of variation in RNA-Seq [0.1] 39 | -F Frequency of non-variation in DNA-Seq [0.95] 40 | -e Exclude multiple substitutions in RNA-Seq 41 | -r Exclude invariant sites in RNA-Seq 42 | -R Exclude variant sites in DNA-Seq # 43 | -u Use only positions supported by DNA-Seq 44 | -o Save selected positions on outTable_%s 45 | -h Print this help 46 | 47 | """%(pid) 48 | 49 | try: 50 | opts, args = getopt.getopt(sys.argv[1:], "i:c:C:v:s:f:F:euo:hrd:RV:",["help"]) 51 | if len(opts)==0: 52 | usage() 53 | sys.exit(2) 54 | except getopt.GetoptError as err: 55 | print str(err) # will print something like "option -a not recognized" 56 | usage() 57 | sys.exit(2) 58 | 59 | tablefile='' 60 | outfile='outTable_%s' %(pid) 61 | #rna-seq 62 | cov=5 63 | bvar=1 64 | sfreq=0.1 65 | expos=0 66 | upos=0 67 | exinv=0 68 | subs=[x+y for x in 'ACGT' for y in 'ACGT' if x!=y] 69 | #dna-seq 70 | dnacol=11 71 | dnacols=[x for x in range(dnacol-2,dnacol+3,1)] 72 | isdna=0 73 | gcov=5 74 | gsfreq=0.95 75 | gexvar=0 76 | gbvar=0 77 | 78 | for o, a in opts: 79 | if o in ("-h","--help"): 80 | usage() 81 | sys.exit() 82 | elif o == "-i": 83 | tablefile=a 84 | if not os.path.exists(tablefile): 85 | usage() 86 | sys.exit('Table file not found') 87 | elif o == "-c": cov=int(a) 88 | elif o == "-C": gcov=int(a) 89 | elif o == "-v": bvar=int(a) 90 | elif o == "-V": gbvar=int(a) 91 | elif o == "-s": subs=[x.upper() for x in a.split(',') if x.strip()!=''] 92 | elif o == "-f": sfreq=float(a) 93 | elif o == "-F": gsfreq=float(a) 94 | elif o == "-e": expos=1 95 | elif o == "-u": upos=1 96 | elif o == "-r": exinv=1 97 | elif o == "-R": gexvar=1 98 | elif o == "-d": 99 | dnacol=int(a) 100 | if dnacol>3: 101 | isdna=1 102 | dnacols=[x-1 for x in range(dnacol-2,dnacol+3,1)] 103 | elif o == "-o": outfile=a 104 | else: 105 | assert False, "Unhandled Option" 106 | 107 | def isnvar(nuc,idx,val): 108 | n=eval(nuc) 109 | x=0 110 | for j in range(4): 111 | if j!=idx and n[j]>=val: 112 | x+=1 113 | if x>0: return 1 114 | return 0 115 | 116 | def isnvar2(nuc,idx,val): 117 | n=eval(nuc) 118 | x=0 119 | for j in range(4): 120 | if j!=idx: x+=n[j] 121 | if x<=val: return 1 122 | return 0 123 | 124 | def issub(osubs,esubs): 125 | x=0 126 | for i in osubs: 127 | if i in esubs: x+=1 128 | if x>0: return 1 129 | return 0 130 | 131 | def vinv(nuc,idx,val): 132 | n=eval(nuc) 133 | try: v=float(n[idx])/sum(n) 134 | except: v=0.0 135 | if v>=val: return 1 136 | return 0 137 | 138 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 139 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 140 | sys.stdout.write("Reading table...\n") 141 | 142 | if tablefile.endswith('.gz'): f=gzip.open(tablefile,'rb') 143 | else: f=open(tablefile) 144 | db={'A':0,'C':1,'G':2,'T':3} 145 | o=open(outfile,'w') 146 | xx,yy=0,0 147 | for i in f: 148 | if i.startswith('Region'): 149 | o.write(i) 150 | continue 151 | if i.strip()=='': continue 152 | l=(i.strip()).split('\t') 153 | xx+=1 154 | if l[2] not in 'ACGTacgt': continue 155 | if exinv and l[7]=='-': continue 156 | if int(l[4])1: continue 161 | if not issub(osubs,subs): continue 162 | if float(l[8]) END: %s\n"%(script_time)) 181 | 182 | -------------------------------------------------------------------------------- /accessory/subCount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | try: 5 | infile=sys.argv[1] 6 | except: 7 | sys.exit('USAGE: ') 8 | 9 | s={} 10 | for i in 'ACGT': 11 | for j in 'ACGT': 12 | if i!=j: s[i+j]=0 13 | n={} 14 | x=0 15 | for i in 'ACGT': 16 | n[i]=x 17 | x+=1 18 | all=0 19 | f=open(infile) 20 | for i in f: 21 | if i.startswith('Reg'): continue 22 | l=(i.strip()).split('\t') 23 | if l[7]=='-': continue 24 | sub=l[7].split()[0] 25 | nuc=eval(l[6]) 26 | nv= nuc[n[sub[1]]] 27 | s[sub]+=nv 28 | all+=nv 29 | f.close() 30 | 31 | for i in s: 32 | try: v=(s[i]/float(all))*100 33 | except: v=0.0 34 | print i,s[i],all,v 35 | 36 | -------------------------------------------------------------------------------- /accessory/subCount2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | try: 5 | infile=sys.argv[1] 6 | except: 7 | sys.exit('USAGE: ') 8 | 9 | s={} 10 | for i in 'ACGT': 11 | for j in 'ACGT': 12 | if i!=j: s[i+j]=0 13 | n={} 14 | x=0 15 | for i in 'ACGT': 16 | n[i]=x 17 | x+=1 18 | all=0 19 | f=open(infile) 20 | for i in f: 21 | if i.startswith('Reg'): continue 22 | l=(i.strip()).split('\t') 23 | if l[7]=='-': continue 24 | sub=l[7].split()[0] 25 | s[sub]+=1 26 | all+=1 27 | f.close() 28 | 29 | for i in s: 30 | try: v=(s[i]/float(all))*100 31 | except: v=0.0 32 | print i,s[i],all,v 33 | 34 | -------------------------------------------------------------------------------- /accessory/tableToTabix.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, getopt, time, random, heapq, shutil 23 | from tempfile import gettempdir 24 | from itertools import islice, cycle 25 | from collections import namedtuple 26 | from operator import itemgetter 27 | try: import pysam 28 | except: sys.exit('Pysam module not found.') 29 | 30 | version='1.0' 31 | pid=str(os.getpid()+random.randint(0,999999999)) 32 | 33 | def usage(): 34 | print """ 35 | USAGE: python tableToTabix.py [options] 36 | Options: 37 | -i TAB-delimited file 38 | -s Sequence name column [1] 39 | -c Start column [4] 40 | -e End column (can be identical to -c) [5] 41 | -m Skip lines starting with [#] 42 | -0 Zero based coordinates 43 | -S Do not sort input file (sort by default) 44 | -b Buffer size (as number of lines) [32000] 45 | -t Temporary directory to use (multiple -t may be used) 46 | -u Save an uncompressed GFF copy (add _copy suffix) 47 | -h Print this help 48 | 49 | """ 50 | 51 | try: 52 | opts, args = getopt.getopt(sys.argv[1:], "i:Sb:t:hus:c:e:m:0",["help"]) 53 | if len(opts)==0: 54 | usage() 55 | sys.exit(2) 56 | except getopt.GetoptError as err: 57 | print str(err) # will print something like "option -a not recognized" 58 | usage() 59 | sys.exit(2) 60 | 61 | GFFfile='' 62 | buffer_size=32000 63 | tempdirs=[] 64 | sort=1 65 | mc=0 # save an uncompressed GFF copy, default no 66 | scol=0 # sequence column name 67 | bcol=3 # start column 68 | ecol=4 # end column 69 | schar='#' # skip lines starting with this character 70 | zcoord=False # zero based coordinated 71 | for o, a in opts: 72 | if o in ("-h","--help"): 73 | usage() 74 | sys.exit() 75 | elif o == "-i": 76 | GFFfile=a 77 | outfile='.'.join(GFFfile.split('.')[:-1])+'.sorted.gff' 78 | if not os.path.exists(GFFfile): 79 | usage() 80 | sys.exit('GFF file not found') 81 | elif o == "-b": buffer_size=int(a) 82 | elif o == "-t": tempdirs.append(a) 83 | elif o == "-S": sort=0 84 | elif o == "-u": mc=1 85 | elif o == "-m": schar=a 86 | elif o == "-s": scol=int(a)-1 87 | elif o == "-c": bcol=int(a)-1 88 | elif o == "-e": ecol=int(a)-1 89 | elif o == "-0": zcoord=True 90 | else: 91 | assert False, "Unhandled Option" 92 | 93 | Keyed = namedtuple("Keyed", ["key", "obj"]) 94 | key_=eval('lambda line : (%s)' %('line[:]')) 95 | 96 | def gk(key,obj): 97 | ik=itemgetter(scol,bcol,ecol)(obj.split('\t')) 98 | return key((ik[0],int(ik[1]),int(ik[2]))) 99 | 100 | def merge(key=None, *iterables): 101 | # based on code posted by Scott David Daniels in c.l.p. 102 | # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d 103 | #print iterables 104 | if key is None: 105 | keyed_iterables = iterables 106 | else: 107 | keyed_iterables = [(Keyed(gk(key,obj), obj) for obj in iterable) for iterable in iterables] 108 | #print keyed_iterables 109 | for element in heapq.merge(*keyed_iterables): 110 | yield element.obj 111 | 112 | def batch_sort(input, output, key=None, buffer_size=32000, tempdirs=None): 113 | if tempdirs is None: 114 | tempdirs = [] 115 | if not tempdirs: 116 | tempdirs.append(gettempdir()) 117 | chunks = [] 118 | xx=0 119 | try: 120 | with open(input,'rb',64*1024) as input_file: 121 | input_iterator = iter(input_file) 122 | for tempdir in cycle(tempdirs): 123 | current_chunk2=[] 124 | for j in islice(input_iterator,buffer_size): 125 | if j.startswith('Region'): continue 126 | if j.startswith(schar): continue 127 | l=(j.strip()).split('\t') 128 | l[bcol]=int(l[bcol]) 129 | l[ecol]=int(l[ecol]) 130 | current_chunk2.append(l) 131 | current_chunk3=[] 132 | for j in sorted(current_chunk2, key=itemgetter(scol,bcol,ecol)): 133 | j[bcol]=str(j[bcol]) 134 | j[ecol]=str(j[ecol]) 135 | current_chunk3.append('\t'.join(j)+'\n') 136 | xx+=len(current_chunk3) 137 | if not current_chunk3: break 138 | sys.stdout.write("Loaded and sorted %i lines.\n"%(xx)) 139 | output_chunk = open(os.path.join(tempdir,'%06i_%s'%(len(chunks),pid)),'w+b',64*1024) 140 | chunks.append(output_chunk) 141 | output_chunk.writelines(current_chunk3) 142 | output_chunk.flush() 143 | output_chunk.seek(0) 144 | sys.stdout.write("Merging from %i files.\n"%(len(chunks))) 145 | with open(output,'wb',64*1024) as output_file: 146 | output_file.writelines(merge(key, *chunks)) 147 | finally: 148 | for chunk in chunks: 149 | try: 150 | chunk.close() 151 | os.remove(chunk.name) 152 | except Exception: 153 | pass 154 | 155 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 156 | sys.stdout.write("Script time --> START: %s\n"%(script_time)) 157 | if sort: 158 | sys.stdout.write("Sorting GFF file...\n") 159 | batch_sort(GFFfile,outfile,key_,buffer_size,tempdirs) 160 | GFFfile=outfile 161 | if mc: 162 | copyfile=GFFfile+'_copy' 163 | shutil.copyfile(GFFfile,copyfile) 164 | sys.stdout.write("A copy of uncompressed GFF file has been saved on %s.\n" %(copyfile)) 165 | sys.stdout.write("Indexing GFF file...\n") 166 | GFFfile=pysam.tabix_index(GFFfile,seq_col=scol, start_col=bcol, end_col=ecol, zerobased=zcoord) 167 | sys.stdout.write("Tabix file saved on %s.\n" %(GFFfile)) 168 | sys.stdout.write("Indices saved on %s.tbi.\n" %(GFFfile)) 169 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 170 | sys.stdout.write("Script time --> END: %s\n"%(script_time)) -------------------------------------------------------------------------------- /main/REDItoolKnown.py: -------------------------------------------------------------------------------- 1 | #!/home/epicardi/bin/python27/bin/python 2 | # Copyright (c) 2013-2014 Ernesto Picardi 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | # THE SOFTWARE. 21 | 22 | import sys, os, time, math, random, getopt, operator, string, errno 23 | try: import pysam 24 | except: sys.exit('Pysam module not found.') 25 | from multiprocessing import Process, Queue 26 | from Queue import Empty 27 | 28 | pysamVersion=pysam.__version__ 29 | 30 | sys.stderr.write('Pysam version used: %s\n' %(pysamVersion)) 31 | 32 | version='1.3' 33 | 34 | pid=str(os.getpid()+random.randint(0,999999999)) 35 | 36 | def usage(): 37 | print """ 38 | USAGE: python REDItoolKnown.py [options] 39 | Options: 40 | -i BAM file 41 | -I Sort input BAM file 42 | -f Reference in fasta file 43 | -l List of known RNA editing events 44 | -C Base interval to explore [100000] 45 | -k List of chromosomes to skip separated by comma or file 46 | -t Number of threads [1] 47 | -o Output folder [rediFolder_%s] 48 | -F Internal folder name [null] 49 | -c Min. read coverage [10] 50 | -q Min. quality score [30] 51 | -m Min. mapping quality score [30]* 52 | -O Min. homoplymeric length [5] 53 | -s Infer strand (for strand oriented reads) [1] 54 | -g Strand inference type 1:maxValue 2:useConfidence [1] 55 | -x Strand confidence [0.70] 56 | -S Strand correction 57 | -G Infer strand by gff annotation (must be sorted, otherwise use -X) 58 | -X Sort annotation files 59 | -K File with positions to exclude 60 | -e Exclude multi hits 61 | -d Exclude duplicates 62 | -p Use paired concardant reads only 63 | -u Consider mapping quality 64 | -T Trim x bases up and y bases down per read [0-0] 65 | -B Blat file for correction 66 | -U Remove substitutions in homopolymeric regions 67 | -v Min. num. of reads supporting the variation [3] 68 | -n Min. editing frequency [0.1] 69 | -E Exclude positions with multiple changes 70 | -P File containing splice sites annotations 71 | -r Num. of bases near splice sites to explore [4] 72 | -H No Table Header 73 | -h Print this help 74 | 75 | *This value may change according to the aligner: 76 | - For Bowtie use 255 77 | - For Bowtie2 use 40 78 | - For BWA use 30 79 | - For RNA-STAR use 255 80 | - For HiSAT2 use 60 81 | - For Tophat1 use 255 82 | - For Tophat2 use 50 83 | - For GSNAP use 30 84 | 85 | """%(pid) 86 | 87 | try: 88 | opts, args = getopt.getopt(sys.argv[1:], "i:f:k:t:o:c:q:m:O:s:edpuT:B:Sv:n:EP:r:hHIXG:K:l:C:F:x:g:U") 89 | except getopt.GetoptError as err: 90 | print str(err) # will print something like "option -a not recognized" 91 | usage() 92 | sys.exit(2) 93 | 94 | MAX_DEPTH=100000 95 | corrstr=0 96 | strconf=0.70 #confidenza strand 97 | useconf=0 98 | bamfile='' 99 | fastafile='' 100 | sortbam=0 101 | kfile='' 102 | nochrs=[] 103 | NCPU=1 104 | infolder='' 105 | outfolder_='rediFolder_%s' %(pid) 106 | MINCOV=10 107 | QVAL=33 #NOT USED 108 | MQUAL=30 109 | MAPQ=30 110 | homo=5 111 | rmpv = '0-0' 112 | rmp = [int(x) for x in rmpv.split('-')] 113 | getstrand=0 # considera la strand 114 | exh=0 # escludi multi hits 115 | exd=0 # escludi duplicati 116 | conc=0 # se presenti paired-end, usa solo quelle concordanti 117 | mq=0 # considera il map quality 118 | rmnuc=0 # rimuovi nucleotide a monte ed a valle delle read; connesso a rmp e rmpv 119 | blatr=0 # applica la correzione blat 120 | blatfolder='' 121 | rmsh=0 # rimuovi sostituzioni in omopolimeri di lunghezza maggiore o uguale a homo 122 | vnuc=3 # numero minimo di basi che supportano la variazione 123 | mmf=0.1 # frequenza minima della variazione 124 | exms=0 # escludi sostituzioni multiple 125 | exss=0 # escludi posizioni introniche nei pressi dei siti di splicing a nss nucleotidi 126 | nss=4 # basi introniche da esplorare per ogni sito si splicing 127 | splicefile='' #'splicesites.hg18.sorted.txt' 128 | #custsub=0 # use custom distribution 129 | #custfile='' # custom distribution file 130 | #sigsites=0 # select significant sites 131 | #test = 'bh' # select statistical test 132 | usubs=[x+y for x in 'ACGT' for y in 'ACGT' if x!=y] # use these substitutions [default all] 133 | #sval=0.05 # significant value 134 | annfile='' # use annotation file for strand correction and features 135 | sortann=0 # sort annotation file 136 | uann=0 # use annotation 137 | exfile='' # use annotations to exclude positions 138 | expos=0 # 139 | chunckval=100000 140 | unchange1=1 141 | unchange2=0 142 | noheader=0 143 | 144 | for o, a in opts: 145 | if o in ("-h","--help"): 146 | usage() 147 | sys.exit() 148 | elif o == "-H": noheader=1 149 | elif o == "-i": bamfile=a 150 | elif o == "-f": fastafile=a 151 | elif o == "-l": kfile=a 152 | elif o == "-k": 153 | if os.path.exists(a): 154 | f=open(a) 155 | nochrs=[x.strip() for x in f if x.strip()!=''] 156 | f.close() 157 | else: nochrs=[x for x in a.split(',') if x.strip()!=''] 158 | elif o == "-t": NCPU=int(a) 159 | elif o == "-F": infolder=a 160 | elif o == "-o": outfolder_=a 161 | elif o == "-c": MINCOV=int(a) 162 | #elif o == "-Q": QVAL=int(a) 163 | elif o == "-q": MQUAL=int(a) 164 | elif o == "-m": MAPQ=int(a) 165 | elif o == "-O": homo=int(a) 166 | elif o == "-x": strconf=float(a) 167 | elif o == "-g": 168 | if a=='2': useconf=1 169 | elif o == "-s": 170 | getstrand=1 171 | if int(a)==1: unchange1,unchange2=1,0 172 | elif int(a)==0: unchange1,unchange2=0,0 173 | elif int(a)==2: unchange1,unchange2=0,1 174 | elif int(a)==12: unchange1,unchange2=1,1 175 | elif o == "-U": usubs=[x.upper() for x in a.split(',') if a.strip()!=''] 176 | elif o == "-e": exh=1 177 | elif o == "-d": exd=1 178 | elif o == "-p": conc=1 179 | elif o == "-I": sortbam=1 180 | elif o == "-X": sortann=1 181 | elif o == "-C": chunckval=int(a) 182 | elif o == "-u": mq=1 183 | elif o == "-T": 184 | rmpv = a 185 | try: 186 | rmp = [int(x) for x in rmpv.split('-')] 187 | rmnuc=1 188 | except: rmnuc=0 189 | elif o == "-B": 190 | blatfolder=a 191 | if os.path.exists(blatfolder): blatr=1 192 | elif o == "-S": corrstr=1 193 | elif o == "-U": rmsh=1 194 | elif o == "-v": vnuc=int(a) 195 | elif o == "-n": mmf=float(a) 196 | elif o == "-E": exms=1 197 | elif o == "-P": 198 | splicefile=a 199 | if os.path.exists(splicefile): exss=1 200 | elif o == "-K": 201 | exfile=a 202 | if os.path.exists(exfile): expos=1 203 | elif o == "-r": nss=int(a) 204 | elif o == "-G": 205 | annfile=a 206 | uann=1 207 | else: 208 | assert False, "Unhandled Option" 209 | 210 | ####### 211 | commandLine=' '.join(sys.argv[1:]) 212 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 213 | params=[] 214 | #Input parameters 215 | params.append('REDItoolKnown version %s\n' %(version)) 216 | params.append('User command line: %s\n' %(commandLine)) 217 | params.append('Analysis ID: %s\n' %(pid)) 218 | params.append('Analysis time: %s\n' %(script_time)) 219 | params.append('-i --> BAM file: %s\n' %(bamfile)) 220 | params.append('-f --> Reference file: %s\n' %(fastafile)) 221 | params.append('-I --> Sort input BAM file: %i\n' %(sortbam)) 222 | params.append('-l --> File with known RNA editing positions: %s\n' %(kfile)) 223 | params.append('-X --> Sort annotation files: %i\n' %(sortann)) 224 | params.append('-k --> Regions to exclude: %s\n' %(','.join(nochrs))) 225 | params.append('-t --> Number of working threads: %i\n' %(NCPU)) 226 | params.append('-C --> Base interval to explore: %i\n' %(chunckval)) 227 | params.append('-o --> Output folder: %s\n' %(outfolder_)) 228 | params.append('-F --> Infolder folder: %s\n' %(infolder)) 229 | params.append('-c --> Min. per base coverage: %i\n' %(MINCOV)) 230 | #params.append('-Q --> FastQ offset value: %i\n' %(QVAL)) 231 | params.append('-q --> Min. per base quality: %i\n' %(MQUAL)) 232 | params.append('-m --> Min. mapping quality: %i\n' %(MAPQ)) 233 | params.append('-O --> Min. homoplymeric length: %i\n' %(homo)) 234 | params.append('-s --> Infer strand: %i - %i-%i\n' %(getstrand,unchange1,unchange2)) 235 | params.append('-g --> Use confidence: %i\n' %(useconf)) 236 | params.append('-x --> Strand confidence: %.2f\n' %(strconf)) 237 | params.append('-S --> Strand correction : %i\n' %(corrstr)) 238 | params.append('-G --> GFF annotation to infer strand: %s\n' %(annfile)) 239 | params.append('-e --> Exclude multi hits: %i\n' %(exh)) 240 | params.append('-d --> Exclude duplicates: %i\n' %(exd)) 241 | params.append('-p --> Use paired concardant reads only: %i\n' %(conc)) 242 | params.append('-u --> Consider mapping quality: %i\n' %(mq)) 243 | params.append('-T --> Trim x bases up and y bases down per read: %i - %i-%i\n' %(rmnuc,rmp[0],rmp[1])) 244 | params.append('-B --> Blat folder for correction: %s\n' %(blatfolder)) 245 | params.append('-S --> Remove substitutions in homopolymeric regions: %i\n' %(rmsh)) 246 | params.append('-v --> Min. num. of reads supporting the variation: %i\n' %(vnuc)) 247 | params.append('-n --> Min. editing frequency: %.2f\n' %(mmf)) 248 | params.append('-E --> Exclude positions with multiple changes: %i\n' %(exms)) 249 | params.append('-P --> File containing splice sites annotations: %s\n' %(splicefile)) 250 | params.append('-r --> Num. of bases near splice sites to explore: %i\n' %(nss)) 251 | params.append('-K --> File with positions to exclude: %s\n' %(exfile)) 252 | ####### 253 | 254 | def pid_exists(pid): 255 | """Check whether pid exists in the current process table.""" 256 | if pid < 0: 257 | return False 258 | try: 259 | os.kill(pid, 0) 260 | except OSError, e: 261 | return e.errno == errno.EPERM 262 | else: 263 | return True 264 | 265 | def get_no(pvalue,siglevel,ngenes): # No Correction 266 | lista=[] 267 | pp=siglevel 268 | y=0 269 | for i in pvalue: 270 | p=i[0] 271 | if p<=siglevel: 272 | lista.append(i) 273 | y+=1 274 | return lista,y,pp 275 | 276 | def get_b(pvalue,siglevel,ngenes): # Bonferroni 277 | pvalue.sort() 278 | lista=[] 279 | y=0 280 | #bcorr=siglevel/ngenes 281 | pp=1.0 282 | for i in pvalue: 283 | p=i[0]*ngenes 284 | if p<=siglevel: 285 | lista.append(i) 286 | #lista[i[1]]=i[0] 287 | y+=1 288 | if p 1: 354 | rr=[float(x[-1]) for x in res] 355 | idx=rr.index(min(rr)) 356 | return res[idx][5] #,allsub,fread 357 | else: return '1.0' #,0,0 358 | 359 | def BaseCount(seq,ref): 360 | b={'A':0,'C':0,'G':0,'T':0} 361 | subs=[] 362 | subv=[] 363 | for i in seq.upper(): 364 | if b.has_key(i): b[i]+=1 365 | for i in b: 366 | if not b.has_key(ref): continue 367 | if b[i]!=0 and i!=ref: 368 | vv=float(b[i])/(b[i]+b[ref]) 369 | subv.append((b[i],vv,ref+i)) 370 | subv.sort() 371 | subv.reverse() 372 | for i in subv: 373 | if i[0]>=vnuc and i[1]>=mmf: subs.append(i[2]) 374 | freq=0.0 375 | if len(subs)==0: subs.append('-') 376 | else: freq=subv[0][1] 377 | return sum(b.values()),[b['A'],b['C'],b['G'],b['T']],' '.join(subs),'%.2f'%(freq) 378 | 379 | def meanq(v,n): 380 | try:m=float(v)/n 381 | except: m=0.0 382 | return '%.2f'%(m) 383 | 384 | def rmHomo(sequp,seqdw,gh,ref): 385 | if len(sequp)==0 and len(seqdw)==0: return 0 386 | up,dw=0,0 387 | for i in seqdw: 388 | if i==ref:dw+=1 389 | else:break 390 | for i in sequp[::-1]: 391 | if i==ref:up+=1 392 | else:break 393 | if up+dw+1 >= gh : return 1 394 | return 0 395 | 396 | def prop(tot,va): 397 | try: av=float(va)/tot 398 | except: av=0.0 399 | return av 400 | 401 | def vstand(strand): 402 | vv=[(strand.count('+'),'+'),(strand.count('-'),'-'),(strand.count('*'),'*')] 403 | if vv[0][0]==0 and vv[1][0]==0: return '*' 404 | if useconf: 405 | totvv=sum([x[0] for x in vv[:2]]) 406 | if prop(totvv,vv[0][0])>=strconf: return '+' 407 | if prop(totvv,vv[1][0])>=strconf: return '-' 408 | return '*' 409 | else: 410 | if vv[0][0]==vv[1][0] and vv[2][0]==0: return '+' 411 | return max(vv)[1] 412 | 413 | def comp(s): 414 | a={'A':'T','T':'A','C':'G','G':'C'} 415 | ss='' 416 | for i in s.upper(): 417 | if a.has_key(i): ss+=a[i] 418 | else: ss+='N' 419 | return ss 420 | 421 | def whereis(program): 422 | for path in os.environ.get('PATH', '').split(':'): 423 | if os.path.exists(os.path.join(path, program)) and not os.path.isdir(os.path.join(path, program)): return 1 424 | return 0 425 | 426 | def vstrand(lista): 427 | if len(lista)==0: return '2' 428 | p=lista.count('+') 429 | m=lista.count('-') 430 | if p==len(lista): return '1' 431 | elif m==len(lista): return '0' 432 | else: return '2' 433 | 434 | def getd(lines): #fixed error in reading strand 6/3/2014 435 | d={} 436 | for i in lines: 437 | l=(i.strip('\n\r')).split('\t') 438 | if len(l)>=3: 439 | if l[2]=='+': strand='1' 440 | elif l[2]=='-': strand='0' 441 | elif l[2] in '012': strand=l[2] 442 | else: strand='2' 443 | else: strand='2' 444 | d[int(l[1])]=strand 445 | return d 446 | 447 | def normByStrand(seq_,strand_,squal_,mystrand_): 448 | st='+' 449 | if mystrand_=='0': st='-' 450 | seq,qual,squal='',0,[] 451 | for i in range(len(seq_)): 452 | if strand_[i]==st: 453 | seq+=seq_[i] 454 | qual+=squal_[i] #-QVAL 455 | squal.append(squal_[i]) 456 | return seq,qual,squal 457 | 458 | def normByBlat(seq_,strand_,squal_,blatc_): 459 | seq,qual,squal,strand='',0,[],'' 460 | for i in range(len(seq_)): 461 | if blatc_[i]=='1': 462 | seq+=seq_[i] 463 | qual+=squal_[i] 464 | squal.append(squal_[i]) 465 | strand+=strand_[i] 466 | return seq,qual,squal,strand 467 | 468 | def testBlat(blc): 469 | if blc.count('1') > blc.count('0'): return 1 470 | return 0 471 | 472 | ####### 473 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 474 | sys.stderr.write("Script time --> START: %s\n"%(script_time)) 475 | sys.stderr.write("Analysis ID: %s\n"%(pid)) 476 | 477 | if not os.path.exists(bamfile): 478 | usage() 479 | sys.exit('BAM file %s not found.' %(bamfile)) 480 | if sortbam: 481 | sys.stderr.write('Sorting BAM file.\n') 482 | pysam.sort(bamfile,'sorted_%s'%(pid)) 483 | os.rename(bamfile,bamfile+'_old') 484 | os.rename('sorted_%s.bam'%(pid),bamfile) 485 | sys.stderr.write('Indexing BAM file.\n') 486 | pysam.index(bamfile) 487 | if not os.path.exists(bamfile+'.bai') and not sortbam: 488 | sys.stderr.write('Indexing BAM file.\n') 489 | pysam.index(bamfile) 490 | if not os.path.exists(fastafile): 491 | usage() 492 | sys.exit('Fasta file %s not found.' %(fastafile)) 493 | if not os.path.exists(fastafile+'.fai'): 494 | sys.stderr.write('Indexing Fasta file.\n') 495 | pysam.faidx(fastafile) 496 | if not os.path.exists(kfile): sys.exit('File containing RNA editing positions not found.') 497 | if sortann: 498 | if not whereis('grep'): sys.exit('grep command not found.') 499 | if not whereis('sort'): sys.exit('sort command not found.') 500 | sys.stderr.write('Sorting file with known editing positions.\n') 501 | scmd='grep -v ^"chrom" %s | grep -v "^[[:space:]]*$" | sort -k1,1 -k2,2n > %s' %(kfile,'positions_%s'%(pid)) 502 | os.system(scmd) 503 | os.rename(kfile,kfile+'_old') 504 | os.rename('positions_%s'%(pid),kfile) 505 | if not os.path.exists(kfile+'.tbi'): 506 | sys.stderr.write('Indexing file with known positions.\n') 507 | kfile=pysam.tabix_index(kfile, seq_col=0, start_col=1, end_col=1) 508 | # Format for tabfile with positions: 509 | # chr start strand 510 | ################################## 511 | # check reference names 512 | rrefs={} 513 | ridxinfo=pysam.idxstats(bamfile) 514 | for j in ridxinfo.split('\n'): #MOD 515 | l=(j.strip()).split('\t') 516 | if l[0] in ['*','']: continue #MOD 517 | if int(l[2])+int(l[3]) > 0: rrefs[l[0]]=int(l[1]) 518 | frefs=[] 519 | fidxinfo=open(fastafile+'.fai') 520 | for j in fidxinfo: 521 | l=(j.strip()).split('\t') 522 | if l[0]=='': continue 523 | frefs.append(l[0]) 524 | fidxinfo.close() 525 | # in rna-seq 526 | rnof=[] 527 | for i in rrefs.keys(): 528 | if i not in frefs: sys.stderr.write('WARNING: Region %s in RNA-Seq not found in reference file.\n' %(i)) 529 | ################################## 530 | 531 | if uann: 532 | getstrand=0 533 | if not os.path.exists(annfile): 534 | usage() 535 | sys.exit('Annotation file %s not found.' %(annfile)) 536 | if sortann: 537 | if not whereis('grep'): sys.exit('grep command not found.') 538 | if not whereis('sort'): sys.exit('sort command not found.') 539 | sys.stderr.write('Sorting annotation file.\n') 540 | scmd='grep ^"#" %s; grep -v ^"#" %s | sort -k1,1 -k4,4n > %s' %(annfile,annfile,'annotation_%s'%(pid)) 541 | os.system(scmd) 542 | os.rename(annfile,annfile+'_old') 543 | os.rename('annotation_%s'%(pid),annfile) 544 | if not os.path.exists(annfile+'.tbi'): 545 | sys.stderr.write('Indexing annotation file.\n') 546 | annfile=pysam.tabix_index(annfile, preset='gff') 547 | ########################################################### 548 | # Annotation file to exclude positions 549 | if expos: 550 | if not os.path.exists(exfile): 551 | usage() 552 | sys.exit('File %s not found.' %(exfile)) 553 | if sortann: 554 | if not whereis('grep'): sys.exit('grep command not found.') 555 | if not whereis('sort'): sys.exit('sort command not found.') 556 | sys.stderr.write('Sorting file.\n') 557 | scmd='grep ^"#" %s; grep -v ^"#" %s | sort -k1,1 -k4,4n > %s' %(exfile,exfile,'exfile_%s'%(pid)) 558 | os.system(scmd) 559 | os.rename(exfile,exfile+'_old') 560 | os.rename('exfile_%s'%(pid),exfile) 561 | if not os.path.exists(exfile+'.tbi'): 562 | sys.stderr.write('Indexing %s file.\n' %(exfile)) 563 | exfile=pysam.tabix_index(exfile, preset='gff') 564 | ########################################################### 565 | #mainbam=pysam.Samfile(bamfile,"rb") 566 | #regions=mainbam.references 567 | #regionslens=mainbam.lengths 568 | #mainbam.close() 569 | #dicregions=dict([(regions[x],regionslens[x]) for x in range(len(regions))]) 570 | #chrs=[x for x in regions if x not in nochrs] 571 | dicregions=dict(rrefs.items()) 572 | chrs=[x for x in dicregions.keys() if x not in nochrs] 573 | sys.stderr.write('Analysis on %i regions.\n' %(len(chrs))) 574 | 575 | if infolder!='': outfolder=os.path.join(outfolder_,'known_%s_%s' %(infolder,pid)) 576 | else: outfolder=os.path.join(outfolder_,'known_%s' %(pid)) 577 | if not os.path.exists(outfolder): 578 | splitfolder=os.path.split(outfolder) 579 | if not os.path.exists(splitfolder[0]): os.mkdir(splitfolder[0]) 580 | os.mkdir(outfolder) 581 | outtable=os.path.join(outfolder,'outTable_%s' %(pid)) 582 | #write command line and input parameters 583 | f=open(os.path.join(outfolder,'parameters.txt'),'w') 584 | f.writelines(params) 585 | f.close() 586 | 587 | ############################################# 588 | d={} 589 | if blatr: 590 | badblat=blatfolder #os.path.join(blatfolder,'blatseqs_%s.bad'%(chr)) 591 | if os.path.exists(badblat): 592 | sys.stderr.write('Using Blat mapping for RNAseq...\n') 593 | f=open(badblat) 594 | for i in f: 595 | l=(i.strip()).split() 596 | d[l[0]+'_'+l[1]]=int(l[1]) 597 | f.close() 598 | sys.stderr.write('Found %i reads.\n'%(len(d))) 599 | 600 | def exploreBAM(myinput): 601 | inputs=myinput.split('$') 602 | chr,bamfile=inputs[0],inputs[1] 603 | outfile=os.path.join(outfolder,'table_%s_%s'%(chr,pid)) 604 | #outfile2=os.path.join(outfolder,'subs_%s_%s'%(chr,pid)) 605 | d,di={},{} 606 | bam=pysam.Samfile(bamfile,"rb") 607 | fasta=pysam.Fastafile(fastafile) 608 | ktabix=pysam.Tabixfile(kfile) 609 | lenregion=dicregions[chr] 610 | if uann: tabix=pysam.Tabixfile(annfile) 611 | if expos: extabix=pysam.Tabixfile(exfile) 612 | out=open(outfile,'w') 613 | #if not custsub: 614 | # dsubs=dict([(x+y, 0) for x in 'ACGT' for y in 'ACGT']) 615 | # out2=open(outfile2,'w') 616 | #header='Region\tPosition\tReference\tCoverage\tMeanQuality\tBaseCount\tSubs\tFrequency\n' 617 | #out.write(header) 618 | sys.stderr.write('Started analysis on region: %s\n'%(chr)) 619 | #if blatr: 620 | # badblat=os.path.join(blatfolder,'blatseqs_%s.bad'%(chr)) 621 | # if os.path.exists(badblat): 622 | # sys.stderr.write('Using Blat mapping for region %s\n'%(chr)) 623 | # f=open(badblat) 624 | # for i in f: 625 | # l=(i.strip()).split() 626 | # d[l[0]+'_'+l[1]]=int(l[1]) 627 | # f.close() 628 | # sys.stderr.write('Found %i reads for region %s\n'%(len(d),chr)) 629 | if exss: 630 | if os.path.exists(splicefile): 631 | sys.stderr.write('Loading known splice sites for region %s\n'%(chr)) 632 | f=open(splicefile) 633 | for i in f: 634 | l=(i.strip()).split() 635 | if l[0]!=chr: continue 636 | st,tp,cc=l[4],l[3],int(l[1]) 637 | if st=='+' and tp=='D': 638 | for j in range(nss): di[cc+(j+1)]=0 639 | if st=='+' and tp=='A': 640 | for j in range(nss): di[cc-(j+1)]=0 641 | if st=='-' and tp=='D': 642 | for j in range(nss): di[cc-(j+1)]=0 643 | if st=='-' and tp=='A': 644 | for j in range(nss): di[cc+(j+1)]=0 645 | f.close() 646 | sys.stderr.write('Loaded %i positions for %s\n'%(len(di),chr)) 647 | if chr in ktabix.contigs: 648 | for kpos in range(0,lenregion,chunckval): 649 | startk,endk=kpos,(kpos+chunckval)-1 650 | kres=[kk for kk in ktabix.fetch(reference=chr,start=startk,end=endk)] 651 | if len(kres)==0: continue 652 | kdic=getd(kres) 653 | #print kdic 654 | # else explore bam to find exact positions 655 | for pileupcolumn in bam.pileup(chr,startk,endk,stepper='nofilter', max_depth=MAX_DEPTH): 656 | if not startk<=pileupcolumn.reference_pos<=endk: continue 657 | if not kdic.has_key(pileupcolumn.reference_pos+1): continue 658 | ref=fasta.fetch(chr,pileupcolumn.reference_pos,pileupcolumn.reference_pos+1).upper() 659 | seq,qual,strand,squal,blatc='',0,'',[],'' 660 | if rmsh: 661 | if ((pileupcolumn.reference_pos+1)-homo)-1 < 0: sequp='' 662 | else: sequp=(fasta.fetch(chr,((pileupcolumn.reference_pos+1)-homo)-1,(pileupcolumn.reference_pos+1)-1)).upper() 663 | seqdw=(fasta.fetch(chr,pileupcolumn.reference_pos+1,(pileupcolumn.reference_pos+1)+homo)).upper() 664 | for pileupread in pileupcolumn.pileups: # per ogni base dell'allineamento multiplo 665 | #s,q,t,qq=pileupread.alignment.seq[pileupread.qpos].upper(),ord(pileupread.alignment.qual[pileupread.qpos])-QVAL,'*',pileupread.alignment.qual[pileupread.qpos] 666 | if pileupread.is_del: continue 667 | if pileupread.alignment.is_qcfail: continue 668 | if pileupread.alignment.is_supplementary: continue 669 | if pileupread.alignment.has_tag('SA'): continue 670 | # escludi posizioni introniche nei pressi di splice sites 671 | if exss and di.has_key(pileupcolumn.reference_pos+1): continue 672 | # multiple hit 673 | if exh: 674 | if pileupread.alignment.is_secondary: continue 675 | if pileupread.alignment.has_tag('NH'): 676 | if pileupread.alignment.get_tag('NH') > 1: continue 677 | # duplicates 678 | if exd and pileupread.alignment.is_duplicate: continue 679 | # se paired end 680 | if conc: # se devi usare solo le paired 681 | # se non sono paired 682 | if not pileupread.alignment.is_paired: continue 683 | # se non sono concordanti 684 | if not pileupread.alignment.is_proper_pair: continue 685 | # se concordanti ma nello stesso orientamento 686 | flag=pileupread.alignment.flag 687 | if pileupread.alignment.is_duplicate: flag=flag-1024 688 | if pileupread.alignment.is_secondary: flag=flag-256 689 | if flag in [67,131,115,179]: continue 690 | # mapping quality 691 | if mq and pileupread.alignment.mapping_quality < MAPQ: continue 692 | #se la qualita' >= alla qualita' minima 693 | if not pileupread.alignment.query_qualities: pileupread.alignment.query_qualities=[30 for vn in range(len(pileupread.alignment.query_sequence))] 694 | # 695 | #print pileupread.alignment.query_sequence 696 | #print pileupread.query_position 697 | s,q,t,qq=pileupread.alignment.query_sequence[pileupread.query_position].upper(),pileupread.alignment.query_qualities[pileupread.query_position],'*',pileupread.alignment.query_qualities[pileupread.query_position] 698 | if q >= MQUAL and pileupcolumn.reference_pos in pileupread.alignment.get_reference_positions(): 699 | #tags=dict(pileupread.alignment.tags) 700 | #deduci la strand per ogni posizione 701 | if getstrand: 702 | #usa le info del mapping se strand oriented 703 | if pileupread.alignment.is_read1: 704 | if unchange1: 705 | if pileupread.alignment.is_reverse: t='-' 706 | else: t='+' 707 | else: 708 | if pileupread.alignment.is_reverse: t='+' 709 | else: t='-' 710 | elif pileupread.alignment.is_read2: 711 | if unchange2: 712 | if pileupread.alignment.is_reverse: t='-' 713 | else: t='+' 714 | else: 715 | if pileupread.alignment.is_reverse: t='+' 716 | else: t='-' 717 | else: # for single ends 718 | if unchange1: 719 | if pileupread.alignment.is_reverse: t='-' 720 | else: t='+' 721 | else: 722 | if pileupread.alignment.is_reverse: t='+' 723 | else: t='-' 724 | if rmnuc: 725 | #rlen=pileupread.alignment.rlen #pileupread.alignment.qlen #lunghezza della specifica read 726 | #print rlen,pileupread.qpos,pileupread.alignment.qstart,pileupread.alignment.qend 727 | # verifica se il nuc deve essere rimosso alle estremita' nel range x-y 728 | # testare il forward 729 | #qp=pileupread.qpos #pileupread.qpos-pileupread.alignment.qstart 730 | #print pileupread.qpos,pileupread.alignment.rlen,len(pileupread.alignment.seq) 731 | #if pileupread.alignment.is_reverse: 732 | # if (rlen-qp)-1 < rmp[0]:continue 733 | # if (rlen-qp)-1 > ((rlen)-rmp[1])-1: continue 734 | #else: 735 | # if qp(rlen-rmp[1])-1: continue 737 | rlen=pileupread.alignment.query_length #pileupread.alignment.qlen #lunghezza della specifica read 738 | qp=pileupread.query_position #pileupread.qpos-pileupread.alignment.qstart 739 | if rmp[0]>0: #rimuovi posizioni al 5' 740 | if pileupread.alignment.is_reverse: 741 | if (pileupread.alignment.query_alignment_end-rmp[1]) <=qp<= pileupread.alignment.query_alignment_end-1: continue 742 | else: 743 | if pileupread.alignment.query_alignment_start <=qp<= (pileupread.alignment.query_alignment_start+rmp[0])-1: continue 744 | if rmp[1]>0: #rimuovi posizioni al 3' 745 | if pileupread.alignment.is_reverse: 746 | if pileupread.alignment.query_alignment_start <=qp<= (pileupread.alignment.query_alignment_start+rmp[0])-1: continue 747 | else: 748 | if (pileupread.alignment.query_alignment_end-rmp[1]) <=qp<= pileupread.alignment.query_alignment_end-1: continue 749 | #print qp, rmp 750 | # se la read di appartenenza non mappa in modo univoco con Blat 751 | if blatr: 752 | rt=0 753 | if pileupread.alignment.is_read1: rt=1 754 | elif pileupread.alignment.is_read2: rt=2 755 | else: rt=0 756 | rname=pileupread.alignment.query_name+'_%i'%(rt) 757 | if d.has_key(rname): blatc+='0' #continue 758 | else: blatc+='1' 759 | # se la base e' diversa dal reference 760 | # se in regione omopolimerica scarta 761 | if rmsh and rmHomo(sequp,seqdw,homo,ref): continue 762 | seq+=s 763 | qual+=q 764 | strand+=t 765 | squal.append(qq) 766 | if seq.strip()!='': 767 | if blatr: 768 | if testBlat(blatc): seq,qual,squal,strand=normByBlat(seq,strand,squal,blatc) 769 | else: continue 770 | #print pileupcolumn.reference_pos+1,seq,squal 771 | #mystrand=kdic[pileupcolumn.reference_pos+1] 772 | #print mystrand 773 | try: mystrand=kdic[pileupcolumn.reference_pos+1] 774 | except: mystrand='2' 775 | #print chr,pileupcolumn.reference_pos+1,seq,strand, mystrand 776 | if uann and not getstrand: 777 | if chr in tabix.contigs: 778 | sres=[kk.strand for kk in tabix.fetch(reference=chr,start=(pileupcolumn.reference_pos),end=(pileupcolumn.reference_pos+1),parser=pysam.asGTF())] 779 | mystrand=vstrand(sres) 780 | if getstrand and not uann: 781 | mystr=vstand(strand) 782 | if mystr=='-': mystrand='0' 783 | elif mystr=='+': mystrand='1' 784 | else: mystrand='2' 785 | if mystrand=='0': 786 | seq=comp(seq) 787 | ref=comp(ref) 788 | #if getstrand and mystrand in ['1','0'] and not useconf: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) 789 | if getstrand and mystrand in ['1','0'] and corrstr: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) 790 | if uann and mystrand in ['1','0'] and corrstr: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) 791 | #if not getstrand and not uann and mystrand in ['1','0']: seq,qual,squal=normByStrand(seq,strand,squal,mystrand) 792 | #print chr,pileupcolumn.reference_pos+1,seq,strand,mystrand 793 | cov,bcomp,subs,freq=BaseCount(seq,ref) 794 | if cov < MINCOV: continue 795 | if exms and subs.count(' ')>0: continue 796 | mqua=meanq(qual,len(seq)) 797 | if expos: 798 | if chr in extabix.contigs: 799 | exres=[kk for kk in extabix.fetch(reference=chr,start=(pileupcolumn.reference_pos),end=(pileupcolumn.reference_pos+1))] 800 | if len(exres)>0: continue 801 | line='\t'.join([chr,str(pileupcolumn.reference_pos+1),ref,mystrand,str(cov),(mqua),str(bcomp),subs,freq])+'\n' 802 | out.write(line) 803 | bam.close() 804 | fasta.close() 805 | ktabix.close() 806 | out.close() 807 | if uann: tabix.close() 808 | if expos: extabix.close() 809 | sys.stderr.write('Job completed for region: %s\n'%(chr)) 810 | 811 | def addPvalue(myinput2): # not used here 812 | inputs=myinput2.split('$') 813 | f=open(inputs[0]) 814 | subs=eval((f.readline()).strip()) 815 | f.close() 816 | dsubs={} 817 | for i in subs: dsubs[i]=float(subs[i])/sum(subs.values()) 818 | dsubss=getDicSS(dsubs) 819 | #print dsubss 820 | o=open(inputs[2],'w') 821 | f=open(inputs[1]) 822 | for i in f: 823 | l=(i.strip()).split('\t') 824 | if i.strip()=='': continue 825 | #if i.startswith('Region'): 826 | # l.append('Pvalue') 827 | # o.write('\t'.join(l)+'\n') 828 | # continue 829 | if l[6]!='-': pval=getSub(l[2],getFreads(eval(l[6])),dsubss) 830 | else: pval='1.0' 831 | l.append(pval) 832 | o.write('\t'.join(l)+'\n') 833 | o.close() 834 | 835 | def do_work(q): 836 | while True: 837 | try: 838 | x=q.get(block=False) 839 | exploreBAM(x) 840 | except Empty: 841 | break 842 | 843 | work_queue = Queue() 844 | for i in chrs: 845 | strinput=i+'$'+bamfile 846 | work_queue.put(strinput) 847 | processes=[Process(target=do_work, args=(work_queue,)) for i in range(NCPU)] 848 | for t in processes: 849 | t.start() 850 | for t in processes: 851 | t.join() 852 | time.sleep(0.5) 853 | # 854 | head='Region\tPosition\tReference\tStrand\tCoverage-q%i\tMeanQ\tBaseCount[A,C,G,T]\tAllSubs\tFrequency\n' %(MQUAL) 855 | sys.stderr.write('Merging Tables.\n') 856 | o=open(outtable,'w') 857 | if noheader==0: o.write(head) 858 | for i in chrs: 859 | #tabfile=os.path.join(outfolder,'outTable_%s_%s' %(i,pid)) 860 | tabfile=os.path.join(outfolder,'table_%s_%s' %(i,pid)) 861 | if os.path.exists(tabfile): 862 | f=open(tabfile) 863 | for j in f: o.write(j) 864 | f.close() 865 | os.remove(tabfile) 866 | #os.remove(intabfile) 867 | o.close() 868 | 869 | #if sigsites: 870 | # sys.stderr.write('Selecting significant sites.\n') 871 | # outsig=os.path.join(outfolder,'outTableSig_%s' %(pid)) 872 | # f=open(outtable) 873 | # o=open(outsig,'w') 874 | # o.write(head) 875 | # allv=[] 876 | # for i in f: 877 | # if i.startswith('Region'): continue 878 | # if i.strip()=='': continue 879 | # l=(i.strip()).split('\t') 880 | # if l[7]=='-': continue 881 | # if l[7] not in usubs: continue 882 | # pp=float(l[9]) 883 | # allv.append((pp,i)) 884 | # if test=='bh': rr=get_bh(allv,sval,len(allv)) 885 | # elif test=='bo': rr=get_b(allv,sval,len(allv)) 886 | # else: rr=get_no(allv,sval,len(allv)) 887 | # for i in rr[0]: o.write(i[1]) 888 | # f.close() 889 | # o.close() 890 | 891 | sys.stderr.write('Results saved on %s\n'%(outtable)) 892 | #if sigsites: sys.stderr.write('Significant sites saved on %s\n'%(outsig)) 893 | 894 | script_time=time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) 895 | sys.stderr.write("Script time --> END: %s\n"%(script_time)) 896 | 897 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='REDItools', 6 | version='1.3', 7 | description='Python Scripts for RNA editing detection by RNA-Seq data', 8 | author='Ernesto Picardi', 9 | author_email='ernesto.picardi@gmail.com', 10 | url='https://github.com/BioinfoUNIBA/REDItools', 11 | scripts=['main/REDItoolDenovo.py', 12 | 'main/REDItoolDnaRna.py', 13 | 'main/REDItoolKnown.py', 14 | 'accessory/AnnotateTable.py', 15 | 'accessory/FilterTable.py', 16 | 'accessory/SearchInTable.py', 17 | 'accessory/selectPositions.py', 18 | 'accessory/GFFtoTabix.py', 19 | 'accessory/SortGFF.py', 20 | 'accessory/SortTable.py', 21 | 'accessory/TableToGFF.py', 22 | 'accessory/tableToTabix.py', 23 | 'accessory/readPsl.py', 24 | 'accessory/subCount.py', 25 | 'accessory/subCount2.py', 26 | 'accessory/rediportal2recoding.py' 27 | ], 28 | license='LICENSE.txt', 29 | classifiers=[ 30 | 'Intended Audience :: Computational biologists', 31 | 'License :: OSI Approved :: MIT', 32 | 'Operating System :: MacOS :: MacOS X', 33 | 'Operating System :: POSIX', 34 | 'Programming Language :: Python', 35 | ], 36 | long_description=open('README_1.md').read(), 37 | platforms=['Linux','Unix','MacOS'] 38 | ) 39 | 40 | --------------------------------------------------------------------------------