├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── analysis_scripts ├── alignment │ ├── README.md │ ├── RNACocktail-Alignment-Analysis.ipynb │ └── RNACocktail-Alignment-Analysis.py ├── denovo │ ├── README.md │ ├── RNACocktail-Denovo-Analysis.ipynb │ └── RNACocktail-Denovo-Analysis.py ├── diff │ ├── README.md │ ├── RNACocktail-DIFF-Analysis.ipynb │ └── RNACocktail-DIFF-Analysis.py ├── editing │ ├── README.md │ ├── RNACocktail-Editing-Analysis.ipynb │ └── RNACocktail-Editing-Analysis.py ├── fusion │ ├── README.md │ ├── RNACocktail-Fusion-Analysis.ipynb │ └── RNACocktail-Fusion-Analysis.py ├── quantification │ ├── README.md │ ├── RNACocktail-Quant-Analysis.ipynb │ └── RNACocktail-Quant-Analysis.py ├── reconstruction │ ├── README.md │ ├── RNACocktail-Reconstruction-Analysis.ipynb │ └── RNACocktail-Reconstruction-Analysis.py └── variant │ ├── README.md │ ├── RNACocktail-Variant-Analysis.ipynb │ └── RNACocktail-Variant-Analysis.py ├── docker └── Dockerfile ├── ez_setup.py ├── index.html ├── scripts ├── gpd2gtf.py ├── hisat2_jun2bed.py └── run_rnacocktail.py ├── setup.py ├── src ├── __init__.py ├── _version.py ├── defaults.py ├── external_cmd.py ├── main.py ├── run_diff.py ├── run_dnv_assemebly.py ├── run_editing.py ├── run_fusion.py ├── run_lr_align.py ├── run_lr_correct.py ├── run_lr_fusion.py ├── run_lr_reconstruct.py ├── run_quantify.py ├── run_reconstruct.py ├── run_sr_align.py ├── run_variant.py └── utils.py └── test ├── A1_1.fq.gz ├── A1_2.fq.gz ├── A2_1.fq.gz ├── A2_2.fq.gz ├── B1_1.fq.gz ├── B1_2.fq.gz ├── B2_1.fq.gz ├── B2_2.fq.gz ├── C_long.fa.gz ├── C_short.fa.gz ├── C_short_1.fq.gz ├── C_short_2.fq.gz ├── GRCh37_genes_pos.bed.gz ├── GRCh37_strand_pos.bed.gz ├── GRCh38.21.gpd.gz ├── GRCh38_genes_pos.bed.gz ├── GRCh38_strand_pos.bed.gz ├── docker_test.sh ├── hg19.known.21.gpd.gz └── test_run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | .idea/** 4 | 5 | /build 6 | 7 | /RNACocktail_Pipeline.egg-info 8 | 9 | /dist 10 | 11 | /test/example* 12 | 13 | /test/*.jar 14 | 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | RNACocktail (c) 2016 by Roche Sequencing Solutions, Inc. All rights reserved. 2 | RNACocktail is licensed under Apache License Version 2.0. 3 | ------------------------------------------------------------- 4 | 5 | The script "gpd2gtf.py" is modified from the original code from 6 | https://github.com/jason-weirather/Au-public/blob/master/gold/gpd2gtf.py 7 | available under Apache License Version 2.0. 8 | 9 | ------------------------------------------------------------- 10 | 11 | Apache License 12 | Version 2.0, January 2004 13 | http://www.apache.org/licenses/ 14 | 15 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 16 | 17 | 1. Definitions. 18 | 19 | "License" shall mean the terms and conditions for use, reproduction, 20 | and distribution as defined by Sections 1 through 9 of this document. 21 | 22 | "Licensor" shall mean the copyright owner or entity authorized by 23 | the copyright owner that is granting the License. 24 | 25 | "Legal Entity" shall mean the union of the acting entity and all 26 | other entities that control, are controlled by, or are under common 27 | control with that entity. For the purposes of this definition, 28 | "control" means (i) the power, direct or indirect, to cause the 29 | direction or management of such entity, whether by contract or 30 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 31 | outstanding shares, or (iii) beneficial ownership of such entity. 32 | 33 | "You" (or "Your") shall mean an individual or Legal Entity 34 | exercising permissions granted by this License. 35 | 36 | "Source" form shall mean the preferred form for making modifications, 37 | including but not limited to software source code, documentation 38 | source, and configuration files. 39 | 40 | "Object" form shall mean any form resulting from mechanical 41 | transformation or translation of a Source form, including but 42 | not limited to compiled object code, generated documentation, 43 | and conversions to other media types. 44 | 45 | "Work" shall mean the work of authorship, whether in Source or 46 | Object form, made available under the License, as indicated by a 47 | copyright notice that is included in or attached to the work 48 | (an example is provided in the Appendix below). 49 | 50 | "Derivative Works" shall mean any work, whether in Source or Object 51 | form, that is based on (or derived from) the Work and for which the 52 | editorial revisions, annotations, elaborations, or other modifications 53 | represent, as a whole, an original work of authorship. For the purposes 54 | of this License, Derivative Works shall not include works that remain 55 | separable from, or merely link (or bind by name) to the interfaces of, 56 | the Work and Derivative Works thereof. 57 | 58 | "Contribution" shall mean any work of authorship, including 59 | the original version of the Work and any modifications or additions 60 | to that Work or Derivative Works thereof, that is intentionally 61 | submitted to Licensor for inclusion in the Work by the copyright owner 62 | or by an individual or Legal Entity authorized to submit on behalf of 63 | the copyright owner. For the purposes of this definition, "submitted" 64 | means any form of electronic, verbal, or written communication sent 65 | to the Licensor or its representatives, including but not limited to 66 | communication on electronic mailing lists, source code control systems, 67 | and issue tracking systems that are managed by, or on behalf of, the 68 | Licensor for the purpose of discussing and improving the Work, but 69 | excluding communication that is conspicuously marked or otherwise 70 | designated in writing by the copyright owner as "Not a Contribution." 71 | 72 | "Contributor" shall mean Licensor and any individual or Legal Entity 73 | on behalf of whom a Contribution has been received by Licensor and 74 | subsequently incorporated within the Work. 75 | 76 | 2. Grant of Copyright License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | copyright license to reproduce, prepare Derivative Works of, 80 | publicly display, publicly perform, sublicense, and distribute the 81 | Work and such Derivative Works in Source or Object form. 82 | 83 | 3. Grant of Patent License. Subject to the terms and conditions of 84 | this License, each Contributor hereby grants to You a perpetual, 85 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 86 | (except as stated in this section) patent license to make, have made, 87 | use, offer to sell, sell, import, and otherwise transfer the Work, 88 | where such license applies only to those patent claims licensable 89 | by such Contributor that are necessarily infringed by their 90 | Contribution(s) alone or by combination of their Contribution(s) 91 | with the Work to which such Contribution(s) was submitted. If You 92 | institute patent litigation against any entity (including a 93 | cross-claim or counterclaim in a lawsuit) alleging that the Work 94 | or a Contribution incorporated within the Work constitutes direct 95 | or contributory patent infringement, then any patent licenses 96 | granted to You under this License for that Work shall terminate 97 | as of the date such litigation is filed. 98 | 99 | 4. Redistribution. You may reproduce and distribute copies of the 100 | Work or Derivative Works thereof in any medium, with or without 101 | modifications, and in Source or Object form, provided that You 102 | meet the following conditions: 103 | 104 | (a) You must give any other recipients of the Work or 105 | Derivative Works a copy of this License; and 106 | 107 | (b) You must cause any modified files to carry prominent notices 108 | stating that You changed the files; and 109 | 110 | (c) You must retain, in the Source form of any Derivative Works 111 | that You distribute, all copyright, patent, trademark, and 112 | attribution notices from the Source form of the Work, 113 | excluding those notices that do not pertain to any part of 114 | the Derivative Works; and 115 | 116 | (d) If the Work includes a "NOTICE" text file as part of its 117 | distribution, then any Derivative Works that You distribute must 118 | include a readable copy of the attribution notices contained 119 | within such NOTICE file, excluding those notices that do not 120 | pertain to any part of the Derivative Works, in at least one 121 | of the following places: within a NOTICE text file distributed 122 | as part of the Derivative Works; within the Source form or 123 | documentation, if provided along with the Derivative Works; or, 124 | within a display generated by the Derivative Works, if and 125 | wherever such third-party notices normally appear. The contents 126 | of the NOTICE file are for informational purposes only and 127 | do not modify the License. You may add Your own attribution 128 | notices within Derivative Works that You distribute, alongside 129 | or as an addendum to the NOTICE text from the Work, provided 130 | that such additional attribution notices cannot be construed 131 | as modifying the License. 132 | 133 | You may add Your own copyright statement to Your modifications and 134 | may provide additional or different license terms and conditions 135 | for use, reproduction, or distribution of Your modifications, or 136 | for any such Derivative Works as a whole, provided Your use, 137 | reproduction, and distribution of the Work otherwise complies with 138 | the conditions stated in this License. 139 | 140 | 5. Submission of Contributions. Unless You explicitly state otherwise, 141 | any Contribution intentionally submitted for inclusion in the Work 142 | by You to the Licensor shall be under the terms and conditions of 143 | this License, without any additional terms or conditions. 144 | Notwithstanding the above, nothing herein shall supersede or modify 145 | the terms of any separate license agreement you may have executed 146 | with Licensor regarding such Contributions. 147 | 148 | 6. Trademarks. This License does not grant permission to use the trade 149 | names, trademarks, service marks, or product names of the Licensor, 150 | except as required for reasonable and customary use in describing the 151 | origin of the Work and reproducing the content of the NOTICE file. 152 | 153 | 7. Disclaimer of Warranty. Unless required by applicable law or 154 | agreed to in writing, Licensor provides the Work (and each 155 | Contributor provides its Contributions) on an "AS IS" BASIS, 156 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 157 | implied, including, without limitation, any warranties or conditions 158 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 159 | PARTICULAR PURPOSE. You are solely responsible for determining the 160 | appropriateness of using or redistributing the Work and assume any 161 | risks associated with Your exercise of permissions under this License. 162 | 163 | 8. Limitation of Liability. In no event and under no legal theory, 164 | whether in tort (including negligence), contract, or otherwise, 165 | unless required by applicable law (such as deliberate and grossly 166 | negligent acts) or agreed to in writing, shall any Contributor be 167 | liable to You for damages, including any direct, indirect, special, 168 | incidental, or consequential damages of any character arising as a 169 | result of this License or out of the use or inability to use the 170 | Work (including but not limited to damages for loss of goodwill, 171 | work stoppage, computer failure or malfunction, or any and all 172 | other commercial damages or losses), even if such Contributor 173 | has been advised of the possibility of such damages. 174 | 175 | 9. Accepting Warranty or Additional Liability. While redistributing 176 | the Work or Derivative Works thereof, You may choose to offer, 177 | and charge a fee for, acceptance of support, warranty, indemnity, 178 | or other liability obligations and/or rights consistent with this 179 | License. However, in accepting such obligations, You may act only 180 | on Your own behalf and on Your sole responsibility, not on behalf 181 | of any other Contributor, and only if You agree to indemnify, 182 | defend, and hold each Contributor harmless for any liability 183 | incurred by, or claims asserted against, such Contributor by reason 184 | of your accepting any such warranty or additional liability. 185 | 186 | END OF TERMS AND CONDITIONS 187 | 188 | APPENDIX: How to apply the Apache License to your work. 189 | 190 | To apply the Apache License to your work, attach the following 191 | boilerplate notice, with the fields enclosed by brackets "{}" 192 | replaced with your own identifying information. (Don't include 193 | the brackets!) The text should be enclosed in the appropriate 194 | comment syntax for the file format. We also recommend that a 195 | file or class name and description of purpose be included on the 196 | same "printed page" as the copyright notice for easier 197 | identification within third-party archives. 198 | 199 | Copyright {yyyy} {name of copyright owner} 200 | 201 | Licensed under the Apache License, Version 2.0 (the "License"); 202 | you may not use this file except in compliance with the License. 203 | You may obtain a copy of the License at 204 | 205 | http://www.apache.org/licenses/LICENSE-2.0 206 | 207 | Unless required by applicable law or agreed to in writing, software 208 | distributed under the License is distributed on an "AS IS" BASIS, 209 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 210 | See the License for the specific language governing permissions and 211 | limitations under the License. 212 | 213 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include ez_setup.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | RNACocktail: A comprehensive framework for accurate and efficient RNA-Seq analysis 2 | 3 | See http://bioinform.github.io/rnacocktail/ for help and downloads. 4 | -------------------------------------------------------------------------------- /analysis_scripts/alignment/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Alignment Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/alignment/RNACocktail-Alignment-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /analysis_scripts/alignment/RNACocktail-Alignment-Analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | get_ipython().magic(u'pylab inline') 7 | 8 | 9 | # In[2]: 10 | 11 | import pybedtools 12 | import pickle 13 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2 14 | import seaborn as sns 15 | from pandas import DataFrame 16 | import os 17 | import csv 18 | import matplotlib.patches as patches 19 | import pysam 20 | 21 | 22 | # # Initialization 23 | 24 | # In[3]: 25 | 26 | methods=["Tophat","STAR","HISAT2"] 27 | sample="NA12878" 28 | reliable_est_bed="/path/to/reliable/EST/junctions.bed" 29 | 30 | 31 | # # Predictions 32 | 33 | # In[4]: 34 | 35 | bed_files={'Tophat':'/path/to/TopHat/junctions.bed', 36 | 'STAR':'/path/to/STAR/SJ.out.tab', 37 | 'HISAT2':'/path/to/HISAT/splicesites.txt', 38 | } 39 | bam_files={'Tophat':'/path/to/TopHat/alignments.bam', 40 | 'STAR':'/path/to/STAR/alignments.bam', 41 | 'HISAT2':'/path/to/HISAT2/alignments.bam', 42 | } 43 | 44 | 45 | # In[ ]: 46 | 47 | 48 | 49 | 50 | # # Functions 51 | 52 | # In[5]: 53 | 54 | def find_stats(bamfile,statfile): 55 | sam_file = pysam.Samfile(bamfile, "rb") 56 | seq={"1":[],"2":[]} 57 | current_qname="" 58 | uniqmap_uniqmap=0 59 | uniqmap_multimap=0 60 | multimap_multimap=0 61 | uniqmap_unmap=0 62 | multimap_unmap=0 63 | unmap_unmap=0 64 | cnts=0 65 | for line in sam_file: 66 | qname=line.qname 67 | if current_qname=="": 68 | current_qname=qname 69 | if qname!=current_qname: 70 | uniqed_multi_un={} 71 | for fs in ["1","2"]: 72 | NHs=map(lambda x:x[1],seq[fs]) 73 | if len(set(NHs))==1: 74 | NH=NHs[0] 75 | if NH==1: 76 | uniqed_multi_un[fs]=0 77 | elif NH==-1: 78 | uniqed_multi_un[fs]=2 79 | else: 80 | uniqed_multi_un[fs]=1 81 | if uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==0: 82 | uniqmap_uniqmap+=1 83 | elif (uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==1) or ( 84 | uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==0): 85 | uniqmap_multimap+=1 86 | elif (uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==1): 87 | multimap_multimap+=1 88 | elif (uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==2) or ( 89 | uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==0): 90 | uniqmap_unmap+=1 91 | elif (uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==2) or ( 92 | uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==1): 93 | multimap_unmap+=1 94 | elif (uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==2): 95 | unmap_unmap+=1 96 | else: 97 | print "ERRR3 ", line 98 | aaaa 99 | current_qname=qname 100 | seq={"1":[],"2":[]} 101 | 102 | flag=np.binary_repr(line.flag,12) 103 | tags=dict(line.get_tags()) 104 | NH=-1 if "NH" not in tags else tags["NH"] 105 | mpd=flag[-3]=="0" 106 | pmpd=flag[-4]=="0" 107 | first=flag[-7]=="1" 108 | second=flag[-8]=="1" 109 | if not (first ^ second): 110 | print "ERRR1 ", line 111 | aaaa 112 | 113 | if (not mpd) and NH>0: 114 | print "ERRR1 ", line 115 | aaaa 116 | 117 | fs="1" if first else "2" 118 | seq[fs].append([flag,NH,mpd,pmpd]) 119 | cnts+=1 120 | with open(statfile, 'wb') as csvfile: 121 | spamwriter = csv.writer(csvfile, delimiter='\t', 122 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 123 | spamwriter.writerow(["uniqmap_uniqmap", "uniqmap_multimap", "multimap_multimap", "uniqmap_unmap", "multimap_unmap", "unmap_unmap","total","cnts"]) 124 | spamwriter.writerow([uniqmap_uniqmap, uniqmap_multimap, multimap_multimap, uniqmap_unmap, multimap_unmap, unmap_unmap, 125 | sum([uniqmap_uniqmap, uniqmap_multimap, multimap_multimap, uniqmap_unmap, multimap_unmap, unmap_unmap]),cnts]) 126 | 127 | def find_matchstats(bamfile,matchstatfile): 128 | sam_file = pysam.Samfile(bamfile, "rb") 129 | match_stats={} 130 | for line in sam_file: 131 | if line.cigar: 132 | codes={} 133 | for k,v in line.cigar: 134 | if k not in codes: 135 | codes[k]=0 136 | codes[k]+=v 137 | for k,v in codes.iteritems(): 138 | if k not in match_stats: 139 | match_stats[k]={} 140 | if v not in match_stats[k]: 141 | match_stats[k][v]=0 142 | match_stats[k][v]+=1 143 | pickle.dump(match_stats,open(matchstatfile,"w")) 144 | 145 | def find_NMstats(bamfile,NMstatfile): 146 | sam_file = pysam.Samfile(bamfile, "rb") 147 | NM_stats={} 148 | for line in sam_file: 149 | unmapped=(line.flag/4)%2==1 150 | if unmapped: 151 | continue 152 | tags=dict(line.tags) 153 | if "NM" in tags: 154 | nm=tags["NM"] 155 | if nm not in NM_stats: 156 | NM_stats[nm]=0 157 | NM_stats[nm]+=1 158 | elif "nM" in tags: 159 | nm=tags["nM"] 160 | if nm not in NM_stats: 161 | NM_stats[nm]=0 162 | NM_stats[nm]+=1 163 | else: 164 | print tags 165 | aaaa 166 | pickle.dump(NM_stats,open(NMstatfile,"w")) 167 | 168 | 169 | # # Analysis 170 | 171 | # In[6]: 172 | 173 | est_junctions_reliable=pybedtools.BedTool(reliable_est_bed) 174 | 175 | 176 | # In[7]: 177 | 178 | all_beds={} 179 | for method,bedfile in bed_files.iteritems(): 180 | mybed=pybedtools.BedTool(bedfile) 181 | if method == "STAR": 182 | mybed=mybed.filter(lambda x: (int(x[2])-int(x[1]))>1).each(lambda x:[x[0],int(x[1])+1,x[2]]).saveas() 183 | elif method == "HISAT2": 184 | mybed=mybed.each(lambda x:[x[0],int(x[1])+1,x[2]]).saveas() 185 | elif method == "Tophat": 186 | mybed=mybed.each(lambda x:[x[0],int(x[1])+int(x[10].split(",")[0]),int(x[2])-int(x[10].split(",")[1])]).saveas() 187 | all_beds[method]=mybed.each(lambda x:["chr%s"%x[0],x[1],x[2]]).saveas() 188 | 189 | 190 | # In[8]: 191 | 192 | for method,bamfile in bam_files.iteritems(): 193 | statfile=bamfile+".mystats" 194 | if os.path.exists(bamfile): 195 | if not os.path.exists(statfile): 196 | find_stats(bamfile,statfile) 197 | 198 | 199 | # In[9]: 200 | 201 | for method,bamfile in bam_files.iteritems(): 202 | statfile=bamfile+".mystats_match" 203 | if os.path.exists(bamfile): 204 | if not os.path.exists(statfile): 205 | find_matchstats(bamfile,statfile) 206 | 207 | 208 | # In[10]: 209 | 210 | for method,bamfile in bam_files.iteritems(): 211 | statfile=bamfile+".mystats_NM" 212 | if os.path.exists(bamfile): 213 | if not os.path.exists(statfile): 214 | print sample,method 215 | find_NMstats(bamfile,statfile) 216 | 217 | 218 | # In[11]: 219 | 220 | def parse_my_stats(stat_file): 221 | mystats={} 222 | with open(stat_file, 'r') as csv_f: 223 | spamreader = csv.reader(csv_f, delimiter='\t', quotechar='|') 224 | cnt=0 225 | for row in spamreader: 226 | if cnt==0: 227 | keys=row 228 | cnt=1 229 | else: 230 | vals=row 231 | mystats={x[0]:int(x[1]) for x in zip(keys,vals)} 232 | return mystats 233 | return {} 234 | 235 | 236 | # In[12]: 237 | 238 | alignment_stats={} 239 | for method,bed in all_beds.iteritems(): 240 | alignment_stats[method]={} 241 | L=len(bed) 242 | L_est_reliable=len(bed.intersect(est_junctions_reliable,f=0.99,u=True,r=True)) 243 | alignment_stats[method].update({"n_junctions":L, "n_est_reliable":L_est_reliable, "r_est_reliable":round(float(L_est_reliable)/float(L),2)}) 244 | 245 | 246 | # In[13]: 247 | 248 | for method,bamfile in bam_files.iteritems(): 249 | statfile=bamfile+".mystats" 250 | mystats=parse_my_stats(statfile) 251 | alignment_stats[method].update(mystats) 252 | 253 | 254 | # In[14]: 255 | 256 | for method,bamfile in bam_files.iteritems(): 257 | statfile=bamfile+".mystats_match" 258 | mystats=pickle.load(open(statfile)) 259 | alignment_stats[method].update({"match_stats":mystats}) 260 | 261 | 262 | # In[15]: 263 | 264 | for method,bamfile in bam_files.iteritems(): 265 | statfile=bamfile+".mystats_NM" 266 | mystats=pickle.load(open(statfile)) 267 | alignment_stats[method].update({"NM":mystats}) 268 | 269 | 270 | # In[16]: 271 | 272 | intersect_3methods={i:{} for i in range(8)} 273 | for iii in range(8): 274 | if iii==0: 275 | continue 276 | i=iii%2 277 | j=(iii/2)%2 278 | k=(iii/4)%2 279 | bed1=all_beds[methods[0]] 280 | bed2=all_beds[methods[1]] 281 | bed3=all_beds[methods[2]] 282 | if i==1: 283 | bed=bed1 284 | elif j==1: 285 | bed=bed2 286 | elif k==1: 287 | bed=bed3 288 | bed=bed.intersect(bed1,f=0.99,u=True if i==1 else False,v=True if i==0 else False,r=True) 289 | bed=bed.intersect(bed2,f=0.99,u=True if j==1 else False,v=True if j==0 else False,r=True) 290 | bed=bed.intersect(bed3,f=0.99,u=True if k==1 else False,v=True if k==0 else False,r=True) 291 | L=len(bed) 292 | L_est_reliable=len(bed.intersect(est_junctions_reliable,f=0.99,u=True,r=True)) 293 | intersect_3methods[iii].update({"n_junctions":L, "n_est_reliable":L_est_reliable, "r_est_reliable":round(float(L_est_reliable)/float(L),2)}) 294 | 295 | 296 | # ## Plots 297 | 298 | # ## junction validation 299 | 300 | # In[17]: 301 | 302 | sns.set(style="white",font_scale=1.5) 303 | fig, ax = plt.subplots(figsize=(8,2)) 304 | bin_labels=["Reliable" , "Not Reliable"] 305 | A=[] 306 | B=[] 307 | res=[] 308 | labels=[] 309 | my_colors=sns.color_palette("Set1",n_colors=10) 310 | for jjj,method in enumerate(methods): 311 | A.append(alignment_stats[method]["n_junctions"]) 312 | B.append(alignment_stats[method]["n_est_reliable"]) 313 | labels.append(method) 314 | res.append(np.array(A)) 315 | res.append(np.array(B)) 316 | my_data=DataFrame(np.array(res).transpose(),index=labels,columns=bin_labels[::-1]) 317 | for ii,b in enumerate(bin_labels[::-1]): 318 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax) 319 | for i,ytick in enumerate(cg.get_yticklabels()): 320 | ytick.set_fontsize(12) 321 | ax.set_xlabel("Number of Junctions") 322 | ax.set_xticks(range(0,600000,200000)) 323 | ax.set_yticks(range(len(labels))) 324 | ax.set_xticklabels(["%sk"%(x/1000) if x>0 else "0" for x in range(0,600000,200000)]) 325 | 326 | ax.set_xlim([0,500000]) 327 | ax.set_title("Validation rate of splicing junctions on dbEST",fontsize=16) 328 | sns.despine(left=True) 329 | handles, labels = ax.get_legend_handles_labels() 330 | # reverse the order 331 | ax.legend(handles[::-1], labels[::-1],bbox_to_anchor=(0.85, 0.65, 0.5, .3), 332 | loc=1,ncol=1, 333 | mode="expand", borderaxespad=0.,frameon=False,fontsize=14) 334 | 335 | 336 | # In[18]: 337 | 338 | sns.set(style="white",font_scale=2.2) 339 | fig, ax = plt.subplots(figsize=(10,10)) 340 | keys=["r_est","r_est_reliable"] 341 | labels=["% of EST matches","% Reliable EST matches"] 342 | index = np.arange(len(methods)) 343 | bar_width = 0.2 344 | opacity = 0.5 345 | my_colors=sns.color_palette("Set2",n_colors=10) 346 | v = venn3(subsets=[intersect_3methods[k]['n_junctions'] for k in range(1,8)], 347 | set_labels = ('A','B','C'),ax=ax,alpha=0.6,set_colors=my_colors[0:3]) 348 | for c in range(1,8): 349 | i=c%2 350 | j=(c/2)%2 351 | k=(c/4)%2 352 | v.get_label_by_id('%d%d%d'%(i,j,k)).set_text("%d%%"%( 353 | intersect_3methods[c]['r_est_reliable']*100)) 354 | v.get_label_by_id('A').set_text('TopHat\n%s,%03d\n(%d%%)'%(alignment_stats['Tophat']['n_junctions']/1000, 355 | alignment_stats['Tophat']['n_junctions']%1000, 356 | alignment_stats['Tophat']['r_est_reliable']*100)) 357 | v.get_label_by_id('B').set_text('STAR\n%s,%03d\n(%d%%)'%(alignment_stats['STAR']['n_junctions']/1000, 358 | alignment_stats['STAR']['n_junctions']%1000, 359 | alignment_stats['STAR']['r_est_reliable']*100)) 360 | v.get_label_by_id('C').set_text('HISAT2\n%s,%03d\n(%d%%)'%(alignment_stats['HISAT2']['n_junctions']/1000, 361 | alignment_stats['HISAT2']['n_junctions']%1000, 362 | alignment_stats['HISAT2']['r_est_reliable']*100)) 363 | for labe_id in ["A","B","C"]: 364 | v.get_label_by_id(labe_id).set_fontsize(25) 365 | ax.set_title(sample,fontsize=25) 366 | 367 | for labe_id in ["A","B","C","110","101","111","011"]: 368 | v.get_patch_by_id(labe_id).set_linewidth(0) 369 | 370 | ax.legend(["Only TopHat","Only STAR","Only TopHat & STAR","Only HISAT2", 371 | "Only TopHat & HISAT2","Only STAR & HISAT2","TopHat & STAR & HISAT2"],bbox_to_anchor=(0, 1.1, 1.2, .3), 372 | loc=0,ncol=2, 373 | mode="expand", borderaxespad=0.,frameon=False) 374 | 375 | 376 | 377 | # ## Read mapping analysis 378 | 379 | # In[19]: 380 | 381 | sns.set(style="white",font_scale=1.2) 382 | colors=[4] 383 | nt=["A","C","G","T"] 384 | etypes=[] 385 | for i in nt: 386 | for j in nt: 387 | if i!=j: 388 | etypes.append(i+j) 389 | print etypes 390 | bin_labels=["Both pairs uniquely mapped","Both pairs multi-mapped", "One pair uniquely, one multi-mapped", 391 | "One pair uniquely mapped, one unmapped","One pair multi-mapped, one unmapped", "Both pairs unmapped"] 392 | keys=['uniqmap_uniqmap','multimap_multimap', 'uniqmap_multimap', 'uniqmap_unmap', 'multimap_unmap', 'unmap_unmap'] 393 | my_colors=sns.color_palette("Set3",n_colors=10) 394 | 395 | fig, axes = plt.subplots(1,3,figsize=(17,2)) 396 | ax=axes[0] 397 | res=[] 398 | labels=[] 399 | for method in methods: 400 | if method not in alignment_stats: 401 | continue 402 | if "uniqmap_uniqmap" in alignment_stats[method]: 403 | myres=[alignment_stats[method][k]/float(alignment_stats[method]["total"])*100 for k in keys][::-1] 404 | myres=[sum(myres[i:]) for i in range(len(myres))] 405 | res.append(myres) 406 | label=method 407 | labels.append(label) 408 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels) 409 | for ii,b in enumerate(bin_labels): 410 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax) 411 | 412 | ax.set_xlabel("% of fragments") 413 | ax.set_xlim([0,100]) 414 | sns.despine(left=True) 415 | handles, labels = ax.get_legend_handles_labels() 416 | # reverse the order 417 | ax.legend(handles[::-1], labels,bbox_to_anchor=(-0.4, 1, 1.52, .3), 418 | loc=0,ncol=2, 419 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12) 420 | plt.tight_layout() 421 | 422 | 423 | ax=axes[1] 424 | bin_labels=["1","2-3","4-6","7-10","11-20",">20"] 425 | bins=[1,3,6,10,20,1000] 426 | 427 | codes=[4] 428 | res=[] 429 | labels=[] 430 | for method in methods: 431 | if method not in alignment_stats: 432 | continue 433 | if "match_stats" not in alignment_stats[method]: 434 | continue 435 | if set(alignment_stats[method]["match_stats"].keys())&set(codes): 436 | my_res=[] 437 | for b in bins[::-1]: 438 | my_res.append(sum([v for code in set(alignment_stats[method]["match_stats"].keys())&set(codes) 439 | for k,v in alignment_stats[method]["match_stats"][code].iteritems() if ( 440 | k<=b)])/float(sum(alignment_stats[method]["NM"].values()))*100) 441 | my_res=my_res 442 | res.append(my_res) 443 | label=method 444 | labels.append(label) 445 | else: 446 | my_res=[] 447 | for b in bins: 448 | my_res.append(0) 449 | my_res=my_res 450 | res.append(my_res) 451 | label=method 452 | labels.append(label) 453 | 454 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels) 455 | for ii,b in enumerate(bin_labels): 456 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax) 457 | 458 | ax.set_yticklabels([]) 459 | 460 | ax.set_xlabel("% of mapped fragments") 461 | sns.despine(left=True) 462 | handles, labels = ax.get_legend_handles_labels() 463 | ax.legend(handles[::-1], labels,bbox_to_anchor=(0.2, 1, .6, .3), 464 | loc=0,ncol=3, 465 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12, title="Number of soft clipped bases") 466 | plt.tight_layout() 467 | 468 | 469 | 470 | 471 | ax=axes[2] 472 | 473 | bin_labels=["1","2","3-4","5-6","7-9",">9"] 474 | bins=[1,2,4,6,9,1000] 475 | res=[] 476 | labels=[] 477 | for method in methods: 478 | if method not in alignment_stats: 479 | continue 480 | if "NM" not in alignment_stats[method]: 481 | continue 482 | my_res=[] 483 | for b in bins[::-1]: 484 | my_res.append(sum([v/float(sum(alignment_stats[method]["NM"].values()))*100 485 | for k,v in alignment_stats[method]["NM"].iteritems() if ( 486 | 01: 164 | cnt+=1 165 | if kk: 166 | diff_g_res_taq[k]=res[list(kk)[0]] 167 | print len(diff_g_res_taq),cnt 168 | 169 | 170 | # In[64]: 171 | 172 | diff_g_res_ercc={k:res[k] for k in (set(res.keys())&set(ercc_control.keys()))} 173 | print len(diff_g_res_ercc) 174 | 175 | 176 | # ## Plots 177 | 178 | # In[65]: 179 | 180 | taq_corr={} 181 | my_res=diff_g_res_taq 182 | taq_genes=TAQ_control.keys() 183 | x=[my_res[k][0] if k in my_res else 0 for k in taq_genes] 184 | 185 | x=map(lambda i:max(i,-14),x) 186 | x=map(lambda i:min(i,13),x) 187 | y=[TAQ_control[k]["logfc"] for k in taq_genes] 188 | taq_corr=find_corr(x,y) 189 | print taq_corr 190 | 191 | 192 | # In[66]: 193 | 194 | sns.set(style="white",font_scale=3) 195 | logFC_cutoffs=np.arange(0.5,2.5,0.5) 196 | AUC_TAQ={} 197 | for logFC_cutoff in logFC_cutoffs: 198 | SNs=[0] 199 | SPs=[1] 200 | prev_SP=0 201 | my_res=diff_g_res_taq 202 | for pval_cutof in sorted(map(lambda x:x[1],my_res.values())): 203 | taq_genes=TAQ_control.keys() 204 | T=set(filter(lambda x:abs(TAQ_control[x]["logfc"])>=logFC_cutoff,taq_genes)) 205 | F=set(taq_genes)-set(T) 206 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(TAQ_control[x]["logfc"]),my_res.keys())) 207 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys())) 208 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys())) 209 | N=N|(set(taq_genes)-(P|N)) 210 | 211 | TP=T&P&homos 212 | FP=(P&F)|(P&(T-homos)) 213 | TN=F&N 214 | FN=T&N 215 | SN=len(TP)/float(len(TP)+len(FN)+0.00001) 216 | SP=len(TN)/float(len(TN)+len(FP)+0.00001) 217 | if SPs[-1]>0.7: 218 | SNs.append(SN) 219 | SPs.append(SP) 220 | prev_SP=SP 221 | SP_1=SPs[-1] 222 | SP_2=SPs[-2] 223 | SN_1=SNs[-1] 224 | SN_2=SNs[-2] 225 | SP=0.7 226 | SN=(SN_2-SN_1)/(SP_2-SP_1+0.0000001)*(SP-SP_1)+SN_1 227 | SNs[-1]=SN 228 | SPs[-1]=SP 229 | AUC_TAQ[logFC_cutoff]=metrics.auc(1-np.array(SPs),SNs) 230 | 231 | 232 | 233 | # In[104]: 234 | 235 | logFC_cutoff=0.5 236 | pval_cutof=0.05 237 | sns.set(style="white",font_scale=2) 238 | SNs=[0] 239 | SPs=[1] 240 | my_res=diff_g_res_taq 241 | for pval_cutof in sorted(map(lambda x:x[1],my_res.values())): 242 | taq_genes=TAQ_control.keys() 243 | T=set(filter(lambda x:abs(TAQ_control[x]["logfc"])>=logFC_cutoff,taq_genes)) 244 | F=set(taq_genes)-set(T) 245 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(TAQ_control[x]["logfc"]),my_res.keys())) 246 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys())) 247 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys())) 248 | N=N|(set(taq_genes)-(P|N)) 249 | TP=T&P&homos 250 | FP=(P&F)|(P&(T-homos)) 251 | TN=F&N 252 | FN=T&N 253 | SN=len(TP)/float(len(TP)+len(FN)) 254 | SP=len(TN)/float(len(TN)+len(FP)) 255 | SNs.append(SN) 256 | SPs.append(SP) 257 | plot(1-np.array(SPs),SNs) 258 | xlabel("FPR (1-Specificity)") 259 | ylabel("TPR (Sensitivity)") 260 | title("ROC analysis of qRT-PCR measured genes",fontsize=18) 261 | 262 | 263 | # In[106]: 264 | 265 | logFC_cutoff=0.5 266 | pval_cutof=0.05 267 | sns.set(style="white",font_scale=2) 268 | x=logFC_cutoffs 269 | y=[AUC_TAQ[w] for w in logFC_cutoffs] 270 | plot(x,y) 271 | xlabel("log2-fold change threshold") 272 | ylabel("AUC-30") 273 | title("AUC-30 vs. log2-fold change for qRT-PCR experiment",fontsize=18) 274 | 275 | 276 | # In[80]: 277 | 278 | taq_corr 279 | 280 | 281 | # In[108]: 282 | 283 | sns.set(style="white",font_scale=1.5) 284 | my_data=DataFrame([["DESeq2+Salmon-SMEM",taq_corr["spearman"][0],"Spearman rank correlation"], 285 | ["DESeq2+Salmon-SMEM",taq_corr["RMSD"],"RMSD"], 286 | ["DESeq2+Salmon-SMEM",AUC_TAQ[0.5],"AUC-30"]] 287 | , 288 | columns=["tool","score","Measure"]) 289 | fig, axes = plt.subplots(1,3,figsize=(16,5)) 290 | for iii,key in enumerate(["Spearman rank correlation","RMSD","AUC-30"]): 291 | ax=axes[iii] 292 | my_data_=my_data[my_data["Measure"]==key] 293 | my_data_=my_data_.sort_values(by='score', ascending=[1 if key=="RMSD" else 0]) 294 | 295 | cg=sns.stripplot(y="tool", x="score",data=my_data_,size=10, hue="Measure", orient="h",edgecolor="gray",ax=ax) 296 | ax.set_ylabel("") 297 | ax.set_xlabel(key) 298 | ax.legend([]) 299 | ax.xaxis.grid(False) 300 | if iii==0: 301 | ax.set_xticks(np.arange(0.65,1,.1)) 302 | ax.set_xlim([0.65,0.95]) 303 | elif iii==1: 304 | ax.set_xticks(np.arange(1.5,4,1)) 305 | ax.set_xlim([1.5,3.5]) 306 | elif iii==2: 307 | ax.set_xticks(np.arange(0.08,0.24,.04)) 308 | ax.set_xlim([0.08,0.2]) 309 | ax.yaxis.grid(True) 310 | sns.despine(bottom=True) 311 | sns.despine(top=True) 312 | sns.despine(right=True) 313 | sns.despine(left=True) 314 | plt.tight_layout() 315 | 316 | 317 | # In[91]: 318 | 319 | ercc_corr={} 320 | my_res=diff_g_res_ercc 321 | ercc_genes=ercc_control.keys() 322 | x=[my_res[k][0] if k in my_res else 0 for k in ercc_genes ] 323 | x=map(lambda i:max(i,-14),x) 324 | x=map(lambda i:min(i,13),x) 325 | y=[ercc_control[k]["logfc"] for k in ercc_genes ] 326 | print len(x),len(y) 327 | ercc_corr=find_corr(x,y) 328 | 329 | 330 | # In[94]: 331 | 332 | logFC_cutoff=0.5 333 | pval_cutof=0.05 334 | sns.set(style="white",font_scale=3) 335 | SNs=[0] 336 | SPs=[1] 337 | my_res=diff_g_res_ercc 338 | for thr in (np.arange(0,3,0.1).tolist()+range(3,100))[::-1]: 339 | pval_cutof=10**-thr 340 | ercc_genes=ercc_control.keys() 341 | T=set(filter(lambda x:abs(ercc_control[x]["logfc"])>0,ercc_genes)) 342 | F=set(ercc_genes)-set(T) 343 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(ercc_control[x]["logfc"]),my_res.keys())) 344 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys())) 345 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys())) 346 | N=N|(set(ercc_genes)-(P|N)) 347 | TP=T&P&homos 348 | FP=(P&F)|(P&(T-homos)) 349 | TN=F&N 350 | FN=T&N 351 | SN=len(TP)/float(len(TP)+len(FN)+0.0001) 352 | SP=len(TN)/float(len(TN)+len(FP)+0.0001) 353 | if SPs[-1]>0.7: 354 | SNs.append(SN) 355 | SPs.append(SP) 356 | SP_1=SPs[-1] 357 | SP_2=SPs[-2] 358 | SN_1=SNs[-1] 359 | SN_2=SNs[-2] 360 | SP=0.7 361 | SN=(SN_2-SN_1)/(SP_2-SP_1+0.0000001)*(SP-SP_1)+SN_1 362 | SNs[-1]=SN 363 | SPs[-1]=SP 364 | AUC_ERCC=metrics.auc(1-np.array(SPs),SNs) 365 | 366 | 367 | # In[105]: 368 | 369 | logFC_cutoff=0.5 370 | pval_cutof=0.05 371 | sns.set(style="white",font_scale=2) 372 | SPs_ERCC={} 373 | SNs_ERCC={} 374 | SNs=[0] 375 | SPs=[1] 376 | my_res=diff_g_res_ercc 377 | for thr in (np.arange(0,3,0.1).tolist()+range(3,100))[::-1]: 378 | pval_cutof=10**-thr 379 | ercc_genes=ercc_control.keys() 380 | T=set(filter(lambda x:abs(ercc_control[x]["logfc"])>0,ercc_genes)) 381 | F=set(ercc_genes)-set(T) 382 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(ercc_control[x]["logfc"]),my_res.keys())) 383 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys())) 384 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys())) 385 | N=N|(set(ercc_genes)-(P|N)) 386 | TP=T&P&homos 387 | FP=(P&F)|(P&(T-homos)) 388 | TN=F&N 389 | FN=T&N 390 | SN=len(TP)/float(len(TP)+len(FN)+0.0001) 391 | SP=len(TN)/float(len(TN)+len(FP)+0.0001) 392 | SNs.append(SN) 393 | SPs.append(SP) 394 | plot(1-np.array(SPs),SNs) 395 | xlabel("FPR (1-Specificity)") 396 | ylabel("TPR (Sensitivity)") 397 | title("ROC analysis of ERCC genes",fontsize=18) 398 | 399 | 400 | # In[109]: 401 | 402 | sns.set(style="white",font_scale=1.5) 403 | my_data=DataFrame([["DESeq2+Salmon-SMEM",ercc_corr["spearman"][0],"Spearman rank correlation"], 404 | ["DESeq2+Salmon-SMEM",ercc_corr["RMSD"],"RMSD"], 405 | ["DESeq2+Salmon-SMEM",AUC_ERCC,"AUC-30"]] 406 | , 407 | columns=["tool","score","Measure"]) 408 | fig, axes = plt.subplots(1,3,figsize=(16,5)) 409 | for iii,key in enumerate(["Spearman rank correlation","RMSD","AUC-30"]): 410 | ax=axes[iii] 411 | my_data_=my_data[my_data["Measure"]==key] 412 | my_data_=my_data_.sort_values(by='score', ascending=[1 if key=="RMSD" else 0]) 413 | 414 | cg=sns.stripplot(y="tool", x="score",data=my_data_,size=10, hue="Measure", orient="h",edgecolor="gray",ax=ax) 415 | ax.set_ylabel("") 416 | ax.set_xlabel(key) 417 | ax.legend([]) 418 | ax.xaxis.grid(False) 419 | if iii==0: 420 | ax.set_xticks(np.arange(0.55,0.95,.1)) 421 | ax.set_xlim([0.55,0.9]) 422 | elif iii==1: 423 | ax.set_xticks(np.arange(0.5,3.5,.5)) 424 | ax.set_xlim([0.5,3]) 425 | elif iii==2: 426 | ax.set_xticks(np.arange(0.05,0.3,.05)) 427 | ax.set_xlim([0.05,0.25]) 428 | ax.yaxis.grid(True) 429 | sns.despine(bottom=True) 430 | sns.despine(top=True) 431 | sns.despine(right=True) 432 | sns.despine(left=True) 433 | plt.tight_layout() 434 | 435 | -------------------------------------------------------------------------------- /analysis_scripts/editing/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Editing Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/editing/RNACocktail-Editing-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /analysis_scripts/editing/RNACocktail-Editing-Analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | get_ipython().magic(u'pylab inline') 7 | 8 | 9 | # In[2]: 10 | 11 | import numpy as np 12 | import os 13 | import glob 14 | import pickle 15 | from operator import itemgetter 16 | from Bio import SeqIO 17 | import csv 18 | import scipy 19 | from scipy import stats 20 | import pybedtools 21 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2 22 | import seaborn as sns 23 | from pandas import DataFrame 24 | import matplotlib.patches as patches 25 | 26 | 27 | # # Initializtion 28 | 29 | # In[3]: 30 | 31 | tool="HISAT2" 32 | sample="NA12878" 33 | callers="GATK" 34 | assemblers="StringTie" 35 | editor="GIREMI" 36 | 37 | 38 | varsim_jar="/path/to/VarSim.jar" 39 | NIST_HC_nonDB="/path/to/NIST_HC_nonDB.vcf" 40 | NIST_HC_vcf="/path/to/NIST_HC.vcf" 41 | b37_regions="/path/to/b37_regions" 42 | b37_rmask_bed="/path/to/b37.rmask.bed" 43 | 44 | 45 | 46 | 47 | # # Predictions 48 | 49 | # In[4]: 50 | 51 | 52 | 53 | pred_file="/path/to/giremi_out_good.txt.res" 54 | pred_file_pcnt_hidden={i:"/path/to/giremi_out_good_%s.txt.res"%i 55 | for i in range(10,110,10)} 56 | 57 | 58 | # # Funcions 59 | 60 | # In[5]: 61 | 62 | def parse_giremi(outfile): 63 | preds=[] 64 | with open(outfile,"r") as csv_file: 65 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 66 | cnt=0 67 | for row in spamreader: 68 | if cnt==0: 69 | cnt=1 70 | continue 71 | preds.append(row) 72 | return preds 73 | 74 | 75 | # In[6]: 76 | 77 | def parse_ga(outfile): 78 | preds=[] 79 | llrs=[] 80 | with open(outfile,"r") as csv_file: 81 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 82 | cnt=0 83 | for row in spamreader: 84 | x=row 85 | if int(x[1])==int(x[5]) and int(x[1])==(int(x[4])+1) and (x[2]==x[9]) and (x[3]==x[10]): 86 | keys=x[11].split(":") 87 | vals=x[12].split(":") 88 | if "AD" not in keys: 89 | cnts=["1","1"] 90 | else: 91 | cnts=vals[keys.index("AD")].split(",") 92 | if int(cnts[0])==0: 93 | continue 94 | preds.append([x[0],x[1],x[2],x[3],x[6],x[7],cnts[0],cnts[1]]) 95 | print len(preds) 96 | return preds 97 | 98 | 99 | # In[7]: 100 | 101 | def vcf_to_bed(vcf_file,all_otherfields=False,otherfields=[]): 102 | with open(vcf_file,"r") as csv_file: 103 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 104 | intervals=[] 105 | for row in spamreader: 106 | if row[0]=="#": 107 | continue 108 | if all_otherfields: 109 | otherfields=range(2,len(row)) 110 | if otherfields: 111 | intervals.append(pybedtools.Interval(row[0],int(row[1])-1,int(row[1]),otherfields=[row[i] 112 | for i in otherfields])) 113 | else: 114 | intervals.append(pybedtools.Interval(row[0],int(row[1])-1,int(row[1]))) 115 | return pybedtools.BedTool(intervals) 116 | 117 | 118 | # In[8]: 119 | 120 | def find_etype(ref,alt,strand): 121 | revnt={"A":"T","C":"G","T":"A","G":"C"} 122 | if strand=="-": 123 | alt=revnt[alt] 124 | ref=revnt[ref] 125 | return ref+alt 126 | 127 | 128 | # In[9]: 129 | 130 | def find_er(ref,alt,strand,counts): 131 | revnt={"A":"T","C":"G","T":"A","G":"C"} 132 | id_n={"A":0,"C":1,"G":2,"T":3} 133 | counts=map(int,counts) 134 | if strand=="-": 135 | alt=revnt[alt] 136 | ref=revnt[ref] 137 | eratio=int(counts[id_n[alt]]/float(sum(counts))*100) 138 | return eratio 139 | 140 | 141 | 142 | 143 | 144 | # In[10]: 145 | 146 | revnt={"A":"T","C":"G","T":"A","G":"C"} 147 | def giremi_to_vcf(giremi_list,vcf_file): 148 | with open(vcf_file,"w") as csv_file: 149 | spamwriter = csv.writer(csv_file, delimiter='\t', quotechar='|') 150 | for x in giremi_list: 151 | if int(x[22])==0: 152 | continue 153 | ref,alt=x[17][0],x[17][1] 154 | strand=x[3] 155 | if strand=="-": 156 | alt=revnt[alt] 157 | ref=revnt[ref] 158 | spamwriter.writerow([x[1],x[2],".",ref,alt,".","PASS","."]) 159 | 160 | 161 | # In[11]: 162 | 163 | revnt={"A":"T","C":"G","T":"A","G":"C"} 164 | def ga_to_vcf(ga_list,vcf_file): 165 | with open(vcf_file,"w") as csv_file: 166 | spamwriter = csv.writer(csv_file, delimiter='\t', quotechar='|') 167 | for x in ga_list: 168 | spamwriter.writerow([x[0],x[1],".",x[2],x[3],".","PASS","."]) 169 | 170 | 171 | # In[12]: 172 | 173 | revnt={"A":"T","C":"G","T":"A","G":"C"} 174 | 175 | 176 | # In[13]: 177 | 178 | Alu_regions=pybedtools.BedTool(b37_rmask_bed 179 | ).filter(lambda x: "Alu" in x.name).merge().sort() 180 | print len(Alu_regions) 181 | 182 | 183 | # In[14]: 184 | 185 | reps=["repeats_b37_duplicates.bed","repeats_b37_Low_complexity.bed","repeats_b37_SINE.bed", 186 | "repeats_b37_duplicates_unique.bed", "repeats_b37_Satellite.bed", "repeats_b37_LINE.bed", "repeats_b37_Simple_repeat.bed"] 187 | rep_regions=pybedtools.BedTool([]) 188 | for rep in reps: 189 | rep_regions=rep_regions.cat("%s/%s"%(b37_regions,rep)) 190 | rep_regions=rep_regions.sort().merge() 191 | 192 | 193 | # In[15]: 194 | 195 | nonAlu_rep_regions=rep_regions.subtract(Alu_regions).sort() 196 | 197 | 198 | # In[16]: 199 | 200 | vcf_file="%s.vcf"%pred_file 201 | editor_pred=parse_giremi(pred_file) 202 | giremi_to_vcf(editor_pred,vcf_file) 203 | editor_bed=vcf_to_bed(vcf_file,all_otherfields=True) 204 | cmd="java -jar %s vcfcompare -true_vcf %s -prefix %s.NISTHCnonDB %s"%(varsim_jar,NIST_HC_nonDB,pred_file,vcf_file) 205 | if not os.path.exists("%s.NISTHCnonDB_TP.vcf"%(pred_file)): 206 | a=os.system(cmd) 207 | print cmd 208 | if a!=0: 209 | print a 210 | 211 | 212 | # In[17]: 213 | 214 | pred_edited={} 215 | edit_bed=pybedtools.BedTool([pybedtools.Interval(x[1],int(x[2])-1,int(x[2]),x[17],find_er(x[17][0],x[17][1],x[3],x[18:22])) 216 | for x in editor_pred if int(x[22])>0]) 217 | for region,region_bed in [["Alu",Alu_regions],["nonAlu-reps",nonAlu_rep_regions],["nonreps",""],["all",""]]: 218 | if region in ["Alu","nonAlu-reps"]: 219 | my_edit_bed=edit_bed.window(region_bed,w=0,u=True) 220 | elif region=="nonreps": 221 | my_edit_bed=edit_bed.window(Alu_regions,w=0,v=True) 222 | my_edit_bed=my_edit_bed.window(nonAlu_rep_regions,w=0,v=True) 223 | elif region=="all": 224 | my_edit_bed=edit_bed.sort() 225 | edit_types=[x[3] for x in my_edit_bed] 226 | edit_ratios=[x[4] for x in my_edit_bed] 227 | vcf_file="%s.NISTHCnonDB_TP.vcf"%pred_file 228 | NIST_errors=len(vcf_to_bed(vcf_file)) 229 | pred_edited[region]={ 230 | "dist":{etype:edit_types.count(etype) for etype in set(edit_types)}, 231 | "ratio":edit_ratios, 232 | "types":edit_types, 233 | "errors":NIST_errors 234 | } 235 | 236 | 237 | # In[18]: 238 | 239 | sns.set(style="white",font_scale=1.5) 240 | colors=[4] 241 | nt=["A","C","G","T"] 242 | etypes=[] 243 | for i in nt: 244 | for j in nt: 245 | if i!=j: 246 | etypes.append(i+j) 247 | rgn_name={"Alu": "Alu","nonAlu-reps":"Repetetive non-Alu","nonreps":"Nonrepetetive"} 248 | bin_labels=[r"A$\rightarrow$G",r"T$\rightarrow$C",r"C$\rightarrow$T",r"G$\rightarrow$A","Other Mismatches"] 249 | my_palette=sns.color_palette("Set3",n_colors=10) 250 | fig, ax = plt.subplots(figsize=(9,1.4)) 251 | res=[] 252 | labels=[] 253 | n={} 254 | for rrr,rgn in enumerate(["Alu","nonAlu-reps","nonreps"]): 255 | my_dist=pred_edited 256 | if set(my_dist[rgn]["dist"].keys())-set(etypes): 257 | print aaaa 258 | z=[my_dist[rgn]["dist"][k] if k in 259 | my_dist[rgn]["dist"] else 0 260 | for k in etypes] 261 | 262 | sz=sum(z)+0.000001 263 | z=map(lambda x:round(x/float(sz),4)*100,z) 264 | z=[z[1],z[10],z[5],z[6],z[0]+sum(z[2:5])+sum(z[7:10])+z[11]] 265 | res_bin=[sum(z),sum(z[:4]),sum(z[:3]),sum(z[:2]),z[0]] 266 | res.append(res_bin) 267 | label="%s: %s"%(rgn_name[rgn], tool.replace("Tophat","TopHat")) 268 | n[label]=int(sz) 269 | labels.append(label) 270 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels) 271 | for ii,b in enumerate(bin_labels): 272 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_palette[ii],ax=ax) 273 | for ii,label in enumerate(labels): 274 | ax.text(101,ii+.25,"%d,%03d"%(n[label]/1000,n[label]%1000) if n[label]>=1000 else n[label] ,fontsize=12) 275 | ax.set_xlabel("% of Edits") 276 | ax.set_xlim([0,100]) 277 | sns.despine(left=True) 278 | handles, labels = ax.get_legend_handles_labels() 279 | # reverse the order 280 | ax.legend(handles[::-1], labels,bbox_to_anchor=(1.2, 0.7, .5, .3), 281 | loc=0,ncol=1, 282 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12) 283 | 284 | 285 | # In[19]: 286 | 287 | nist_editor_pred={} 288 | nist_editor_bed={} 289 | nist_editor_out={} 290 | for x in range(0,110,10): 291 | if x==0: 292 | path=pred_file 293 | else: 294 | path=pred_file_pcnt_hidden[x] 295 | vcf_file="%s.vcf"%path 296 | nist_editor_out[x]=path 297 | nist_editor_pred[x]=parse_giremi(path) 298 | giremi_to_vcf(nist_editor_pred[x],vcf_file) 299 | nist_editor_bed[x]=vcf_to_bed(vcf_file,all_otherfields=True) 300 | cmd="java -jar %s vcfcompare -true_vcf %s -prefix %s.NISTHC_%s %s"%(varsim_jar,NIST_HC_vcf,path,x,vcf_file) 301 | if not os.path.exists("%s.NISTHC_%s_TP.vcf"%(path,x)): 302 | a=os.system(cmd) 303 | print cmd 304 | if a!=0: 305 | print a 306 | 307 | 308 | # In[20]: 309 | 310 | FDR={"all_calls":[],"FPs":[],"FDR":[],"AG":[]} 311 | for x in range(0,110,10): 312 | g=nist_editor_out[x] 313 | vcf_file="%s.NISTHC_%s_TP.vcf"%(g,x) 314 | NIST_errors=len(vcf_to_bed(vcf_file)) 315 | all_calls=len(vcf_to_bed(vcf_file="%s.vcf"%g)) 316 | fdr=NIST_errors/float(all_calls)*100 317 | FDR["all_calls"].append(all_calls) 318 | FDR["FPs"].append(NIST_errors) 319 | FDR["FDR"].append(fdr) 320 | edit_bed=pybedtools.BedTool([pybedtools.Interval(w[1],int(w[2])-1,int(w[2]),w[17],find_er(w[17][0],w[17][1],w[3],w[18:22])) 321 | for w in nist_editor_pred[x] if int(w[22])>0]) 322 | edit_types=[w[3] for w in edit_bed] 323 | dist={etype:edit_types.count(etype) for etype in set(edit_types)} 324 | FDR["AG"].append((dist["AG"])/float((all_calls))*100) 325 | 326 | 327 | # In[21]: 328 | 329 | sns.set(style="white",font_scale=1.5) 330 | fig, axes = plt.subplots(1,3,figsize=(18,5)) 331 | hiddens=range(0,110,10) 332 | for iii,key in enumerate(["FDR","all_calls","AG"]): 333 | ax=axes[iii] 334 | rects1 = ax.plot(hiddens,FDR[key],alpha=0.8, 335 | label="%s: %s"%(editor,tool), linewidth=3) 336 | ax.set_xticks(range(0,110,10)) 337 | ax.set_xlabel("Proportion of hidden SNPs (%)") 338 | if key=="FDR": 339 | ax.set_yticks(range(0,50,10)) 340 | ax.set_ylim([0,40]) 341 | ax.set_ylabel(r"FDR (%)") 342 | elif key=="all_calls": 343 | ax.set_yticks(range(0,9000,2000)) 344 | ax.set_ylim([0,8000]) 345 | ax.set_ylabel(r"Number of predicted RNA editings") 346 | elif key=="AG": 347 | ax.set_yticks(range(75,105,5)) 348 | ax.set_ylim([75,100]) 349 | ax.set_ylabel(r"Proportion of A$\rightarrow$G events (%)") 350 | 351 | ax.legend(bbox_to_anchor=(0.4,1.1, 2, .102), loc=1,ncol=1, 352 | mode="expand", borderaxespad=0.,frameon=False,fontsize=18) 353 | plt.tight_layout() 354 | 355 | 356 | # In[22]: 357 | 358 | sns.set(style="white",font_scale=1.5) 359 | fig, ax = plt.subplots(figsize=(10,8)) 360 | levels=np.arange(0,100,10) 361 | my_dist=pred_edited 362 | etypes=my_dist['all']["types"] 363 | ratios=np.array(map(int,my_dist['all']["ratio"])) 364 | E=[] 365 | for level in levels: 366 | es=[etypes[i] for i in range(len(ratios)) if ratios[i]>level] 367 | E.append((es.count("AG")+es.count("TC"))/float(len(es)+0.00001)*100) 368 | rects1 = ax.plot(levels,E, alpha=0.8, 369 | label="%s: %s"%(editor,tool), linewidth=3) 370 | ax.set_ylim([40,100]) 371 | ax.set_xlabel("Minimum editing level (%)") 372 | ax.set_ylabel(r"Proportion of A$\rightarrow$G/T$\rightarrow$C events (%)") 373 | plt.tight_layout() 374 | ax.legend(bbox_to_anchor=(-.1, 1.15, 1.2, .102), loc=2,ncol=4, 375 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12) 376 | 377 | 378 | # In[ ]: 379 | 380 | 381 | 382 | -------------------------------------------------------------------------------- /analysis_scripts/fusion/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Fusion Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/fusion/RNACocktail-Fusion-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /analysis_scripts/fusion/RNACocktail-Fusion-Analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | get_ipython().magic(u'pylab inline') 7 | 8 | 9 | # In[2]: 10 | 11 | import numpy as np 12 | import os 13 | import glob 14 | import pickle 15 | from operator import itemgetter 16 | from Bio import SeqIO 17 | import csv 18 | import scipy 19 | from scipy import stats 20 | import pybedtools 21 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2 22 | import seaborn as sns 23 | from pandas import DataFrame 24 | 25 | 26 | # # Initialization 27 | 28 | # In[11]: 29 | 30 | tools=["IDP-fusion","FusionCatcher"] 31 | sample="MCF7" 32 | 33 | gencode_gtf="/path/to/gencode.v19.annotation.gtf" 34 | gold_set="/path/to/idp_gold_set.txt" 35 | 36 | 37 | # # Prediction 38 | 39 | # In[4]: 40 | 41 | pred_file={} 42 | 43 | 44 | pred_file["FusionCatcher"]="/path/to/final-list_candidate-fusion-genes.txt" 45 | pred_file["IDP-fusion"]="/path/to/preds.txt" 46 | 47 | 48 | # # Functions 49 | 50 | # In[5]: 51 | 52 | def parse_fusion(predfile,tool): 53 | preds=[] 54 | 55 | if tool=="FusionCatcher": 56 | with open(predfile,"r") as csv_file: 57 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 58 | cnt=0 59 | for row in spamreader: 60 | if cnt==0: 61 | cnt+=1 62 | continue 63 | if cnt==1: 64 | preds.append([row[0],row[1],row[8].split(":")[0], 65 | row[8].split(":")[1],row[9].split(":")[0],row[9].split(":")[1]]) 66 | elif tool=="IDP-fusion": 67 | with open(predfile,"r") as csv_file: 68 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 69 | cnt=0 70 | for row in spamreader: 71 | if cnt==0: 72 | cnt+=1 73 | continue 74 | if cnt==1: 75 | if not row[0]: 76 | continue 77 | preds.append([row[0].split("-")[0],row[0].split("-")[1], 78 | row[9].split("chr")[1], 79 | row[10], 80 | row[13].split("chr")[1], 81 | row[14] 82 | ]) 83 | else: 84 | print "NO file ", tool 85 | 86 | 87 | Fs=set([]) 88 | nonredundant_preds=[] 89 | for pred in preds: 90 | g1,g2=pred[0:2] 91 | fs="%s:%s"%(g1,g2) 92 | if fs not in Fs: 93 | nonredundant_preds.append(pred) 94 | Fs.add(fs) 95 | 96 | return nonredundant_preds 97 | 98 | 99 | 100 | # In[6]: 101 | 102 | def parse_gold(goldfile): 103 | gs=[] 104 | with open(goldfile,"r") as csv_file: 105 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 106 | for row in spamreader: 107 | gs.append(row) 108 | 109 | genes=set([x for w in gs for x in w]) 110 | coord={} 111 | for gene in genes: 112 | if gene[0:3]=="chr": 113 | c=gene.split(":")[0][3:] 114 | p1=gene.split(":")[1].split("-")[0] 115 | p2=gene.split(":")[1].split("-")[1] if "-" in gene else str(int(p1)+1) 116 | coord[gene]=[c,p1,p2] 117 | 118 | with open(gencode_gtf,"r") as csv_file: 119 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 120 | for row in spamreader: 121 | if row[0][0]=="#": 122 | continue 123 | if row[2]=="gene": 124 | gene_info = {k.split()[0]:k.split()[1][1:-1] for k in ' '.join(row[8:]).split(";")[:-1]} 125 | name=gene_info["gene_name"] 126 | if name in genes: 127 | if name in coord: 128 | print "DUP",name 129 | coord[name]=[row[0],row[3],row[4]] 130 | 131 | gs=map(lambda x:x+coord[x[0]]+coord[x[1]],gs) 132 | return gs,coord 133 | 134 | gs,coord=parse_gold(gold_set) 135 | genes_gs=set([x for w in gs for x in w[0:2]]) 136 | gs_dict={} 137 | for g in gs: 138 | if g[0] not in gs_dict: 139 | gs_dict[g[0]]={} 140 | if g[1] not in gs_dict[g[0]]: 141 | gs_dict[g[0]][g[1]]=[] 142 | gs_dict[g[0]][g[1]].append(g[2:]) 143 | if g[1] not in gs_dict: 144 | gs_dict[g[1]]={} 145 | if g[0] not in gs_dict[g[1]]: 146 | gs_dict[g[1]][g[0]]=[] 147 | gs_dict[g[0]][g[1]].append(g[2:]) 148 | 149 | intervals=[] 150 | processed_gs=set([]) 151 | for g in gs: 152 | if g[0] not in processed_gs: 153 | intervals.append(pybedtools.Interval(chrom=g[2],start=int(g[3]),end=int(g[4]),name=g[0])) 154 | processed_gs.add(g[0]) 155 | if g[1] not in processed_gs: 156 | intervals.append(pybedtools.Interval(chrom=g[5],start=int(g[6]),end=int(g[7]),name=g[1])) 157 | processed_gs.add(g[1]) 158 | 159 | gs_bed=pybedtools.BedTool(intervals).sort() 160 | 161 | 162 | 163 | # In[7]: 164 | 165 | def evaluate(pred): 166 | tp=0 167 | fp=0 168 | for fusion in pred: 169 | g1,g2,c1,p1,c2,p2=fusion 170 | if g1 not in genes_gs: 171 | my_bed1=pybedtools.BedTool([pybedtools.Interval(chrom=c1,start=int(p1),end=int(p1)+1,name=g1)]) 172 | matches1=my_bed1.window(gs_bed,w=0) 173 | if len(matches1)>1: 174 | aaaa 175 | elif len(matches1)==1: 176 | g1=matches1[0][9] 177 | else: 178 | fp+=1 179 | continue 180 | 181 | if g2 not in genes_gs: 182 | my_bed2=pybedtools.BedTool([pybedtools.Interval(chrom=c2,start=int(p2),end=int(p2)+1,name=g2)]) 183 | matches2=my_bed2.window(gs_bed,w=0) 184 | if len(matches2)>1: 185 | aaaa 186 | elif len(matches2)==1: 187 | g2=matches2[0][9] 188 | else: 189 | fp+=1 190 | continue 191 | 192 | 193 | if g2 in gs_dict[g1]: 194 | tp+=1 195 | else: 196 | fp+=1 197 | 198 | print fp,tp,len(pred)-fp-tp 199 | return fp,tp 200 | 201 | 202 | # In[8]: 203 | 204 | preds={} 205 | for tool in tools: 206 | pred=parse_fusion(pred_file[tool],tool) 207 | preds[tool]=pred 208 | 209 | 210 | # In[9]: 211 | 212 | performance={} 213 | for tool in tools: 214 | fp,tp=evaluate(preds[tool]) 215 | performance[tool]={"FP":fp,"TP":tp,"PR":tp/float(tp+fp+0.0001),"SN":tp/float(len(gs))} 216 | print tool,performance[tool] 217 | 218 | 219 | # In[10]: 220 | 221 | sns.set(style="white",font_scale=2) 222 | fig, ax = plt.subplots(figsize=(6,6)) 223 | x=[] 224 | y=[] 225 | for tool in tools: 226 | x=(performance[tool]["SN"]*100) 227 | y=(performance[tool]["PR"]*100) 228 | label=tool 229 | ax.plot(x,y,label=label,linestyle="" 230 | ,marker="o",markersize=25) 231 | ax.set_yticks(range(0,70,10)) 232 | ax.set_ylim([0,60]) 233 | ax.set_xticks(range(20,55,5)) 234 | ax.set_xlim([20,50]) 235 | ax.set_xlabel("Sensitivity(%)") 236 | ax.set_ylabel("Precision(%)") 237 | 238 | ax.legend(bbox_to_anchor=(1, 0.8, 1.1, 0.1), loc=1,ncol=1, 239 | mode="expand", borderaxespad=0.,frameon=False) 240 | 241 | 242 | # In[ ]: 243 | 244 | 245 | 246 | 247 | # In[ ]: 248 | 249 | 250 | 251 | -------------------------------------------------------------------------------- /analysis_scripts/quantification/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Quantification Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/quantification/RNACocktail-Quant-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /analysis_scripts/quantification/RNACocktail-Quant-Analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | get_ipython().magic(u'pylab inline') 7 | 8 | 9 | # In[2]: 10 | 11 | import numpy as np 12 | import os 13 | import glob 14 | import pickle 15 | from operator import itemgetter 16 | from Bio import SeqIO 17 | import csv 18 | import scipy 19 | from scipy import stats 20 | import statsmodels.api as sm 21 | import seaborn as sns 22 | 23 | 24 | # # Initialization 25 | 26 | # In[3]: 27 | 28 | housekeeping_file="/path/to/housekeeping.txt" 29 | 30 | samples=["SEQC_A1","SEQC_A2"] 31 | 32 | 33 | # # Predictions 34 | 35 | # In[4]: 36 | 37 | quant_file={} 38 | 39 | quant_file["SEQC_A1"]="/path/to/quant.sf" 40 | quant_file["SEQC_A2"]="/path/to/quant.sf" 41 | 42 | 43 | # # Functions 44 | 45 | # In[5]: 46 | 47 | def parse_quant_results(res_file): 48 | mat=[] 49 | with open(res_file, 'r') as csv_f: 50 | spamreader = csv.reader(csv_f, delimiter='\t', quotechar='|') 51 | cnt=-1 52 | for row in spamreader: 53 | cnt+=1 54 | if cnt==0: 55 | continue 56 | mat.append([row[0],int(row[1]),float(row[2]),float(row[4]),float(row[3])]) 57 | return mat 58 | 59 | 60 | # In[6]: 61 | 62 | def find_corr(x,y): 63 | corr={} 64 | [r,p]=scipy.stats.spearmanr(np.log2(x), np.log2(y)) 65 | corr['spearman_log']=[r,p] 66 | return corr 67 | 68 | 69 | 70 | # In[7]: 71 | 72 | with open(housekeeping_file) as csv_file: 73 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 74 | hk_genes=[] 75 | hk_transcripts=[] 76 | for row in spamreader: 77 | hk_genes.append(row[0]) 78 | hk_transcripts.append(row[1]) 79 | hk_genes=set(hk_genes) 80 | hk_transcripts=set(hk_transcripts) 81 | print len(hk_genes),len(hk_transcripts) 82 | 83 | 84 | # ## Read Assembly transcripts 85 | 86 | # In[8]: 87 | 88 | quant_stats={} 89 | median_hk_trans={} 90 | for sample in samples: 91 | quant_stats[sample]=parse_quant_results(quant_file[sample]) 92 | x_dict={w[0]:w[4] for w in quant_stats[sample] if w[0]} 93 | hks=[x_dict[k] for k in set(x_dict.keys())&hk_transcripts] 94 | median_hk_trans[sample]=np.median(hks) 95 | 96 | 97 | # ## Plots 98 | 99 | # In[9]: 100 | 101 | pscnt=1 102 | sample1,sample2=samples 103 | labels=[] 104 | my_data=[] 105 | res1=quant_stats[sample1] 106 | res2=quant_stats[sample2] 107 | x_dict={w[0]:w[4]/float(median_hk_trans[sample1]) for w in res1} 108 | y_dict={w[0]:w[4]/float(median_hk_trans[sample2]) for w in res2} 109 | keys=list(set(x_dict.keys())|set(y_dict.keys())) 110 | keys=filter(lambda x:"ENST" in x,keys) 111 | x=np.array(map(lambda w:x_dict[w] if w in x_dict else 0,keys)) 112 | y=np.array(map(lambda w:y_dict[w] if w in y_dict else 0,keys)) 113 | x=x+0.5 114 | y=y+0.5 115 | f4=find((np.multiply((y>pscnt),(x>pscnt))) 116 | +(np.multiply((y>pscnt),(x<=pscnt))) 117 | +(np.multiply((y<=pscnt),(x>pscnt)))) 118 | w=(np.log2(x[f4])-np.log2(y[f4])) 119 | w=filter(lambda x: abs(x)>=0.000, w) 120 | logfc_data=w 121 | 122 | 123 | # In[10]: 124 | 125 | import seaborn as sns 126 | sns.set(style="whitegrid",font_scale=2) 127 | fig, ax = plt.subplots(figsize=(1,4)) 128 | my_data=logfc_data 129 | cg=sns.violinplot(data=my_data, palette="Set3" , bw=0.2, cut=10, 130 | linewidth=1,scale="area",inner="quartile",saturation=0.75,gridsize=500) 131 | ax.set_xticklabels(labels,rotation=90) 132 | ax.set_ylim([-1.5,1.5]) 133 | ax.set_yticks(np.arange(-1,2,1)) 134 | sns.despine(left=True, bottom=True) 135 | ax.set_title("Percentage of expression \n disagreement between replicates",fontsize=12) 136 | 137 | 138 | # In[11]: 139 | 140 | miss_diff_pscnt={} 141 | sample1,sample2=samples 142 | res1=quant_stats[sample1] 143 | res2=quant_stats[sample2] 144 | x_dict={w[0]:w[4]/float(median_hk_trans[sample1]) for w in res1} 145 | y_dict={w[0]:w[4]/float(median_hk_trans[sample2]) for w in res2} 146 | keys=list(set(x_dict.keys())|set(y_dict.keys())) 147 | keys=filter(lambda x:"ENST" in x,keys) 148 | x=np.array(map(lambda w:x_dict[w] if w in x_dict else 0,keys)) 149 | y=np.array(map(lambda w:y_dict[w] if w in y_dict else 0,keys)) 150 | for pscnt in np.arange(0,5,0.1): 151 | zz=find((np.multiply((y<=pscnt),(x<=pscnt)))) 152 | gg=find((np.multiply((y>pscnt),(x>pscnt)))+ 153 | (np.multiply((y>pscnt),(x<=pscnt)))+ 154 | (np.multiply((x>pscnt),(y<=pscnt)))) 155 | lfc=(np.log2(x[gg]+0.5)-np.log2(y[gg]+0.5)) 156 | miss_diff_pscnt[pscnt]=[sum(np.multiply(abs(lfc)>1,lfc>=0)), 157 | sum(np.multiply(abs(lfc)>1,lfc<0)), 158 | sum(abs(lfc)<=1), 159 | len(zz) 160 | ] 161 | 162 | 163 | # In[12]: 164 | 165 | sns.set(style="white",font_scale=2) 166 | fig, ax = plt.subplots(figsize=(14,8)) 167 | x=[] 168 | y=[] 169 | for pscnt in np.arange(0,5,0.1): 170 | md=miss_diff_pscnt[pscnt] 171 | x.append(md[3]/float(sum(md))*100) 172 | y.append(sum(md[0:2])/float(sum(md[0:4]))*100) 173 | ax.plot(x,y, alpha=0.8,linewidth=2) 174 | ax.set_xlabel("% Excluded") 175 | ax.set_ylabel("% Expression disagreement") 176 | ax.spines['top'].set_visible(False) 177 | ax.spines['right'].set_visible(False) 178 | ax.get_xaxis().tick_bottom() 179 | ax.get_yaxis().tick_left() 180 | 181 | -------------------------------------------------------------------------------- /analysis_scripts/reconstruction/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Transcriptome Reconstruction Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/reconstruction/RNACocktail-Reconstruction-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /analysis_scripts/variant/README.md: -------------------------------------------------------------------------------- 1 | RNACocktail Variant Analysis 2 | =========== 3 | 4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/variant/RNACocktail-Variant-Analysis.ipynb) 5 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | 4 | ENV RNACOCKTAIL_VERSION 0.3.2 5 | ENV R_VERSION 3.6.1-3bionic 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAMTOOLS_VERSION 1.2 9 | ENV BEDTOOLS2_VERSION 2.29.0 10 | ENV PYBEDTOOLS_VERSION 0.8.0 11 | ENV PYSAM_VERSION 0.15.0 12 | ENV HISAT2_VERSION 2.1.0 13 | ENV STRINGTIE_VERSION 2.0.4 14 | ENV SALMON_VERSION 0.11.0 15 | ENV OASES_VERSION 0.2.09 16 | ENV VELVET_VERSION 1.2.10 17 | ENV SUBREAD_VERSION 2.0.0 18 | ENV LORDEC_VERSION 0.9 19 | ENV STAR_VERSION 2.7.2b 20 | ENV PICARD_VERSION 2.19.0 21 | ENV HTSLIB_VERSION 1.9 22 | ENV GIREMI_VERSION 0.2.1 23 | ENV BIOPYTHON_VERSION 1.74 24 | ENV OPENPYXL_VERSION 2.6.4 25 | ENV XLRD_VERSION 1.1.0 26 | ENV BOWTIE_VERSION 1.2.3 27 | ENV BOWTIE2_VERSION 2.3.5.1 28 | ENV BWA_VERSION 0.7.17 29 | ENV SRA_VERSION 2.9.6 30 | ENV COREUTILS_VERSION 8.27 31 | ENV PIGZ_VERSION 2.4 32 | ENV GMAP_VERSION 2019-09-12 33 | ENV BBMAP_VERSION 38.44 34 | ENV FUSIONCATCHER_VERSION 1.20 35 | ENV GFFREAD_VERSION 0.11.5 36 | ENV IDPFUSION_VERSION 1.1.1 37 | ENV GATK_VERSION 4.1.4.0 38 | 39 | RUN apt-get update && \ 40 | apt-get install -y --fix-missing build-essential zlib1g-dev unzip libncurses5-dev curl wget python python-pip python-dev cmake libboost-all-dev libxml2-dev libcurl4-gnutls-dev software-properties-common apt-transport-https default-jre default-jdk less vim libtbb-dev git tabix 41 | 42 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 43 | RUN add-apt-repository 'deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' 44 | RUN apt-get update 45 | RUN apt-get install -y --fix-missing r-base=${R_VERSION} r-recommended=${R_VERSION} 46 | RUN apt-get install -y --fix-missing --allow-downgrades r-base-core=${R_VERSION} 47 | 48 | RUN echo 'local({r <- getOption("repos"); r["CRAN"] <- "http://cran.r-project.org"; options(repos=r)})' > ~/.Rprofile 49 | RUN R -e 'install.packages("BiocManager"); BiocManager::install(); BiocManager::install("DESeq2"); BiocManager::install("tximport"); BiocManager::install("readr");' 50 | 51 | ADD https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 /opt/samtools-${SAMTOOLS_VERSION}.tar.bz2 52 | RUN cd /opt && tar -xjvf samtools-${SAMTOOLS_VERSION}.tar.bz2 && cd samtools-${SAMTOOLS_VERSION} && make && make install && cd /opt && rm -rf samtools* 53 | 54 | ADD https://github.com/arq5x/bedtools2/releases/download/v${BEDTOOLS2_VERSION}/bedtools-${BEDTOOLS2_VERSION}.tar.gz /opt/bedtools-${BEDTOOLS2_VERSION}.tar.gz 55 | RUN cd /opt && tar -zxvf bedtools-${BEDTOOLS2_VERSION}.tar.gz && cd bedtools2 && make && make install && cd /opt && rm -rf bedtools* 56 | 57 | RUN wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-${HISAT2_VERSION}-Linux_x86_64.zip -O /opt/hisat2-${HISAT2_VERSION}-Linux_x86_64.zip && cd /opt && unzip hisat2-${HISAT2_VERSION}-Linux_x86_64.zip && cp -p /opt/hisat2-${HISAT2_VERSION}/hisat2* /usr/local/bin && cd /opt && rm -rf hisat2* 58 | 59 | ADD https://github.com/gpertea/stringtie/archive/v${STRINGTIE_VERSION}.tar.gz /opt/stringtie-${STRINGTIE_VERSION}.Linux_x86_64.tar.gz 60 | RUN cd /opt && tar -zxvf stringtie-${STRINGTIE_VERSION}.Linux_x86_64.tar.gz && cd stringtie-${STRINGTIE_VERSION} && make && cp -p /opt/stringtie-${STRINGTIE_VERSION}/stringtie /usr/local/bin && cd /opt && rm -rf stringtie* 61 | 62 | ADD https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/salmon-${SALMON_VERSION}-linux_x86_64.tar.gz /opt/salmon-${SALMON_VERSION}-linux_x86_64.tar.gz 63 | RUN cd /opt && tar -zxvf salmon-${SALMON_VERSION}-linux_x86_64.tar.gz && cp -p /opt/salmon-*/bin/salmon /usr/local/bin && cp -p /opt/salmon-*/lib/* /usr/local/lib && cd /opt && rm -rf salmon* 64 | 65 | ADD https://github.com/dzerbino/oases/archive/${OASES_VERSION}.tar.gz /opt/${OASES_VERSION}.tar.gz 66 | RUN cd /opt && tar -zxvf ${OASES_VERSION}.tar.gz && rm -rf /opt/oases-${OASES_VERSION}/velvet /opt/${OASES_VERSION}.tar.gz 67 | 68 | ADD https://www.ebi.ac.uk/~zerbino/velvet/velvet_${VELVET_VERSION}.tgz /opt/velvet_${VELVET_VERSION}.tgz 69 | RUN cd /opt && tar -zxvf velvet_${VELVET_VERSION}.tgz && cd velvet_${VELVET_VERSION} && make OPENMP=1 && mv /opt/velvet_${VELVET_VERSION} /opt/oases-${OASES_VERSION}/velvet && cd /opt/oases-${OASES_VERSION} && make OPENMP=1 && cp -p /opt/oases-${OASES_VERSION}/oases /usr/local/bin && cp -p /opt/oases-${OASES_VERSION}/velvet/velvet* /usr/local/bin && rm -rf /opt/velvet_${VELVET_VERSION}.tgz 70 | RUN rm -rf /opt/oases-${OASES_VERSION}/velvet/* && rm -rf /opt/oases-${OASES_VERSION}/velvet/.gitignore && rm -rf /opt/oases-${OASES_VERSION}/* && rm -rf /opt/oases* 71 | 72 | RUN wget http://downloads.sourceforge.net/project/subread/subread-${SUBREAD_VERSION}/subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz -O /opt/subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz && cd /opt && tar -zxvf subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz && cp -p /opt/subread-${SUBREAD_VERSION}-Linux-x86_64/bin/featureCounts /usr/local/bin && cd /opt && rm -rf subread* 73 | 74 | ADD https://gite.lirmm.fr/lordec/lordec-releases/uploads/710113d83c210b6989ccfbdbafa89234/lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2 /opt/lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2 75 | RUN cd /opt && tar xjf lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2 && cd lordec-bin_${LORDEC_VERSION}_linux64 && cp -p /opt/lordec-bin_${LORDEC_VERSION}_linux64/lordec* /usr/local/bin && chmod -R 777 /usr/local/bin/lordec* && chown -R root /usr/local/bin/lordec* && chgrp -R root /usr/local/bin/lordec* && cd /opt && rm -rf lordec* 76 | 77 | ADD https://github.com/alexdobin/STAR/archive/${STAR_VERSION}.tar.gz /opt/STAR_${STAR_VERSION}.tar.gz 78 | RUN cd /opt && tar -zxvf STAR_${STAR_VERSION}.tar.gz && cp -p /opt/STAR-${STAR_VERSION}/bin/Linux_x86_64_static/* /usr/local/bin && cd /opt && rm -rf STAR* 79 | 80 | ADD https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar /opt/picard.jar 81 | RUN cd /opt && cp -p picard.jar /usr/local/bin && chmod 755 /usr/local/bin/picard.jar && cd /opt && rm -rf picard* 82 | 83 | ENV HTSLIB_VERSION 1.3 84 | 85 | ADD https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2 /opt/htslib-${HTSLIB_VERSION}.tar.bz2 86 | RUN cd /opt && tar xjf htslib-${HTSLIB_VERSION}.tar.bz2 && cd htslib-${HTSLIB_VERSION} && ./configure && make && rm -rf /opt/htslib-${HTSLIB_VERSION}.tar.bz2 87 | 88 | ADD https://github.com/zhqingit/giremi/archive/v${GIREMI_VERSION}.tar.gz /opt/giremi-${GIREMI_VERSION}.tar.gz 89 | RUN cd /opt && tar -zxvf giremi-${GIREMI_VERSION}.tar.gz && cp -p giremi-${GIREMI_VERSION}/giremi* /usr/local/bin && chmod -R 777 /usr/local/bin/giremi* && cd /opt && rm -rf giremi-* 90 | 91 | RUN pip install --upgrade pip 92 | RUN pip install pybedtools==${PYBEDTOOLS_VERSION} pysam==${PYSAM_VERSION} biopython==${BIOPYTHON_VERSION} openpyxl==${OPENPYXL_VERSION} xlrd==${XLRD_VERSION} numpy pandas scipy 93 | 94 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip -O /opt/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip 95 | RUN cd /opt && unzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && cp -p /opt/bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie* /usr/local/bin && cd /opt && rm -rf bowtie* 96 | 97 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip -O /opt/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip 98 | RUN cd /opt && unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip && cp -p /opt/bowtie2-${BOWTIE2_VERSION}-linux-x86_64/bowtie2* /usr/local/bin && cd /opt && rm -rf bowtie2* 99 | 100 | RUN wget https://sourceforge.net/projects/bio-bwa/files/bwa-${BWA_VERSION}.tar.bz2/download -O /opt/bwa-${BWA_VERSION}.tar.bz2 101 | RUN cd /opt && tar xjf bwa-${BWA_VERSION}.tar.bz2 && cd bwa-${BWA_VERSION} && make && cp -p /opt/bwa-${BWA_VERSION}/bwa /usr/local/bin && cd /opt && rm -rf bwa* 102 | 103 | ADD https://github.com/ndaniel/seqtk/archive/1.2-r101c.tar.gz /opt/seqtk-1.2-r101c.tar.gz 104 | RUN cd /opt && tar -zxvf /opt/seqtk-1.2-r101c.tar.gz && cd seqtk-1.2-r101c && make && cp -p /opt/seqtk-1.2-r101c/seqtk /usr/local/bin && cd /opt && rm -rf seqtk* 105 | 106 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/blat /usr/local/bin/blat 107 | RUN chmod 755 /usr/local/bin/blat 108 | 109 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit /usr/local/bin/faToTwoBit 110 | RUN chmod 755 /usr/local/bin/faToTwoBit 111 | 112 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver /usr/local/bin/liftOver 113 | RUN chmod 755 /usr/local/bin/liftOver 114 | 115 | ADD https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz /opt/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz 116 | RUN cd /opt && tar -zxvf sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && cp -Rp /opt/sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/local/bin/ && cd /opt && rm -rf sratoolkit* 117 | 118 | ADD http://ftp.gnu.org/gnu/coreutils/coreutils-${COREUTILS_VERSION}.tar.xz /opt/coreutils-${COREUTILS_VERSION}.tar.xz 119 | RUN cd /opt && tar -xJf coreutils-${COREUTILS_VERSION}.tar.xz && cd coreutils-${COREUTILS_VERSION} && ./configure FORCE_UNSAFE_CONFIGURE=1 && make && make install && cd /opt && rm -rf coreutils* 120 | 121 | ADD https://github.com/madler/pigz/archive/v${PIGZ_VERSION}.tar.gz /opt/pigz-${PIGZ_VERSION}.tar.gz 122 | RUN cd /opt && tar -zxvf pigz-${PIGZ_VERSION}.tar.gz && cd pigz-${PIGZ_VERSION} && make && cp -p /opt/pigz-${PIGZ_VERSION}/pigz /usr/local/bin && cd /opt && rm -rf pigz* 123 | 124 | ADD http://research-pub.gene.com/gmap/src/gmap-gsnap-${GMAP_VERSION}.tar.gz /opt/gmap-gsnap-${GMAP_VERSION}.tar.gz 125 | RUN cd /opt && tar -zxvf gmap-gsnap-${GMAP_VERSION}.tar.gz && cd gmap-${GMAP_VERSION} && ./configure && make && make install && cd /opt && rm -rf gmap* 126 | 127 | ENV PATH $PATH:/opt/bbmap/ 128 | 129 | RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_${BBMAP_VERSION}.tar.gz -O /opt/BBMap_${BBMAP_VERSION}.tar.gz 130 | RUN cd /opt && tar -xzvf BBMap_${BBMAP_VERSION}.tar.gz 131 | 132 | ENV PATH $PATH:/opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/ 133 | 134 | RUN wget https://github.com/ndaniel/fusioncatcher/releases/download/${FUSIONCATCHER_VERSION}/fusioncatcher_v${FUSIONCATCHER_VERSION}.zip -O /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}.zip && cd /opt && unzip fusioncatcher_v${FUSIONCATCHER_VERSION}.zip && cp -p /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/sam2psl.py /usr/local/bin && cp -p /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/FC /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/fusioncatcher 135 | 136 | 137 | ADD http://ccb.jhu.edu/software/stringtie/dl/gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz opt/gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz 138 | RUN cd /opt && tar -xzvf gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz && cp -p /opt/gffread-${GFFREAD_VERSION}.Linux_x86_64/gffread /usr/local/bin && rm -rf /opt/gffread* 139 | 140 | RUN cd /opt/ && git clone https://github.com/bioinform/IDP.git && cd IDP && git checkout a5d2d624ab8e4545feff3f51d264931b440d0b53 141 | 142 | ADD http://augroup.org/IDP-fusion/files/IDP-fusion_${IDPFUSION_VERSION}.tar.gz /opt/IDP-fusion_${IDPFUSION_VERSION}.tar.gz 143 | RUN cd /opt && tar -xzvf IDP-fusion_${IDPFUSION_VERSION}.tar.gz && rm -rf /opt/IDP-fusion_${IDPFUSION_VERSION}.tar.gz 144 | 145 | RUN wget https://github.com/broadinstitute/gatk/releases/download/4.1.4.0/gatk-4.1.4.0.zip -O /opt/gatk-4.1.4.0.zip && cd /opt && unzip gatk-4.1.4.0.zip && chmod -R 777 /opt/gatk-4.1.4.0 146 | 147 | RUN pip install https://github.com/bioinform/rnacocktail/archive/v${RNACOCKTAIL_VERSION}.tar.gz 148 | 149 | VOLUME /work_dir 150 | 151 | 152 | -------------------------------------------------------------------------------- /ez_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Bootstrap setuptools installation 3 | 4 | To use setuptools in your package's setup.py, include this 5 | file in the same directory and add this to the top of your setup.py:: 6 | 7 | from ez_setup import use_setuptools 8 | use_setuptools() 9 | 10 | To require a specific version of setuptools, set a download 11 | mirror, or use an alternate download directory, simply supply 12 | the appropriate options to ``use_setuptools()``. 13 | 14 | This file can also be run as a script to install or upgrade setuptools. 15 | """ 16 | import os 17 | import shutil 18 | import sys 19 | import tempfile 20 | import zipfile 21 | import optparse 22 | import subprocess 23 | import platform 24 | import textwrap 25 | import contextlib 26 | 27 | from distutils import log 28 | 29 | try: 30 | from urllib.request import urlopen 31 | except ImportError: 32 | from urllib2 import urlopen 33 | 34 | try: 35 | from site import USER_SITE 36 | except ImportError: 37 | USER_SITE = None 38 | 39 | DEFAULT_VERSION = "12.0.4" 40 | DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" 41 | 42 | def _python_cmd(*args): 43 | """ 44 | Return True if the command succeeded. 45 | """ 46 | args = (sys.executable,) + args 47 | return subprocess.call(args) == 0 48 | 49 | 50 | def _install(archive_filename, install_args=()): 51 | with archive_context(archive_filename): 52 | # installing 53 | log.warn('Installing Setuptools') 54 | if not _python_cmd('setup.py', 'install', *install_args): 55 | log.warn('Something went wrong during the installation.') 56 | log.warn('See the error message above.') 57 | # exitcode will be 2 58 | return 2 59 | 60 | 61 | def _build_egg(egg, archive_filename, to_dir): 62 | with archive_context(archive_filename): 63 | # building an egg 64 | log.warn('Building a Setuptools egg in %s', to_dir) 65 | _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) 66 | # returning the result 67 | log.warn(egg) 68 | if not os.path.exists(egg): 69 | raise IOError('Could not build the egg.') 70 | 71 | 72 | class ContextualZipFile(zipfile.ZipFile): 73 | """ 74 | Supplement ZipFile class to support context manager for Python 2.6 75 | """ 76 | 77 | def __enter__(self): 78 | return self 79 | 80 | def __exit__(self, type, value, traceback): 81 | self.close() 82 | 83 | def __new__(cls, *args, **kwargs): 84 | """ 85 | Construct a ZipFile or ContextualZipFile as appropriate 86 | """ 87 | if hasattr(zipfile.ZipFile, '__exit__'): 88 | return zipfile.ZipFile(*args, **kwargs) 89 | return super(ContextualZipFile, cls).__new__(cls) 90 | 91 | 92 | @contextlib.contextmanager 93 | def archive_context(filename): 94 | # extracting the archive 95 | tmpdir = tempfile.mkdtemp() 96 | log.warn('Extracting in %s', tmpdir) 97 | old_wd = os.getcwd() 98 | try: 99 | os.chdir(tmpdir) 100 | with ContextualZipFile(filename) as archive: 101 | archive.extractall() 102 | 103 | # going in the directory 104 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 105 | os.chdir(subdir) 106 | log.warn('Now working in %s', subdir) 107 | yield 108 | 109 | finally: 110 | os.chdir(old_wd) 111 | shutil.rmtree(tmpdir) 112 | 113 | 114 | def _do_download(version, download_base, to_dir, download_delay): 115 | egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' 116 | % (version, sys.version_info[0], sys.version_info[1])) 117 | if not os.path.exists(egg): 118 | archive = download_setuptools(version, download_base, 119 | to_dir, download_delay) 120 | _build_egg(egg, archive, to_dir) 121 | sys.path.insert(0, egg) 122 | 123 | # Remove previously-imported pkg_resources if present (see 124 | # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). 125 | if 'pkg_resources' in sys.modules: 126 | del sys.modules['pkg_resources'] 127 | 128 | import setuptools 129 | setuptools.bootstrap_install_from = egg 130 | 131 | 132 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 133 | to_dir=os.curdir, download_delay=15): 134 | to_dir = os.path.abspath(to_dir) 135 | rep_modules = 'pkg_resources', 'setuptools' 136 | imported = set(sys.modules).intersection(rep_modules) 137 | try: 138 | import pkg_resources 139 | except ImportError: 140 | return _do_download(version, download_base, to_dir, download_delay) 141 | try: 142 | pkg_resources.require("setuptools>=" + version) 143 | return 144 | except pkg_resources.DistributionNotFound: 145 | return _do_download(version, download_base, to_dir, download_delay) 146 | except pkg_resources.VersionConflict as VC_err: 147 | if imported: 148 | msg = textwrap.dedent(""" 149 | The required version of setuptools (>={version}) is not available, 150 | and can't be installed while this script is running. Please 151 | install a more recent version first, using 152 | 'easy_install -U setuptools'. 153 | 154 | (Currently using {VC_err.args[0]!r}) 155 | """).format(VC_err=VC_err, version=version) 156 | sys.stderr.write(msg) 157 | sys.exit(2) 158 | 159 | # otherwise, reload ok 160 | del pkg_resources, sys.modules['pkg_resources'] 161 | return _do_download(version, download_base, to_dir, download_delay) 162 | 163 | def _clean_check(cmd, target): 164 | """ 165 | Run the command to download target. If the command fails, clean up before 166 | re-raising the error. 167 | """ 168 | try: 169 | subprocess.check_call(cmd) 170 | except subprocess.CalledProcessError: 171 | if os.access(target, os.F_OK): 172 | os.unlink(target) 173 | raise 174 | 175 | def download_file_powershell(url, target): 176 | """ 177 | Download the file at url to target using Powershell (which will validate 178 | trust). Raise an exception if the command cannot complete. 179 | """ 180 | target = os.path.abspath(target) 181 | ps_cmd = ( 182 | "[System.Net.WebRequest]::DefaultWebProxy.Credentials = " 183 | "[System.Net.CredentialCache]::DefaultCredentials; " 184 | "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" 185 | % vars() 186 | ) 187 | cmd = [ 188 | 'powershell', 189 | '-Command', 190 | ps_cmd, 191 | ] 192 | _clean_check(cmd, target) 193 | 194 | def has_powershell(): 195 | if platform.system() != 'Windows': 196 | return False 197 | cmd = ['powershell', '-Command', 'echo test'] 198 | with open(os.path.devnull, 'wb') as devnull: 199 | try: 200 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 201 | except Exception: 202 | return False 203 | return True 204 | 205 | download_file_powershell.viable = has_powershell 206 | 207 | def download_file_curl(url, target): 208 | cmd = ['curl', url, '--silent', '--output', target] 209 | _clean_check(cmd, target) 210 | 211 | def has_curl(): 212 | cmd = ['curl', '--version'] 213 | with open(os.path.devnull, 'wb') as devnull: 214 | try: 215 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 216 | except Exception: 217 | return False 218 | return True 219 | 220 | download_file_curl.viable = has_curl 221 | 222 | def download_file_wget(url, target): 223 | cmd = ['wget', url, '--quiet', '--output-document', target] 224 | _clean_check(cmd, target) 225 | 226 | def has_wget(): 227 | cmd = ['wget', '--version'] 228 | with open(os.path.devnull, 'wb') as devnull: 229 | try: 230 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 231 | except Exception: 232 | return False 233 | return True 234 | 235 | download_file_wget.viable = has_wget 236 | 237 | def download_file_insecure(url, target): 238 | """ 239 | Use Python to download the file, even though it cannot authenticate the 240 | connection. 241 | """ 242 | src = urlopen(url) 243 | try: 244 | # Read all the data in one block. 245 | data = src.read() 246 | finally: 247 | src.close() 248 | 249 | # Write all the data in one block to avoid creating a partial file. 250 | with open(target, "wb") as dst: 251 | dst.write(data) 252 | 253 | download_file_insecure.viable = lambda: True 254 | 255 | def get_best_downloader(): 256 | downloaders = ( 257 | download_file_powershell, 258 | download_file_curl, 259 | download_file_wget, 260 | download_file_insecure, 261 | ) 262 | viable_downloaders = (dl for dl in downloaders if dl.viable()) 263 | return next(viable_downloaders, None) 264 | 265 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 266 | to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader): 267 | """ 268 | Download setuptools from a specified location and return its filename 269 | 270 | `version` should be a valid setuptools version number that is available 271 | as an sdist for download under the `download_base` URL (which should end 272 | with a '/'). `to_dir` is the directory where the egg will be downloaded. 273 | `delay` is the number of seconds to pause before an actual download 274 | attempt. 275 | 276 | ``downloader_factory`` should be a function taking no arguments and 277 | returning a function for downloading a URL to a target. 278 | """ 279 | # making sure we use the absolute path 280 | to_dir = os.path.abspath(to_dir) 281 | zip_name = "setuptools-%s.zip" % version 282 | url = download_base + zip_name 283 | saveto = os.path.join(to_dir, zip_name) 284 | if not os.path.exists(saveto): # Avoid repeated downloads 285 | log.warn("Downloading %s", url) 286 | downloader = downloader_factory() 287 | downloader(url, saveto) 288 | return os.path.realpath(saveto) 289 | 290 | def _build_install_args(options): 291 | """ 292 | Build the arguments to 'python setup.py install' on the setuptools package 293 | """ 294 | return ['--user'] if options.user_install else [] 295 | 296 | def _parse_args(): 297 | """ 298 | Parse the command line for options 299 | """ 300 | parser = optparse.OptionParser() 301 | parser.add_option( 302 | '--user', dest='user_install', action='store_true', default=False, 303 | help='install in user site package (requires Python 2.6 or later)') 304 | parser.add_option( 305 | '--download-base', dest='download_base', metavar="URL", 306 | default=DEFAULT_URL, 307 | help='alternative URL from where to download the setuptools package') 308 | parser.add_option( 309 | '--insecure', dest='downloader_factory', action='store_const', 310 | const=lambda: download_file_insecure, default=get_best_downloader, 311 | help='Use internal, non-validating downloader' 312 | ) 313 | parser.add_option( 314 | '--version', help="Specify which version to download", 315 | default=DEFAULT_VERSION, 316 | ) 317 | options, args = parser.parse_args() 318 | # positional arguments are ignored 319 | return options 320 | 321 | def main(): 322 | """Install or upgrade setuptools and EasyInstall""" 323 | options = _parse_args() 324 | archive = download_setuptools( 325 | version=options.version, 326 | download_base=options.download_base, 327 | downloader_factory=options.downloader_factory, 328 | ) 329 | return _install(archive, _build_install_args(options)) 330 | 331 | if __name__ == '__main__': 332 | sys.exit(main()) 333 | -------------------------------------------------------------------------------- /scripts/gpd2gtf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################ 4 | #This script is modified from the original code by Kin Fai Au 5 | #Obtained from https://github.com/jason-weirather/Au-public/blob/master/gold/gpd2gtf.py 6 | #Available unde Apache License Version 2.0 7 | ############################################################################ 8 | 9 | import sys 10 | import math 11 | 12 | ### generate_transcript_list 13 | ############################ 14 | def generate_transcript_list(gpd_file, transcript_list): 15 | 16 | for line in gpd_file: 17 | 18 | if (line[0] == '#'): 19 | continue 20 | 21 | fields = line.split() 22 | num_exons = int(fields[8]) 23 | 24 | start_pos_list = fields[9].split(',') 25 | end_pos_list = fields[10].split(',') 26 | 27 | exon_pos = [0] * num_exons 28 | for i in range(num_exons): 29 | exon_pos[i] = [start_pos_list[i], end_pos_list[i]] 30 | 31 | transcript_list.append([fields[0], fields[1], fields[2], fields[3], exon_pos]) 32 | 33 | 34 | ### generate_FPKM_dict 35 | ####################### 36 | def generate_FPKM_dict(FPKM_file, FPKM_dict): 37 | 38 | for line in FPKM_file: 39 | fields = line.split() 40 | FPKM_dict[fields[0]] = fields[1] 41 | 42 | 43 | 44 | ### generate_gpd_format 45 | ####################### 46 | def generate_gtf_format(gtf_file, transcript_list, FPKM_dict, source): 47 | 48 | 49 | for line in transcript_list: 50 | exon_pos = line[4] 51 | # transcript line 52 | 53 | # chr name 54 | gtf_file.write(line[2] + '\t' + source + '\t' + "transcript" + '\t') 55 | # start-end pos, score 56 | gtf_file.write("%s"%(int(exon_pos[0][0])+1) + '\t' + exon_pos[-1][1] + '\t' + '*' + '\t') 57 | # Direction 58 | gtf_file.write(line[3] + '\t' + '.' + '\t') 59 | 60 | if (FPKM_dict.has_key( line[1]) ): 61 | FPKM = FPKM_dict[line[1]] 62 | else: 63 | FPKM = '*' 64 | attribute_1 = 'gene_id "' + line[0] + '"; transcript_id "' + line[1] + '"; ' 65 | attribute_2 = ('FPKM "' + FPKM + '"; frac "' + '*' + '"; conf_lo "' + '*' + '"; ' + 66 | 'conf_hi "' + '*' + '"; cov "' + '*' + '";\n') 67 | 68 | gtf_file.write(attribute_1) 69 | gtf_file.write(attribute_2) 70 | 71 | num_exons = len(exon_pos) 72 | for i in range(num_exons): 73 | # chr name 74 | gtf_file.write(line[2] + '\t' + source + '\t' + "exon" + '\t') 75 | # start-end pos, score 76 | gtf_file.write("%s"%(int(exon_pos[i][0])+1) + '\t' + exon_pos[i][1] + '\t' + '*' + '\t') 77 | # Direction 78 | gtf_file.write(line[3] + '\t' + '.' + '\t') 79 | gtf_file.write(attribute_1) 80 | gtf_file.write('exon_number "' + str(i+1) + '"; ') 81 | gtf_file.write(attribute_2) 82 | 83 | 84 | 85 | ### Main 86 | ######## 87 | def main(): 88 | gpd_file = open(sys.argv[1], 'r') 89 | FPKM_file = open(sys.argv[2], 'r') 90 | gtf_file = open(sys.argv[3], 'w') 91 | source = sys.argv[4] 92 | 93 | transcript_list = [] 94 | FPKM_dict = dict() 95 | generate_transcript_list(gpd_file, transcript_list) 96 | generate_FPKM_dict(FPKM_file, FPKM_dict) 97 | generate_gtf_format(gtf_file, transcript_list, FPKM_dict, source) 98 | 99 | gpd_file.close() 100 | gtf_file.close() 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /scripts/hisat2_jun2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | 6 | if len(sys.argv) >= 2: 7 | HISATJun_filename = sys.argv[1] 8 | bed_filename = sys.argv[2] 9 | else: 10 | print("usage: python hisat2_jun2bed.py HISAT2_splicesites.txt junction.bed") 11 | sys.exit(1) 12 | 13 | jun_s = set() 14 | junction=open(HISATJun_filename,'r') 15 | output_file = open(bed_filename,'w') 16 | for line in junction: 17 | if line[0:5]=='track': 18 | continue 19 | else: 20 | line_list=line.strip().split("\t") 21 | leftpos=str(int(line_list[1])) 22 | rightpos=str(int(line_list[2])) 23 | locus = "___".join([line_list[0],leftpos,rightpos,line_list[3]]) 24 | jun_s.add(locus) 25 | 26 | output_file.write("track name=junctions description=\"HISAT2 junctions\"\n") 27 | i=0 28 | for locus in jun_s: 29 | output_ls = [] 30 | locus_ls = locus.split("___") 31 | chr_name = locus_ls[0] 32 | int_start = int(locus_ls[1])-51 33 | if int_start<=0: 34 | start = "1" 35 | width_start = str(49+int_start) 36 | else: 37 | start = str(int_start) 38 | width_start = "50" 39 | end = str( int(locus_ls[2]) + 50 ) 40 | distance = str( int(locus_ls[2]) - int(locus_ls[1])+51 ) 41 | 42 | sign = locus_ls[3] 43 | 44 | name = "HISAT" + str(i) 45 | 46 | i += 1 47 | output_ls = [chr_name,start,end,name,"50",sign,start,end,"0,0,0","2",width_start+",50","0,"+distance] 48 | output_file.write( '\t'.join(output_ls) + "\n" ) 49 | junction.close() 50 | output_file.close() 51 | 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | version = "Unknown" 4 | for line in open("src/_version.py"): 5 | if line.startswith("__version__"): 6 | version = line.strip().split("=")[1].strip().replace('"', '') 7 | 8 | print version 9 | setup( 10 | name='RNACocktail Pipeline', 11 | version=version, 12 | description='RNACocktail: A comprehensive framework for accurate and efficient RNA-Seq analysis', 13 | author='Roche Sequencing Solutions, Inc', 14 | author_email='bina.rd@roche.com', 15 | url='https://github.com/bioinform/rnacocktail', 16 | packages=find_packages(), 17 | install_requires=["pysam", "pybedtools"], 18 | scripts=['scripts/run_rnacocktail.py','scripts/hisat2_jun2bed.py', 19 | 'scripts/gpd2gtf.py'] 20 | ) 21 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from _version import __version__ 2 | -------------------------------------------------------------------------------- /src/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.2" 2 | -------------------------------------------------------------------------------- /src/defaults.py: -------------------------------------------------------------------------------- 1 | MODES = set(["align", "reconstruct", "denovo", 2 | "quantify", "diff", "long_correct", "long_align", 3 | "long_reconstruct", "long_fusion", "variant", "editing", "fusion","all"]) 4 | SR_ALIGNERS = set(["HISAT2"]) 5 | RECONSTRUCTORS = set(["StringTie"]) 6 | QUANTIFIERS = set(["Salmon-SMEM"]) 7 | DIFFS = set(["DESeq2"]) 8 | DNV_ASSEMBLERS = set(["Oases"]) 9 | LR_CORRECTORS = set(["LoRDEC"]) 10 | LR_ALIGNERS= set(["STARlong"]) 11 | LR_RECONSTRUCTORS= set(["IDP"]) 12 | LR_FUSION= set(["IDP-fusion"]) 13 | variant_caller= set(["GATK"]) 14 | editing_caller= set(["GIRMI"]) 15 | fusion_caller= set(["FusionCatcher"]) 16 | TIMEOUT = 10000000 # in seconds 17 | 18 | 19 | SALMON_LIBTYPE = "IU" 20 | SALMON_SMEM_k = 19 21 | DESeq2_MINCNT = 2 22 | DESeq2_ALPHA = 0.05 23 | DNV_HASH = 25 24 | DNV_FORMAT = "fasta" 25 | DNV_READTYPE = "short" 26 | STARLONG_DEFAULTS = {"outSAMattributes": "NH HI NM MD", "readNameSeparator": "space", 27 | "outFilterMultimapScoreRange": "1", "outFilterMismatchNmax": "2000", 28 | "scoreGapNoncan": "-20", "scoreGapGCAG":"-4", "scoreGapATAC":"-8", 29 | "scoreDelOpen": "-1", "scoreDelBase": "-1", "scoreInsOpen": "-1", "scoreInsBase": "-1", 30 | "alignEndsType": "Local", "seedSearchStartLmax": "50", "seedPerReadNmax": "100000", 31 | "seedPerWindowNmax": "1000", "alignTranscriptsPerReadNmax": "100000", 32 | "alignTranscriptsPerWindowNmax": "10000"} 33 | 34 | 35 | GATK_SN_OPT = "" 36 | 37 | GATK_HC_STANDCALLCONF = 20.0 38 | GATK_HC_STANDEMITCONF = 20.0 39 | GATK_HC_OPT = (("-stand-call-conf %f " % GATK_HC_STANDCALLCONF) if GATK_HC_STANDCALLCONF else "") + \ 40 | "--dont-use-soft-clipped-bases " 41 | 42 | GATK_VF_WINDOW = 35 43 | GATK_VF_CLUSTER = 3 44 | GATK_VF_FSMIN = 30.0 45 | GATK_VF_QDMAX = 2.0 46 | GATK_VF_OPT = (("-window %d " % GATK_VF_WINDOW) if GATK_VF_WINDOW else "") + \ 47 | (("-cluster %d " % GATK_VF_CLUSTER) if GATK_VF_CLUSTER else "") + \ 48 | (("--filter-name FS -filter 'FS > %f' " % GATK_VF_FSMIN) if GATK_VF_FSMIN else "") + \ 49 | (("--filter-name QD -filter 'QD < %f' " % GATK_VF_QDMAX) if GATK_VF_QDMAX else "") 50 | 51 | JAVA_XMS = "-Xms1g" 52 | JAVA_XMG = "-Xmx5g" 53 | JAVA_OPT= "%s %s"%(JAVA_XMS,JAVA_XMG) 54 | 55 | 56 | HISAT2 = "hisat2" 57 | HISAT2_SPS = "hisat2_extract_splice_sites.py" 58 | SAMTOOLS = "samtools" 59 | STRINGTIE = "stringtie" 60 | SALMON = "salmon" 61 | R_CMD = "R" 62 | FEATURECOUNTS = "featureCounts" 63 | VELVETG = "velvetg" 64 | VELVETH = "velveth" 65 | OASES = "oases" 66 | LORDEC = "lordec-correct" 67 | STARLONG = "STARlong" 68 | SAM2PSL = "sam2psl.py" 69 | IDP = "runIDP.py" 70 | IDPFUSION = "runIDP.py" 71 | GMAP="gmap" 72 | STAR_DIR = "/us/local/bin" 73 | BOWTIE2_DIR = "/us/local/bin" 74 | PICARD = "picard.jar" 75 | GATK = "GenomeAnalysisTK.jar" 76 | JAVA = "java" 77 | GIREMI = "giremi" 78 | HTSLIB = "" 79 | FUSIONCATCHER= "fusioncatcher" -------------------------------------------------------------------------------- /src/external_cmd.py: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | #This script is modified from the original code 3 | #obtained from https://github.com/bioinform/metasv/blob/master/metasv/external_cmd.py 4 | #Copyright (c) 2014, Bina Technologies inc. 5 | ############################################################################ 6 | 7 | 8 | import time 9 | import shlex 10 | import subprocess 11 | from threading import Timer 12 | import unittest 13 | import os 14 | from utils import * 15 | 16 | class TimedExternalCmd: 17 | def __init__(self, cmd, logger, raise_exception=False, env_dict={}): 18 | self.cmd = shlex.split(cmd) 19 | self.p = None 20 | self.did_timeout = False 21 | self.logger = logger 22 | self.raise_exception = raise_exception 23 | self.env_dict = env_dict 24 | def enforce_timeout(self): 25 | self.p.terminate() 26 | self.did_timeout = True 27 | def run(self, cmd_log_fd_out=None, cmd_log_fd_err=None, cmd_log="", msg="", timeout=None): 28 | self.logger.info("Task: %s " % (msg)) 29 | self.logger.info("Running \"%s\" " % (" ".join(self.cmd))) 30 | cmd_log_fd_err = cmd_log_fd_err or cmd_log_fd_out 31 | if self.env_dict: 32 | my_env = os.environ.copy() 33 | for k,v in self.env_dict.iteritems(): 34 | my_env[k] = v 35 | self.p = subprocess.Popen(self.cmd, stderr=cmd_log_fd_err, stdout=cmd_log_fd_out, env=my_env) 36 | else: 37 | self.p = subprocess.Popen(self.cmd, stderr=cmd_log_fd_err, stdout=cmd_log_fd_out) 38 | 39 | start_time = time.time() 40 | if timeout: 41 | t = Timer(timeout, self.enforce_timeout) 42 | t.start() 43 | self.p.wait() 44 | if timeout: 45 | t.cancel() 46 | if self.did_timeout: 47 | if not self.raise_exception: 48 | self.logger.error("Timed out after %d seconds.", timeout) 49 | return None 50 | else: 51 | self.logger.error("Aborting!") 52 | raise Exception("Timed out after %d seconds."%timeout) 53 | retcode = self.p.returncode 54 | if retcode == 0: 55 | self.logger.info("Done %s " % msg) 56 | else: 57 | if self.raise_exception: 58 | self.logger.info("Returned code %d (%g seconds)" % (retcode, time.time() - start_time)) 59 | self.logger.error("Aborting!") 60 | if cmd_log: 61 | raise Exception("Failed %s. Log file: %s" % (msg,cmd_log)) 62 | else: 63 | raise Exception(msg) 64 | self.logger.info("Returned code %d (%g seconds)" % (retcode, time.time() - start_time)) 65 | return retcode 66 | 67 | 68 | class TestTimedExternalCmd(unittest.TestCase): 69 | def test_run_complete(self): 70 | cmd = TimedExternalCmd("sleep 1", self.logger) 71 | self.assertEqual(cmd.run(timeout = 2), 0) 72 | self.assertFalse(cmd.did_timeout) 73 | return 74 | 75 | def test_run_timeout(self): 76 | start_tick = time.time() 77 | cmd = TimedExternalCmd("sleep 2", self.logger) 78 | cmd.run(timeout = 1) 79 | run_time = time.time() - start_tick 80 | self.assertTrue(cmd.did_timeout) 81 | self.assertAlmostEqual(run_time, 1, delta=0.2) 82 | return 83 | 84 | def test_run_no_timeout(self): 85 | cmd = TimedExternalCmd("sleep 1", self.logger) 86 | retcode = cmd.run() 87 | self.assertEqual(cmd.run(), 0) 88 | self.assertFalse(cmd.did_timeout) 89 | return 90 | 91 | def test_run_fail(self): 92 | cmd = TimedExternalCmd("sleep 1 2 3", self.logger) 93 | retcode = cmd.run(timeout = 1) 94 | self.assertIsNotNone(retcode) 95 | self.assertIsNot(retcode, 0) 96 | return 97 | 98 | logger = None 99 | 100 | 101 | if __name__ == '__main__': 102 | TestTimedExternalCmd.logger = logging.getLogger(__name__) 103 | unittest.main() 104 | -------------------------------------------------------------------------------- /src/run_dnv_assemebly.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_oases(assmebly_hash=DNV_HASH, 14 | seq_1="", seq_2="", seq_u="", seq_i="", 15 | file_format=DNV_FORMAT, read_type=DNV_READTYPE, 16 | oases=OASES, velvetg=VELVETG, velveth=VELVETH, 17 | oases_opts="", velvetg_opts="", velveth_opts="", 18 | start=0, sample= "", nthreads=1, 19 | workdir=None, outdir=None, timeout=TIMEOUT): 20 | 21 | logger.info("Running de novo assembly (OASES) for %s"%sample) 22 | 23 | if seq_1 and seq_2: 24 | for s1 in seq_1.split(","): 25 | if not os.path.exists(s1): 26 | logger.error("Aborting!") 27 | raise Exception("No Mate 1 sequence file %s"%s1) 28 | for s2 in seq_2.split(","): 29 | if not os.path.exists(s2): 30 | logger.error("Aborting!") 31 | raise Exception("No Mate 2 sequence file %s"%s2) 32 | seq_argument="-separate %s %s"%(seq_1,seq_2) 33 | elif seq_u: 34 | seq_argument=seq_u 35 | for su in seq_u.split(","): 36 | if not os.path.exists(su): 37 | logger.error("Aborting!") 38 | raise Exception("No unpaired sequence file %s"%su) 39 | 40 | elif seq_i: 41 | seq_argument=seq_i 42 | for sr in seq_i.split(","): 43 | if not os.path.exists(seq_i): 44 | logger.error("Aborting!") 45 | raise Exception("No sra sequence file %s"%sr) 46 | 47 | work_oases=os.path.join(workdir,"oases",sample) 48 | create_dirs([work_oases]) 49 | 50 | step=0 51 | if start<=step: 52 | logger.info("--------------------------STEP %s--------------------------"%step) 53 | msg = "Erase Oases work directory for %s"%sample 54 | command="rm -rf %s/*" % ( 55 | work_oases) 56 | command="bash -c \"%s\""%command 57 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 58 | retcode = cmd.run(msg=msg, timeout=timeout) 59 | step+=1 60 | 61 | oases_log = os.path.join(work_oases, "oases.log") 62 | oases_log_fd = open(oases_log, "w") 63 | 64 | 65 | seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument) 66 | 67 | msg = "velveth for %s"%sample 68 | if start<=step: 69 | logger.info("--------------------------STEP %s--------------------------"%step) 70 | command="%s %s %d %s %s" % ( 71 | velveth, work_oases, assmebly_hash, velveth_opts, seq_argument) 72 | command="bash -c \"%s\""%command 73 | cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)}) 74 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) 75 | else: 76 | logger.info("Skipping step %d: %s"%(step,msg)) 77 | step+=1 78 | 79 | 80 | msg = "velvetg for %s"%sample 81 | if start<=step: 82 | logger.info("--------------------------STEP %s--------------------------"%step) 83 | command="%s %s %s -read_trkg yes " % ( 84 | velvetg, work_oases, velvetg_opts) 85 | command="bash -c \"%s\""%command 86 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 87 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) 88 | else: 89 | logger.info("Skipping step %d: %s"%(step,msg)) 90 | step+=1 91 | 92 | msg = "oases for %s"%sample 93 | if start<=step: 94 | logger.info("--------------------------STEP %s--------------------------"%step) 95 | command="%s %s %s " % ( 96 | oases, work_oases, oases_opts) 97 | command="bash -c \"%s\""%command 98 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 99 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) 100 | else: 101 | logger.info("Skipping step %d: %s"%(step,msg)) 102 | step+=1 103 | 104 | out_oases=os.path.join(outdir,"oases",sample) 105 | create_dirs([out_oases]) 106 | msg="Copy predictions to output directory for %s."%sample 107 | if start<=step: 108 | logger.info("--------------------------STEP %s--------------------------"%step) 109 | if os.path.exists("%s/transcripts.fa"%work_oases): 110 | command = "cp %s/transcripts.fa %s/transcripts.fa"%( 111 | work_oases, out_oases) 112 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 113 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout) 114 | else: 115 | logger.info("Skipping step %d: %s"%(step,msg)) 116 | step+=1 117 | 118 | 119 | transcripts = "" 120 | if os.path.exists("%s/transcripts.fa"%out_oases): 121 | logger.info("Oases was successfull!") 122 | logger.info("Output transcripts: %s/transcripts.fa"%out_oases) 123 | transcripts = "%s/transcripts.fa"%out_oases 124 | else: 125 | logger.info("Oases failed!") 126 | return transcripts 127 | 128 | def run_dnv_assemebly(assembler="Oases", assmebly_hash=DNV_HASH, 129 | seq_1="", seq_2="", seq_u="", seq_i="", 130 | file_format=DNV_FORMAT, read_type=DNV_READTYPE, 131 | oases=OASES, velvetg=VELVETG, velveth=VELVETH, 132 | oases_opts="", velvetg_opts="", velveth_opts="", 133 | start=0, sample= "", nthreads=1, 134 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 135 | transcripts="" 136 | if assembler.upper()=="OASES": 137 | try: 138 | transcripts=run_oases(assmebly_hash=assmebly_hash, 139 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u, seq_i=seq_i, 140 | file_format=file_format, read_type=read_type, 141 | oases=oases, velvetg=velvetg, velveth=velveth, 142 | oases_opts=oases_opts, velvetg_opts=velvetg_opts, velveth_opts=velveth_opts, 143 | start=start, sample= sample, nthreads=nthreads, 144 | workdir=workdir, outdir=outdir, timeout=timeout) 145 | except Exception as excp: 146 | logger.info("Oases failed!") 147 | logger.error(excp) 148 | if not ignore_exceptions: 149 | raise Exception(excp) 150 | return transcripts -------------------------------------------------------------------------------- /src/run_editing.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | import pysam 6 | import sys 7 | import csv 8 | import pybedtools 9 | 10 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 11 | logFormatter = logging.Formatter(FORMAT) 12 | logger = logging.getLogger(__name__) 13 | consoleHandler = logging.StreamHandler() 14 | consoleHandler.setFormatter(logFormatter) 15 | logger.addHandler(consoleHandler) 16 | 17 | def filter_multi_chr_alignments(in_file,out_file): 18 | curren_read="" 19 | chrms=set([]) 20 | reads=[] 21 | infile=pysam.AlignmentFile(in_file, "rb") 22 | outfile=pysam.AlignmentFile(out_file, "wb",template=infile) 23 | for read in infile: 24 | if read.qname !=curren_read: 25 | if curren_read!="": 26 | if len(chrms)==1: 27 | for r in reads: 28 | outfile.write(r) 29 | curren_read=read.qname 30 | chrms=set([read.tid]) 31 | reads=[read] 32 | else: 33 | chrms.add(read.tid) 34 | reads.append(read) 35 | if len(chrms)==1: 36 | for r in reads: 37 | outfile.write(r) 38 | outfile.close() 39 | 40 | 41 | 42 | 43 | def fix_SNV_no(feature): 44 | return pybedtools.Interval(feature.chrom, feature.start, feature.end, name="SNV", 45 | score=feature.score, strand=".",otherfields=[".","."]) 46 | 47 | def merge_info_SNV(feature): 48 | pos=round(min(abs(int(feature[9])-feature.start), 49 | abs(int(feature[10])-feature.start))/float(int(feature[10])-int(feature[9])+1)*100) 50 | isin=1 if ( feature.start>= int(feature[9]) and feature.start<=int(feature[10])) else -1 51 | pos=pos*isin 52 | name="%s,%s"%(feature[3],feature[11]) 53 | otherfields= [str(pos),feature[12]] 54 | return pybedtools.Interval(chrom=feature.chrom,start=feature.start,end=feature.end,name=name, 55 | score=feature.score,strand=feature[13],otherfields=otherfields) 56 | 57 | def find_SNV_strands(strand_pos_bed,genes_pos_bed,input_annotated_vcf,output_annotated_bed): 58 | 59 | final_fwd=pybedtools.BedTool(strand_pos_bed).filter(lambda x:x.strand=="+").sort() 60 | final_rev=pybedtools.BedTool(strand_pos_bed).filter(lambda x:x.strand=="-").sort() 61 | 62 | vcf_intervals=[] 63 | with open(input_annotated_vcf, 'rb') as csvfile: 64 | spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|') 65 | for x in spamreader: 66 | if x[0][0]=="#": 67 | continue 68 | if x[6]!="PASS": 69 | continue 70 | if len(x[3])!=1 or len(x[4])!=1: 71 | continue 72 | 73 | gt=x[9].split(":")[0] 74 | gt=gt.split("|") if "|" in gt else gt.split("/") 75 | if gt[0]==gt[1]: 76 | continue 77 | 78 | vcf_intervals.append(pybedtools.Interval(x[0], int(x[1])-1, int(x[1]), name="SNV", 79 | score=1 if "DB" in x[7] else 0, strand=".",otherfields=[".","."])) 80 | SNV=pybedtools.BedTool(vcf_intervals).sort().saveas() 81 | 82 | 83 | 84 | 85 | for w in [0,10,50,100,200,400,800,1000]: 86 | if w==0: 87 | SNV_no=SNV 88 | SNV_fwd=SNV_no.window(final_fwd,w=w).each(merge_info_SNV).sort() 89 | if len(SNV_fwd)>0: 90 | SNV_fwd=SNV_fwd.groupby(g=[1,2,3],c=[4,5,6,7,8],o="first,first,first,max,min") 91 | SNV_fwd1=SNV_no.window(final_fwd,w=w,v=True) 92 | SNV_fwd=SNV_fwd.cat(SNV_fwd1,postmerge=False).sort() 93 | 94 | SNV_rev=SNV_no.window(final_rev,w=w).each(merge_info_SNV).sort() 95 | if len(SNV_rev)>0: 96 | SNV_rev=SNV_rev.groupby(g=[1,2,3],c=[4,5,6,7,8],o="first,first,first,max,min") 97 | SNV_rev1=SNV_no.window(final_rev,w=w,v=True) 98 | SNV_rev=SNV_rev.cat(SNV_rev1,postmerge=False).sort() 99 | SNV_final=SNV_fwd.cat(SNV_rev,postmerge=False).sort() 100 | if len(SNV_final)>0: 101 | SNV_final=SNV_final.groupby(g=[1,2,3],c=[4,5,6,7,8],o="collapse,first,collapse,collapse,collapse") 102 | 103 | SNV_good_=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))==1).sort() 104 | SNV_no=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))==0).each(fix_SNV_no).sort() 105 | SNV_bad_=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))>1).sort() 106 | 107 | if w==0: 108 | SNV_good=SNV_good_ 109 | SNV_bad=SNV_bad_ 110 | else: 111 | SNV_good=SNV_good.cat(SNV_good_,postmerge=False).sort() 112 | SNV_no=SNV_no.cat(SNV_bad_,postmerge=False).sort() 113 | 114 | 115 | SNV_annotated=[] 116 | cnt=0 117 | for i in SNV_good: 118 | name=list(set(i.name.split(","))-set(["SNV"]))[0] 119 | strand=list(set(i.strand.split(","))-set(["."])) 120 | strand=strand[0] 121 | SNV_annotated.append(pybedtools.Interval(chrom=i.chrom,start=i.start,end=i.end,name=name, 122 | score=i.score,strand=strand)) 123 | for i in SNV_no: 124 | SNV_annotated.append(pybedtools.Interval(chrom=i.chrom,start=i.start,end=i.end,name="SNV%d"%cnt, 125 | score=i.score,strand=".")) 126 | cnt+=1 127 | SNV_output_annotated_bed=pybedtools.BedTool(SNV_annotated).sort() 128 | 129 | Intes=SNV_output_annotated_bed.window(genes_pos_bed,v=True).each(lambda x: 130 | pybedtools.Interval(x[0],int(x[1]),int(x[2]),"Inte",x[4],"#")).sort() 131 | Genes=SNV_output_annotated_bed.window(genes_pos_bed,u=True) 132 | SNV_output_annotated_bed=Intes.cat(Genes,postmerge=False).sort().saveas(output_annotated_bed) 133 | 134 | 135 | 136 | def run_giremi(alignment="", variant="", 137 | strand_pos="", genes_pos="", 138 | ref_genome="", knownsites="", 139 | giremi_dir="", htslib_dir="", 140 | samtools=SAMTOOLS, gatk=GATK, 141 | java=JAVA, giremi_opts="", java_opts="", 142 | VariantAnnotator_opts="", 143 | start=0, sample= "", nthreads=1, 144 | workdir=None, outdir=None, timeout=TIMEOUT): 145 | 146 | 147 | logger.info("Running RNA editing detection (GIREMI) for %s"%sample) 148 | if not os.path.exists(alignment): 149 | logger.error("Aborting!") 150 | raise Exception("No alignment file %s"%alignment) 151 | if not os.path.exists(variant): 152 | logger.error("Aborting!") 153 | raise Exception("No variant VCF file %s"%variant) 154 | if not os.path.exists(strand_pos): 155 | logger.error("Aborting!") 156 | raise Exception("No strand position BED file %s"%strand_pos) 157 | if not os.path.exists(genes_pos): 158 | logger.error("Aborting!") 159 | raise Exception("No genes position BED file %s"%genes_pos) 160 | if not os.path.exists(ref_genome): 161 | logger.error("Aborting!") 162 | raise Exception("No reference genome FASTA file %s"%ref_genome) 163 | if not os.path.exists(knownsites): 164 | logger.error("Aborting!") 165 | raise Exception("No VCF knownsites file %s"%knownsites) 166 | if giremi_dir: 167 | if not os.path.exists(giremi_dir): 168 | logger.error("Aborting!") 169 | raise Exception("No GIREMI directory %s"%giremi_dir) 170 | 171 | work_giremi=os.path.join(workdir,"giremi",sample) 172 | create_dirs([work_giremi]) 173 | 174 | tmp_dir = "" 175 | if "-Xms" not in java_opts: 176 | java_opts += " %s"%JAVA_XMS 177 | if "-Xmx" not in java_opts: 178 | java_opts += " %s"%JAVA_XMG 179 | if "-Djava.io.tmpdir" not in java_opts: 180 | java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_giremi) 181 | tmp_dir="%s/javatmp/"%(work_giremi) 182 | 183 | 184 | step=0 185 | if start<=step: 186 | logger.info("--------------------------STEP %s--------------------------"%step) 187 | msg = "Erase GIREMI work directory for %s"%sample 188 | command="rm -rf %s/*" % ( 189 | work_giremi) 190 | command="bash -c \"%s\""%command 191 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 192 | retcode = cmd.run(msg=msg,timeout=timeout) 193 | step+=1 194 | 195 | giremi_log = os.path.join(work_giremi, "giremi.log") 196 | giremi_log_fd = open(giremi_log, "w") 197 | 198 | if tmp_dir: 199 | create_dirs([tmp_dir]) 200 | 201 | msg = "Sort BAM by name for %s"%sample 202 | if start<=step: 203 | logger.info("--------------------------STEP %s--------------------------"%step) 204 | command="%s sort -n -@ %d -T %s/alignments.name_sorted -o %s/alignments.name_sorted.bam %s" % ( 205 | samtools, nthreads, work_giremi, work_giremi, alignment) 206 | command="bash -c \"%s\""%command 207 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 208 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 209 | else: 210 | logger.info("Skipping step %d: %s"%(step,msg)) 211 | step+=1 212 | 213 | 214 | msg = "Filter alignments mapped to multiple chromosoms for %s"%sample 215 | if start<=step: 216 | logger.info("--------------------------STEP %s--------------------------"%step) 217 | logger.info(msg) 218 | filter_multi_chr_alignments("%s/alignments.name_sorted.bam"%work_giremi,"%s/alignments.chr_unique.bam"%work_giremi) 219 | else: 220 | logger.info("Skipping step %d: %s"%(step,msg)) 221 | step+=1 222 | 223 | msg = "Sort BAM by pos for %s"%sample 224 | if start<=step: 225 | logger.info("--------------------------STEP %s--------------------------"%step) 226 | command="%s sort -@ %d -T %s/alignments.pos_sorted -o %s/alignments.pos_sorted.bam %s/alignments.chr_unique.bam" % ( 227 | samtools, nthreads, work_giremi, work_giremi, work_giremi) 228 | command="bash -c \"%s\""%command 229 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 230 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 231 | else: 232 | logger.info("Skipping step %d: %s"%(step,msg)) 233 | step+=1 234 | 235 | msg = "GATK IndexFeatureFile for %s"%sample 236 | if start<=step: 237 | logger.info("--------------------------STEP %s--------------------------"%step) 238 | command="%s %s -jar %s IndexFeatureFile -F %s" % ( 239 | java, java_opts, gatk, variant) 240 | command="bash -c \"%s\""%command 241 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 242 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 243 | else: 244 | logger.info("Skipping step %d: %s"%(step,msg)) 245 | step+=1 246 | 247 | 248 | msg = "GATK VariantAnnotator for %s"%sample 249 | if start<=step: 250 | logger.info("--------------------------STEP %s--------------------------"%step) 251 | command="%s %s -jar %s VariantAnnotator -R %s -V %s -L %s -O %s/annotated.vcf --dbsnp %s %s" % ( 252 | java, java_opts, gatk, ref_genome,variant,variant,work_giremi,knownsites,VariantAnnotator_opts) 253 | command="bash -c \"%s\""%command 254 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 255 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 256 | else: 257 | logger.info("Skipping step %d: %s"%(step,msg)) 258 | step+=1 259 | 260 | msg="Find variant strands for %s"%sample 261 | if start<=step: 262 | logger.info("--------------------------STEP %s--------------------------"%step) 263 | logger.info(msg) 264 | find_SNV_strands(strand_pos, genes_pos, "%s/annotated.vcf"%work_giremi, "%s/SNV_annotated.bed"%work_giremi) 265 | else: 266 | logger.info("Skipping step %d: %s"%(step,msg)) 267 | step+=1 268 | 269 | if htslib_dir: 270 | if "LD_LIBRARY_PATH" in os.environ: 271 | os.environ["LD_LIBRARY_PATH"] += ":%s/"%htslib_dir 272 | else: 273 | os.environ["LD_LIBRARY_PATH"] = htslib_dir 274 | 275 | if giremi_dir: 276 | os.environ["PATH"] += ":%s/"%giremi_dir 277 | 278 | msg = "Run GIREMI for %s"%sample 279 | if start<=step: 280 | logger.info("--------------------------STEP %s--------------------------"%step) 281 | command="cd %s && %s %s -f %s -l %s/SNV_annotated.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % ( 282 | giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi)) 283 | command="bash -c \"%s\""%command 284 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 285 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 286 | else: 287 | logger.info("Skipping step %d: %s"%(step,msg)) 288 | step+=1 289 | 290 | 291 | if os.path.exists("%s/giremi_out.txt"%work_giremi) and not os.path.exists("%s/giremi_out.txt.res"%work_giremi): 292 | 293 | msg="Identify N variants for %s"%sample 294 | if start<=step: 295 | logger.info("--------------------------STEP %s--------------------------"%step) 296 | logger.info(msg) 297 | with open("%s/giremi_out.txt"%work_giremi) as csv_file_i: 298 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') 299 | with open("%s/N.bed"%work_giremi, 'wb') as csvfile_o: 300 | spamwriter = csv.writer(csvfile_o, delimiter='\t', 301 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 302 | for row in spamreader: 303 | if (row[5]=="N" or row[8]=="N"): 304 | spamwriter.writerow([row[0],int(row[1])-1,row[1]]) 305 | else: 306 | logger.info("Skipping step %d: %s"%(step,msg)) 307 | step+=1 308 | 309 | cnt=len(pybedtools.BedTool("%s/N.bed"%work_giremi)) 310 | if cnt>0: 311 | msg="Remove N variants for %s"%sample 312 | if start<=step: 313 | logger.info("--------------------------STEP %s--------------------------"%step) 314 | logger.info(msg) 315 | pybedtools.BedTool("%s/SNV_annotated.bed"%work_giremi).intersect( 316 | "%s/N.bed"%work_giremi,r=True, f=1, v=True).saveas("%s/SNV_annotated_filtered.bed"%work_giremi) 317 | else: 318 | logger.info("Skipping step %d: %s"%(step,msg)) 319 | step+=1 320 | 321 | msg = "Rerun GIREMI for %s"%sample 322 | if start<=step: 323 | logger.info("--------------------------STEP %s--------------------------"%step) 324 | if os.path.exists("%s/SNV_annotated_filtered.bed"%work_giremi): 325 | command="cd %s && %s %s -f %s -l %s/SNV_annotated_filtered.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % ( 326 | giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi)) 327 | command="bash -c \"%s\""%command 328 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 329 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 330 | else: 331 | logger.info("No file %s/SNV_annotated_filtered.bed"%work_giremi) 332 | else: 333 | logger.info("Skipping step %d: %s"%(step,msg)) 334 | step+=1 335 | else: 336 | step+=2 337 | else: 338 | step+=3 339 | 340 | out_giremi=os.path.join(outdir,"giremi",sample) 341 | create_dirs([out_giremi]) 342 | msg="Copy predictions to output directory for %s."%sample 343 | if start<=step: 344 | logger.info("--------------------------STEP %s--------------------------"%step) 345 | if os.path.exists("%s/giremi_out.txt.res"%work_giremi): 346 | command = "cp %s/giremi_out.txt.res %s/giremi_out.txt.res"%( 347 | work_giremi, out_giremi) 348 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 349 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout) 350 | else: 351 | logger.info("Skipping step %d: %s"%(step,msg)) 352 | step+=1 353 | 354 | 355 | edits = "" 356 | if os.path.exists("%s/giremi_out.txt.res"%out_giremi): 357 | logger.info("GIREMI was successfull!") 358 | logger.info("Output edits: %s/giremi_out.txt.res"%out_giremi) 359 | edits = "%s/giremi_out.txt.res"%out_giremi 360 | else: 361 | logger.info("GIREMI failed!") 362 | return edits 363 | 364 | def run_editing(editing_caller="GIREMI", alignment="", variant="", 365 | strand_pos="", genes_pos="", 366 | ref_genome="", knownsites="", 367 | giremi_dir="", htslib_dir="", 368 | samtools=SAMTOOLS, gatk=GATK, 369 | java=JAVA, giremi_opts="", java_opts="", 370 | VariantAnnotator_opts="", 371 | start=0, sample= "", nthreads=1, 372 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 373 | edits="" 374 | 375 | if editing_caller.upper()=="GIREMI": 376 | try: 377 | edits=run_giremi(alignment=alignment, variant=variant, 378 | strand_pos=strand_pos, genes_pos=genes_pos, 379 | ref_genome=ref_genome, knownsites=knownsites, 380 | giremi_dir=giremi_dir, htslib_dir=htslib_dir, 381 | samtools=samtools, gatk=gatk, 382 | java=java, giremi_opts=giremi_opts, java_opts=java_opts, 383 | VariantAnnotator_opts=VariantAnnotator_opts, 384 | start=start, sample= sample, nthreads=nthreads, 385 | workdir=workdir, outdir=outdir, timeout=timeout) 386 | except Exception as excp: 387 | logger.info("GIREMI failed!") 388 | logger.error(excp) 389 | if not ignore_exceptions: 390 | raise Exception(excp) 391 | 392 | return edits 393 | 394 | 395 | 396 | -------------------------------------------------------------------------------- /src/run_fusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_fusioncatcher(data_dir="", input="", start=0, 14 | fusioncatcher=FUSIONCATCHER, fusioncatcher_opts="", 15 | sample= "", nthreads=1, 16 | workdir=None, outdir=None, timeout=TIMEOUT): 17 | 18 | 19 | logger.info("Running RNA fusion detection (FusionCatcher) for %s"%sample) 20 | if not os.path.exists(data_dir): 21 | logger.error("Aborting!") 22 | raise Exception("No data directory %s"%data_dir) 23 | 24 | 25 | work_fusioncatcher=os.path.join(workdir,"fusioncatcher",sample) 26 | create_dirs([work_fusioncatcher]) 27 | fusioncatcher_log = os.path.join(work_fusioncatcher, "fusioncatcher.log") 28 | fusioncatcher_log_fd = open(fusioncatcher_log, "w") 29 | 30 | if nthreads>1: 31 | if "-p " not in fusioncatcher_opts: 32 | fusioncatcher_opts += " -p %d"%nthreads 33 | msg = "Run FusionCatcher for %s"%sample 34 | command="%s -d %s -i %s --start %d -o %s" % ( 35 | fusioncatcher, data_dir, input, start, work_fusioncatcher) 36 | command="bash -c \"%s\""%command 37 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 38 | retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log_fd, msg=msg, timeout=timeout) 39 | 40 | out_fusioncatcher=os.path.join(outdir,"fusioncatcher",sample) 41 | create_dirs([out_fusioncatcher]) 42 | msg="Copy predictions to output directory for %s."%sample 43 | if os.path.exists("%s/final-list_candidate-fusion-genes.txt"%work_fusioncatcher): 44 | command = "cp %s/final-list_candidate-fusion-genes.txt %s/final-list_candidate-fusion-genes.txt"%( 45 | work_fusioncatcher, out_fusioncatcher) 46 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 47 | retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log, msg=msg, timeout=timeout) 48 | 49 | fusions = "" 50 | if os.path.exists("%s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher): 51 | logger.info("FusionCatcher was successfull!") 52 | logger.info("Output fusions: %s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher) 53 | fusions = "%s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher 54 | else: 55 | logger.info("FusionCatcher failed!") 56 | return fusions 57 | 58 | 59 | def run_fusion(fusion_caller="FusionCatcher", 60 | data_dir="", input="", start=0, 61 | fusioncatcher=FUSIONCATCHER, fusioncatcher_opts="", 62 | sample= "", nthreads=1, 63 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 64 | fusions="" 65 | if fusion_caller.upper()=="FUSIONCATCHER": 66 | try: 67 | fusions=run_fusioncatcher(data_dir=data_dir, input=input, start=start, 68 | fusioncatcher=fusioncatcher, fusioncatcher_opts=fusioncatcher_opts, 69 | sample= sample, nthreads=nthreads, 70 | workdir=workdir, outdir=outdir, timeout=timeout) 71 | except Exception as excp: 72 | logger.info("FusionCatcher failed!") 73 | logger.error(excp) 74 | if not ignore_exceptions: 75 | raise Exception(excp) 76 | return fusions 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/run_lr_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_starlong(long="", 14 | genome_dir="", ref_gtf="", 15 | starlong=STARLONG, sam2psl=SAM2PSL,samtools=SAMTOOLS, 16 | starlong_opts="", 17 | start=0, sample= "", nthreads=1, 18 | workdir=None, outdir=None, timeout=TIMEOUT): 19 | 20 | logger.info("Running long read alignment (STARlong) for %s"%sample) 21 | if not os.path.exists(genome_dir+"SAindex"): 22 | logger.error("Aborting!") 23 | raise Exception("No SAindex directory in %s"%genome_dir) 24 | 25 | if long: 26 | if not os.path.exists(long): 27 | logger.error("Aborting!") 28 | raise Exception("No long read sequence file %s"%long) 29 | 30 | work_starlong=os.path.join(workdir,"starlong",sample) 31 | create_dirs([work_starlong]) 32 | 33 | step=0 34 | if start<=step: 35 | logger.info("--------------------------STEP %s--------------------------"%step) 36 | msg = "Erase STARlong work directory for %s"%sample 37 | command="rm -rf %s/*" % ( 38 | work_starlong) 39 | command="bash -c \"%s\""%command 40 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 41 | retcode = cmd.run(msg=msg,timeout=timeout) 42 | step+=1 43 | 44 | starlong_log = os.path.join(work_starlong, "starlong.log") 45 | starlong_log_fd = open(starlong_log, "w") 46 | 47 | 48 | 49 | if ref_gtf: 50 | if not os.path.exists(ref_gtf): 51 | logger.error("Aborting!") 52 | raise Exception("No reference GTF file %s"%ref_gtf) 53 | 54 | if "--outSAMattrRGline" not in starlong_opts: 55 | starlong_opts += " --outSAMattrRGline ID:STARlong SM:%s"%sample 56 | if "--runThreadN " not in starlong_opts: 57 | starlong_opts += " --runThreadN %d"%nthreads 58 | if ref_gtf: 59 | starlong_opts += " --sjdbGTFfile %s"%ref_gtf 60 | for k,v in STARLONG_DEFAULTS.iteritems(): 61 | if k not in starlong_opts: 62 | starlong_opts += " --%s %s"%(k,v) 63 | 64 | 65 | msg = "STARlong for %s"%sample 66 | if start<=step: 67 | logger.info("--------------------------STEP %s--------------------------"%step) 68 | command="%s --runMode alignReads %s --genomeDir %s --readFilesIn %s --outFileNamePrefix %s/" % ( 69 | starlong, starlong_opts, genome_dir, long, work_starlong ) 70 | command="bash -c \"%s\""%command 71 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 72 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) 73 | else: 74 | logger.info("Skipping step %d: %s"%(step,msg)) 75 | step+=1 76 | 77 | 78 | msg = "converting SAM to PSL for %s"%sample 79 | if start<=step: 80 | logger.info("--------------------------STEP %s--------------------------"%step) 81 | command="%s -i %s/Aligned.out.sam -o %s/Aligned.out.psl" % ( 82 | sam2psl, work_starlong, work_starlong) 83 | command="bash -c \"%s\""%command 84 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 85 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) 86 | else: 87 | logger.info("Skipping step %d: %s"%(step,msg)) 88 | step+=1 89 | 90 | msg = "converting SAM to BAM for %s"%sample 91 | if start<=step: 92 | logger.info("--------------------------STEP %s--------------------------"%step) 93 | command="%s view -Su %s/Aligned.out.sam -@ %d -o %s/Aligned.out.bam" % ( 94 | samtools, work_starlong, nthreads, work_starlong) 95 | command="bash -c \"%s\""%command 96 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 97 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) 98 | else: 99 | logger.info("Skipping step %d: %s"%(step,msg)) 100 | step+=1 101 | 102 | # 103 | # msg = "Clean temp alignment files for %s"%sample 104 | # if start<=step: 105 | # logger.info("--------------------------STEP %s--------------------------"%step) 106 | # command="rm %s/Aligned.out.sam" % (work_starlong) 107 | # command="bash -c \"%s\""%command 108 | # cmd = TimedExternalCmd(command, logger, raise_exception=True) 109 | # retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) 110 | # else: 111 | # logger.info("Skipping step %d: %s"%(step,msg)) 112 | # step+=1 113 | 114 | 115 | out_starlong=os.path.join(outdir,"starlong",sample) 116 | create_dirs([out_starlong]) 117 | msg="Copy predictions to output directory for %s."%sample 118 | if start<=step: 119 | logger.info("--------------------------STEP %s--------------------------"%step) 120 | if os.path.exists("%s/Aligned.out.psl"%work_starlong): 121 | command = "cp %s/Aligned.out.psl %s/Aligned.out.psl"%( 122 | work_starlong, out_starlong) 123 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 124 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout) 125 | else: 126 | logger.info("Skipping step %d: %s"%(step,msg)) 127 | step+=1 128 | 129 | 130 | alignments_psl = "" 131 | if os.path.exists("%s/Aligned.out.psl"%out_starlong): 132 | logger.info("STARlong was successfull!") 133 | logger.info("Output alignment: %s/Aligned.out.psl"%out_starlong) 134 | alignments_psl = "%s/Aligned.out.psl"%out_starlong 135 | else: 136 | logger.info("STARlong failed!") 137 | return alignments_psl 138 | 139 | def run_lr_align(long_aligner="STARlong", long="", 140 | genome_dir="", ref_gtf="", 141 | starlong=STARLONG, sam2psl=SAM2PSL, samtools=SAMTOOLS, 142 | starlong_opts="", 143 | start=0, sample= "", nthreads=1, 144 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 145 | alignments_psl="" 146 | if long_aligner.upper()=="STARLONG": 147 | try: 148 | alignments_psl=run_starlong(genome_dir=genome_dir, ref_gtf=ref_gtf, 149 | long=long, starlong=starlong, sam2psl=sam2psl, samtools=samtools, 150 | starlong_opts=starlong_opts, 151 | start=start, sample= sample, nthreads=nthreads, 152 | workdir=workdir, outdir=outdir, timeout=timeout) 153 | except Exception as excp: 154 | logger.info("STARlong failed!") 155 | logger.error(excp) 156 | if not ignore_exceptions: 157 | raise Exception(excp) 158 | return alignments_psl -------------------------------------------------------------------------------- /src/run_lr_correct.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_lordec(kmer=23, 14 | solid=3, long="", short="", 15 | lordec=LORDEC, lordec_opts="", 16 | start=0, sample= "", nthreads=1, 17 | workdir=None, outdir=None, timeout=TIMEOUT): 18 | 19 | logger.info("Running long read error correction (LoRDEC) for %s"%sample) 20 | if not os.path.exists(long): 21 | logger.error("Aborting!") 22 | raise Exception("No long read sequence file %s"%long) 23 | 24 | if not os.path.exists(short): 25 | logger.error("Aborting!") 26 | raise Exception("No short read sequence file %s"%short) 27 | 28 | work_lordec=os.path.join(workdir,"lordec",sample) 29 | create_dirs([work_lordec]) 30 | 31 | step=0 32 | if start<=step: 33 | logger.info("--------------------------STEP %s--------------------------"%step) 34 | msg = "Erase LoRDEC work directory for %s"%sample 35 | command="rm -rf %s/*" % ( 36 | work_lordec) 37 | command="bash -c \"%s\""%command 38 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 39 | retcode = cmd.run(msg=msg,timeout=timeout) 40 | step+=1 41 | 42 | lordec_log = os.path.join(work_lordec, "lordec.log") 43 | lordec_log_fd = open(lordec_log, "w") 44 | ksps = "" 45 | 46 | if "-T " not in lordec_opts: 47 | lordec_opts += " -T %d"%nthreads 48 | 49 | msg = "LoRDEC for %s"%sample 50 | if start<=step: 51 | logger.info("--------------------------STEP %s--------------------------"%step) 52 | command="%s %s -k %d -s %d -i %s -2 %s -O %s -o %s/long_corrected.fa" % ( 53 | lordec, lordec_opts, kmer, solid, long, short, work_lordec, work_lordec) 54 | command="bash -c \"%s\""%command 55 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 56 | retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout) 57 | else: 58 | logger.info("Skipping step %d: %s"%(step,msg)) 59 | step+=1 60 | 61 | out_lordec=os.path.join(outdir,"lordec",sample) 62 | create_dirs([out_lordec]) 63 | msg="Copy predictions to output directory for %s."%sample 64 | if start<=step: 65 | logger.info("--------------------------STEP %s--------------------------"%step) 66 | if os.path.exists("%s/long_corrected.fa"%work_lordec): 67 | command = "cp %s/long_corrected.fa %s/long_corrected.fa"%( 68 | work_lordec, out_lordec) 69 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 70 | retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout) 71 | else: 72 | logger.info("Skipping step %d: %s"%(step,msg)) 73 | step+=1 74 | 75 | 76 | corrected = "" 77 | if os.path.exists("%s/long_corrected.fa"%out_lordec): 78 | logger.info("LoRDEC was successfull!") 79 | logger.info("Output corrected reads: %s/long_corrected.fa"%out_lordec) 80 | corrected = "%s/long_corrected.fa"%out_lordec 81 | else: 82 | logger.info("LoRDEC failed!") 83 | return corrected 84 | 85 | def run_lr_correct(long_corrector="LoRDEC", kmer=23, 86 | solid=3, long="", short="", 87 | lordec=LORDEC, lordec_opts="", 88 | start=0, sample= "", nthreads=1, 89 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 90 | corrected="" 91 | if long_corrector.upper()=="LORDEC": 92 | try: 93 | corrected=run_lordec(kmer=kmer, solid=solid, long=long, short=short, 94 | lordec=lordec, lordec_opts=lordec_opts, 95 | start=start, sample= sample, nthreads=nthreads, 96 | workdir=workdir, outdir=outdir, timeout=timeout) 97 | except Exception as excp: 98 | logger.info("LoRDEC failed!") 99 | logger.error(excp) 100 | if not ignore_exceptions: 101 | raise Exception(excp) 102 | return corrected -------------------------------------------------------------------------------- /src/run_lr_fusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | import csv 6 | import re 7 | 8 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 9 | logFormatter = logging.Formatter(FORMAT) 10 | logger = logging.getLogger(__name__) 11 | consoleHandler = logging.StreamHandler() 12 | consoleHandler.setFormatter(logFormatter) 13 | logger.addHandler(consoleHandler) 14 | 15 | def sort_gpd(in_file,out_file,order_chrs=dict([("%s"%k,k) for k in range(1,23)]+[("MT",23),("X",24),("Y",25)]+[ 16 | ("chr%s"%k,k) for k in range(1,23)]+[("chrM",23),("chrX",24),("chrY",25)])): 17 | with open(in_file) as csv_file: 18 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 19 | rows=[] 20 | for row in spamreader: 21 | rows.append(row) 22 | others_chrs=sorted(set(map(lambda x:x[2],rows))-set(order_chrs.keys())) 23 | if others_chrs: 24 | max_id=max(order_chrs.values()) 25 | for i,c in enumerate(others_chrs): 26 | order_chrs[c]=max_id+i+1 27 | sorted_rows=sorted(rows,key=lambda x: (order_chrs[x[2]],int(x[4]))) 28 | with open(out_file, 'wb') as csvfile: 29 | spamwriter = csv.writer(csvfile, delimiter='\t', 30 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 31 | spamwriter.writerows(sorted_rows) 32 | 33 | 34 | 35 | CIGAR_MATCH = 0 36 | CIGAR_INS = 1 37 | CIGAR_DEL = 2 38 | CIGAR_SOFTCLIP = 4 39 | CIGAR_EQUAL = 7 40 | CIGAR_DIFF = 8 41 | CIGAR_PATTERN = re.compile(r'([0-9]+)([MIDNSHPX=])') 42 | CIGAR_OP_DICT = {op: index for index, op in enumerate("MIDNSHP=X")} 43 | CIGAR_OP_DICT_rev = {index: op for index, op in enumerate("MIDNSHP=X")} 44 | CIGAR_REFERENCE_OPS = [CIGAR_MATCH, CIGAR_DEL, CIGAR_EQUAL, CIGAR_DIFF] 45 | 46 | def cigarstring_to_tuple(cigarstring): 47 | return tuple((CIGAR_OP_DICT[op], int(length)) for length, op in CIGAR_PATTERN.findall(cigarstring)) 48 | 49 | 50 | def run_idpfusion(alignment="", short_junction="", long_alignment="",mode_number=0, 51 | short_fasta="", long_fasta="", 52 | ref_genome="", ref_all_gpd="", ref_gpd="", uniqueness_bedgraph="", 53 | genome_bowtie2_idx="", transcriptome_bowtie2_idx="", 54 | read_length=100, 55 | idpfusion_cfg="", idpfusion=IDPFUSION, samtools=SAMTOOLS, 56 | gmap=GMAP, gmap_idx="", star_dir=STAR_DIR, bowtie2_dir=BOWTIE2_DIR, 57 | start=0, sample= "", nthreads=1, 58 | workdir=None, outdir=None, timeout=TIMEOUT): 59 | 60 | logger.info("Running long read fusion Detection (IDP-fusion) for %s"%sample) 61 | if not os.path.exists(alignment): 62 | logger.error("Aborting!") 63 | raise Exception("No input short read alignment BAM/SAM file %s"%alignment) 64 | if not os.path.exists(short_junction): 65 | logger.error("Aborting!") 66 | raise Exception("No input short read junction BED file %s"%short_junction) 67 | 68 | if idpfusion_cfg: 69 | if not os.path.exists(idpfusion_cfg): 70 | logger.error("Aborting!") 71 | raise Exception("No input .cfg file %s"%idpfusion_cfg) 72 | 73 | 74 | 75 | if mode_number>0: 76 | start=4 77 | 78 | work_idpfusion="%s/idpfusion/%s/"%(workdir,sample) 79 | create_dirs([work_idpfusion]) 80 | 81 | step=0 82 | if start<=step: 83 | logger.info("--------------------------STEP %s--------------------------"%step) 84 | msg = "Erase IDP-fusion work directory for %s"%sample 85 | command="rm -rf %s/*" % ( 86 | work_idpfusion) 87 | command="bash -c \"%s\""%command 88 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 89 | retcode = cmd.run(msg=msg,timeout=timeout) 90 | step+=1 91 | 92 | 93 | 94 | idpfusion_log = os.path.join(work_idpfusion, "idpfusion.log") 95 | idpfusion_log_fd = open(idpfusion_log, "w") 96 | 97 | msg = "converting BAM to SAM for %s"%sample 98 | logger.info("--------------------------STEP %s--------------------------"%step) 99 | if start<=step: 100 | if alignment.endswith('.bam'): 101 | command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idpfusion,alignment) 102 | command="bash -c \"%s\""%command 103 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 104 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) 105 | alignment = "%s/alignments.sam"%(work_idpfusion) 106 | else: 107 | logger.info("Skipping step %d: %s"%(step,msg)) 108 | step+=1 109 | 110 | 111 | msg = "Fix soft-clipped reads in SAM for %s"%sample 112 | logger.info("--------------------------STEP %s--------------------------"%step) 113 | if start<=step: 114 | logger.info("Task: %s"%msg) 115 | corrected_alignment = "%s/alignments_corrected.sam"%(work_idpfusion) 116 | with open(alignment,"r") as csv_file_i: 117 | with open(corrected_alignment,"w") as csv_file_o: 118 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') 119 | spamwriter = csv.writer(csv_file_o, delimiter='\t', 120 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 121 | for row in spamreader: 122 | if row[0][0]=="@": 123 | spamwriter.writerow(row) 124 | continue 125 | if row[5]=="*": 126 | continue 127 | if "S" in row[5]: 128 | cigartuple=cigarstring_to_tuple(row[5]) 129 | if cigartuple[0][0]==4: 130 | row[9]=row[9][cigartuple[0][1]:] 131 | row[10]=row[10][cigartuple[0][1]:] 132 | cigartuple=cigartuple[1:] 133 | if cigartuple[-1][0]==4: 134 | row[9]=row[9][:-cigartuple[-1][1]] 135 | row[10]=row[10][:-cigartuple[-1][1]] 136 | cigartuple=cigartuple[:-1] 137 | row[5]="".join(["%d%s"%(x[1],CIGAR_OP_DICT_rev[x[0]]) for x in cigartuple]) 138 | spamwriter.writerow(row) 139 | alignment=corrected_alignment 140 | else: 141 | logger.info("Skipping step %d: %s"%(step,msg)) 142 | step+=1 143 | 144 | 145 | msg = "Fix junction bed for %s"%sample 146 | logger.info("--------------------------STEP %s--------------------------"%step) 147 | if start<=step: 148 | logger.info("Task: %s"%msg) 149 | corrected_junction = "%s/splicesites_corrected.bed"%(work_idpfusion) 150 | with open(short_junction,"r") as csv_file_i: 151 | with open(corrected_junction,"w") as csv_file_o: 152 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|') 153 | spamwriter = csv.writer(csv_file_o, delimiter='\t', 154 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 155 | for row in spamreader: 156 | if len(row)<4: 157 | spamwriter.writerow(row) 158 | continue 159 | if "]" in row[3]: 160 | spamwriter.writerow(row) 161 | continue 162 | row[3]="(2)[2_2](2/0)" 163 | spamwriter.writerow(row) 164 | short_junction=corrected_junction 165 | else: 166 | logger.info("Skipping step %d: %s"%(step,msg)) 167 | step+=1 168 | 169 | 170 | msg = "Preparing run.cfg for %s"%sample 171 | if start<=step: 172 | logger.info("--------------------------STEP %s--------------------------"%step) 173 | logger.info("Task: %s"%msg) 174 | if idpfusion_cfg: 175 | msg = "copy IDP-fusion .cfg file for %s"%sample 176 | command="cp %s %s/run.cfg" % ( 177 | idpfusion_cfg, work_idpfusion) 178 | command="bash -c \"%s\""%command 179 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 180 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) 181 | else: 182 | f=open("%s/run.cfg"%work_idpfusion, 'w') 183 | f.close() 184 | 185 | cgf_dict={} 186 | with open("%s/run.cfg"%work_idpfusion , 'r') as cfg_file: 187 | for line in cfg_file: 188 | line = line.strip() 189 | if line=='': 190 | continue 191 | if "=" in line and not line[0]=='#' : 192 | k,v=line.split("=") 193 | k=k.strip() 194 | v=v.strip() 195 | cgf_dict[k]=v 196 | 197 | 198 | with open("%s/run.cfg"%work_idpfusion , 'w') as cfg_file: 199 | for k,v in cgf_dict.iteritems(): 200 | cfg_file.write("%s = %s \n"%(k,v)) 201 | if "temp_foldername" not in cgf_dict: 202 | cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idpfusion) 203 | if "output_foldername" not in cgf_dict: 204 | cfg_file.write("output_foldername = %s/out/ \n"%work_idpfusion) 205 | if "Nthread" not in cgf_dict: 206 | cfg_file.write("Nthread = %d \n"%nthreads) 207 | if "LR_psl_pathfilename" not in cgf_dict: 208 | if long_alignment and os.path.exists(long_alignment): 209 | cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment) 210 | if "LR_pathfilename" not in cgf_dict: 211 | cfg_file.write("LR_pathfilename = %s \n"%long_fasta) 212 | if "SR_sam_pathfilename" not in cgf_dict: 213 | cfg_file.write("SR_sam_pathfilename = %s \n"%alignment) 214 | if "SR_jun_pathfilename" not in cgf_dict: 215 | cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction) 216 | if "SR_pathfilename" not in cgf_dict: 217 | cfg_file.write("SR_pathfilename = %s \n"%short_fasta) 218 | if "SR_aligner_choice" not in cgf_dict: 219 | cfg_file.write("SR_aligner_choice = STAR \n") 220 | if "star_path" not in cgf_dict: 221 | cfg_file.write("star_path = %s \n"%star_dir) 222 | if "gmap_executable_pathfilename" not in cgf_dict: 223 | cfg_file.write("gmap_executable_pathfilename = %s \n"%gmap) 224 | if "gmap_index_pathfoldername" not in cgf_dict: 225 | cfg_file.write("gmap_index_pathfoldername = %s \n"%gmap_idx) 226 | if "genome_bowtie2_index_pathfilename" not in cgf_dict: 227 | cfg_file.write("genome_bowtie2_index_pathfilename = %s \n"%genome_bowtie2_idx) 228 | if "transcriptome_bowtie2_index_pathfilename" not in cgf_dict: 229 | cfg_file.write("transcriptome_bowtie2_index_pathfilename = %s \n"%transcriptome_bowtie2_idx) 230 | if "allref_annotation_pathfilename" not in cgf_dict: 231 | cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd) 232 | if "ref_annotation_pathfilename" not in cgf_dict: 233 | cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd) 234 | if "genome_pathfilename" not in cgf_dict: 235 | cfg_file.write("genome_pathfilename = %s \n"%ref_genome) 236 | if "estimator_choice" not in cgf_dict: 237 | cfg_file.write("estimator_choice = MAP \n") 238 | if "FPR" not in cgf_dict: 239 | cfg_file.write("FPR = 0.1 \n") 240 | if "Njun_limit" not in cgf_dict: 241 | cfg_file.write("Njun_limit = 10 \n") 242 | if "Niso_limit" not in cgf_dict: 243 | cfg_file.write("Niso_limit = 20 \n") 244 | if "L_exon_limit" not in cgf_dict: 245 | cfg_file.write("L_exon_limit = 1700 \n") 246 | if "L_min_intron" not in cgf_dict: 247 | cfg_file.write("L_min_intron = 68 \n") 248 | if "Bfile_Npt" not in cgf_dict: 249 | cfg_file.write("Bfile_Npt = 50 \n") 250 | if "Bfile_Nbin" not in cgf_dict: 251 | cfg_file.write("Bfile_Nbin = 5 \n") 252 | if "min_LR_overlap_len" not in cgf_dict: 253 | cfg_file.write("min_LR_overlap_len = 100 \n") 254 | if "LR_fusion_point_err_margin" not in cgf_dict: 255 | cfg_file.write("LR_fusion_point_err_margin = 100 \n") 256 | if "min_LR_fusion_point_search_distance" not in cgf_dict: 257 | cfg_file.write("min_LR_fusion_point_search_distance = 20 \n") 258 | if "uniq_LR_alignment_margin_perc" not in cgf_dict: 259 | cfg_file.write("uniq_LR_alignment_margin_perc = 20 \n") 260 | if "Niso_fusion_limit" not in cgf_dict: 261 | cfg_file.write("Niso_fusion_limit = 1000 \n") 262 | if "psl_type" not in cgf_dict: 263 | cfg_file.write("psl_type = 0 \n") 264 | if "read_length" not in cgf_dict: 265 | cfg_file.write("read_length = %d \n"%read_length) 266 | if "min_junction_overlap_len" not in cgf_dict: 267 | cfg_file.write("min_junction_overlap_len = 10 \n") 268 | if "I_refjun_isoformconstruction" not in cgf_dict: 269 | cfg_file.write("I_refjun_isoformconstruction = 1 \n") 270 | if "I_ref5end_isoformconstruction" not in cgf_dict: 271 | cfg_file.write("I_ref5end_isoformconstruction = 1 \n") 272 | if "I_ref3end_isoformconstruction" not in cgf_dict: 273 | cfg_file.write("I_ref3end_isoformconstruction = 1 \n") 274 | if "fusion_mode" not in cgf_dict: 275 | cfg_file.write("fusion_mode = 1 \n") 276 | if "uniqueness_bedGraph_pathfilename" not in cgf_dict: 277 | cfg_file.write("uniqueness_bedGraph_pathfilename = %s \n"%uniqueness_bedgraph) 278 | if "exon_construction_junction_span" not in cgf_dict: 279 | cfg_file.write("exon_construction_junction_span = 1 \n") 280 | if "aligner_choice" not in cgf_dict: 281 | cfg_file.write("aligner_choice = gmap \n") 282 | if "aligner_choice" not in cgf_dict: 283 | cfg_file.write("aligner_choice = gmap \n") 284 | if "three_primer" not in cgf_dict: 285 | cfg_file.write("three_primer = \n") 286 | if "five_primer" not in cgf_dict: 287 | cfg_file.write("five_primer = \n") 288 | else: 289 | logger.info("Skipping step %d: %s"%(step,msg)) 290 | step+=1 291 | 292 | if star_dir: 293 | os.environ["PATH"] += ":%s/"%star_dir 294 | if bowtie2_dir: 295 | os.environ["PATH"] += ":%s/"%bowtie2_dir 296 | 297 | 298 | msg = "IDP-fusion for %s"%sample 299 | if start<=step: 300 | logger.info("--------------------------STEP %s--------------------------"%step) 301 | command="%s %s/run.cfg %d" % ( 302 | idpfusion, work_idpfusion, mode_number) 303 | command="bash -c \"%s\""%command 304 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 305 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) 306 | else: 307 | logger.info("Skipping step %d: %s"%(step,msg)) 308 | step+=1 309 | 310 | msg = "Convert transcript GPD file to GTF for %s"%sample 311 | if start<=step: 312 | logger.info("--------------------------STEP %s--------------------------"%step) 313 | if os.path.exists("%s/out/isoform.gpd"%work_idpfusion): 314 | sort_gpd("%s/out/isoform.gpd"%work_idpfusion,"%s/out/isoform_sorted.gpd"%work_idpfusion) 315 | command="gpd2gtf.py \ 316 | %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idpfusion,work_idpfusion,work_idpfusion) 317 | command="bash -c \"%s\""%command 318 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 319 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) 320 | else: 321 | logger.info("Skipping step %d: %s"%(step,msg)) 322 | step+=1 323 | 324 | out_idpfusion=os.path.join(outdir,"idpfusion",sample) 325 | create_dirs([out_idpfusion]) 326 | msg="Copy predictions to output directory for %s."%sample 327 | if start<=step: 328 | logger.info("--------------------------STEP %s--------------------------"%step) 329 | if os.path.exists("%s/out/fusion_report.tsv"%work_idpfusion): 330 | command = "cp %s/out/fusion_report.tsv %s/fusion_report.tsv"%( 331 | work_idpfusion, out_idpfusion) 332 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 333 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout) 334 | else: 335 | logger.info("Skipping step %d: %s"%(step,msg)) 336 | step+=1 337 | 338 | 339 | 340 | fusions = "" 341 | if os.path.exists("%s/fusion_report.tsv"%out_idpfusion): 342 | logger.info("IDP-fusion was successfull!") 343 | logger.info("Output fusions: %s/fusion_report.tsv"%out_idpfusion) 344 | fusions = "%s/fusion_report.tsv"%out_idpfusion 345 | else: 346 | logger.info("IDP-fusion failed!") 347 | return fusions 348 | 349 | def run_lr_fusion(long_fusion_caller="IDP-fusion", alignment="", 350 | short_junction="", long_alignment="", mode_number=0, 351 | short_fasta="", long_fasta="", 352 | ref_genome="", ref_all_gpd="", ref_gpd="", uniqueness_bedgraph="", 353 | genome_bowtie2_idx="", transcriptome_bowtie2_idx="", 354 | read_length=100, 355 | idpfusion_cfg="", idpfusion=IDPFUSION, samtools=SAMTOOLS, 356 | gmap=GMAP, gmap_idx="", star_dir=STAR_DIR, bowtie2_dir=BOWTIE2_DIR, 357 | start=0, sample= "", nthreads=1, 358 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 359 | fusions = "" 360 | if long_fusion_caller.upper()=="IDP-FUSION": 361 | try: 362 | fusions=run_idpfusion(alignment=alignment, 363 | short_junction=short_junction, long_alignment=long_alignment, 364 | mode_number=mode_number, 365 | short_fasta=short_fasta, long_fasta=long_fasta, 366 | ref_genome=ref_genome, ref_all_gpd=ref_all_gpd, 367 | ref_gpd=ref_gpd, uniqueness_bedgraph=uniqueness_bedgraph, 368 | genome_bowtie2_idx=genome_bowtie2_idx, transcriptome_bowtie2_idx=transcriptome_bowtie2_idx, 369 | read_length=read_length, 370 | idpfusion_cfg=idpfusion_cfg, idpfusion=idpfusion, samtools=samtools, 371 | gmap=gmap, gmap_idx=gmap_idx, star_dir=star_dir, 372 | bowtie2_dir=bowtie2_dir, 373 | start=start, sample= sample, nthreads=nthreads, 374 | workdir=workdir, outdir=outdir, timeout=timeout) 375 | except Exception as excp: 376 | logger.info("IDP-fusion failed!") 377 | logger.error(excp) 378 | if not ignore_exceptions: 379 | raise Exception(excp) 380 | return fusions -------------------------------------------------------------------------------- /src/run_lr_reconstruct.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | import csv 6 | 7 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 8 | logFormatter = logging.Formatter(FORMAT) 9 | logger = logging.getLogger(__name__) 10 | consoleHandler = logging.StreamHandler() 11 | consoleHandler.setFormatter(logFormatter) 12 | logger.addHandler(consoleHandler) 13 | 14 | def sort_gpd(in_file,out_file,order_chrs=dict([("%s"%k,k) for k in range(1,23)]+[("MT",23),("X",24),("Y",25)]+[ 15 | ("chr%s"%k,k) for k in range(1,23)]+[("chrM",23),("chrX",24),("chrY",25)])): 16 | with open(in_file) as csv_file: 17 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|') 18 | rows=[] 19 | for row in spamreader: 20 | rows.append(row) 21 | others_chrs=sorted(set(map(lambda x:x[2],rows))-set(order_chrs.keys())) 22 | if others_chrs: 23 | max_id=max(order_chrs.values()) 24 | for i,c in enumerate(others_chrs): 25 | order_chrs[c]=max_id+i+1 26 | sorted_rows=sorted(rows,key=lambda x: (order_chrs[x[2]],int(x[4]))) 27 | with open(out_file, 'wb') as csvfile: 28 | spamwriter = csv.writer(csvfile, delimiter='\t', 29 | quotechar='|', quoting=csv.QUOTE_MINIMAL) 30 | spamwriter.writerows(sorted_rows) 31 | 32 | 33 | 34 | def run_idp(alignment="", short_junction="", long_alignment="",mode_number=0, 35 | ref_genome="", ref_all_gpd="", ref_gpd="",read_length=100, 36 | idp_cfg="", idp=IDP, samtools=SAMTOOLS, 37 | start=0, sample= "", nthreads=1, 38 | workdir=None, outdir=None, timeout=TIMEOUT): 39 | 40 | logger.info("Running long-read transcriptome reconstruction (IDP) for %s"%sample) 41 | if not os.path.exists(alignment): 42 | logger.error("Aborting!") 43 | raise Exception("No input short read alignment BAM/SAM file %s"%alignment) 44 | if not os.path.exists(short_junction): 45 | logger.error("Aborting!") 46 | raise Exception("No input short read junction BED file %s"%short_junction) 47 | if not os.path.exists(long_alignment): 48 | logger.error("Aborting!") 49 | raise Exception("No input long read alignment PSL file %s"%long_alignment) 50 | 51 | if idp_cfg: 52 | if not os.path.exists(idp_cfg): 53 | logger.error("Aborting!") 54 | raise Exception("No input .cfg file %s"%idp_cfg) 55 | 56 | 57 | 58 | if mode_number>0: 59 | start=4 60 | 61 | work_idp="%s/idp/%s/"%(workdir,sample) 62 | create_dirs([work_idp]) 63 | 64 | step=0 65 | if start<=step: 66 | logger.info("--------------------------STEP %s--------------------------"%step) 67 | msg = "Erase IDP work directory for %s"%sample 68 | command="rm -rf %s/*" % ( 69 | work_idp) 70 | command="bash -c \"%s\""%command 71 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 72 | retcode = cmd.run(msg=msg,timeout=timeout) 73 | step+=1 74 | 75 | 76 | 77 | idp_log = os.path.join(work_idp, "idp.log") 78 | idp_log_fd = open(idp_log, "w") 79 | 80 | msg = "converting BAM to SAM for %s"%sample 81 | logger.info("--------------------------STEP %s--------------------------"%step) 82 | if start<=step: 83 | if alignment.endswith('.bam'): 84 | command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idp,alignment) 85 | command="bash -c \"%s\""%command 86 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 87 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 88 | alignment = "%s/alignments.sam"%(work_idp) 89 | else: 90 | logger.info("Skipping step %d: %s"%(step,msg)) 91 | step+=1 92 | 93 | 94 | msg = "Preparing run.cfg for %s"%sample 95 | if start<=step: 96 | logger.info("--------------------------STEP %s--------------------------"%step) 97 | if idp_cfg: 98 | msg = "copy IDP .cfg file for %s"%sample 99 | command="cp %s %s/run.cfg" % ( 100 | idp_cfg, work_idp) 101 | command="bash -c \"%s\""%command 102 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 103 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 104 | else: 105 | f=open("%s/run.cfg"%work_idp, 'w') 106 | f.close() 107 | 108 | cgf_dict={} 109 | with open("%s/run.cfg"%work_idp , 'r') as cfg_file: 110 | for line in cfg_file: 111 | line = line.strip() 112 | if line=='': 113 | continue 114 | if "=" in line and not line[0]=='#' : 115 | k,v=line.split("=") 116 | k=k.strip() 117 | v=v.strip() 118 | cgf_dict[k]=v 119 | 120 | with open("%s/run.cfg"%work_idp , 'w') as cfg_file: 121 | for k,v in cgf_dict.iteritems(): 122 | cfg_file.write("%s = %s \n"%(k,v)) 123 | if "temp_foldername" not in cgf_dict: 124 | cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idp) 125 | if "output_foldername" not in cgf_dict: 126 | cfg_file.write("output_foldername = %s/out/ \n"%work_idp) 127 | if "Nthread" not in cgf_dict: 128 | cfg_file.write("Nthread = %d \n"%nthreads) 129 | if "LR_psl_pathfilename" not in cgf_dict: 130 | cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment) 131 | if "SR_sam_pathfilename" not in cgf_dict: 132 | cfg_file.write("SR_sam_pathfilename = %s \n"%alignment) 133 | if "SR_jun_pathfilename" not in cgf_dict: 134 | cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction) 135 | if "genome_pathfilename" not in cgf_dict: 136 | cfg_file.write("genome_pathfilename = %s \n"%ref_genome) 137 | if "allref_annotation_pathfilename" not in cgf_dict: 138 | cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd) 139 | if "ref_annotation_pathfilename" not in cgf_dict: 140 | cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd) 141 | if "estimator_choice" not in cgf_dict: 142 | cfg_file.write("estimator_choice = MLE \n") 143 | if "FPR" not in cgf_dict: 144 | cfg_file.write("FPR = 0.05 \n") 145 | if "Njun_limit" not in cgf_dict: 146 | cfg_file.write("Njun_limit = 10 \n") 147 | if "Niso_limit" not in cgf_dict: 148 | cfg_file.write("Niso_limit = 100 \n") 149 | if "aligner_choice" not in cgf_dict: 150 | cfg_file.write("aligner_choice = gmap \n") 151 | if "exon_construction_junction_span" not in cgf_dict: 152 | cfg_file.write("exon_construction_junction_span = 1 \n") 153 | if "read_length" not in cgf_dict: 154 | cfg_file.write("read_length = %d \n"%read_length) 155 | else: 156 | logger.info("Skipping step %d: %s"%(step,msg)) 157 | step+=1 158 | 159 | 160 | 161 | msg = "IDP for %s"%sample 162 | if start<=step: 163 | logger.info("--------------------------STEP %s--------------------------"%step) 164 | command="%s %s/run.cfg %d" % ( 165 | idp, work_idp, mode_number) 166 | command="bash -c \"%s\""%command 167 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 168 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 169 | else: 170 | logger.info("Skipping step %d: %s"%(step,msg)) 171 | step+=1 172 | 173 | msg = "Convert transcript GPD file to GTF for %s"%sample 174 | if start<=step: 175 | logger.info("--------------------------STEP %s--------------------------"%step) 176 | if os.path.exists("%s/out/isoform.gpd"%work_idp): 177 | sort_gpd("%s/out/isoform.gpd"%work_idp,"%s/out/isoform_sorted.gpd"%work_idp) 178 | command="gpd2gtf.py \ 179 | %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idp,work_idp,work_idp) 180 | command="bash -c \"%s\""%command 181 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 182 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 183 | else: 184 | logger.info("Skipping step %d: %s"%(step,msg)) 185 | step+=1 186 | 187 | out_idp=os.path.join(outdir,"idp",sample) 188 | create_dirs([out_idp]) 189 | msg="Copy predictions to output directory for %s."%sample 190 | if start<=step: 191 | logger.info("--------------------------STEP %s--------------------------"%step) 192 | if os.path.exists("%s/out/isoform.gtf"%work_idp) and \ 193 | os.path.exists("%s/out/isoform.exp"%work_idp): 194 | command = "cp %s/out/isoform.gtf %s/isoform.gtf"%( 195 | work_idp, out_idp) 196 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 197 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 198 | 199 | command = "cp %s/out/isoform.exp %s/isoform.exp"%( 200 | work_idp, out_idp) 201 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 202 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout) 203 | else: 204 | logger.info("Skipping step %d: %s"%(step,msg)) 205 | step+=1 206 | 207 | 208 | 209 | transcripts = "" 210 | abundances = "" 211 | if os.path.exists("%s/isoform.gtf"%out_idp) and \ 212 | os.path.exists("%s/isoform.exp"%out_idp): 213 | logger.info("IDP was successfull!") 214 | logger.info("Output isoforms: %s/isoform.gtf"%out_idp) 215 | logger.info("Output expressions: %s/isoform.exp"%out_idp) 216 | transcripts = "%s/isoform.gtf"%out_idp 217 | abundances = "%s/isoform.exp"%out_idp 218 | else: 219 | logger.info("IDP failed!") 220 | return transcripts,abundances 221 | 222 | def run_lr_reconstruct(long_reconstructor="IDP", alignment="", 223 | short_junction="", long_alignment="", mode_number=0, 224 | ref_genome="", ref_all_gpd="", ref_gpd="", read_length=100, 225 | idp_cfg="", idp=IDP, samtools=SAMTOOLS, 226 | start=0, sample= "", nthreads=1, 227 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 228 | transcripts = "" 229 | abundances = "" 230 | if long_reconstructor.upper()=="IDP": 231 | try: 232 | transcripts,abundances=run_idp(alignment=alignment, 233 | short_junction=short_junction, long_alignment=long_alignment, 234 | mode_number=mode_number, 235 | ref_genome=ref_genome, ref_all_gpd=ref_all_gpd, ref_gpd=ref_gpd, 236 | read_length=read_length, 237 | idp_cfg=idp_cfg, idp=idp, samtools=samtools, 238 | start=start, sample= sample, nthreads=nthreads, 239 | workdir=workdir, outdir=outdir, timeout=timeout) 240 | except Exception as excp: 241 | logger.info("IDP failed!") 242 | logger.error(excp) 243 | if not ignore_exceptions: 244 | raise Exception(excp) 245 | return transcripts,abundances -------------------------------------------------------------------------------- /src/run_quantify.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_salmon_smem(quantifier_idx=None, 14 | seq_1="", seq_2="", seq_u="", 15 | salmon_k=SALMON_SMEM_k, libtype="", 16 | salmon_smem_opts="", salmon=SALMON, 17 | start=0, sample= "", nthreads=1, unzip=False, 18 | workdir=None, outdir=None, timeout=TIMEOUT): 19 | 20 | logger.info("Running quantification (Salmon-SMEM) for %s"%sample) 21 | if not os.path.exists(quantifier_idx): 22 | logger.error("Aborting!") 23 | raise Exception("No Salmon FMD index directory %s"%quantifier_idx) 24 | 25 | if seq_1 and seq_2: 26 | for s1 in seq_1.split(","): 27 | if not os.path.exists(s1): 28 | logger.error("Aborting!") 29 | raise Exception("No Mate 1 sequence file %s"%s1) 30 | 31 | for s2 in seq_2.split(","): 32 | if not os.path.exists(s2): 33 | logger.error("Aborting!") 34 | raise Exception("No Mate 2 sequence file %s"%s2) 35 | 36 | if unzip: 37 | seq_argument="-1 <(gunzip -c %s) -2 <(gunzip -c %s)"%(" ".join(seq_1.split(","))," ".join(seq_2.split(","))) 38 | else: 39 | if "," in seq_1: 40 | seq_1="<(cat %s)"%(" ".join(seq_1.split(","))) 41 | if "," in seq_2: 42 | seq_2="<(cat %s)"%(" ".join(seq_2.split(","))) 43 | seq_argument="-1 %s -2 %s"%(seq_1,seq_2) 44 | elif seq_u: 45 | if unzip: 46 | seq_argument="-r <(gunzip -c %s)"%(" ".join(seq_u.split(","))) 47 | elif "," in seq_u: 48 | seq_argument="-r <(cat %s)"%(" ".join(seq_u1.split(","))) 49 | else: 50 | seq_argument="-r %s"%(seq_u) 51 | for su in seq_u.split(","): 52 | if not os.path.exists(su): 53 | logger.error("Aborting!") 54 | raise Exception("No unpaired sequence file %s"%su) 55 | 56 | 57 | work_salmon_smem=os.path.join(workdir,"salmon_smem",sample) 58 | create_dirs([work_salmon_smem]) 59 | 60 | step=0 61 | if start<=step: 62 | logger.info("--------------------------STEP %s--------------------------"%step) 63 | msg = "Erase Salmon-SMEM work directory for %s"%sample 64 | command="rm -rf %s/*" % ( 65 | work_salmon_smem) 66 | command="bash -c \"%s\""%command 67 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 68 | retcode = cmd.run(msg=msg,timeout=timeout) 69 | step+=1 70 | 71 | 72 | salmon_smem_log = os.path.join(work_salmon_smem, "salmon_smem.log") 73 | salmon_smem_log_fd = open(salmon_smem_log, "w") 74 | 75 | if "-p " not in salmon_smem_opts: 76 | salmon_smem_opts += " -p %d"%nthreads 77 | 78 | salmon_smem_opts += " -k %d"%salmon_k 79 | salmon_smem_opts += " -l %s"%libtype 80 | 81 | msg = "Salmon-SMEM for %s"%sample 82 | if start<=step: 83 | logger.info("--------------------------STEP %s--------------------------"%step) 84 | command="%s quant -i %s %s %s -o %s" % ( 85 | salmon, quantifier_idx, salmon_smem_opts, seq_argument,work_salmon_smem ) 86 | command="bash -c \"%s\""%command 87 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 88 | retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout) 89 | else: 90 | logger.info("Skipping step %d: %s"%(step,msg)) 91 | step+=1 92 | 93 | 94 | out_salmon_smem=os.path.join(outdir,"salmon_smem",sample) 95 | create_dirs([out_salmon_smem]) 96 | msg="Copy predictions to output directory for %s."%sample 97 | if start<=step: 98 | logger.info("--------------------------STEP %s--------------------------"%step) 99 | if os.path.exists("%s/quant.sf"%work_salmon_smem): 100 | command = "cp %s/quant.sf %s/quant.sf"%( 101 | work_salmon_smem, out_salmon_smem) 102 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 103 | retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout) 104 | else: 105 | logger.info("Skipping step %d: %s"%(step,msg)) 106 | step+=1 107 | 108 | 109 | quant = "" 110 | if os.path.exists("%s/quant.sf"%out_salmon_smem): 111 | logger.info("Salmon-SMEM was successfull!") 112 | logger.info("Output expressions: %s/quant.sf"%out_salmon_smem) 113 | quant = "%s/quant.sf"%out_salmon_smem 114 | else: 115 | logger.info("Salmon-SMEM failed!") 116 | return quant 117 | 118 | def run_quantify(quantifier="Salmon-SMEM", quantifier_idx=None, 119 | seq_1="", seq_2="", seq_u="", 120 | salmon_k=SALMON_SMEM_k, libtype="", 121 | salmon_smem_opts="", salmon=SALMON, 122 | start=0, sample= "", nthreads=1, unzip=False, 123 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 124 | quant="" 125 | if quantifier.upper()=="SALMON-SMEM": 126 | try: 127 | quant=run_salmon_smem(quantifier_idx=quantifier_idx, 128 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u, 129 | salmon_k=salmon_k, libtype=libtype, 130 | salmon_smem_opts=salmon_smem_opts, salmon=salmon, 131 | start=start, sample= sample, nthreads=nthreads, unzip=unzip, 132 | workdir=workdir, outdir=outdir, timeout=timeout) 133 | except Exception as excp: 134 | logger.info("Salmon-SMEM failed!") 135 | logger.error(excp) 136 | if not ignore_exceptions: 137 | raise Exception(excp) 138 | return quant -------------------------------------------------------------------------------- /src/run_reconstruct.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_stringtie(alignment_bam="",ref_gtf="", 14 | stringtie_opts="", stringtie=STRINGTIE, 15 | start=0, sample= "", nthreads=1, 16 | workdir=None, outdir=None, timeout=TIMEOUT): 17 | 18 | logger.info("Running transcriptome reconstruction (StringTie) for %s"%sample) 19 | if not os.path.exists(alignment_bam): 20 | logger.error("Aborting!") 21 | raise Exception("No input alignment BAM file %s"%alignment_bam) 22 | 23 | work_stringtie="%s/stringtie/%s/"%(workdir,sample) 24 | create_dirs([work_stringtie]) 25 | step=0 26 | if start<=step: 27 | logger.info("--------------------------STEP %s--------------------------"%step) 28 | msg = "Erase StringTie work directory for %s"%sample 29 | command="rm -rf %s/*" % ( 30 | work_stringtie) 31 | command="bash -c \"%s\""%command 32 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 33 | retcode = cmd.run(msg=msg,timeout=timeout) 34 | step+=1 35 | stringtie_log = os.path.join(work_stringtie, "stringtie.log") 36 | stringtie_log_fd = open(stringtie_log, "w") 37 | 38 | if ref_gtf: 39 | if not os.path.exists(ref_gtf): 40 | logger.error("Aborting!") 41 | raise Exception("No reference GTF file %s"%ref_gtf) 42 | 43 | if ref_gtf: 44 | stringtie_opts += " -G %s"%ref_gtf 45 | if "-p " not in stringtie_opts: 46 | stringtie_opts += " -p %d"%nthreads 47 | 48 | msg = "StringTie for %s"%sample 49 | if start<=step: 50 | logger.info("--------------------------STEP %s--------------------------"%step) 51 | command="%s %s %s -o %s/transcripts.gtf -A %s/gene_abund.tab -v" % ( 52 | stringtie, alignment_bam, stringtie_opts, work_stringtie, work_stringtie) 53 | command="bash -c \"%s\""%command 54 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 55 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) 56 | else: 57 | logger.info("Skipping step %d: %s"%(step,msg)) 58 | step+=1 59 | 60 | out_stringtie=os.path.join(outdir,"stringtie",sample) 61 | create_dirs([out_stringtie]) 62 | msg="Copy predictions to output directory for %s."%sample 63 | if start<=step: 64 | logger.info("--------------------------STEP %s--------------------------"%step) 65 | if os.path.exists("%s/transcripts.gtf"%work_stringtie) and \ 66 | os.path.exists("%s/gene_abund.tab"%work_stringtie): 67 | command = "cp %s/transcripts.gtf %s/transcripts.gtf"%( 68 | work_stringtie, out_stringtie) 69 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 70 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) 71 | 72 | command = "cp %s/gene_abund.tab %s/gene_abund.tab"%( 73 | work_stringtie, out_stringtie) 74 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 75 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout) 76 | else: 77 | logger.info("Skipping step %d: %s"%(step,msg)) 78 | step+=1 79 | 80 | 81 | transcripts = "" 82 | abundances = "" 83 | if os.path.exists("%s/transcripts.gtf"%out_stringtie) and \ 84 | os.path.exists("%s/gene_abund.tab"%out_stringtie): 85 | logger.info("StringTie was successfull!") 86 | logger.info("Output isoforms: %s/transcripts.gtf"%out_stringtie) 87 | logger.info("Output expressions: %s/gene_abund.tab"%out_stringtie) 88 | transcripts = "%s/transcripts.gtf"%out_stringtie 89 | abundances = "%s/gene_abund.tab"%out_stringtie 90 | else: 91 | logger.info("StringTie failed!") 92 | return transcripts,abundances 93 | 94 | def run_reconstruct(reconstructor="StringTie", alignment_bam="", 95 | ref_gtf="", 96 | stringtie_opts="", stringtie=STRINGTIE, 97 | start=0, sample= "", nthreads=1, 98 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 99 | transcripts = "" 100 | abundances = "" 101 | if reconstructor.upper()=="STRINGTIE": 102 | try: 103 | transcripts,abundances=run_stringtie(alignment_bam=alignment_bam, 104 | ref_gtf=ref_gtf, 105 | stringtie_opts=stringtie_opts, stringtie=stringtie, 106 | start=start, sample= sample, nthreads=nthreads, 107 | workdir=workdir, outdir=outdir, timeout=timeout) 108 | except Exception as excp: 109 | logger.info("StringTie failed!") 110 | logger.error(excp) 111 | if not ignore_exceptions: 112 | raise Exception(excp) 113 | return transcripts,abundances -------------------------------------------------------------------------------- /src/run_sr_align.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | 14 | def run_hisat2(align_idx=None, 15 | seq_1="", seq_2="", seq_u="", 16 | seq_sra="", ref_gtf="", 17 | hisat2_opts="", hisat2=HISAT2, hisat2_sps=HISAT2_SPS, 18 | samtools=SAMTOOLS, 19 | start=0, sample= "", nthreads=1, 20 | workdir=None, outdir=None, timeout=TIMEOUT): 21 | 22 | logger.info("Running alignment (HISAT2) for %s"%sample) 23 | if not os.path.exists(align_idx+".1.ht2"): 24 | logger.error("Aborting!") 25 | raise Exception("No HISAT index file %s.1.ht2"%align_idx) 26 | 27 | if seq_1 and seq_2: 28 | for s1 in seq_1.split(","): 29 | if not os.path.exists(s1): 30 | logger.error("Aborting!") 31 | raise Exception("No Mate 1 sequence file %s"%s1) 32 | for s2 in seq_2.split(","): 33 | if not os.path.exists(s2): 34 | logger.error("Aborting!") 35 | raise Exception("No Mate 2 sequence file %s"%s2) 36 | seq_argument="-1 %s -2 %s"%(seq_1,seq_2) 37 | elif seq_u: 38 | seq_argument="-U %s"%(seq_u) 39 | for su in seq_u.split(","): 40 | if not os.path.exists(su): 41 | logger.error("Aborting!") 42 | raise Exception("No unpaired sequence file %s"%su) 43 | 44 | elif seq_sra: 45 | seq_argument="--sra-acc %s"%(seq_sra) 46 | for sr in seq_sra.split(","): 47 | if not os.path.exists(sr): 48 | logger.error("Aborting!") 49 | raise Exception("No sra sequence file %s"%sr) 50 | 51 | 52 | work_hisat2=os.path.join(workdir,"hisat2",sample) 53 | create_dirs([work_hisat2]) 54 | 55 | step=0 56 | if start<=step: 57 | logger.info("--------------------------STEP %s--------------------------"%step) 58 | msg = "Erase HISAT2 work directory for %s"%sample 59 | command="rm -rf %s/*" % ( 60 | work_hisat2) 61 | command="bash -c \"%s\""%command 62 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 63 | retcode = cmd.run(msg=msg,timeout=timeout) 64 | step+=1 65 | 66 | hisat2_log = os.path.join(work_hisat2, "hisat2.log") 67 | hisat2_log_fd = open(hisat2_log, "w") 68 | 69 | ksps = "" 70 | msg = "Prepare known-splicesites for %s"%sample 71 | if start<=step: 72 | logger.info("--------------------------STEP %s--------------------------"%step) 73 | if ref_gtf: 74 | if not os.path.exists(ref_gtf): 75 | logger.error("Aborting!") 76 | raise Exception("No reference GTF file %s"%ref_gtf) 77 | else: 78 | ksps = ref_gtf.strip() + "known-splicesite.txt" 79 | if os.path.exists(ksps): 80 | logger.info("Will use the precomputed %s as --known-splicesite-infile for HISAT2"%ksps) 81 | else: 82 | msg="compute --known-splicesite-infile for HISAT2" 83 | ksps = os.path.join(work_hisat2, "known-splicesite.txt") 84 | ksps_fd = open(ksps, "w") 85 | 86 | command="%s %s" % (hisat2_sps,ref_gtf) 87 | command="bash -c \"%s\""%command 88 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 89 | retcode = cmd.run(cmd_log_fd_out=ksps_fd, msg=msg, timeout=timeout) 90 | else: 91 | logger.info("Skipping step %d: %s"%(step,msg)) 92 | step+=1 93 | 94 | 95 | 96 | if "--dta " not in hisat2_opts: 97 | hisat2_opts += " --dta" 98 | if "--rg-id " not in hisat2_opts: 99 | hisat2_opts += " --rg-id hisat2" 100 | if "--rg " not in hisat2_opts: 101 | hisat2_opts += " --rg SM:%s"%sample 102 | if "--threads " not in hisat2_opts: 103 | hisat2_opts += " --threads %d"%nthreads 104 | if ksps: 105 | hisat2_opts += " --known-splicesite-infile %s"%ksps 106 | 107 | msg = "HISAT2 for %s"%sample 108 | if start<=step: 109 | logger.info("--------------------------STEP %s--------------------------"%step) 110 | command="%s %s -x %s %s -S %s/alignments.sam --novel-splicesite-outfile %s/splicesites.tab" % ( 111 | hisat2, hisat2_opts, align_idx, seq_argument,work_hisat2, work_hisat2 ) 112 | command="bash -c \"%s\""%command 113 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 114 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 115 | else: 116 | logger.info("Skipping step %d: %s"%(step,msg)) 117 | step+=1 118 | 119 | msg = "converting SAM to BAM for %s"%sample 120 | if start<=step: 121 | logger.info("--------------------------STEP %s--------------------------"%step) 122 | command="%s view -Su %s/alignments.sam -@ %d -o %s/alignments.bam" % ( 123 | samtools, work_hisat2, nthreads, work_hisat2) 124 | command="bash -c \"%s\""%command 125 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 126 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 127 | else: 128 | logger.info("Skipping step %d: %s"%(step,msg)) 129 | step+=1 130 | 131 | msg = "sorting BAM for %s"%sample 132 | if start<=step: 133 | logger.info("--------------------------STEP %s--------------------------"%step) 134 | command="%s sort -@ %d -T %s/alignments.sorted -o %s/alignments.sorted.bam %s/alignments.bam " % ( 135 | samtools, nthreads, work_hisat2, work_hisat2, work_hisat2) 136 | command="bash -c \"%s\""%command 137 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 138 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 139 | else: 140 | logger.info("Skipping step %d: %s"%(step,msg)) 141 | step+=1 142 | 143 | 144 | 145 | msg = "Converting junctions to BED for %s"%sample 146 | if start<=step: 147 | logger.info("--------------------------STEP %s--------------------------"%step) 148 | command="hisat2_jun2bed.py %s/splicesites.tab %s/splicesites.bed " % ( 149 | work_hisat2, work_hisat2) 150 | command="bash -c \"%s\""%command 151 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 152 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 153 | else: 154 | logger.info("Skipping step %d: %s"%(step,msg)) 155 | step+=1 156 | 157 | msg = "Clean temp alignment files for %s"%sample 158 | if start<=step: 159 | logger.info("--------------------------STEP %s--------------------------"%step) 160 | command="rm %s/alignments.sam %s/alignments.bam" % (work_hisat2, work_hisat2) 161 | command="bash -c \"%s\""%command 162 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 163 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 164 | else: 165 | logger.info("Skipping step %d: %s"%(step,msg)) 166 | step+=1 167 | 168 | 169 | out_hisat2=os.path.join(outdir,"hisat2",sample) 170 | create_dirs([out_hisat2]) 171 | msg="Copy predictions to output directory for %s."%sample 172 | if start<=step: 173 | logger.info("--------------------------STEP %s--------------------------"%step) 174 | if os.path.exists("%s/alignments.sorted.bam"%work_hisat2) and \ 175 | os.path.exists("%s/splicesites.tab"%work_hisat2) and \ 176 | os.path.exists("%s/splicesites.bed"%work_hisat2): 177 | command = "cp %s/alignments.sorted.bam %s/alignments.sorted.bam"%( 178 | work_hisat2, out_hisat2) 179 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 180 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 181 | command = "cp %s/splicesites.tab %s/splicesites.tab"%( 182 | work_hisat2, out_hisat2) 183 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 184 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 185 | command = "cp %s/splicesites.bed %s/splicesites.bed"%( 186 | work_hisat2, out_hisat2) 187 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 188 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout) 189 | else: 190 | logger.info("Skipping step %d: %s"%(step,msg)) 191 | step+=1 192 | 193 | 194 | 195 | alignments_bam = "" 196 | junctions_tab = "" 197 | junctions_bed = "" 198 | if os.path.exists("%s/alignments.sorted.bam"%out_hisat2): 199 | logger.info("HISAT2 was successfull!") 200 | logger.info("Output alignment: %s/alignments.sorted.bam"%out_hisat2) 201 | logger.info("Output junction tab: %s/splicesites.tab"%out_hisat2) 202 | logger.info("Output junction bed: %s/splicesites.bed"%out_hisat2) 203 | alignments_bam = "%s/alignments.sorted.bam"%out_hisat2 204 | junctions_tab = "%s/splicesites.tab"%out_hisat2 205 | junctions_bed = "%s/splicesites.bed"%out_hisat2 206 | else: 207 | logger.info("HISAT2 failed!") 208 | return alignments_bam,junctions_tab,junctions_bed 209 | 210 | def run_sr_align(sr_aligner="HISAT2", align_idx=None, 211 | seq_1="", seq_2="", seq_u="", 212 | seq_sra="", ref_gtf="", 213 | hisat2_opts="", hisat2=HISAT2, hisat2_sps=HISAT2_SPS, 214 | samtools=SAMTOOLS, 215 | start=0, sample= "", nthreads=1, 216 | workdir=None, outdir=None, timeout=TIMEOUT,ignore_exceptions=False): 217 | alignments_bam = "" 218 | junctions_tab = "" 219 | junctions_bed = "" 220 | if sr_aligner.upper()=="HISAT2": 221 | try : 222 | alignments_bam, junctions_tab, junctions_bed=run_hisat2(align_idx=align_idx, 223 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u, 224 | seq_sra=seq_sra, ref_gtf=ref_gtf, 225 | hisat2_opts=hisat2_opts, hisat2=hisat2, hisat2_sps=hisat2_sps, 226 | samtools=samtools, 227 | start=start, sample= sample, nthreads=nthreads, 228 | workdir=workdir, outdir=outdir, timeout=timeout) 229 | except Exception as excp: 230 | logger.info("HISAT2 failed!") 231 | logger.error(excp) 232 | if not ignore_exceptions: 233 | raise Exception(excp) 234 | 235 | return alignments_bam, junctions_tab, junctions_bed -------------------------------------------------------------------------------- /src/run_variant.py: -------------------------------------------------------------------------------- 1 | import os 2 | from external_cmd import TimedExternalCmd 3 | from defaults import * 4 | from utils import * 5 | 6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' 7 | logFormatter = logging.Formatter(FORMAT) 8 | logger = logging.getLogger(__name__) 9 | consoleHandler = logging.StreamHandler() 10 | consoleHandler.setFormatter(logFormatter) 11 | logger.addHandler(consoleHandler) 12 | 13 | def run_gatk(alignment="", ref_genome="", knownsites="", 14 | picard=PICARD, gatk=GATK, 15 | java=JAVA, java_opts="", 16 | CleanSam=False, no_BaseRecalibrator=False , 17 | AddOrReplaceReadGroups_opts="", MarkDuplicates_opts="", 18 | SplitNCigarReads_opts="", 19 | BaseRecalibrator_opts="", 20 | ApplyBQSR_opts="", HaplotypeCaller_opts="", 21 | VariantFiltration_opts="", 22 | start=0, sample= "", nthreads=1, 23 | workdir=None, outdir=None, timeout=TIMEOUT): 24 | 25 | logger.info("Running variant calling (GATK) for %s"%sample) 26 | if not os.path.exists(alignment): 27 | logger.error("Aborting!") 28 | raise Exception("No alignment file %s"%alignment) 29 | if not os.path.exists(ref_genome): 30 | logger.error("Aborting!") 31 | raise Exception("No reference genome FASTA file %s"%ref_genome) 32 | 33 | 34 | work_gatk=os.path.join(workdir,"gatk",sample) 35 | create_dirs([work_gatk]) 36 | 37 | step=0 38 | if start<=step: 39 | logger.info("--------------------------STEP %s--------------------------"%step) 40 | msg = "Erase GATK work directory for %s"%sample 41 | command="rm -rf %s/*" % ( 42 | work_gatk) 43 | command="bash -c \"%s\""%command 44 | cmd = TimedExternalCmd(command, logger, raise_exception=False) 45 | retcode = cmd.run(msg=msg,timeout=timeout) 46 | step+=1 47 | 48 | gatk_log = os.path.join(work_gatk, "gatk.log") 49 | gatk_log_fd = open(gatk_log, "w") 50 | 51 | 52 | if "SO=" not in AddOrReplaceReadGroups_opts: 53 | AddOrReplaceReadGroups_opts += " SO=coordinate" 54 | if "RGLB=" not in AddOrReplaceReadGroups_opts: 55 | AddOrReplaceReadGroups_opts += " RGLB=lib1" 56 | if "RGPL=" not in AddOrReplaceReadGroups_opts: 57 | AddOrReplaceReadGroups_opts += " RGPL=illumina" 58 | if "RGPU=" not in AddOrReplaceReadGroups_opts: 59 | AddOrReplaceReadGroups_opts += " RGPU=unit1" 60 | if "RGSM=" not in AddOrReplaceReadGroups_opts: 61 | AddOrReplaceReadGroups_opts += " RGSM=%s"%sample 62 | 63 | if "CREATE_INDEX=" not in MarkDuplicates_opts: 64 | MarkDuplicates_opts += " CREATE_INDEX=true" 65 | if "VALIDATION_STRINGENCY=" not in MarkDuplicates_opts: 66 | MarkDuplicates_opts += " VALIDATION_STRINGENCY=SILENT" 67 | 68 | if knownsites: 69 | if not os.path.exists(knownsites): 70 | logger.error("Aborting!") 71 | raise Exception("No VCF knownsites file %s"%knownsites) 72 | if "--known-sites " not in BaseRecalibrator_opts: 73 | BaseRecalibrator_opts += " --known-sites %s"%knownsites 74 | 75 | 76 | 77 | if "--dont-use-soft-clipped-bases " not in HaplotypeCaller_opts: 78 | HaplotypeCaller_opts += " --dont-use-soft-clipped-bases" 79 | if "-stand-call-conf " not in HaplotypeCaller_opts: 80 | HaplotypeCaller_opts += " -stand-call-conf %f"%GATK_HC_STANDCALLCONF 81 | 82 | if "-window " not in VariantFiltration_opts: 83 | VariantFiltration_opts += " -window %d"%GATK_VF_WINDOW 84 | if "-cluster " not in VariantFiltration_opts: 85 | VariantFiltration_opts += " -cluster %d"%GATK_VF_CLUSTER 86 | if "--filter-name FS " not in VariantFiltration_opts: 87 | VariantFiltration_opts += " --filter-name FS -filter 'FS > %f'"%GATK_VF_FSMIN 88 | if "--filter-name QD " not in VariantFiltration_opts: 89 | VariantFiltration_opts += " --filter-name QD -filter 'QD < %f'"%GATK_VF_QDMAX 90 | 91 | if "-Xms" not in java_opts: 92 | java_opts += " %s"%JAVA_XMS 93 | if "-Xmx" not in java_opts: 94 | java_opts += " %s"%JAVA_XMG 95 | if "-Djava.io.tmpdir" not in java_opts: 96 | java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_gatk) 97 | create_dirs(["%s/javatmp/"%(work_gatk)]) 98 | 99 | msg = "picard CleanSam for %s"%sample 100 | if start<=step: 101 | logger.info("--------------------------STEP %s--------------------------"%step) 102 | if CleanSam: 103 | command="%s %s -cp %s picard.cmdline.PicardCommandLine CleanSam I=%s O=%s/alignments_clean.bam" % ( 104 | java, java_opts, picard, alignment,work_gatk ) 105 | command="bash -c \"%s\""%command 106 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 107 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 108 | alignment="%s/alignments_clean.bam"%work_gatk 109 | else: 110 | logger.info("Skipping step %d: %s"%(step,msg)) 111 | step+=1 112 | 113 | 114 | msg = "picard AddOrReplaceReadGroups for %s"%sample 115 | if start<=step: 116 | logger.info("--------------------------STEP %s--------------------------"%step) 117 | command="%s %s -cp %s picard.cmdline.PicardCommandLine AddOrReplaceReadGroups I=%s O=%s/rg_added_sorted.bam %s" % ( 118 | java, java_opts, picard, alignment,work_gatk,AddOrReplaceReadGroups_opts) 119 | command="bash -c \"%s\""%command 120 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 121 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 122 | else: 123 | logger.info("Skipping step %d: %s"%(step,msg)) 124 | step+=1 125 | 126 | 127 | msg = "picard MarkDuplicates for %s"%sample 128 | if start<=step: 129 | logger.info("--------------------------STEP %s--------------------------"%step) 130 | command="%s %s -cp %s picard.cmdline.PicardCommandLine MarkDuplicates I=%s/rg_added_sorted.bam O=%s/dedupped.bam %s M=%s/output.metrics" % ( 131 | java, java_opts, picard, work_gatk,work_gatk,MarkDuplicates_opts,work_gatk) 132 | command="bash -c \"%s\""%command 133 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 134 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 135 | else: 136 | logger.info("Skipping step %d: %s"%(step,msg)) 137 | step+=1 138 | 139 | 140 | msg = "GATK SplitNCigarReads for %s"%sample 141 | if start<=step: 142 | logger.info("--------------------------STEP %s--------------------------"%step) 143 | command="%s %s -jar %s SplitNCigarReads -R %s -I %s/dedupped.bam -O %s/split.bam %s" % ( 144 | java, java_opts, gatk, ref_genome,work_gatk,work_gatk,SplitNCigarReads_opts) 145 | command="bash -c \"%s\""%command 146 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 147 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 148 | else: 149 | logger.info("Skipping step %d: %s"%(step,msg)) 150 | step+=1 151 | 152 | split_bam="%s/split.bam"%work_gatk 153 | 154 | if not no_BaseRecalibrator: 155 | msg = "GATK BaseRecalibrator for %s"%sample 156 | if start<=step: 157 | logger.info("--------------------------STEP %s--------------------------"%step) 158 | command="%s %s -jar %s BaseRecalibrator -R %s -I %s -O %s/recal_data.table %s" % ( 159 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,BaseRecalibrator_opts) 160 | command="bash -c \"%s\""%command 161 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 162 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 163 | else: 164 | logger.info("Skipping step %d: %s"%(step,msg)) 165 | step+=1 166 | 167 | msg = "GATK ApplyBQSR for %s"%sample 168 | if start<=step: 169 | logger.info("--------------------------STEP %s--------------------------"%step) 170 | command="%s %s -jar %s ApplyBQSR -R %s -I %s -bqsr %s/recal_data.table -O %s/bsqr.bam %s" % ( 171 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,work_gatk,ApplyBQSR_opts) 172 | command="bash -c \"%s\""%command 173 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 174 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 175 | else: 176 | logger.info("Skipping step %d: %s"%(step,msg)) 177 | step+=1 178 | split_bam="%s/bsqr.bam"%work_gatk 179 | else: 180 | msg = "GATK BaseRecalibrator for %s"%sample 181 | logger.info("Skipping step %d: %s"%(step,msg)) 182 | step+=1 183 | msg = "GATK ApplyBQSR for %s"%sample 184 | logger.info("Skipping step %d: %s"%(step,msg)) 185 | step+=1 186 | 187 | msg = "GATK HaplotypeCaller for %s"%sample 188 | if start<=step: 189 | logger.info("--------------------------STEP %s--------------------------"%step) 190 | command="%s %s -jar %s HaplotypeCaller -R %s -I %s -O %s/variants.vcf %s" % ( 191 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,HaplotypeCaller_opts) 192 | command="bash -c \"%s\""%command 193 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 194 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 195 | else: 196 | logger.info("Skipping step %d: %s"%(step,msg)) 197 | step+=1 198 | 199 | msg = "GATK VariantFiltration for %s"%sample 200 | if start<=step: 201 | logger.info("--------------------------STEP %s--------------------------"%step) 202 | command="%s %s -jar %s VariantFiltration -R %s -V %s/variants.vcf -O %s/variants_filtered.vcf %s" % ( 203 | java, java_opts, gatk, ref_genome,work_gatk,work_gatk,VariantFiltration_opts) 204 | command="bash -c \"%s\""%command 205 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 206 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 207 | else: 208 | logger.info("Skipping step %d: %s"%(step,msg)) 209 | step+=1 210 | 211 | 212 | out_gatk=os.path.join(outdir,"gatk",sample) 213 | create_dirs([out_gatk]) 214 | msg="Copy predictions to output directory for %s."%sample 215 | if start<=step: 216 | logger.info("--------------------------STEP %s--------------------------"%step) 217 | if os.path.exists("%s/variants_filtered.vcf"%work_gatk): 218 | command = "cp %s/variants_filtered.vcf %s/variants_filtered.vcf"%( 219 | work_gatk, out_gatk) 220 | cmd = TimedExternalCmd(command, logger, raise_exception=True) 221 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout) 222 | else: 223 | logger.info("Skipping step %d: %s"%(step,msg)) 224 | step+=1 225 | 226 | variants = "" 227 | if os.path.exists("%s/variants_filtered.vcf"%out_gatk): 228 | logger.info("GATK was successfull!") 229 | logger.info("Output variants: %s/variants_filtered.vcf"%out_gatk) 230 | variants = "%s/variants_filtered.vcf"%out_gatk 231 | else: 232 | logger.info("GATK failed!") 233 | return variants 234 | 235 | def run_variant(variant_caller="GATK", alignment="", 236 | ref_genome="", knownsites="", 237 | picard=PICARD, gatk=GATK, 238 | java=JAVA, java_opts="", 239 | CleanSam=False, no_BaseRecalibrator=False, 240 | AddOrReplaceReadGroups_opts="", MarkDuplicates_opts="", 241 | SplitNCigarReads_opts="", 242 | BaseRecalibrator_opts="", 243 | ApplyBQSR_opts="", HaplotypeCaller_opts="", 244 | VariantFiltration_opts="", 245 | start=0, sample= "", nthreads=1, 246 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False): 247 | variants="" 248 | if variant_caller.upper()=="GATK": 249 | try: 250 | variants=run_gatk(alignment=alignment, 251 | ref_genome=ref_genome, knownsites=knownsites, 252 | picard=picard, gatk=gatk, 253 | java=java, java_opts=java_opts, 254 | CleanSam=CleanSam, 255 | no_BaseRecalibrator=no_BaseRecalibrator, 256 | AddOrReplaceReadGroups_opts=AddOrReplaceReadGroups_opts, 257 | MarkDuplicates_opts=MarkDuplicates_opts, 258 | SplitNCigarReads_opts=SplitNCigarReads_opts, 259 | BaseRecalibrator_opts=BaseRecalibrator_opts, 260 | ApplyBQSR_opts=ApplyBQSR_opts, HaplotypeCaller_opts=HaplotypeCaller_opts, 261 | VariantFiltration_opts=VariantFiltration_opts, 262 | start=start, sample= sample, nthreads=nthreads, 263 | workdir=workdir, outdir=outdir, timeout=timeout) 264 | except Exception as excp: 265 | logger.info("GATK failed!") 266 | logger.error(excp) 267 | if not ignore_exceptions: 268 | raise Exception(excp) 269 | return variants 270 | 271 | 272 | 273 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def create_dirs(dirlist): 8 | for dirname in dirlist: 9 | if not os.path.isdir(dirname): 10 | logger.info("Creating directory %s" % (dirname)) 11 | os.makedirs(dirname) 12 | 13 | -------------------------------------------------------------------------------- /test/A1_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A1_1.fq.gz -------------------------------------------------------------------------------- /test/A1_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A1_2.fq.gz -------------------------------------------------------------------------------- /test/A2_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A2_1.fq.gz -------------------------------------------------------------------------------- /test/A2_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A2_2.fq.gz -------------------------------------------------------------------------------- /test/B1_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B1_1.fq.gz -------------------------------------------------------------------------------- /test/B1_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B1_2.fq.gz -------------------------------------------------------------------------------- /test/B2_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B2_1.fq.gz -------------------------------------------------------------------------------- /test/B2_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B2_2.fq.gz -------------------------------------------------------------------------------- /test/C_long.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_long.fa.gz -------------------------------------------------------------------------------- /test/C_short.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short.fa.gz -------------------------------------------------------------------------------- /test/C_short_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short_1.fq.gz -------------------------------------------------------------------------------- /test/C_short_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short_2.fq.gz -------------------------------------------------------------------------------- /test/GRCh37_genes_pos.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh37_genes_pos.bed.gz -------------------------------------------------------------------------------- /test/GRCh37_strand_pos.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh37_strand_pos.bed.gz -------------------------------------------------------------------------------- /test/GRCh38.21.gpd.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38.21.gpd.gz -------------------------------------------------------------------------------- /test/GRCh38_genes_pos.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38_genes_pos.bed.gz -------------------------------------------------------------------------------- /test/GRCh38_strand_pos.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38_strand_pos.bed.gz -------------------------------------------------------------------------------- /test/hg19.known.21.gpd.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/hg19.known.21.gpd.gz -------------------------------------------------------------------------------- /test/test_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir example_small 4 | cd example_small 5 | 6 | echo "Download reference genome (chromosome 21) FASTA file" 7 | wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.21.fa.gz 8 | 9 | echo "Unzip reference genome (chromosome 21) FASTA file" 10 | gunzip Homo_sapiens.GRCh37.75.dna.chromosome.21.fa.gz 11 | 12 | echo "Download reference annotation GTF file" 13 | wget ftp://ftp.ensembl.org/pub/release-75//gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz 14 | 15 | echo "Unzip reference annotation GTF file" 16 | gunzip Homo_sapiens.GRCh37.75.gtf.gz 17 | 18 | echo "Restrict GTF to chromosome 21" 19 | less Homo_sapiens.GRCh37.75.gtf |awk '{if ($1==21) print}' > Homo_sapiens.GRCh37.75.chromosome.21.gtf 20 | 21 | 22 | echo "Download HISAT2 binaries" 23 | wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.0.5-Linux_x86_64.zip 24 | 25 | echo "Unzip HISAT2 binaries" 26 | unzip hisat2-2.0.5-Linux_x86_64.zip 27 | 28 | echo "Index genome with HISAT2" 29 | ./hisat2-2.0.5/hisat2-build Homo_sapiens.GRCh37.75.dna.chromosome.21.fa Homo_sapiens.GRCh37.75.dna.chromosome.21.HISAT2 30 | 31 | echo "Test alignment step using HISAT2" 32 | run_rnacocktail.py align --align_idx Homo_sapiens.GRCh37.75.dna.chromosome.21.HISAT2 --outdir out --workdir work --ref_gtf Homo_sapiens.GRCh37.75.chromosome.21.gtf --1 ../A1_1.fq.gz --2 ../A1_2.fq.gz --hisat2 hisat2-2.0.5/hisat2 --hisat2_sps hisat2-2.0.5/hisat2_extract_splice_sites.py --samtools samtools --sample A 33 | 34 | echo "Download StringTie binaries" 35 | wget http://ccb.jhu.edu/software/stringtie/dl/stringtie-1.3.3.Linux_x86_64.tar.gz 36 | 37 | echo "Untar StringTie binaries" 38 | tar -xzvf stringtie-1.3.3.Linux_x86_64.tar.gz 39 | 40 | echo "Test reconstruction step using StringTie" 41 | run_rnacocktail.py reconstruct --alignment_bam work/hisat2/A/alignments.sorted.bam --outdir out --workdir work --ref_gtf Homo_sapiens.GRCh37.75.chromosome.21.gtf --stringtie stringtie-1.3.3.Linux_x86_64/stringtie --sample A 42 | --------------------------------------------------------------------------------