├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── analysis_scripts
├── alignment
│ ├── README.md
│ ├── RNACocktail-Alignment-Analysis.ipynb
│ └── RNACocktail-Alignment-Analysis.py
├── denovo
│ ├── README.md
│ ├── RNACocktail-Denovo-Analysis.ipynb
│ └── RNACocktail-Denovo-Analysis.py
├── diff
│ ├── README.md
│ ├── RNACocktail-DIFF-Analysis.ipynb
│ └── RNACocktail-DIFF-Analysis.py
├── editing
│ ├── README.md
│ ├── RNACocktail-Editing-Analysis.ipynb
│ └── RNACocktail-Editing-Analysis.py
├── fusion
│ ├── README.md
│ ├── RNACocktail-Fusion-Analysis.ipynb
│ └── RNACocktail-Fusion-Analysis.py
├── quantification
│ ├── README.md
│ ├── RNACocktail-Quant-Analysis.ipynb
│ └── RNACocktail-Quant-Analysis.py
├── reconstruction
│ ├── README.md
│ ├── RNACocktail-Reconstruction-Analysis.ipynb
│ └── RNACocktail-Reconstruction-Analysis.py
└── variant
│ ├── README.md
│ ├── RNACocktail-Variant-Analysis.ipynb
│ └── RNACocktail-Variant-Analysis.py
├── docker
└── Dockerfile
├── ez_setup.py
├── index.html
├── scripts
├── gpd2gtf.py
├── hisat2_jun2bed.py
└── run_rnacocktail.py
├── setup.py
├── src
├── __init__.py
├── _version.py
├── defaults.py
├── external_cmd.py
├── main.py
├── run_diff.py
├── run_dnv_assemebly.py
├── run_editing.py
├── run_fusion.py
├── run_lr_align.py
├── run_lr_correct.py
├── run_lr_fusion.py
├── run_lr_reconstruct.py
├── run_quantify.py
├── run_reconstruct.py
├── run_sr_align.py
├── run_variant.py
└── utils.py
└── test
├── A1_1.fq.gz
├── A1_2.fq.gz
├── A2_1.fq.gz
├── A2_2.fq.gz
├── B1_1.fq.gz
├── B1_2.fq.gz
├── B2_1.fq.gz
├── B2_2.fq.gz
├── C_long.fa.gz
├── C_short.fa.gz
├── C_short_1.fq.gz
├── C_short_2.fq.gz
├── GRCh37_genes_pos.bed.gz
├── GRCh37_strand_pos.bed.gz
├── GRCh38.21.gpd.gz
├── GRCh38_genes_pos.bed.gz
├── GRCh38_strand_pos.bed.gz
├── docker_test.sh
├── hg19.known.21.gpd.gz
└── test_run.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
3 | .idea/**
4 |
5 | /build
6 |
7 | /RNACocktail_Pipeline.egg-info
8 |
9 | /dist
10 |
11 | /test/example*
12 |
13 | /test/*.jar
14 |
15 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | RNACocktail (c) 2016 by Roche Sequencing Solutions, Inc. All rights reserved.
2 | RNACocktail is licensed under Apache License Version 2.0.
3 | -------------------------------------------------------------
4 |
5 | The script "gpd2gtf.py" is modified from the original code from
6 | https://github.com/jason-weirather/Au-public/blob/master/gold/gpd2gtf.py
7 | available under Apache License Version 2.0.
8 |
9 | -------------------------------------------------------------
10 |
11 | Apache License
12 | Version 2.0, January 2004
13 | http://www.apache.org/licenses/
14 |
15 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
16 |
17 | 1. Definitions.
18 |
19 | "License" shall mean the terms and conditions for use, reproduction,
20 | and distribution as defined by Sections 1 through 9 of this document.
21 |
22 | "Licensor" shall mean the copyright owner or entity authorized by
23 | the copyright owner that is granting the License.
24 |
25 | "Legal Entity" shall mean the union of the acting entity and all
26 | other entities that control, are controlled by, or are under common
27 | control with that entity. For the purposes of this definition,
28 | "control" means (i) the power, direct or indirect, to cause the
29 | direction or management of such entity, whether by contract or
30 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
31 | outstanding shares, or (iii) beneficial ownership of such entity.
32 |
33 | "You" (or "Your") shall mean an individual or Legal Entity
34 | exercising permissions granted by this License.
35 |
36 | "Source" form shall mean the preferred form for making modifications,
37 | including but not limited to software source code, documentation
38 | source, and configuration files.
39 |
40 | "Object" form shall mean any form resulting from mechanical
41 | transformation or translation of a Source form, including but
42 | not limited to compiled object code, generated documentation,
43 | and conversions to other media types.
44 |
45 | "Work" shall mean the work of authorship, whether in Source or
46 | Object form, made available under the License, as indicated by a
47 | copyright notice that is included in or attached to the work
48 | (an example is provided in the Appendix below).
49 |
50 | "Derivative Works" shall mean any work, whether in Source or Object
51 | form, that is based on (or derived from) the Work and for which the
52 | editorial revisions, annotations, elaborations, or other modifications
53 | represent, as a whole, an original work of authorship. For the purposes
54 | of this License, Derivative Works shall not include works that remain
55 | separable from, or merely link (or bind by name) to the interfaces of,
56 | the Work and Derivative Works thereof.
57 |
58 | "Contribution" shall mean any work of authorship, including
59 | the original version of the Work and any modifications or additions
60 | to that Work or Derivative Works thereof, that is intentionally
61 | submitted to Licensor for inclusion in the Work by the copyright owner
62 | or by an individual or Legal Entity authorized to submit on behalf of
63 | the copyright owner. For the purposes of this definition, "submitted"
64 | means any form of electronic, verbal, or written communication sent
65 | to the Licensor or its representatives, including but not limited to
66 | communication on electronic mailing lists, source code control systems,
67 | and issue tracking systems that are managed by, or on behalf of, the
68 | Licensor for the purpose of discussing and improving the Work, but
69 | excluding communication that is conspicuously marked or otherwise
70 | designated in writing by the copyright owner as "Not a Contribution."
71 |
72 | "Contributor" shall mean Licensor and any individual or Legal Entity
73 | on behalf of whom a Contribution has been received by Licensor and
74 | subsequently incorporated within the Work.
75 |
76 | 2. Grant of Copyright License. Subject to the terms and conditions of
77 | this License, each Contributor hereby grants to You a perpetual,
78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
79 | copyright license to reproduce, prepare Derivative Works of,
80 | publicly display, publicly perform, sublicense, and distribute the
81 | Work and such Derivative Works in Source or Object form.
82 |
83 | 3. Grant of Patent License. Subject to the terms and conditions of
84 | this License, each Contributor hereby grants to You a perpetual,
85 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
86 | (except as stated in this section) patent license to make, have made,
87 | use, offer to sell, sell, import, and otherwise transfer the Work,
88 | where such license applies only to those patent claims licensable
89 | by such Contributor that are necessarily infringed by their
90 | Contribution(s) alone or by combination of their Contribution(s)
91 | with the Work to which such Contribution(s) was submitted. If You
92 | institute patent litigation against any entity (including a
93 | cross-claim or counterclaim in a lawsuit) alleging that the Work
94 | or a Contribution incorporated within the Work constitutes direct
95 | or contributory patent infringement, then any patent licenses
96 | granted to You under this License for that Work shall terminate
97 | as of the date such litigation is filed.
98 |
99 | 4. Redistribution. You may reproduce and distribute copies of the
100 | Work or Derivative Works thereof in any medium, with or without
101 | modifications, and in Source or Object form, provided that You
102 | meet the following conditions:
103 |
104 | (a) You must give any other recipients of the Work or
105 | Derivative Works a copy of this License; and
106 |
107 | (b) You must cause any modified files to carry prominent notices
108 | stating that You changed the files; and
109 |
110 | (c) You must retain, in the Source form of any Derivative Works
111 | that You distribute, all copyright, patent, trademark, and
112 | attribution notices from the Source form of the Work,
113 | excluding those notices that do not pertain to any part of
114 | the Derivative Works; and
115 |
116 | (d) If the Work includes a "NOTICE" text file as part of its
117 | distribution, then any Derivative Works that You distribute must
118 | include a readable copy of the attribution notices contained
119 | within such NOTICE file, excluding those notices that do not
120 | pertain to any part of the Derivative Works, in at least one
121 | of the following places: within a NOTICE text file distributed
122 | as part of the Derivative Works; within the Source form or
123 | documentation, if provided along with the Derivative Works; or,
124 | within a display generated by the Derivative Works, if and
125 | wherever such third-party notices normally appear. The contents
126 | of the NOTICE file are for informational purposes only and
127 | do not modify the License. You may add Your own attribution
128 | notices within Derivative Works that You distribute, alongside
129 | or as an addendum to the NOTICE text from the Work, provided
130 | that such additional attribution notices cannot be construed
131 | as modifying the License.
132 |
133 | You may add Your own copyright statement to Your modifications and
134 | may provide additional or different license terms and conditions
135 | for use, reproduction, or distribution of Your modifications, or
136 | for any such Derivative Works as a whole, provided Your use,
137 | reproduction, and distribution of the Work otherwise complies with
138 | the conditions stated in this License.
139 |
140 | 5. Submission of Contributions. Unless You explicitly state otherwise,
141 | any Contribution intentionally submitted for inclusion in the Work
142 | by You to the Licensor shall be under the terms and conditions of
143 | this License, without any additional terms or conditions.
144 | Notwithstanding the above, nothing herein shall supersede or modify
145 | the terms of any separate license agreement you may have executed
146 | with Licensor regarding such Contributions.
147 |
148 | 6. Trademarks. This License does not grant permission to use the trade
149 | names, trademarks, service marks, or product names of the Licensor,
150 | except as required for reasonable and customary use in describing the
151 | origin of the Work and reproducing the content of the NOTICE file.
152 |
153 | 7. Disclaimer of Warranty. Unless required by applicable law or
154 | agreed to in writing, Licensor provides the Work (and each
155 | Contributor provides its Contributions) on an "AS IS" BASIS,
156 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
157 | implied, including, without limitation, any warranties or conditions
158 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
159 | PARTICULAR PURPOSE. You are solely responsible for determining the
160 | appropriateness of using or redistributing the Work and assume any
161 | risks associated with Your exercise of permissions under this License.
162 |
163 | 8. Limitation of Liability. In no event and under no legal theory,
164 | whether in tort (including negligence), contract, or otherwise,
165 | unless required by applicable law (such as deliberate and grossly
166 | negligent acts) or agreed to in writing, shall any Contributor be
167 | liable to You for damages, including any direct, indirect, special,
168 | incidental, or consequential damages of any character arising as a
169 | result of this License or out of the use or inability to use the
170 | Work (including but not limited to damages for loss of goodwill,
171 | work stoppage, computer failure or malfunction, or any and all
172 | other commercial damages or losses), even if such Contributor
173 | has been advised of the possibility of such damages.
174 |
175 | 9. Accepting Warranty or Additional Liability. While redistributing
176 | the Work or Derivative Works thereof, You may choose to offer,
177 | and charge a fee for, acceptance of support, warranty, indemnity,
178 | or other liability obligations and/or rights consistent with this
179 | License. However, in accepting such obligations, You may act only
180 | on Your own behalf and on Your sole responsibility, not on behalf
181 | of any other Contributor, and only if You agree to indemnify,
182 | defend, and hold each Contributor harmless for any liability
183 | incurred by, or claims asserted against, such Contributor by reason
184 | of your accepting any such warranty or additional liability.
185 |
186 | END OF TERMS AND CONDITIONS
187 |
188 | APPENDIX: How to apply the Apache License to your work.
189 |
190 | To apply the Apache License to your work, attach the following
191 | boilerplate notice, with the fields enclosed by brackets "{}"
192 | replaced with your own identifying information. (Don't include
193 | the brackets!) The text should be enclosed in the appropriate
194 | comment syntax for the file format. We also recommend that a
195 | file or class name and description of purpose be included on the
196 | same "printed page" as the copyright notice for easier
197 | identification within third-party archives.
198 |
199 | Copyright {yyyy} {name of copyright owner}
200 |
201 | Licensed under the Apache License, Version 2.0 (the "License");
202 | you may not use this file except in compliance with the License.
203 | You may obtain a copy of the License at
204 |
205 | http://www.apache.org/licenses/LICENSE-2.0
206 |
207 | Unless required by applicable law or agreed to in writing, software
208 | distributed under the License is distributed on an "AS IS" BASIS,
209 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
210 | See the License for the specific language governing permissions and
211 | limitations under the License.
212 |
213 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 | include ez_setup.py
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail: A comprehensive framework for accurate and efficient RNA-Seq analysis
2 |
3 | See http://bioinform.github.io/rnacocktail/ for help and downloads.
4 |
--------------------------------------------------------------------------------
/analysis_scripts/alignment/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Alignment Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/alignment/RNACocktail-Alignment-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/analysis_scripts/alignment/RNACocktail-Alignment-Analysis.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | get_ipython().magic(u'pylab inline')
7 |
8 |
9 | # In[2]:
10 |
11 | import pybedtools
12 | import pickle
13 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2
14 | import seaborn as sns
15 | from pandas import DataFrame
16 | import os
17 | import csv
18 | import matplotlib.patches as patches
19 | import pysam
20 |
21 |
22 | # # Initialization
23 |
24 | # In[3]:
25 |
26 | methods=["Tophat","STAR","HISAT2"]
27 | sample="NA12878"
28 | reliable_est_bed="/path/to/reliable/EST/junctions.bed"
29 |
30 |
31 | # # Predictions
32 |
33 | # In[4]:
34 |
35 | bed_files={'Tophat':'/path/to/TopHat/junctions.bed',
36 | 'STAR':'/path/to/STAR/SJ.out.tab',
37 | 'HISAT2':'/path/to/HISAT/splicesites.txt',
38 | }
39 | bam_files={'Tophat':'/path/to/TopHat/alignments.bam',
40 | 'STAR':'/path/to/STAR/alignments.bam',
41 | 'HISAT2':'/path/to/HISAT2/alignments.bam',
42 | }
43 |
44 |
45 | # In[ ]:
46 |
47 |
48 |
49 |
50 | # # Functions
51 |
52 | # In[5]:
53 |
54 | def find_stats(bamfile,statfile):
55 | sam_file = pysam.Samfile(bamfile, "rb")
56 | seq={"1":[],"2":[]}
57 | current_qname=""
58 | uniqmap_uniqmap=0
59 | uniqmap_multimap=0
60 | multimap_multimap=0
61 | uniqmap_unmap=0
62 | multimap_unmap=0
63 | unmap_unmap=0
64 | cnts=0
65 | for line in sam_file:
66 | qname=line.qname
67 | if current_qname=="":
68 | current_qname=qname
69 | if qname!=current_qname:
70 | uniqed_multi_un={}
71 | for fs in ["1","2"]:
72 | NHs=map(lambda x:x[1],seq[fs])
73 | if len(set(NHs))==1:
74 | NH=NHs[0]
75 | if NH==1:
76 | uniqed_multi_un[fs]=0
77 | elif NH==-1:
78 | uniqed_multi_un[fs]=2
79 | else:
80 | uniqed_multi_un[fs]=1
81 | if uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==0:
82 | uniqmap_uniqmap+=1
83 | elif (uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==1) or (
84 | uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==0):
85 | uniqmap_multimap+=1
86 | elif (uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==1):
87 | multimap_multimap+=1
88 | elif (uniqed_multi_un["1"]==0 and uniqed_multi_un["2"]==2) or (
89 | uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==0):
90 | uniqmap_unmap+=1
91 | elif (uniqed_multi_un["1"]==1 and uniqed_multi_un["2"]==2) or (
92 | uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==1):
93 | multimap_unmap+=1
94 | elif (uniqed_multi_un["1"]==2 and uniqed_multi_un["2"]==2):
95 | unmap_unmap+=1
96 | else:
97 | print "ERRR3 ", line
98 | aaaa
99 | current_qname=qname
100 | seq={"1":[],"2":[]}
101 |
102 | flag=np.binary_repr(line.flag,12)
103 | tags=dict(line.get_tags())
104 | NH=-1 if "NH" not in tags else tags["NH"]
105 | mpd=flag[-3]=="0"
106 | pmpd=flag[-4]=="0"
107 | first=flag[-7]=="1"
108 | second=flag[-8]=="1"
109 | if not (first ^ second):
110 | print "ERRR1 ", line
111 | aaaa
112 |
113 | if (not mpd) and NH>0:
114 | print "ERRR1 ", line
115 | aaaa
116 |
117 | fs="1" if first else "2"
118 | seq[fs].append([flag,NH,mpd,pmpd])
119 | cnts+=1
120 | with open(statfile, 'wb') as csvfile:
121 | spamwriter = csv.writer(csvfile, delimiter='\t',
122 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
123 | spamwriter.writerow(["uniqmap_uniqmap", "uniqmap_multimap", "multimap_multimap", "uniqmap_unmap", "multimap_unmap", "unmap_unmap","total","cnts"])
124 | spamwriter.writerow([uniqmap_uniqmap, uniqmap_multimap, multimap_multimap, uniqmap_unmap, multimap_unmap, unmap_unmap,
125 | sum([uniqmap_uniqmap, uniqmap_multimap, multimap_multimap, uniqmap_unmap, multimap_unmap, unmap_unmap]),cnts])
126 |
127 | def find_matchstats(bamfile,matchstatfile):
128 | sam_file = pysam.Samfile(bamfile, "rb")
129 | match_stats={}
130 | for line in sam_file:
131 | if line.cigar:
132 | codes={}
133 | for k,v in line.cigar:
134 | if k not in codes:
135 | codes[k]=0
136 | codes[k]+=v
137 | for k,v in codes.iteritems():
138 | if k not in match_stats:
139 | match_stats[k]={}
140 | if v not in match_stats[k]:
141 | match_stats[k][v]=0
142 | match_stats[k][v]+=1
143 | pickle.dump(match_stats,open(matchstatfile,"w"))
144 |
145 | def find_NMstats(bamfile,NMstatfile):
146 | sam_file = pysam.Samfile(bamfile, "rb")
147 | NM_stats={}
148 | for line in sam_file:
149 | unmapped=(line.flag/4)%2==1
150 | if unmapped:
151 | continue
152 | tags=dict(line.tags)
153 | if "NM" in tags:
154 | nm=tags["NM"]
155 | if nm not in NM_stats:
156 | NM_stats[nm]=0
157 | NM_stats[nm]+=1
158 | elif "nM" in tags:
159 | nm=tags["nM"]
160 | if nm not in NM_stats:
161 | NM_stats[nm]=0
162 | NM_stats[nm]+=1
163 | else:
164 | print tags
165 | aaaa
166 | pickle.dump(NM_stats,open(NMstatfile,"w"))
167 |
168 |
169 | # # Analysis
170 |
171 | # In[6]:
172 |
173 | est_junctions_reliable=pybedtools.BedTool(reliable_est_bed)
174 |
175 |
176 | # In[7]:
177 |
178 | all_beds={}
179 | for method,bedfile in bed_files.iteritems():
180 | mybed=pybedtools.BedTool(bedfile)
181 | if method == "STAR":
182 | mybed=mybed.filter(lambda x: (int(x[2])-int(x[1]))>1).each(lambda x:[x[0],int(x[1])+1,x[2]]).saveas()
183 | elif method == "HISAT2":
184 | mybed=mybed.each(lambda x:[x[0],int(x[1])+1,x[2]]).saveas()
185 | elif method == "Tophat":
186 | mybed=mybed.each(lambda x:[x[0],int(x[1])+int(x[10].split(",")[0]),int(x[2])-int(x[10].split(",")[1])]).saveas()
187 | all_beds[method]=mybed.each(lambda x:["chr%s"%x[0],x[1],x[2]]).saveas()
188 |
189 |
190 | # In[8]:
191 |
192 | for method,bamfile in bam_files.iteritems():
193 | statfile=bamfile+".mystats"
194 | if os.path.exists(bamfile):
195 | if not os.path.exists(statfile):
196 | find_stats(bamfile,statfile)
197 |
198 |
199 | # In[9]:
200 |
201 | for method,bamfile in bam_files.iteritems():
202 | statfile=bamfile+".mystats_match"
203 | if os.path.exists(bamfile):
204 | if not os.path.exists(statfile):
205 | find_matchstats(bamfile,statfile)
206 |
207 |
208 | # In[10]:
209 |
210 | for method,bamfile in bam_files.iteritems():
211 | statfile=bamfile+".mystats_NM"
212 | if os.path.exists(bamfile):
213 | if not os.path.exists(statfile):
214 | print sample,method
215 | find_NMstats(bamfile,statfile)
216 |
217 |
218 | # In[11]:
219 |
220 | def parse_my_stats(stat_file):
221 | mystats={}
222 | with open(stat_file, 'r') as csv_f:
223 | spamreader = csv.reader(csv_f, delimiter='\t', quotechar='|')
224 | cnt=0
225 | for row in spamreader:
226 | if cnt==0:
227 | keys=row
228 | cnt=1
229 | else:
230 | vals=row
231 | mystats={x[0]:int(x[1]) for x in zip(keys,vals)}
232 | return mystats
233 | return {}
234 |
235 |
236 | # In[12]:
237 |
238 | alignment_stats={}
239 | for method,bed in all_beds.iteritems():
240 | alignment_stats[method]={}
241 | L=len(bed)
242 | L_est_reliable=len(bed.intersect(est_junctions_reliable,f=0.99,u=True,r=True))
243 | alignment_stats[method].update({"n_junctions":L, "n_est_reliable":L_est_reliable, "r_est_reliable":round(float(L_est_reliable)/float(L),2)})
244 |
245 |
246 | # In[13]:
247 |
248 | for method,bamfile in bam_files.iteritems():
249 | statfile=bamfile+".mystats"
250 | mystats=parse_my_stats(statfile)
251 | alignment_stats[method].update(mystats)
252 |
253 |
254 | # In[14]:
255 |
256 | for method,bamfile in bam_files.iteritems():
257 | statfile=bamfile+".mystats_match"
258 | mystats=pickle.load(open(statfile))
259 | alignment_stats[method].update({"match_stats":mystats})
260 |
261 |
262 | # In[15]:
263 |
264 | for method,bamfile in bam_files.iteritems():
265 | statfile=bamfile+".mystats_NM"
266 | mystats=pickle.load(open(statfile))
267 | alignment_stats[method].update({"NM":mystats})
268 |
269 |
270 | # In[16]:
271 |
272 | intersect_3methods={i:{} for i in range(8)}
273 | for iii in range(8):
274 | if iii==0:
275 | continue
276 | i=iii%2
277 | j=(iii/2)%2
278 | k=(iii/4)%2
279 | bed1=all_beds[methods[0]]
280 | bed2=all_beds[methods[1]]
281 | bed3=all_beds[methods[2]]
282 | if i==1:
283 | bed=bed1
284 | elif j==1:
285 | bed=bed2
286 | elif k==1:
287 | bed=bed3
288 | bed=bed.intersect(bed1,f=0.99,u=True if i==1 else False,v=True if i==0 else False,r=True)
289 | bed=bed.intersect(bed2,f=0.99,u=True if j==1 else False,v=True if j==0 else False,r=True)
290 | bed=bed.intersect(bed3,f=0.99,u=True if k==1 else False,v=True if k==0 else False,r=True)
291 | L=len(bed)
292 | L_est_reliable=len(bed.intersect(est_junctions_reliable,f=0.99,u=True,r=True))
293 | intersect_3methods[iii].update({"n_junctions":L, "n_est_reliable":L_est_reliable, "r_est_reliable":round(float(L_est_reliable)/float(L),2)})
294 |
295 |
296 | # ## Plots
297 |
298 | # ## junction validation
299 |
300 | # In[17]:
301 |
302 | sns.set(style="white",font_scale=1.5)
303 | fig, ax = plt.subplots(figsize=(8,2))
304 | bin_labels=["Reliable" , "Not Reliable"]
305 | A=[]
306 | B=[]
307 | res=[]
308 | labels=[]
309 | my_colors=sns.color_palette("Set1",n_colors=10)
310 | for jjj,method in enumerate(methods):
311 | A.append(alignment_stats[method]["n_junctions"])
312 | B.append(alignment_stats[method]["n_est_reliable"])
313 | labels.append(method)
314 | res.append(np.array(A))
315 | res.append(np.array(B))
316 | my_data=DataFrame(np.array(res).transpose(),index=labels,columns=bin_labels[::-1])
317 | for ii,b in enumerate(bin_labels[::-1]):
318 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax)
319 | for i,ytick in enumerate(cg.get_yticklabels()):
320 | ytick.set_fontsize(12)
321 | ax.set_xlabel("Number of Junctions")
322 | ax.set_xticks(range(0,600000,200000))
323 | ax.set_yticks(range(len(labels)))
324 | ax.set_xticklabels(["%sk"%(x/1000) if x>0 else "0" for x in range(0,600000,200000)])
325 |
326 | ax.set_xlim([0,500000])
327 | ax.set_title("Validation rate of splicing junctions on dbEST",fontsize=16)
328 | sns.despine(left=True)
329 | handles, labels = ax.get_legend_handles_labels()
330 | # reverse the order
331 | ax.legend(handles[::-1], labels[::-1],bbox_to_anchor=(0.85, 0.65, 0.5, .3),
332 | loc=1,ncol=1,
333 | mode="expand", borderaxespad=0.,frameon=False,fontsize=14)
334 |
335 |
336 | # In[18]:
337 |
338 | sns.set(style="white",font_scale=2.2)
339 | fig, ax = plt.subplots(figsize=(10,10))
340 | keys=["r_est","r_est_reliable"]
341 | labels=["% of EST matches","% Reliable EST matches"]
342 | index = np.arange(len(methods))
343 | bar_width = 0.2
344 | opacity = 0.5
345 | my_colors=sns.color_palette("Set2",n_colors=10)
346 | v = venn3(subsets=[intersect_3methods[k]['n_junctions'] for k in range(1,8)],
347 | set_labels = ('A','B','C'),ax=ax,alpha=0.6,set_colors=my_colors[0:3])
348 | for c in range(1,8):
349 | i=c%2
350 | j=(c/2)%2
351 | k=(c/4)%2
352 | v.get_label_by_id('%d%d%d'%(i,j,k)).set_text("%d%%"%(
353 | intersect_3methods[c]['r_est_reliable']*100))
354 | v.get_label_by_id('A').set_text('TopHat\n%s,%03d\n(%d%%)'%(alignment_stats['Tophat']['n_junctions']/1000,
355 | alignment_stats['Tophat']['n_junctions']%1000,
356 | alignment_stats['Tophat']['r_est_reliable']*100))
357 | v.get_label_by_id('B').set_text('STAR\n%s,%03d\n(%d%%)'%(alignment_stats['STAR']['n_junctions']/1000,
358 | alignment_stats['STAR']['n_junctions']%1000,
359 | alignment_stats['STAR']['r_est_reliable']*100))
360 | v.get_label_by_id('C').set_text('HISAT2\n%s,%03d\n(%d%%)'%(alignment_stats['HISAT2']['n_junctions']/1000,
361 | alignment_stats['HISAT2']['n_junctions']%1000,
362 | alignment_stats['HISAT2']['r_est_reliable']*100))
363 | for labe_id in ["A","B","C"]:
364 | v.get_label_by_id(labe_id).set_fontsize(25)
365 | ax.set_title(sample,fontsize=25)
366 |
367 | for labe_id in ["A","B","C","110","101","111","011"]:
368 | v.get_patch_by_id(labe_id).set_linewidth(0)
369 |
370 | ax.legend(["Only TopHat","Only STAR","Only TopHat & STAR","Only HISAT2",
371 | "Only TopHat & HISAT2","Only STAR & HISAT2","TopHat & STAR & HISAT2"],bbox_to_anchor=(0, 1.1, 1.2, .3),
372 | loc=0,ncol=2,
373 | mode="expand", borderaxespad=0.,frameon=False)
374 |
375 |
376 |
377 | # ## Read mapping analysis
378 |
379 | # In[19]:
380 |
381 | sns.set(style="white",font_scale=1.2)
382 | colors=[4]
383 | nt=["A","C","G","T"]
384 | etypes=[]
385 | for i in nt:
386 | for j in nt:
387 | if i!=j:
388 | etypes.append(i+j)
389 | print etypes
390 | bin_labels=["Both pairs uniquely mapped","Both pairs multi-mapped", "One pair uniquely, one multi-mapped",
391 | "One pair uniquely mapped, one unmapped","One pair multi-mapped, one unmapped", "Both pairs unmapped"]
392 | keys=['uniqmap_uniqmap','multimap_multimap', 'uniqmap_multimap', 'uniqmap_unmap', 'multimap_unmap', 'unmap_unmap']
393 | my_colors=sns.color_palette("Set3",n_colors=10)
394 |
395 | fig, axes = plt.subplots(1,3,figsize=(17,2))
396 | ax=axes[0]
397 | res=[]
398 | labels=[]
399 | for method in methods:
400 | if method not in alignment_stats:
401 | continue
402 | if "uniqmap_uniqmap" in alignment_stats[method]:
403 | myres=[alignment_stats[method][k]/float(alignment_stats[method]["total"])*100 for k in keys][::-1]
404 | myres=[sum(myres[i:]) for i in range(len(myres))]
405 | res.append(myres)
406 | label=method
407 | labels.append(label)
408 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels)
409 | for ii,b in enumerate(bin_labels):
410 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax)
411 |
412 | ax.set_xlabel("% of fragments")
413 | ax.set_xlim([0,100])
414 | sns.despine(left=True)
415 | handles, labels = ax.get_legend_handles_labels()
416 | # reverse the order
417 | ax.legend(handles[::-1], labels,bbox_to_anchor=(-0.4, 1, 1.52, .3),
418 | loc=0,ncol=2,
419 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12)
420 | plt.tight_layout()
421 |
422 |
423 | ax=axes[1]
424 | bin_labels=["1","2-3","4-6","7-10","11-20",">20"]
425 | bins=[1,3,6,10,20,1000]
426 |
427 | codes=[4]
428 | res=[]
429 | labels=[]
430 | for method in methods:
431 | if method not in alignment_stats:
432 | continue
433 | if "match_stats" not in alignment_stats[method]:
434 | continue
435 | if set(alignment_stats[method]["match_stats"].keys())&set(codes):
436 | my_res=[]
437 | for b in bins[::-1]:
438 | my_res.append(sum([v for code in set(alignment_stats[method]["match_stats"].keys())&set(codes)
439 | for k,v in alignment_stats[method]["match_stats"][code].iteritems() if (
440 | k<=b)])/float(sum(alignment_stats[method]["NM"].values()))*100)
441 | my_res=my_res
442 | res.append(my_res)
443 | label=method
444 | labels.append(label)
445 | else:
446 | my_res=[]
447 | for b in bins:
448 | my_res.append(0)
449 | my_res=my_res
450 | res.append(my_res)
451 | label=method
452 | labels.append(label)
453 |
454 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels)
455 | for ii,b in enumerate(bin_labels):
456 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_colors[ii],ax=ax)
457 |
458 | ax.set_yticklabels([])
459 |
460 | ax.set_xlabel("% of mapped fragments")
461 | sns.despine(left=True)
462 | handles, labels = ax.get_legend_handles_labels()
463 | ax.legend(handles[::-1], labels,bbox_to_anchor=(0.2, 1, .6, .3),
464 | loc=0,ncol=3,
465 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12, title="Number of soft clipped bases")
466 | plt.tight_layout()
467 |
468 |
469 |
470 |
471 | ax=axes[2]
472 |
473 | bin_labels=["1","2","3-4","5-6","7-9",">9"]
474 | bins=[1,2,4,6,9,1000]
475 | res=[]
476 | labels=[]
477 | for method in methods:
478 | if method not in alignment_stats:
479 | continue
480 | if "NM" not in alignment_stats[method]:
481 | continue
482 | my_res=[]
483 | for b in bins[::-1]:
484 | my_res.append(sum([v/float(sum(alignment_stats[method]["NM"].values()))*100
485 | for k,v in alignment_stats[method]["NM"].iteritems() if (
486 | 01:
164 | cnt+=1
165 | if kk:
166 | diff_g_res_taq[k]=res[list(kk)[0]]
167 | print len(diff_g_res_taq),cnt
168 |
169 |
170 | # In[64]:
171 |
172 | diff_g_res_ercc={k:res[k] for k in (set(res.keys())&set(ercc_control.keys()))}
173 | print len(diff_g_res_ercc)
174 |
175 |
176 | # ## Plots
177 |
178 | # In[65]:
179 |
180 | taq_corr={}
181 | my_res=diff_g_res_taq
182 | taq_genes=TAQ_control.keys()
183 | x=[my_res[k][0] if k in my_res else 0 for k in taq_genes]
184 |
185 | x=map(lambda i:max(i,-14),x)
186 | x=map(lambda i:min(i,13),x)
187 | y=[TAQ_control[k]["logfc"] for k in taq_genes]
188 | taq_corr=find_corr(x,y)
189 | print taq_corr
190 |
191 |
192 | # In[66]:
193 |
194 | sns.set(style="white",font_scale=3)
195 | logFC_cutoffs=np.arange(0.5,2.5,0.5)
196 | AUC_TAQ={}
197 | for logFC_cutoff in logFC_cutoffs:
198 | SNs=[0]
199 | SPs=[1]
200 | prev_SP=0
201 | my_res=diff_g_res_taq
202 | for pval_cutof in sorted(map(lambda x:x[1],my_res.values())):
203 | taq_genes=TAQ_control.keys()
204 | T=set(filter(lambda x:abs(TAQ_control[x]["logfc"])>=logFC_cutoff,taq_genes))
205 | F=set(taq_genes)-set(T)
206 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(TAQ_control[x]["logfc"]),my_res.keys()))
207 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys()))
208 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys()))
209 | N=N|(set(taq_genes)-(P|N))
210 |
211 | TP=T&P&homos
212 | FP=(P&F)|(P&(T-homos))
213 | TN=F&N
214 | FN=T&N
215 | SN=len(TP)/float(len(TP)+len(FN)+0.00001)
216 | SP=len(TN)/float(len(TN)+len(FP)+0.00001)
217 | if SPs[-1]>0.7:
218 | SNs.append(SN)
219 | SPs.append(SP)
220 | prev_SP=SP
221 | SP_1=SPs[-1]
222 | SP_2=SPs[-2]
223 | SN_1=SNs[-1]
224 | SN_2=SNs[-2]
225 | SP=0.7
226 | SN=(SN_2-SN_1)/(SP_2-SP_1+0.0000001)*(SP-SP_1)+SN_1
227 | SNs[-1]=SN
228 | SPs[-1]=SP
229 | AUC_TAQ[logFC_cutoff]=metrics.auc(1-np.array(SPs),SNs)
230 |
231 |
232 |
233 | # In[104]:
234 |
235 | logFC_cutoff=0.5
236 | pval_cutof=0.05
237 | sns.set(style="white",font_scale=2)
238 | SNs=[0]
239 | SPs=[1]
240 | my_res=diff_g_res_taq
241 | for pval_cutof in sorted(map(lambda x:x[1],my_res.values())):
242 | taq_genes=TAQ_control.keys()
243 | T=set(filter(lambda x:abs(TAQ_control[x]["logfc"])>=logFC_cutoff,taq_genes))
244 | F=set(taq_genes)-set(T)
245 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(TAQ_control[x]["logfc"]),my_res.keys()))
246 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys()))
247 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys()))
248 | N=N|(set(taq_genes)-(P|N))
249 | TP=T&P&homos
250 | FP=(P&F)|(P&(T-homos))
251 | TN=F&N
252 | FN=T&N
253 | SN=len(TP)/float(len(TP)+len(FN))
254 | SP=len(TN)/float(len(TN)+len(FP))
255 | SNs.append(SN)
256 | SPs.append(SP)
257 | plot(1-np.array(SPs),SNs)
258 | xlabel("FPR (1-Specificity)")
259 | ylabel("TPR (Sensitivity)")
260 | title("ROC analysis of qRT-PCR measured genes",fontsize=18)
261 |
262 |
263 | # In[106]:
264 |
265 | logFC_cutoff=0.5
266 | pval_cutof=0.05
267 | sns.set(style="white",font_scale=2)
268 | x=logFC_cutoffs
269 | y=[AUC_TAQ[w] for w in logFC_cutoffs]
270 | plot(x,y)
271 | xlabel("log2-fold change threshold")
272 | ylabel("AUC-30")
273 | title("AUC-30 vs. log2-fold change for qRT-PCR experiment",fontsize=18)
274 |
275 |
276 | # In[80]:
277 |
278 | taq_corr
279 |
280 |
281 | # In[108]:
282 |
283 | sns.set(style="white",font_scale=1.5)
284 | my_data=DataFrame([["DESeq2+Salmon-SMEM",taq_corr["spearman"][0],"Spearman rank correlation"],
285 | ["DESeq2+Salmon-SMEM",taq_corr["RMSD"],"RMSD"],
286 | ["DESeq2+Salmon-SMEM",AUC_TAQ[0.5],"AUC-30"]]
287 | ,
288 | columns=["tool","score","Measure"])
289 | fig, axes = plt.subplots(1,3,figsize=(16,5))
290 | for iii,key in enumerate(["Spearman rank correlation","RMSD","AUC-30"]):
291 | ax=axes[iii]
292 | my_data_=my_data[my_data["Measure"]==key]
293 | my_data_=my_data_.sort_values(by='score', ascending=[1 if key=="RMSD" else 0])
294 |
295 | cg=sns.stripplot(y="tool", x="score",data=my_data_,size=10, hue="Measure", orient="h",edgecolor="gray",ax=ax)
296 | ax.set_ylabel("")
297 | ax.set_xlabel(key)
298 | ax.legend([])
299 | ax.xaxis.grid(False)
300 | if iii==0:
301 | ax.set_xticks(np.arange(0.65,1,.1))
302 | ax.set_xlim([0.65,0.95])
303 | elif iii==1:
304 | ax.set_xticks(np.arange(1.5,4,1))
305 | ax.set_xlim([1.5,3.5])
306 | elif iii==2:
307 | ax.set_xticks(np.arange(0.08,0.24,.04))
308 | ax.set_xlim([0.08,0.2])
309 | ax.yaxis.grid(True)
310 | sns.despine(bottom=True)
311 | sns.despine(top=True)
312 | sns.despine(right=True)
313 | sns.despine(left=True)
314 | plt.tight_layout()
315 |
316 |
317 | # In[91]:
318 |
319 | ercc_corr={}
320 | my_res=diff_g_res_ercc
321 | ercc_genes=ercc_control.keys()
322 | x=[my_res[k][0] if k in my_res else 0 for k in ercc_genes ]
323 | x=map(lambda i:max(i,-14),x)
324 | x=map(lambda i:min(i,13),x)
325 | y=[ercc_control[k]["logfc"] for k in ercc_genes ]
326 | print len(x),len(y)
327 | ercc_corr=find_corr(x,y)
328 |
329 |
330 | # In[94]:
331 |
332 | logFC_cutoff=0.5
333 | pval_cutof=0.05
334 | sns.set(style="white",font_scale=3)
335 | SNs=[0]
336 | SPs=[1]
337 | my_res=diff_g_res_ercc
338 | for thr in (np.arange(0,3,0.1).tolist()+range(3,100))[::-1]:
339 | pval_cutof=10**-thr
340 | ercc_genes=ercc_control.keys()
341 | T=set(filter(lambda x:abs(ercc_control[x]["logfc"])>0,ercc_genes))
342 | F=set(ercc_genes)-set(T)
343 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(ercc_control[x]["logfc"]),my_res.keys()))
344 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys()))
345 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys()))
346 | N=N|(set(ercc_genes)-(P|N))
347 | TP=T&P&homos
348 | FP=(P&F)|(P&(T-homos))
349 | TN=F&N
350 | FN=T&N
351 | SN=len(TP)/float(len(TP)+len(FN)+0.0001)
352 | SP=len(TN)/float(len(TN)+len(FP)+0.0001)
353 | if SPs[-1]>0.7:
354 | SNs.append(SN)
355 | SPs.append(SP)
356 | SP_1=SPs[-1]
357 | SP_2=SPs[-2]
358 | SN_1=SNs[-1]
359 | SN_2=SNs[-2]
360 | SP=0.7
361 | SN=(SN_2-SN_1)/(SP_2-SP_1+0.0000001)*(SP-SP_1)+SN_1
362 | SNs[-1]=SN
363 | SPs[-1]=SP
364 | AUC_ERCC=metrics.auc(1-np.array(SPs),SNs)
365 |
366 |
367 | # In[105]:
368 |
369 | logFC_cutoff=0.5
370 | pval_cutof=0.05
371 | sns.set(style="white",font_scale=2)
372 | SPs_ERCC={}
373 | SNs_ERCC={}
374 | SNs=[0]
375 | SPs=[1]
376 | my_res=diff_g_res_ercc
377 | for thr in (np.arange(0,3,0.1).tolist()+range(3,100))[::-1]:
378 | pval_cutof=10**-thr
379 | ercc_genes=ercc_control.keys()
380 | T=set(filter(lambda x:abs(ercc_control[x]["logfc"])>0,ercc_genes))
381 | F=set(ercc_genes)-set(T)
382 | homos=set(filter(lambda x:sign(my_res[x][0])==sign(ercc_control[x]["logfc"]),my_res.keys()))
383 | P=set(filter(lambda x:my_res[x][1]<=pval_cutof,my_res.keys()))
384 | N=set(filter(lambda x:my_res[x][1]>pval_cutof,my_res.keys()))
385 | N=N|(set(ercc_genes)-(P|N))
386 | TP=T&P&homos
387 | FP=(P&F)|(P&(T-homos))
388 | TN=F&N
389 | FN=T&N
390 | SN=len(TP)/float(len(TP)+len(FN)+0.0001)
391 | SP=len(TN)/float(len(TN)+len(FP)+0.0001)
392 | SNs.append(SN)
393 | SPs.append(SP)
394 | plot(1-np.array(SPs),SNs)
395 | xlabel("FPR (1-Specificity)")
396 | ylabel("TPR (Sensitivity)")
397 | title("ROC analysis of ERCC genes",fontsize=18)
398 |
399 |
400 | # In[109]:
401 |
402 | sns.set(style="white",font_scale=1.5)
403 | my_data=DataFrame([["DESeq2+Salmon-SMEM",ercc_corr["spearman"][0],"Spearman rank correlation"],
404 | ["DESeq2+Salmon-SMEM",ercc_corr["RMSD"],"RMSD"],
405 | ["DESeq2+Salmon-SMEM",AUC_ERCC,"AUC-30"]]
406 | ,
407 | columns=["tool","score","Measure"])
408 | fig, axes = plt.subplots(1,3,figsize=(16,5))
409 | for iii,key in enumerate(["Spearman rank correlation","RMSD","AUC-30"]):
410 | ax=axes[iii]
411 | my_data_=my_data[my_data["Measure"]==key]
412 | my_data_=my_data_.sort_values(by='score', ascending=[1 if key=="RMSD" else 0])
413 |
414 | cg=sns.stripplot(y="tool", x="score",data=my_data_,size=10, hue="Measure", orient="h",edgecolor="gray",ax=ax)
415 | ax.set_ylabel("")
416 | ax.set_xlabel(key)
417 | ax.legend([])
418 | ax.xaxis.grid(False)
419 | if iii==0:
420 | ax.set_xticks(np.arange(0.55,0.95,.1))
421 | ax.set_xlim([0.55,0.9])
422 | elif iii==1:
423 | ax.set_xticks(np.arange(0.5,3.5,.5))
424 | ax.set_xlim([0.5,3])
425 | elif iii==2:
426 | ax.set_xticks(np.arange(0.05,0.3,.05))
427 | ax.set_xlim([0.05,0.25])
428 | ax.yaxis.grid(True)
429 | sns.despine(bottom=True)
430 | sns.despine(top=True)
431 | sns.despine(right=True)
432 | sns.despine(left=True)
433 | plt.tight_layout()
434 |
435 |
--------------------------------------------------------------------------------
/analysis_scripts/editing/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Editing Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/editing/RNACocktail-Editing-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/analysis_scripts/editing/RNACocktail-Editing-Analysis.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | get_ipython().magic(u'pylab inline')
7 |
8 |
9 | # In[2]:
10 |
11 | import numpy as np
12 | import os
13 | import glob
14 | import pickle
15 | from operator import itemgetter
16 | from Bio import SeqIO
17 | import csv
18 | import scipy
19 | from scipy import stats
20 | import pybedtools
21 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2
22 | import seaborn as sns
23 | from pandas import DataFrame
24 | import matplotlib.patches as patches
25 |
26 |
27 | # # Initializtion
28 |
29 | # In[3]:
30 |
31 | tool="HISAT2"
32 | sample="NA12878"
33 | callers="GATK"
34 | assemblers="StringTie"
35 | editor="GIREMI"
36 |
37 |
38 | varsim_jar="/path/to/VarSim.jar"
39 | NIST_HC_nonDB="/path/to/NIST_HC_nonDB.vcf"
40 | NIST_HC_vcf="/path/to/NIST_HC.vcf"
41 | b37_regions="/path/to/b37_regions"
42 | b37_rmask_bed="/path/to/b37.rmask.bed"
43 |
44 |
45 |
46 |
47 | # # Predictions
48 |
49 | # In[4]:
50 |
51 |
52 |
53 | pred_file="/path/to/giremi_out_good.txt.res"
54 | pred_file_pcnt_hidden={i:"/path/to/giremi_out_good_%s.txt.res"%i
55 | for i in range(10,110,10)}
56 |
57 |
58 | # # Funcions
59 |
60 | # In[5]:
61 |
62 | def parse_giremi(outfile):
63 | preds=[]
64 | with open(outfile,"r") as csv_file:
65 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
66 | cnt=0
67 | for row in spamreader:
68 | if cnt==0:
69 | cnt=1
70 | continue
71 | preds.append(row)
72 | return preds
73 |
74 |
75 | # In[6]:
76 |
77 | def parse_ga(outfile):
78 | preds=[]
79 | llrs=[]
80 | with open(outfile,"r") as csv_file:
81 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
82 | cnt=0
83 | for row in spamreader:
84 | x=row
85 | if int(x[1])==int(x[5]) and int(x[1])==(int(x[4])+1) and (x[2]==x[9]) and (x[3]==x[10]):
86 | keys=x[11].split(":")
87 | vals=x[12].split(":")
88 | if "AD" not in keys:
89 | cnts=["1","1"]
90 | else:
91 | cnts=vals[keys.index("AD")].split(",")
92 | if int(cnts[0])==0:
93 | continue
94 | preds.append([x[0],x[1],x[2],x[3],x[6],x[7],cnts[0],cnts[1]])
95 | print len(preds)
96 | return preds
97 |
98 |
99 | # In[7]:
100 |
101 | def vcf_to_bed(vcf_file,all_otherfields=False,otherfields=[]):
102 | with open(vcf_file,"r") as csv_file:
103 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
104 | intervals=[]
105 | for row in spamreader:
106 | if row[0]=="#":
107 | continue
108 | if all_otherfields:
109 | otherfields=range(2,len(row))
110 | if otherfields:
111 | intervals.append(pybedtools.Interval(row[0],int(row[1])-1,int(row[1]),otherfields=[row[i]
112 | for i in otherfields]))
113 | else:
114 | intervals.append(pybedtools.Interval(row[0],int(row[1])-1,int(row[1])))
115 | return pybedtools.BedTool(intervals)
116 |
117 |
118 | # In[8]:
119 |
120 | def find_etype(ref,alt,strand):
121 | revnt={"A":"T","C":"G","T":"A","G":"C"}
122 | if strand=="-":
123 | alt=revnt[alt]
124 | ref=revnt[ref]
125 | return ref+alt
126 |
127 |
128 | # In[9]:
129 |
130 | def find_er(ref,alt,strand,counts):
131 | revnt={"A":"T","C":"G","T":"A","G":"C"}
132 | id_n={"A":0,"C":1,"G":2,"T":3}
133 | counts=map(int,counts)
134 | if strand=="-":
135 | alt=revnt[alt]
136 | ref=revnt[ref]
137 | eratio=int(counts[id_n[alt]]/float(sum(counts))*100)
138 | return eratio
139 |
140 |
141 |
142 |
143 |
144 | # In[10]:
145 |
146 | revnt={"A":"T","C":"G","T":"A","G":"C"}
147 | def giremi_to_vcf(giremi_list,vcf_file):
148 | with open(vcf_file,"w") as csv_file:
149 | spamwriter = csv.writer(csv_file, delimiter='\t', quotechar='|')
150 | for x in giremi_list:
151 | if int(x[22])==0:
152 | continue
153 | ref,alt=x[17][0],x[17][1]
154 | strand=x[3]
155 | if strand=="-":
156 | alt=revnt[alt]
157 | ref=revnt[ref]
158 | spamwriter.writerow([x[1],x[2],".",ref,alt,".","PASS","."])
159 |
160 |
161 | # In[11]:
162 |
163 | revnt={"A":"T","C":"G","T":"A","G":"C"}
164 | def ga_to_vcf(ga_list,vcf_file):
165 | with open(vcf_file,"w") as csv_file:
166 | spamwriter = csv.writer(csv_file, delimiter='\t', quotechar='|')
167 | for x in ga_list:
168 | spamwriter.writerow([x[0],x[1],".",x[2],x[3],".","PASS","."])
169 |
170 |
171 | # In[12]:
172 |
173 | revnt={"A":"T","C":"G","T":"A","G":"C"}
174 |
175 |
176 | # In[13]:
177 |
178 | Alu_regions=pybedtools.BedTool(b37_rmask_bed
179 | ).filter(lambda x: "Alu" in x.name).merge().sort()
180 | print len(Alu_regions)
181 |
182 |
183 | # In[14]:
184 |
185 | reps=["repeats_b37_duplicates.bed","repeats_b37_Low_complexity.bed","repeats_b37_SINE.bed",
186 | "repeats_b37_duplicates_unique.bed", "repeats_b37_Satellite.bed", "repeats_b37_LINE.bed", "repeats_b37_Simple_repeat.bed"]
187 | rep_regions=pybedtools.BedTool([])
188 | for rep in reps:
189 | rep_regions=rep_regions.cat("%s/%s"%(b37_regions,rep))
190 | rep_regions=rep_regions.sort().merge()
191 |
192 |
193 | # In[15]:
194 |
195 | nonAlu_rep_regions=rep_regions.subtract(Alu_regions).sort()
196 |
197 |
198 | # In[16]:
199 |
200 | vcf_file="%s.vcf"%pred_file
201 | editor_pred=parse_giremi(pred_file)
202 | giremi_to_vcf(editor_pred,vcf_file)
203 | editor_bed=vcf_to_bed(vcf_file,all_otherfields=True)
204 | cmd="java -jar %s vcfcompare -true_vcf %s -prefix %s.NISTHCnonDB %s"%(varsim_jar,NIST_HC_nonDB,pred_file,vcf_file)
205 | if not os.path.exists("%s.NISTHCnonDB_TP.vcf"%(pred_file)):
206 | a=os.system(cmd)
207 | print cmd
208 | if a!=0:
209 | print a
210 |
211 |
212 | # In[17]:
213 |
214 | pred_edited={}
215 | edit_bed=pybedtools.BedTool([pybedtools.Interval(x[1],int(x[2])-1,int(x[2]),x[17],find_er(x[17][0],x[17][1],x[3],x[18:22]))
216 | for x in editor_pred if int(x[22])>0])
217 | for region,region_bed in [["Alu",Alu_regions],["nonAlu-reps",nonAlu_rep_regions],["nonreps",""],["all",""]]:
218 | if region in ["Alu","nonAlu-reps"]:
219 | my_edit_bed=edit_bed.window(region_bed,w=0,u=True)
220 | elif region=="nonreps":
221 | my_edit_bed=edit_bed.window(Alu_regions,w=0,v=True)
222 | my_edit_bed=my_edit_bed.window(nonAlu_rep_regions,w=0,v=True)
223 | elif region=="all":
224 | my_edit_bed=edit_bed.sort()
225 | edit_types=[x[3] for x in my_edit_bed]
226 | edit_ratios=[x[4] for x in my_edit_bed]
227 | vcf_file="%s.NISTHCnonDB_TP.vcf"%pred_file
228 | NIST_errors=len(vcf_to_bed(vcf_file))
229 | pred_edited[region]={
230 | "dist":{etype:edit_types.count(etype) for etype in set(edit_types)},
231 | "ratio":edit_ratios,
232 | "types":edit_types,
233 | "errors":NIST_errors
234 | }
235 |
236 |
237 | # In[18]:
238 |
239 | sns.set(style="white",font_scale=1.5)
240 | colors=[4]
241 | nt=["A","C","G","T"]
242 | etypes=[]
243 | for i in nt:
244 | for j in nt:
245 | if i!=j:
246 | etypes.append(i+j)
247 | rgn_name={"Alu": "Alu","nonAlu-reps":"Repetetive non-Alu","nonreps":"Nonrepetetive"}
248 | bin_labels=[r"A$\rightarrow$G",r"T$\rightarrow$C",r"C$\rightarrow$T",r"G$\rightarrow$A","Other Mismatches"]
249 | my_palette=sns.color_palette("Set3",n_colors=10)
250 | fig, ax = plt.subplots(figsize=(9,1.4))
251 | res=[]
252 | labels=[]
253 | n={}
254 | for rrr,rgn in enumerate(["Alu","nonAlu-reps","nonreps"]):
255 | my_dist=pred_edited
256 | if set(my_dist[rgn]["dist"].keys())-set(etypes):
257 | print aaaa
258 | z=[my_dist[rgn]["dist"][k] if k in
259 | my_dist[rgn]["dist"] else 0
260 | for k in etypes]
261 |
262 | sz=sum(z)+0.000001
263 | z=map(lambda x:round(x/float(sz),4)*100,z)
264 | z=[z[1],z[10],z[5],z[6],z[0]+sum(z[2:5])+sum(z[7:10])+z[11]]
265 | res_bin=[sum(z),sum(z[:4]),sum(z[:3]),sum(z[:2]),z[0]]
266 | res.append(res_bin)
267 | label="%s: %s"%(rgn_name[rgn], tool.replace("Tophat","TopHat"))
268 | n[label]=int(sz)
269 | labels.append(label)
270 | my_data=DataFrame(np.array(res),index=labels,columns=bin_labels)
271 | for ii,b in enumerate(bin_labels):
272 | cg=sns.barplot(data=my_data,x=b,y=labels,label=b, color=my_palette[ii],ax=ax)
273 | for ii,label in enumerate(labels):
274 | ax.text(101,ii+.25,"%d,%03d"%(n[label]/1000,n[label]%1000) if n[label]>=1000 else n[label] ,fontsize=12)
275 | ax.set_xlabel("% of Edits")
276 | ax.set_xlim([0,100])
277 | sns.despine(left=True)
278 | handles, labels = ax.get_legend_handles_labels()
279 | # reverse the order
280 | ax.legend(handles[::-1], labels,bbox_to_anchor=(1.2, 0.7, .5, .3),
281 | loc=0,ncol=1,
282 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12)
283 |
284 |
285 | # In[19]:
286 |
287 | nist_editor_pred={}
288 | nist_editor_bed={}
289 | nist_editor_out={}
290 | for x in range(0,110,10):
291 | if x==0:
292 | path=pred_file
293 | else:
294 | path=pred_file_pcnt_hidden[x]
295 | vcf_file="%s.vcf"%path
296 | nist_editor_out[x]=path
297 | nist_editor_pred[x]=parse_giremi(path)
298 | giremi_to_vcf(nist_editor_pred[x],vcf_file)
299 | nist_editor_bed[x]=vcf_to_bed(vcf_file,all_otherfields=True)
300 | cmd="java -jar %s vcfcompare -true_vcf %s -prefix %s.NISTHC_%s %s"%(varsim_jar,NIST_HC_vcf,path,x,vcf_file)
301 | if not os.path.exists("%s.NISTHC_%s_TP.vcf"%(path,x)):
302 | a=os.system(cmd)
303 | print cmd
304 | if a!=0:
305 | print a
306 |
307 |
308 | # In[20]:
309 |
310 | FDR={"all_calls":[],"FPs":[],"FDR":[],"AG":[]}
311 | for x in range(0,110,10):
312 | g=nist_editor_out[x]
313 | vcf_file="%s.NISTHC_%s_TP.vcf"%(g,x)
314 | NIST_errors=len(vcf_to_bed(vcf_file))
315 | all_calls=len(vcf_to_bed(vcf_file="%s.vcf"%g))
316 | fdr=NIST_errors/float(all_calls)*100
317 | FDR["all_calls"].append(all_calls)
318 | FDR["FPs"].append(NIST_errors)
319 | FDR["FDR"].append(fdr)
320 | edit_bed=pybedtools.BedTool([pybedtools.Interval(w[1],int(w[2])-1,int(w[2]),w[17],find_er(w[17][0],w[17][1],w[3],w[18:22]))
321 | for w in nist_editor_pred[x] if int(w[22])>0])
322 | edit_types=[w[3] for w in edit_bed]
323 | dist={etype:edit_types.count(etype) for etype in set(edit_types)}
324 | FDR["AG"].append((dist["AG"])/float((all_calls))*100)
325 |
326 |
327 | # In[21]:
328 |
329 | sns.set(style="white",font_scale=1.5)
330 | fig, axes = plt.subplots(1,3,figsize=(18,5))
331 | hiddens=range(0,110,10)
332 | for iii,key in enumerate(["FDR","all_calls","AG"]):
333 | ax=axes[iii]
334 | rects1 = ax.plot(hiddens,FDR[key],alpha=0.8,
335 | label="%s: %s"%(editor,tool), linewidth=3)
336 | ax.set_xticks(range(0,110,10))
337 | ax.set_xlabel("Proportion of hidden SNPs (%)")
338 | if key=="FDR":
339 | ax.set_yticks(range(0,50,10))
340 | ax.set_ylim([0,40])
341 | ax.set_ylabel(r"FDR (%)")
342 | elif key=="all_calls":
343 | ax.set_yticks(range(0,9000,2000))
344 | ax.set_ylim([0,8000])
345 | ax.set_ylabel(r"Number of predicted RNA editings")
346 | elif key=="AG":
347 | ax.set_yticks(range(75,105,5))
348 | ax.set_ylim([75,100])
349 | ax.set_ylabel(r"Proportion of A$\rightarrow$G events (%)")
350 |
351 | ax.legend(bbox_to_anchor=(0.4,1.1, 2, .102), loc=1,ncol=1,
352 | mode="expand", borderaxespad=0.,frameon=False,fontsize=18)
353 | plt.tight_layout()
354 |
355 |
356 | # In[22]:
357 |
358 | sns.set(style="white",font_scale=1.5)
359 | fig, ax = plt.subplots(figsize=(10,8))
360 | levels=np.arange(0,100,10)
361 | my_dist=pred_edited
362 | etypes=my_dist['all']["types"]
363 | ratios=np.array(map(int,my_dist['all']["ratio"]))
364 | E=[]
365 | for level in levels:
366 | es=[etypes[i] for i in range(len(ratios)) if ratios[i]>level]
367 | E.append((es.count("AG")+es.count("TC"))/float(len(es)+0.00001)*100)
368 | rects1 = ax.plot(levels,E, alpha=0.8,
369 | label="%s: %s"%(editor,tool), linewidth=3)
370 | ax.set_ylim([40,100])
371 | ax.set_xlabel("Minimum editing level (%)")
372 | ax.set_ylabel(r"Proportion of A$\rightarrow$G/T$\rightarrow$C events (%)")
373 | plt.tight_layout()
374 | ax.legend(bbox_to_anchor=(-.1, 1.15, 1.2, .102), loc=2,ncol=4,
375 | mode="expand", borderaxespad=0.,frameon=False,fontsize=12)
376 |
377 |
378 | # In[ ]:
379 |
380 |
381 |
382 |
--------------------------------------------------------------------------------
/analysis_scripts/fusion/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Fusion Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/fusion/RNACocktail-Fusion-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/analysis_scripts/fusion/RNACocktail-Fusion-Analysis.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | get_ipython().magic(u'pylab inline')
7 |
8 |
9 | # In[2]:
10 |
11 | import numpy as np
12 | import os
13 | import glob
14 | import pickle
15 | from operator import itemgetter
16 | from Bio import SeqIO
17 | import csv
18 | import scipy
19 | from scipy import stats
20 | import pybedtools
21 | from matplotlib_venn import venn3, venn3_circles,venn3_unweighted,venn2
22 | import seaborn as sns
23 | from pandas import DataFrame
24 |
25 |
26 | # # Initialization
27 |
28 | # In[11]:
29 |
30 | tools=["IDP-fusion","FusionCatcher"]
31 | sample="MCF7"
32 |
33 | gencode_gtf="/path/to/gencode.v19.annotation.gtf"
34 | gold_set="/path/to/idp_gold_set.txt"
35 |
36 |
37 | # # Prediction
38 |
39 | # In[4]:
40 |
41 | pred_file={}
42 |
43 |
44 | pred_file["FusionCatcher"]="/path/to/final-list_candidate-fusion-genes.txt"
45 | pred_file["IDP-fusion"]="/path/to/preds.txt"
46 |
47 |
48 | # # Functions
49 |
50 | # In[5]:
51 |
52 | def parse_fusion(predfile,tool):
53 | preds=[]
54 |
55 | if tool=="FusionCatcher":
56 | with open(predfile,"r") as csv_file:
57 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
58 | cnt=0
59 | for row in spamreader:
60 | if cnt==0:
61 | cnt+=1
62 | continue
63 | if cnt==1:
64 | preds.append([row[0],row[1],row[8].split(":")[0],
65 | row[8].split(":")[1],row[9].split(":")[0],row[9].split(":")[1]])
66 | elif tool=="IDP-fusion":
67 | with open(predfile,"r") as csv_file:
68 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
69 | cnt=0
70 | for row in spamreader:
71 | if cnt==0:
72 | cnt+=1
73 | continue
74 | if cnt==1:
75 | if not row[0]:
76 | continue
77 | preds.append([row[0].split("-")[0],row[0].split("-")[1],
78 | row[9].split("chr")[1],
79 | row[10],
80 | row[13].split("chr")[1],
81 | row[14]
82 | ])
83 | else:
84 | print "NO file ", tool
85 |
86 |
87 | Fs=set([])
88 | nonredundant_preds=[]
89 | for pred in preds:
90 | g1,g2=pred[0:2]
91 | fs="%s:%s"%(g1,g2)
92 | if fs not in Fs:
93 | nonredundant_preds.append(pred)
94 | Fs.add(fs)
95 |
96 | return nonredundant_preds
97 |
98 |
99 |
100 | # In[6]:
101 |
102 | def parse_gold(goldfile):
103 | gs=[]
104 | with open(goldfile,"r") as csv_file:
105 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
106 | for row in spamreader:
107 | gs.append(row)
108 |
109 | genes=set([x for w in gs for x in w])
110 | coord={}
111 | for gene in genes:
112 | if gene[0:3]=="chr":
113 | c=gene.split(":")[0][3:]
114 | p1=gene.split(":")[1].split("-")[0]
115 | p2=gene.split(":")[1].split("-")[1] if "-" in gene else str(int(p1)+1)
116 | coord[gene]=[c,p1,p2]
117 |
118 | with open(gencode_gtf,"r") as csv_file:
119 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
120 | for row in spamreader:
121 | if row[0][0]=="#":
122 | continue
123 | if row[2]=="gene":
124 | gene_info = {k.split()[0]:k.split()[1][1:-1] for k in ' '.join(row[8:]).split(";")[:-1]}
125 | name=gene_info["gene_name"]
126 | if name in genes:
127 | if name in coord:
128 | print "DUP",name
129 | coord[name]=[row[0],row[3],row[4]]
130 |
131 | gs=map(lambda x:x+coord[x[0]]+coord[x[1]],gs)
132 | return gs,coord
133 |
134 | gs,coord=parse_gold(gold_set)
135 | genes_gs=set([x for w in gs for x in w[0:2]])
136 | gs_dict={}
137 | for g in gs:
138 | if g[0] not in gs_dict:
139 | gs_dict[g[0]]={}
140 | if g[1] not in gs_dict[g[0]]:
141 | gs_dict[g[0]][g[1]]=[]
142 | gs_dict[g[0]][g[1]].append(g[2:])
143 | if g[1] not in gs_dict:
144 | gs_dict[g[1]]={}
145 | if g[0] not in gs_dict[g[1]]:
146 | gs_dict[g[1]][g[0]]=[]
147 | gs_dict[g[0]][g[1]].append(g[2:])
148 |
149 | intervals=[]
150 | processed_gs=set([])
151 | for g in gs:
152 | if g[0] not in processed_gs:
153 | intervals.append(pybedtools.Interval(chrom=g[2],start=int(g[3]),end=int(g[4]),name=g[0]))
154 | processed_gs.add(g[0])
155 | if g[1] not in processed_gs:
156 | intervals.append(pybedtools.Interval(chrom=g[5],start=int(g[6]),end=int(g[7]),name=g[1]))
157 | processed_gs.add(g[1])
158 |
159 | gs_bed=pybedtools.BedTool(intervals).sort()
160 |
161 |
162 |
163 | # In[7]:
164 |
165 | def evaluate(pred):
166 | tp=0
167 | fp=0
168 | for fusion in pred:
169 | g1,g2,c1,p1,c2,p2=fusion
170 | if g1 not in genes_gs:
171 | my_bed1=pybedtools.BedTool([pybedtools.Interval(chrom=c1,start=int(p1),end=int(p1)+1,name=g1)])
172 | matches1=my_bed1.window(gs_bed,w=0)
173 | if len(matches1)>1:
174 | aaaa
175 | elif len(matches1)==1:
176 | g1=matches1[0][9]
177 | else:
178 | fp+=1
179 | continue
180 |
181 | if g2 not in genes_gs:
182 | my_bed2=pybedtools.BedTool([pybedtools.Interval(chrom=c2,start=int(p2),end=int(p2)+1,name=g2)])
183 | matches2=my_bed2.window(gs_bed,w=0)
184 | if len(matches2)>1:
185 | aaaa
186 | elif len(matches2)==1:
187 | g2=matches2[0][9]
188 | else:
189 | fp+=1
190 | continue
191 |
192 |
193 | if g2 in gs_dict[g1]:
194 | tp+=1
195 | else:
196 | fp+=1
197 |
198 | print fp,tp,len(pred)-fp-tp
199 | return fp,tp
200 |
201 |
202 | # In[8]:
203 |
204 | preds={}
205 | for tool in tools:
206 | pred=parse_fusion(pred_file[tool],tool)
207 | preds[tool]=pred
208 |
209 |
210 | # In[9]:
211 |
212 | performance={}
213 | for tool in tools:
214 | fp,tp=evaluate(preds[tool])
215 | performance[tool]={"FP":fp,"TP":tp,"PR":tp/float(tp+fp+0.0001),"SN":tp/float(len(gs))}
216 | print tool,performance[tool]
217 |
218 |
219 | # In[10]:
220 |
221 | sns.set(style="white",font_scale=2)
222 | fig, ax = plt.subplots(figsize=(6,6))
223 | x=[]
224 | y=[]
225 | for tool in tools:
226 | x=(performance[tool]["SN"]*100)
227 | y=(performance[tool]["PR"]*100)
228 | label=tool
229 | ax.plot(x,y,label=label,linestyle=""
230 | ,marker="o",markersize=25)
231 | ax.set_yticks(range(0,70,10))
232 | ax.set_ylim([0,60])
233 | ax.set_xticks(range(20,55,5))
234 | ax.set_xlim([20,50])
235 | ax.set_xlabel("Sensitivity(%)")
236 | ax.set_ylabel("Precision(%)")
237 |
238 | ax.legend(bbox_to_anchor=(1, 0.8, 1.1, 0.1), loc=1,ncol=1,
239 | mode="expand", borderaxespad=0.,frameon=False)
240 |
241 |
242 | # In[ ]:
243 |
244 |
245 |
246 |
247 | # In[ ]:
248 |
249 |
250 |
251 |
--------------------------------------------------------------------------------
/analysis_scripts/quantification/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Quantification Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/quantification/RNACocktail-Quant-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/analysis_scripts/quantification/RNACocktail-Quant-Analysis.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | get_ipython().magic(u'pylab inline')
7 |
8 |
9 | # In[2]:
10 |
11 | import numpy as np
12 | import os
13 | import glob
14 | import pickle
15 | from operator import itemgetter
16 | from Bio import SeqIO
17 | import csv
18 | import scipy
19 | from scipy import stats
20 | import statsmodels.api as sm
21 | import seaborn as sns
22 |
23 |
24 | # # Initialization
25 |
26 | # In[3]:
27 |
28 | housekeeping_file="/path/to/housekeeping.txt"
29 |
30 | samples=["SEQC_A1","SEQC_A2"]
31 |
32 |
33 | # # Predictions
34 |
35 | # In[4]:
36 |
37 | quant_file={}
38 |
39 | quant_file["SEQC_A1"]="/path/to/quant.sf"
40 | quant_file["SEQC_A2"]="/path/to/quant.sf"
41 |
42 |
43 | # # Functions
44 |
45 | # In[5]:
46 |
47 | def parse_quant_results(res_file):
48 | mat=[]
49 | with open(res_file, 'r') as csv_f:
50 | spamreader = csv.reader(csv_f, delimiter='\t', quotechar='|')
51 | cnt=-1
52 | for row in spamreader:
53 | cnt+=1
54 | if cnt==0:
55 | continue
56 | mat.append([row[0],int(row[1]),float(row[2]),float(row[4]),float(row[3])])
57 | return mat
58 |
59 |
60 | # In[6]:
61 |
62 | def find_corr(x,y):
63 | corr={}
64 | [r,p]=scipy.stats.spearmanr(np.log2(x), np.log2(y))
65 | corr['spearman_log']=[r,p]
66 | return corr
67 |
68 |
69 |
70 | # In[7]:
71 |
72 | with open(housekeeping_file) as csv_file:
73 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
74 | hk_genes=[]
75 | hk_transcripts=[]
76 | for row in spamreader:
77 | hk_genes.append(row[0])
78 | hk_transcripts.append(row[1])
79 | hk_genes=set(hk_genes)
80 | hk_transcripts=set(hk_transcripts)
81 | print len(hk_genes),len(hk_transcripts)
82 |
83 |
84 | # ## Read Assembly transcripts
85 |
86 | # In[8]:
87 |
88 | quant_stats={}
89 | median_hk_trans={}
90 | for sample in samples:
91 | quant_stats[sample]=parse_quant_results(quant_file[sample])
92 | x_dict={w[0]:w[4] for w in quant_stats[sample] if w[0]}
93 | hks=[x_dict[k] for k in set(x_dict.keys())&hk_transcripts]
94 | median_hk_trans[sample]=np.median(hks)
95 |
96 |
97 | # ## Plots
98 |
99 | # In[9]:
100 |
101 | pscnt=1
102 | sample1,sample2=samples
103 | labels=[]
104 | my_data=[]
105 | res1=quant_stats[sample1]
106 | res2=quant_stats[sample2]
107 | x_dict={w[0]:w[4]/float(median_hk_trans[sample1]) for w in res1}
108 | y_dict={w[0]:w[4]/float(median_hk_trans[sample2]) for w in res2}
109 | keys=list(set(x_dict.keys())|set(y_dict.keys()))
110 | keys=filter(lambda x:"ENST" in x,keys)
111 | x=np.array(map(lambda w:x_dict[w] if w in x_dict else 0,keys))
112 | y=np.array(map(lambda w:y_dict[w] if w in y_dict else 0,keys))
113 | x=x+0.5
114 | y=y+0.5
115 | f4=find((np.multiply((y>pscnt),(x>pscnt)))
116 | +(np.multiply((y>pscnt),(x<=pscnt)))
117 | +(np.multiply((y<=pscnt),(x>pscnt))))
118 | w=(np.log2(x[f4])-np.log2(y[f4]))
119 | w=filter(lambda x: abs(x)>=0.000, w)
120 | logfc_data=w
121 |
122 |
123 | # In[10]:
124 |
125 | import seaborn as sns
126 | sns.set(style="whitegrid",font_scale=2)
127 | fig, ax = plt.subplots(figsize=(1,4))
128 | my_data=logfc_data
129 | cg=sns.violinplot(data=my_data, palette="Set3" , bw=0.2, cut=10,
130 | linewidth=1,scale="area",inner="quartile",saturation=0.75,gridsize=500)
131 | ax.set_xticklabels(labels,rotation=90)
132 | ax.set_ylim([-1.5,1.5])
133 | ax.set_yticks(np.arange(-1,2,1))
134 | sns.despine(left=True, bottom=True)
135 | ax.set_title("Percentage of expression \n disagreement between replicates",fontsize=12)
136 |
137 |
138 | # In[11]:
139 |
140 | miss_diff_pscnt={}
141 | sample1,sample2=samples
142 | res1=quant_stats[sample1]
143 | res2=quant_stats[sample2]
144 | x_dict={w[0]:w[4]/float(median_hk_trans[sample1]) for w in res1}
145 | y_dict={w[0]:w[4]/float(median_hk_trans[sample2]) for w in res2}
146 | keys=list(set(x_dict.keys())|set(y_dict.keys()))
147 | keys=filter(lambda x:"ENST" in x,keys)
148 | x=np.array(map(lambda w:x_dict[w] if w in x_dict else 0,keys))
149 | y=np.array(map(lambda w:y_dict[w] if w in y_dict else 0,keys))
150 | for pscnt in np.arange(0,5,0.1):
151 | zz=find((np.multiply((y<=pscnt),(x<=pscnt))))
152 | gg=find((np.multiply((y>pscnt),(x>pscnt)))+
153 | (np.multiply((y>pscnt),(x<=pscnt)))+
154 | (np.multiply((x>pscnt),(y<=pscnt))))
155 | lfc=(np.log2(x[gg]+0.5)-np.log2(y[gg]+0.5))
156 | miss_diff_pscnt[pscnt]=[sum(np.multiply(abs(lfc)>1,lfc>=0)),
157 | sum(np.multiply(abs(lfc)>1,lfc<0)),
158 | sum(abs(lfc)<=1),
159 | len(zz)
160 | ]
161 |
162 |
163 | # In[12]:
164 |
165 | sns.set(style="white",font_scale=2)
166 | fig, ax = plt.subplots(figsize=(14,8))
167 | x=[]
168 | y=[]
169 | for pscnt in np.arange(0,5,0.1):
170 | md=miss_diff_pscnt[pscnt]
171 | x.append(md[3]/float(sum(md))*100)
172 | y.append(sum(md[0:2])/float(sum(md[0:4]))*100)
173 | ax.plot(x,y, alpha=0.8,linewidth=2)
174 | ax.set_xlabel("% Excluded")
175 | ax.set_ylabel("% Expression disagreement")
176 | ax.spines['top'].set_visible(False)
177 | ax.spines['right'].set_visible(False)
178 | ax.get_xaxis().tick_bottom()
179 | ax.get_yaxis().tick_left()
180 |
181 |
--------------------------------------------------------------------------------
/analysis_scripts/reconstruction/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Transcriptome Reconstruction Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/reconstruction/RNACocktail-Reconstruction-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/analysis_scripts/variant/README.md:
--------------------------------------------------------------------------------
1 | RNACocktail Variant Analysis
2 | ===========
3 |
4 | ### [Read it online here](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/bioinform/rnacocktail/master/analysis_scripts/variant/RNACocktail-Variant-Analysis.ipynb)
5 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 |
4 | ENV RNACOCKTAIL_VERSION 0.3.2
5 | ENV R_VERSION 3.6.1-3bionic
6 | ENV DEBIAN_FRONTEND noninteractive
7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
8 | ENV SAMTOOLS_VERSION 1.2
9 | ENV BEDTOOLS2_VERSION 2.29.0
10 | ENV PYBEDTOOLS_VERSION 0.8.0
11 | ENV PYSAM_VERSION 0.15.0
12 | ENV HISAT2_VERSION 2.1.0
13 | ENV STRINGTIE_VERSION 2.0.4
14 | ENV SALMON_VERSION 0.11.0
15 | ENV OASES_VERSION 0.2.09
16 | ENV VELVET_VERSION 1.2.10
17 | ENV SUBREAD_VERSION 2.0.0
18 | ENV LORDEC_VERSION 0.9
19 | ENV STAR_VERSION 2.7.2b
20 | ENV PICARD_VERSION 2.19.0
21 | ENV HTSLIB_VERSION 1.9
22 | ENV GIREMI_VERSION 0.2.1
23 | ENV BIOPYTHON_VERSION 1.74
24 | ENV OPENPYXL_VERSION 2.6.4
25 | ENV XLRD_VERSION 1.1.0
26 | ENV BOWTIE_VERSION 1.2.3
27 | ENV BOWTIE2_VERSION 2.3.5.1
28 | ENV BWA_VERSION 0.7.17
29 | ENV SRA_VERSION 2.9.6
30 | ENV COREUTILS_VERSION 8.27
31 | ENV PIGZ_VERSION 2.4
32 | ENV GMAP_VERSION 2019-09-12
33 | ENV BBMAP_VERSION 38.44
34 | ENV FUSIONCATCHER_VERSION 1.20
35 | ENV GFFREAD_VERSION 0.11.5
36 | ENV IDPFUSION_VERSION 1.1.1
37 | ENV GATK_VERSION 4.1.4.0
38 |
39 | RUN apt-get update && \
40 | apt-get install -y --fix-missing build-essential zlib1g-dev unzip libncurses5-dev curl wget python python-pip python-dev cmake libboost-all-dev libxml2-dev libcurl4-gnutls-dev software-properties-common apt-transport-https default-jre default-jdk less vim libtbb-dev git tabix
41 |
42 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
43 | RUN add-apt-repository 'deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/'
44 | RUN apt-get update
45 | RUN apt-get install -y --fix-missing r-base=${R_VERSION} r-recommended=${R_VERSION}
46 | RUN apt-get install -y --fix-missing --allow-downgrades r-base-core=${R_VERSION}
47 |
48 | RUN echo 'local({r <- getOption("repos"); r["CRAN"] <- "http://cran.r-project.org"; options(repos=r)})' > ~/.Rprofile
49 | RUN R -e 'install.packages("BiocManager"); BiocManager::install(); BiocManager::install("DESeq2"); BiocManager::install("tximport"); BiocManager::install("readr");'
50 |
51 | ADD https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 /opt/samtools-${SAMTOOLS_VERSION}.tar.bz2
52 | RUN cd /opt && tar -xjvf samtools-${SAMTOOLS_VERSION}.tar.bz2 && cd samtools-${SAMTOOLS_VERSION} && make && make install && cd /opt && rm -rf samtools*
53 |
54 | ADD https://github.com/arq5x/bedtools2/releases/download/v${BEDTOOLS2_VERSION}/bedtools-${BEDTOOLS2_VERSION}.tar.gz /opt/bedtools-${BEDTOOLS2_VERSION}.tar.gz
55 | RUN cd /opt && tar -zxvf bedtools-${BEDTOOLS2_VERSION}.tar.gz && cd bedtools2 && make && make install && cd /opt && rm -rf bedtools*
56 |
57 | RUN wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-${HISAT2_VERSION}-Linux_x86_64.zip -O /opt/hisat2-${HISAT2_VERSION}-Linux_x86_64.zip && cd /opt && unzip hisat2-${HISAT2_VERSION}-Linux_x86_64.zip && cp -p /opt/hisat2-${HISAT2_VERSION}/hisat2* /usr/local/bin && cd /opt && rm -rf hisat2*
58 |
59 | ADD https://github.com/gpertea/stringtie/archive/v${STRINGTIE_VERSION}.tar.gz /opt/stringtie-${STRINGTIE_VERSION}.Linux_x86_64.tar.gz
60 | RUN cd /opt && tar -zxvf stringtie-${STRINGTIE_VERSION}.Linux_x86_64.tar.gz && cd stringtie-${STRINGTIE_VERSION} && make && cp -p /opt/stringtie-${STRINGTIE_VERSION}/stringtie /usr/local/bin && cd /opt && rm -rf stringtie*
61 |
62 | ADD https://github.com/COMBINE-lab/salmon/releases/download/v${SALMON_VERSION}/salmon-${SALMON_VERSION}-linux_x86_64.tar.gz /opt/salmon-${SALMON_VERSION}-linux_x86_64.tar.gz
63 | RUN cd /opt && tar -zxvf salmon-${SALMON_VERSION}-linux_x86_64.tar.gz && cp -p /opt/salmon-*/bin/salmon /usr/local/bin && cp -p /opt/salmon-*/lib/* /usr/local/lib && cd /opt && rm -rf salmon*
64 |
65 | ADD https://github.com/dzerbino/oases/archive/${OASES_VERSION}.tar.gz /opt/${OASES_VERSION}.tar.gz
66 | RUN cd /opt && tar -zxvf ${OASES_VERSION}.tar.gz && rm -rf /opt/oases-${OASES_VERSION}/velvet /opt/${OASES_VERSION}.tar.gz
67 |
68 | ADD https://www.ebi.ac.uk/~zerbino/velvet/velvet_${VELVET_VERSION}.tgz /opt/velvet_${VELVET_VERSION}.tgz
69 | RUN cd /opt && tar -zxvf velvet_${VELVET_VERSION}.tgz && cd velvet_${VELVET_VERSION} && make OPENMP=1 && mv /opt/velvet_${VELVET_VERSION} /opt/oases-${OASES_VERSION}/velvet && cd /opt/oases-${OASES_VERSION} && make OPENMP=1 && cp -p /opt/oases-${OASES_VERSION}/oases /usr/local/bin && cp -p /opt/oases-${OASES_VERSION}/velvet/velvet* /usr/local/bin && rm -rf /opt/velvet_${VELVET_VERSION}.tgz
70 | RUN rm -rf /opt/oases-${OASES_VERSION}/velvet/* && rm -rf /opt/oases-${OASES_VERSION}/velvet/.gitignore && rm -rf /opt/oases-${OASES_VERSION}/* && rm -rf /opt/oases*
71 |
72 | RUN wget http://downloads.sourceforge.net/project/subread/subread-${SUBREAD_VERSION}/subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz -O /opt/subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz && cd /opt && tar -zxvf subread-${SUBREAD_VERSION}-Linux-x86_64.tar.gz && cp -p /opt/subread-${SUBREAD_VERSION}-Linux-x86_64/bin/featureCounts /usr/local/bin && cd /opt && rm -rf subread*
73 |
74 | ADD https://gite.lirmm.fr/lordec/lordec-releases/uploads/710113d83c210b6989ccfbdbafa89234/lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2 /opt/lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2
75 | RUN cd /opt && tar xjf lordec-bin_${LORDEC_VERSION}_linux64.tar.bz2 && cd lordec-bin_${LORDEC_VERSION}_linux64 && cp -p /opt/lordec-bin_${LORDEC_VERSION}_linux64/lordec* /usr/local/bin && chmod -R 777 /usr/local/bin/lordec* && chown -R root /usr/local/bin/lordec* && chgrp -R root /usr/local/bin/lordec* && cd /opt && rm -rf lordec*
76 |
77 | ADD https://github.com/alexdobin/STAR/archive/${STAR_VERSION}.tar.gz /opt/STAR_${STAR_VERSION}.tar.gz
78 | RUN cd /opt && tar -zxvf STAR_${STAR_VERSION}.tar.gz && cp -p /opt/STAR-${STAR_VERSION}/bin/Linux_x86_64_static/* /usr/local/bin && cd /opt && rm -rf STAR*
79 |
80 | ADD https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar /opt/picard.jar
81 | RUN cd /opt && cp -p picard.jar /usr/local/bin && chmod 755 /usr/local/bin/picard.jar && cd /opt && rm -rf picard*
82 |
83 | ENV HTSLIB_VERSION 1.3
84 |
85 | ADD https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htslib-${HTSLIB_VERSION}.tar.bz2 /opt/htslib-${HTSLIB_VERSION}.tar.bz2
86 | RUN cd /opt && tar xjf htslib-${HTSLIB_VERSION}.tar.bz2 && cd htslib-${HTSLIB_VERSION} && ./configure && make && rm -rf /opt/htslib-${HTSLIB_VERSION}.tar.bz2
87 |
88 | ADD https://github.com/zhqingit/giremi/archive/v${GIREMI_VERSION}.tar.gz /opt/giremi-${GIREMI_VERSION}.tar.gz
89 | RUN cd /opt && tar -zxvf giremi-${GIREMI_VERSION}.tar.gz && cp -p giremi-${GIREMI_VERSION}/giremi* /usr/local/bin && chmod -R 777 /usr/local/bin/giremi* && cd /opt && rm -rf giremi-*
90 |
91 | RUN pip install --upgrade pip
92 | RUN pip install pybedtools==${PYBEDTOOLS_VERSION} pysam==${PYSAM_VERSION} biopython==${BIOPYTHON_VERSION} openpyxl==${OPENPYXL_VERSION} xlrd==${XLRD_VERSION} numpy pandas scipy
93 |
94 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie/${BOWTIE_VERSION}/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip -O /opt/bowtie-${BOWTIE_VERSION}-linux-x86_64.zip
95 | RUN cd /opt && unzip bowtie-${BOWTIE_VERSION}-linux-x86_64.zip && cp -p /opt/bowtie-${BOWTIE_VERSION}-linux-x86_64/bowtie* /usr/local/bin && cd /opt && rm -rf bowtie*
96 |
97 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip -O /opt/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip
98 | RUN cd /opt && unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip && cp -p /opt/bowtie2-${BOWTIE2_VERSION}-linux-x86_64/bowtie2* /usr/local/bin && cd /opt && rm -rf bowtie2*
99 |
100 | RUN wget https://sourceforge.net/projects/bio-bwa/files/bwa-${BWA_VERSION}.tar.bz2/download -O /opt/bwa-${BWA_VERSION}.tar.bz2
101 | RUN cd /opt && tar xjf bwa-${BWA_VERSION}.tar.bz2 && cd bwa-${BWA_VERSION} && make && cp -p /opt/bwa-${BWA_VERSION}/bwa /usr/local/bin && cd /opt && rm -rf bwa*
102 |
103 | ADD https://github.com/ndaniel/seqtk/archive/1.2-r101c.tar.gz /opt/seqtk-1.2-r101c.tar.gz
104 | RUN cd /opt && tar -zxvf /opt/seqtk-1.2-r101c.tar.gz && cd seqtk-1.2-r101c && make && cp -p /opt/seqtk-1.2-r101c/seqtk /usr/local/bin && cd /opt && rm -rf seqtk*
105 |
106 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/blat /usr/local/bin/blat
107 | RUN chmod 755 /usr/local/bin/blat
108 |
109 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit /usr/local/bin/faToTwoBit
110 | RUN chmod 755 /usr/local/bin/faToTwoBit
111 |
112 | ADD http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver /usr/local/bin/liftOver
113 | RUN chmod 755 /usr/local/bin/liftOver
114 |
115 | ADD https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRA_VERSION}/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz /opt/sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz
116 | RUN cd /opt && tar -zxvf sratoolkit.${SRA_VERSION}-ubuntu64.tar.gz && cp -Rp /opt/sratoolkit.${SRA_VERSION}-ubuntu64/bin/* /usr/local/bin/ && cd /opt && rm -rf sratoolkit*
117 |
118 | ADD http://ftp.gnu.org/gnu/coreutils/coreutils-${COREUTILS_VERSION}.tar.xz /opt/coreutils-${COREUTILS_VERSION}.tar.xz
119 | RUN cd /opt && tar -xJf coreutils-${COREUTILS_VERSION}.tar.xz && cd coreutils-${COREUTILS_VERSION} && ./configure FORCE_UNSAFE_CONFIGURE=1 && make && make install && cd /opt && rm -rf coreutils*
120 |
121 | ADD https://github.com/madler/pigz/archive/v${PIGZ_VERSION}.tar.gz /opt/pigz-${PIGZ_VERSION}.tar.gz
122 | RUN cd /opt && tar -zxvf pigz-${PIGZ_VERSION}.tar.gz && cd pigz-${PIGZ_VERSION} && make && cp -p /opt/pigz-${PIGZ_VERSION}/pigz /usr/local/bin && cd /opt && rm -rf pigz*
123 |
124 | ADD http://research-pub.gene.com/gmap/src/gmap-gsnap-${GMAP_VERSION}.tar.gz /opt/gmap-gsnap-${GMAP_VERSION}.tar.gz
125 | RUN cd /opt && tar -zxvf gmap-gsnap-${GMAP_VERSION}.tar.gz && cd gmap-${GMAP_VERSION} && ./configure && make && make install && cd /opt && rm -rf gmap*
126 |
127 | ENV PATH $PATH:/opt/bbmap/
128 |
129 | RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_${BBMAP_VERSION}.tar.gz -O /opt/BBMap_${BBMAP_VERSION}.tar.gz
130 | RUN cd /opt && tar -xzvf BBMap_${BBMAP_VERSION}.tar.gz
131 |
132 | ENV PATH $PATH:/opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/
133 |
134 | RUN wget https://github.com/ndaniel/fusioncatcher/releases/download/${FUSIONCATCHER_VERSION}/fusioncatcher_v${FUSIONCATCHER_VERSION}.zip -O /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}.zip && cd /opt && unzip fusioncatcher_v${FUSIONCATCHER_VERSION}.zip && cp -p /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/sam2psl.py /usr/local/bin && cp -p /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/FC /opt/fusioncatcher_v${FUSIONCATCHER_VERSION}/bin/fusioncatcher
135 |
136 |
137 | ADD http://ccb.jhu.edu/software/stringtie/dl/gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz opt/gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz
138 | RUN cd /opt && tar -xzvf gffread-${GFFREAD_VERSION}.Linux_x86_64.tar.gz && cp -p /opt/gffread-${GFFREAD_VERSION}.Linux_x86_64/gffread /usr/local/bin && rm -rf /opt/gffread*
139 |
140 | RUN cd /opt/ && git clone https://github.com/bioinform/IDP.git && cd IDP && git checkout a5d2d624ab8e4545feff3f51d264931b440d0b53
141 |
142 | ADD http://augroup.org/IDP-fusion/files/IDP-fusion_${IDPFUSION_VERSION}.tar.gz /opt/IDP-fusion_${IDPFUSION_VERSION}.tar.gz
143 | RUN cd /opt && tar -xzvf IDP-fusion_${IDPFUSION_VERSION}.tar.gz && rm -rf /opt/IDP-fusion_${IDPFUSION_VERSION}.tar.gz
144 |
145 | RUN wget https://github.com/broadinstitute/gatk/releases/download/4.1.4.0/gatk-4.1.4.0.zip -O /opt/gatk-4.1.4.0.zip && cd /opt && unzip gatk-4.1.4.0.zip && chmod -R 777 /opt/gatk-4.1.4.0
146 |
147 | RUN pip install https://github.com/bioinform/rnacocktail/archive/v${RNACOCKTAIL_VERSION}.tar.gz
148 |
149 | VOLUME /work_dir
150 |
151 |
152 |
--------------------------------------------------------------------------------
/ez_setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Bootstrap setuptools installation
3 |
4 | To use setuptools in your package's setup.py, include this
5 | file in the same directory and add this to the top of your setup.py::
6 |
7 | from ez_setup import use_setuptools
8 | use_setuptools()
9 |
10 | To require a specific version of setuptools, set a download
11 | mirror, or use an alternate download directory, simply supply
12 | the appropriate options to ``use_setuptools()``.
13 |
14 | This file can also be run as a script to install or upgrade setuptools.
15 | """
16 | import os
17 | import shutil
18 | import sys
19 | import tempfile
20 | import zipfile
21 | import optparse
22 | import subprocess
23 | import platform
24 | import textwrap
25 | import contextlib
26 |
27 | from distutils import log
28 |
29 | try:
30 | from urllib.request import urlopen
31 | except ImportError:
32 | from urllib2 import urlopen
33 |
34 | try:
35 | from site import USER_SITE
36 | except ImportError:
37 | USER_SITE = None
38 |
39 | DEFAULT_VERSION = "12.0.4"
40 | DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/"
41 |
42 | def _python_cmd(*args):
43 | """
44 | Return True if the command succeeded.
45 | """
46 | args = (sys.executable,) + args
47 | return subprocess.call(args) == 0
48 |
49 |
50 | def _install(archive_filename, install_args=()):
51 | with archive_context(archive_filename):
52 | # installing
53 | log.warn('Installing Setuptools')
54 | if not _python_cmd('setup.py', 'install', *install_args):
55 | log.warn('Something went wrong during the installation.')
56 | log.warn('See the error message above.')
57 | # exitcode will be 2
58 | return 2
59 |
60 |
61 | def _build_egg(egg, archive_filename, to_dir):
62 | with archive_context(archive_filename):
63 | # building an egg
64 | log.warn('Building a Setuptools egg in %s', to_dir)
65 | _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
66 | # returning the result
67 | log.warn(egg)
68 | if not os.path.exists(egg):
69 | raise IOError('Could not build the egg.')
70 |
71 |
72 | class ContextualZipFile(zipfile.ZipFile):
73 | """
74 | Supplement ZipFile class to support context manager for Python 2.6
75 | """
76 |
77 | def __enter__(self):
78 | return self
79 |
80 | def __exit__(self, type, value, traceback):
81 | self.close()
82 |
83 | def __new__(cls, *args, **kwargs):
84 | """
85 | Construct a ZipFile or ContextualZipFile as appropriate
86 | """
87 | if hasattr(zipfile.ZipFile, '__exit__'):
88 | return zipfile.ZipFile(*args, **kwargs)
89 | return super(ContextualZipFile, cls).__new__(cls)
90 |
91 |
92 | @contextlib.contextmanager
93 | def archive_context(filename):
94 | # extracting the archive
95 | tmpdir = tempfile.mkdtemp()
96 | log.warn('Extracting in %s', tmpdir)
97 | old_wd = os.getcwd()
98 | try:
99 | os.chdir(tmpdir)
100 | with ContextualZipFile(filename) as archive:
101 | archive.extractall()
102 |
103 | # going in the directory
104 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
105 | os.chdir(subdir)
106 | log.warn('Now working in %s', subdir)
107 | yield
108 |
109 | finally:
110 | os.chdir(old_wd)
111 | shutil.rmtree(tmpdir)
112 |
113 |
114 | def _do_download(version, download_base, to_dir, download_delay):
115 | egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg'
116 | % (version, sys.version_info[0], sys.version_info[1]))
117 | if not os.path.exists(egg):
118 | archive = download_setuptools(version, download_base,
119 | to_dir, download_delay)
120 | _build_egg(egg, archive, to_dir)
121 | sys.path.insert(0, egg)
122 |
123 | # Remove previously-imported pkg_resources if present (see
124 | # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details).
125 | if 'pkg_resources' in sys.modules:
126 | del sys.modules['pkg_resources']
127 |
128 | import setuptools
129 | setuptools.bootstrap_install_from = egg
130 |
131 |
132 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
133 | to_dir=os.curdir, download_delay=15):
134 | to_dir = os.path.abspath(to_dir)
135 | rep_modules = 'pkg_resources', 'setuptools'
136 | imported = set(sys.modules).intersection(rep_modules)
137 | try:
138 | import pkg_resources
139 | except ImportError:
140 | return _do_download(version, download_base, to_dir, download_delay)
141 | try:
142 | pkg_resources.require("setuptools>=" + version)
143 | return
144 | except pkg_resources.DistributionNotFound:
145 | return _do_download(version, download_base, to_dir, download_delay)
146 | except pkg_resources.VersionConflict as VC_err:
147 | if imported:
148 | msg = textwrap.dedent("""
149 | The required version of setuptools (>={version}) is not available,
150 | and can't be installed while this script is running. Please
151 | install a more recent version first, using
152 | 'easy_install -U setuptools'.
153 |
154 | (Currently using {VC_err.args[0]!r})
155 | """).format(VC_err=VC_err, version=version)
156 | sys.stderr.write(msg)
157 | sys.exit(2)
158 |
159 | # otherwise, reload ok
160 | del pkg_resources, sys.modules['pkg_resources']
161 | return _do_download(version, download_base, to_dir, download_delay)
162 |
163 | def _clean_check(cmd, target):
164 | """
165 | Run the command to download target. If the command fails, clean up before
166 | re-raising the error.
167 | """
168 | try:
169 | subprocess.check_call(cmd)
170 | except subprocess.CalledProcessError:
171 | if os.access(target, os.F_OK):
172 | os.unlink(target)
173 | raise
174 |
175 | def download_file_powershell(url, target):
176 | """
177 | Download the file at url to target using Powershell (which will validate
178 | trust). Raise an exception if the command cannot complete.
179 | """
180 | target = os.path.abspath(target)
181 | ps_cmd = (
182 | "[System.Net.WebRequest]::DefaultWebProxy.Credentials = "
183 | "[System.Net.CredentialCache]::DefaultCredentials; "
184 | "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)"
185 | % vars()
186 | )
187 | cmd = [
188 | 'powershell',
189 | '-Command',
190 | ps_cmd,
191 | ]
192 | _clean_check(cmd, target)
193 |
194 | def has_powershell():
195 | if platform.system() != 'Windows':
196 | return False
197 | cmd = ['powershell', '-Command', 'echo test']
198 | with open(os.path.devnull, 'wb') as devnull:
199 | try:
200 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
201 | except Exception:
202 | return False
203 | return True
204 |
205 | download_file_powershell.viable = has_powershell
206 |
207 | def download_file_curl(url, target):
208 | cmd = ['curl', url, '--silent', '--output', target]
209 | _clean_check(cmd, target)
210 |
211 | def has_curl():
212 | cmd = ['curl', '--version']
213 | with open(os.path.devnull, 'wb') as devnull:
214 | try:
215 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
216 | except Exception:
217 | return False
218 | return True
219 |
220 | download_file_curl.viable = has_curl
221 |
222 | def download_file_wget(url, target):
223 | cmd = ['wget', url, '--quiet', '--output-document', target]
224 | _clean_check(cmd, target)
225 |
226 | def has_wget():
227 | cmd = ['wget', '--version']
228 | with open(os.path.devnull, 'wb') as devnull:
229 | try:
230 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
231 | except Exception:
232 | return False
233 | return True
234 |
235 | download_file_wget.viable = has_wget
236 |
237 | def download_file_insecure(url, target):
238 | """
239 | Use Python to download the file, even though it cannot authenticate the
240 | connection.
241 | """
242 | src = urlopen(url)
243 | try:
244 | # Read all the data in one block.
245 | data = src.read()
246 | finally:
247 | src.close()
248 |
249 | # Write all the data in one block to avoid creating a partial file.
250 | with open(target, "wb") as dst:
251 | dst.write(data)
252 |
253 | download_file_insecure.viable = lambda: True
254 |
255 | def get_best_downloader():
256 | downloaders = (
257 | download_file_powershell,
258 | download_file_curl,
259 | download_file_wget,
260 | download_file_insecure,
261 | )
262 | viable_downloaders = (dl for dl in downloaders if dl.viable())
263 | return next(viable_downloaders, None)
264 |
265 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
266 | to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader):
267 | """
268 | Download setuptools from a specified location and return its filename
269 |
270 | `version` should be a valid setuptools version number that is available
271 | as an sdist for download under the `download_base` URL (which should end
272 | with a '/'). `to_dir` is the directory where the egg will be downloaded.
273 | `delay` is the number of seconds to pause before an actual download
274 | attempt.
275 |
276 | ``downloader_factory`` should be a function taking no arguments and
277 | returning a function for downloading a URL to a target.
278 | """
279 | # making sure we use the absolute path
280 | to_dir = os.path.abspath(to_dir)
281 | zip_name = "setuptools-%s.zip" % version
282 | url = download_base + zip_name
283 | saveto = os.path.join(to_dir, zip_name)
284 | if not os.path.exists(saveto): # Avoid repeated downloads
285 | log.warn("Downloading %s", url)
286 | downloader = downloader_factory()
287 | downloader(url, saveto)
288 | return os.path.realpath(saveto)
289 |
290 | def _build_install_args(options):
291 | """
292 | Build the arguments to 'python setup.py install' on the setuptools package
293 | """
294 | return ['--user'] if options.user_install else []
295 |
296 | def _parse_args():
297 | """
298 | Parse the command line for options
299 | """
300 | parser = optparse.OptionParser()
301 | parser.add_option(
302 | '--user', dest='user_install', action='store_true', default=False,
303 | help='install in user site package (requires Python 2.6 or later)')
304 | parser.add_option(
305 | '--download-base', dest='download_base', metavar="URL",
306 | default=DEFAULT_URL,
307 | help='alternative URL from where to download the setuptools package')
308 | parser.add_option(
309 | '--insecure', dest='downloader_factory', action='store_const',
310 | const=lambda: download_file_insecure, default=get_best_downloader,
311 | help='Use internal, non-validating downloader'
312 | )
313 | parser.add_option(
314 | '--version', help="Specify which version to download",
315 | default=DEFAULT_VERSION,
316 | )
317 | options, args = parser.parse_args()
318 | # positional arguments are ignored
319 | return options
320 |
321 | def main():
322 | """Install or upgrade setuptools and EasyInstall"""
323 | options = _parse_args()
324 | archive = download_setuptools(
325 | version=options.version,
326 | download_base=options.download_base,
327 | downloader_factory=options.downloader_factory,
328 | )
329 | return _install(archive, _build_install_args(options))
330 |
331 | if __name__ == '__main__':
332 | sys.exit(main())
333 |
--------------------------------------------------------------------------------
/scripts/gpd2gtf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | ############################################################################
4 | #This script is modified from the original code by Kin Fai Au
5 | #Obtained from https://github.com/jason-weirather/Au-public/blob/master/gold/gpd2gtf.py
6 | #Available unde Apache License Version 2.0
7 | ############################################################################
8 |
9 | import sys
10 | import math
11 |
12 | ### generate_transcript_list
13 | ############################
14 | def generate_transcript_list(gpd_file, transcript_list):
15 |
16 | for line in gpd_file:
17 |
18 | if (line[0] == '#'):
19 | continue
20 |
21 | fields = line.split()
22 | num_exons = int(fields[8])
23 |
24 | start_pos_list = fields[9].split(',')
25 | end_pos_list = fields[10].split(',')
26 |
27 | exon_pos = [0] * num_exons
28 | for i in range(num_exons):
29 | exon_pos[i] = [start_pos_list[i], end_pos_list[i]]
30 |
31 | transcript_list.append([fields[0], fields[1], fields[2], fields[3], exon_pos])
32 |
33 |
34 | ### generate_FPKM_dict
35 | #######################
36 | def generate_FPKM_dict(FPKM_file, FPKM_dict):
37 |
38 | for line in FPKM_file:
39 | fields = line.split()
40 | FPKM_dict[fields[0]] = fields[1]
41 |
42 |
43 |
44 | ### generate_gpd_format
45 | #######################
46 | def generate_gtf_format(gtf_file, transcript_list, FPKM_dict, source):
47 |
48 |
49 | for line in transcript_list:
50 | exon_pos = line[4]
51 | # transcript line
52 |
53 | # chr name
54 | gtf_file.write(line[2] + '\t' + source + '\t' + "transcript" + '\t')
55 | # start-end pos, score
56 | gtf_file.write("%s"%(int(exon_pos[0][0])+1) + '\t' + exon_pos[-1][1] + '\t' + '*' + '\t')
57 | # Direction
58 | gtf_file.write(line[3] + '\t' + '.' + '\t')
59 |
60 | if (FPKM_dict.has_key( line[1]) ):
61 | FPKM = FPKM_dict[line[1]]
62 | else:
63 | FPKM = '*'
64 | attribute_1 = 'gene_id "' + line[0] + '"; transcript_id "' + line[1] + '"; '
65 | attribute_2 = ('FPKM "' + FPKM + '"; frac "' + '*' + '"; conf_lo "' + '*' + '"; ' +
66 | 'conf_hi "' + '*' + '"; cov "' + '*' + '";\n')
67 |
68 | gtf_file.write(attribute_1)
69 | gtf_file.write(attribute_2)
70 |
71 | num_exons = len(exon_pos)
72 | for i in range(num_exons):
73 | # chr name
74 | gtf_file.write(line[2] + '\t' + source + '\t' + "exon" + '\t')
75 | # start-end pos, score
76 | gtf_file.write("%s"%(int(exon_pos[i][0])+1) + '\t' + exon_pos[i][1] + '\t' + '*' + '\t')
77 | # Direction
78 | gtf_file.write(line[3] + '\t' + '.' + '\t')
79 | gtf_file.write(attribute_1)
80 | gtf_file.write('exon_number "' + str(i+1) + '"; ')
81 | gtf_file.write(attribute_2)
82 |
83 |
84 |
85 | ### Main
86 | ########
87 | def main():
88 | gpd_file = open(sys.argv[1], 'r')
89 | FPKM_file = open(sys.argv[2], 'r')
90 | gtf_file = open(sys.argv[3], 'w')
91 | source = sys.argv[4]
92 |
93 | transcript_list = []
94 | FPKM_dict = dict()
95 | generate_transcript_list(gpd_file, transcript_list)
96 | generate_FPKM_dict(FPKM_file, FPKM_dict)
97 | generate_gtf_format(gtf_file, transcript_list, FPKM_dict, source)
98 |
99 | gpd_file.close()
100 | gtf_file.close()
101 |
102 |
103 | if __name__ == '__main__':
104 | main()
105 |
--------------------------------------------------------------------------------
/scripts/hisat2_jun2bed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys
4 | import os
5 |
6 | if len(sys.argv) >= 2:
7 | HISATJun_filename = sys.argv[1]
8 | bed_filename = sys.argv[2]
9 | else:
10 | print("usage: python hisat2_jun2bed.py HISAT2_splicesites.txt junction.bed")
11 | sys.exit(1)
12 |
13 | jun_s = set()
14 | junction=open(HISATJun_filename,'r')
15 | output_file = open(bed_filename,'w')
16 | for line in junction:
17 | if line[0:5]=='track':
18 | continue
19 | else:
20 | line_list=line.strip().split("\t")
21 | leftpos=str(int(line_list[1]))
22 | rightpos=str(int(line_list[2]))
23 | locus = "___".join([line_list[0],leftpos,rightpos,line_list[3]])
24 | jun_s.add(locus)
25 |
26 | output_file.write("track name=junctions description=\"HISAT2 junctions\"\n")
27 | i=0
28 | for locus in jun_s:
29 | output_ls = []
30 | locus_ls = locus.split("___")
31 | chr_name = locus_ls[0]
32 | int_start = int(locus_ls[1])-51
33 | if int_start<=0:
34 | start = "1"
35 | width_start = str(49+int_start)
36 | else:
37 | start = str(int_start)
38 | width_start = "50"
39 | end = str( int(locus_ls[2]) + 50 )
40 | distance = str( int(locus_ls[2]) - int(locus_ls[1])+51 )
41 |
42 | sign = locus_ls[3]
43 |
44 | name = "HISAT" + str(i)
45 |
46 | i += 1
47 | output_ls = [chr_name,start,end,name,"50",sign,start,end,"0,0,0","2",width_start+",50","0,"+distance]
48 | output_file.write( '\t'.join(output_ls) + "\n" )
49 | junction.close()
50 | output_file.close()
51 |
52 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | version = "Unknown"
4 | for line in open("src/_version.py"):
5 | if line.startswith("__version__"):
6 | version = line.strip().split("=")[1].strip().replace('"', '')
7 |
8 | print version
9 | setup(
10 | name='RNACocktail Pipeline',
11 | version=version,
12 | description='RNACocktail: A comprehensive framework for accurate and efficient RNA-Seq analysis',
13 | author='Roche Sequencing Solutions, Inc',
14 | author_email='bina.rd@roche.com',
15 | url='https://github.com/bioinform/rnacocktail',
16 | packages=find_packages(),
17 | install_requires=["pysam", "pybedtools"],
18 | scripts=['scripts/run_rnacocktail.py','scripts/hisat2_jun2bed.py',
19 | 'scripts/gpd2gtf.py']
20 | )
21 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from _version import __version__
2 |
--------------------------------------------------------------------------------
/src/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.2"
2 |
--------------------------------------------------------------------------------
/src/defaults.py:
--------------------------------------------------------------------------------
1 | MODES = set(["align", "reconstruct", "denovo",
2 | "quantify", "diff", "long_correct", "long_align",
3 | "long_reconstruct", "long_fusion", "variant", "editing", "fusion","all"])
4 | SR_ALIGNERS = set(["HISAT2"])
5 | RECONSTRUCTORS = set(["StringTie"])
6 | QUANTIFIERS = set(["Salmon-SMEM"])
7 | DIFFS = set(["DESeq2"])
8 | DNV_ASSEMBLERS = set(["Oases"])
9 | LR_CORRECTORS = set(["LoRDEC"])
10 | LR_ALIGNERS= set(["STARlong"])
11 | LR_RECONSTRUCTORS= set(["IDP"])
12 | LR_FUSION= set(["IDP-fusion"])
13 | variant_caller= set(["GATK"])
14 | editing_caller= set(["GIRMI"])
15 | fusion_caller= set(["FusionCatcher"])
16 | TIMEOUT = 10000000 # in seconds
17 |
18 |
19 | SALMON_LIBTYPE = "IU"
20 | SALMON_SMEM_k = 19
21 | DESeq2_MINCNT = 2
22 | DESeq2_ALPHA = 0.05
23 | DNV_HASH = 25
24 | DNV_FORMAT = "fasta"
25 | DNV_READTYPE = "short"
26 | STARLONG_DEFAULTS = {"outSAMattributes": "NH HI NM MD", "readNameSeparator": "space",
27 | "outFilterMultimapScoreRange": "1", "outFilterMismatchNmax": "2000",
28 | "scoreGapNoncan": "-20", "scoreGapGCAG":"-4", "scoreGapATAC":"-8",
29 | "scoreDelOpen": "-1", "scoreDelBase": "-1", "scoreInsOpen": "-1", "scoreInsBase": "-1",
30 | "alignEndsType": "Local", "seedSearchStartLmax": "50", "seedPerReadNmax": "100000",
31 | "seedPerWindowNmax": "1000", "alignTranscriptsPerReadNmax": "100000",
32 | "alignTranscriptsPerWindowNmax": "10000"}
33 |
34 |
35 | GATK_SN_OPT = ""
36 |
37 | GATK_HC_STANDCALLCONF = 20.0
38 | GATK_HC_STANDEMITCONF = 20.0
39 | GATK_HC_OPT = (("-stand-call-conf %f " % GATK_HC_STANDCALLCONF) if GATK_HC_STANDCALLCONF else "") + \
40 | "--dont-use-soft-clipped-bases "
41 |
42 | GATK_VF_WINDOW = 35
43 | GATK_VF_CLUSTER = 3
44 | GATK_VF_FSMIN = 30.0
45 | GATK_VF_QDMAX = 2.0
46 | GATK_VF_OPT = (("-window %d " % GATK_VF_WINDOW) if GATK_VF_WINDOW else "") + \
47 | (("-cluster %d " % GATK_VF_CLUSTER) if GATK_VF_CLUSTER else "") + \
48 | (("--filter-name FS -filter 'FS > %f' " % GATK_VF_FSMIN) if GATK_VF_FSMIN else "") + \
49 | (("--filter-name QD -filter 'QD < %f' " % GATK_VF_QDMAX) if GATK_VF_QDMAX else "")
50 |
51 | JAVA_XMS = "-Xms1g"
52 | JAVA_XMG = "-Xmx5g"
53 | JAVA_OPT= "%s %s"%(JAVA_XMS,JAVA_XMG)
54 |
55 |
56 | HISAT2 = "hisat2"
57 | HISAT2_SPS = "hisat2_extract_splice_sites.py"
58 | SAMTOOLS = "samtools"
59 | STRINGTIE = "stringtie"
60 | SALMON = "salmon"
61 | R_CMD = "R"
62 | FEATURECOUNTS = "featureCounts"
63 | VELVETG = "velvetg"
64 | VELVETH = "velveth"
65 | OASES = "oases"
66 | LORDEC = "lordec-correct"
67 | STARLONG = "STARlong"
68 | SAM2PSL = "sam2psl.py"
69 | IDP = "runIDP.py"
70 | IDPFUSION = "runIDP.py"
71 | GMAP="gmap"
72 | STAR_DIR = "/us/local/bin"
73 | BOWTIE2_DIR = "/us/local/bin"
74 | PICARD = "picard.jar"
75 | GATK = "GenomeAnalysisTK.jar"
76 | JAVA = "java"
77 | GIREMI = "giremi"
78 | HTSLIB = ""
79 | FUSIONCATCHER= "fusioncatcher"
--------------------------------------------------------------------------------
/src/external_cmd.py:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | #This script is modified from the original code
3 | #obtained from https://github.com/bioinform/metasv/blob/master/metasv/external_cmd.py
4 | #Copyright (c) 2014, Bina Technologies inc.
5 | ############################################################################
6 |
7 |
8 | import time
9 | import shlex
10 | import subprocess
11 | from threading import Timer
12 | import unittest
13 | import os
14 | from utils import *
15 |
16 | class TimedExternalCmd:
17 | def __init__(self, cmd, logger, raise_exception=False, env_dict={}):
18 | self.cmd = shlex.split(cmd)
19 | self.p = None
20 | self.did_timeout = False
21 | self.logger = logger
22 | self.raise_exception = raise_exception
23 | self.env_dict = env_dict
24 | def enforce_timeout(self):
25 | self.p.terminate()
26 | self.did_timeout = True
27 | def run(self, cmd_log_fd_out=None, cmd_log_fd_err=None, cmd_log="", msg="", timeout=None):
28 | self.logger.info("Task: %s " % (msg))
29 | self.logger.info("Running \"%s\" " % (" ".join(self.cmd)))
30 | cmd_log_fd_err = cmd_log_fd_err or cmd_log_fd_out
31 | if self.env_dict:
32 | my_env = os.environ.copy()
33 | for k,v in self.env_dict.iteritems():
34 | my_env[k] = v
35 | self.p = subprocess.Popen(self.cmd, stderr=cmd_log_fd_err, stdout=cmd_log_fd_out, env=my_env)
36 | else:
37 | self.p = subprocess.Popen(self.cmd, stderr=cmd_log_fd_err, stdout=cmd_log_fd_out)
38 |
39 | start_time = time.time()
40 | if timeout:
41 | t = Timer(timeout, self.enforce_timeout)
42 | t.start()
43 | self.p.wait()
44 | if timeout:
45 | t.cancel()
46 | if self.did_timeout:
47 | if not self.raise_exception:
48 | self.logger.error("Timed out after %d seconds.", timeout)
49 | return None
50 | else:
51 | self.logger.error("Aborting!")
52 | raise Exception("Timed out after %d seconds."%timeout)
53 | retcode = self.p.returncode
54 | if retcode == 0:
55 | self.logger.info("Done %s " % msg)
56 | else:
57 | if self.raise_exception:
58 | self.logger.info("Returned code %d (%g seconds)" % (retcode, time.time() - start_time))
59 | self.logger.error("Aborting!")
60 | if cmd_log:
61 | raise Exception("Failed %s. Log file: %s" % (msg,cmd_log))
62 | else:
63 | raise Exception(msg)
64 | self.logger.info("Returned code %d (%g seconds)" % (retcode, time.time() - start_time))
65 | return retcode
66 |
67 |
68 | class TestTimedExternalCmd(unittest.TestCase):
69 | def test_run_complete(self):
70 | cmd = TimedExternalCmd("sleep 1", self.logger)
71 | self.assertEqual(cmd.run(timeout = 2), 0)
72 | self.assertFalse(cmd.did_timeout)
73 | return
74 |
75 | def test_run_timeout(self):
76 | start_tick = time.time()
77 | cmd = TimedExternalCmd("sleep 2", self.logger)
78 | cmd.run(timeout = 1)
79 | run_time = time.time() - start_tick
80 | self.assertTrue(cmd.did_timeout)
81 | self.assertAlmostEqual(run_time, 1, delta=0.2)
82 | return
83 |
84 | def test_run_no_timeout(self):
85 | cmd = TimedExternalCmd("sleep 1", self.logger)
86 | retcode = cmd.run()
87 | self.assertEqual(cmd.run(), 0)
88 | self.assertFalse(cmd.did_timeout)
89 | return
90 |
91 | def test_run_fail(self):
92 | cmd = TimedExternalCmd("sleep 1 2 3", self.logger)
93 | retcode = cmd.run(timeout = 1)
94 | self.assertIsNotNone(retcode)
95 | self.assertIsNot(retcode, 0)
96 | return
97 |
98 | logger = None
99 |
100 |
101 | if __name__ == '__main__':
102 | TestTimedExternalCmd.logger = logging.getLogger(__name__)
103 | unittest.main()
104 |
--------------------------------------------------------------------------------
/src/run_dnv_assemebly.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_oases(assmebly_hash=DNV_HASH,
14 | seq_1="", seq_2="", seq_u="", seq_i="",
15 | file_format=DNV_FORMAT, read_type=DNV_READTYPE,
16 | oases=OASES, velvetg=VELVETG, velveth=VELVETH,
17 | oases_opts="", velvetg_opts="", velveth_opts="",
18 | start=0, sample= "", nthreads=1,
19 | workdir=None, outdir=None, timeout=TIMEOUT):
20 |
21 | logger.info("Running de novo assembly (OASES) for %s"%sample)
22 |
23 | if seq_1 and seq_2:
24 | for s1 in seq_1.split(","):
25 | if not os.path.exists(s1):
26 | logger.error("Aborting!")
27 | raise Exception("No Mate 1 sequence file %s"%s1)
28 | for s2 in seq_2.split(","):
29 | if not os.path.exists(s2):
30 | logger.error("Aborting!")
31 | raise Exception("No Mate 2 sequence file %s"%s2)
32 | seq_argument="-separate %s %s"%(seq_1,seq_2)
33 | elif seq_u:
34 | seq_argument=seq_u
35 | for su in seq_u.split(","):
36 | if not os.path.exists(su):
37 | logger.error("Aborting!")
38 | raise Exception("No unpaired sequence file %s"%su)
39 |
40 | elif seq_i:
41 | seq_argument=seq_i
42 | for sr in seq_i.split(","):
43 | if not os.path.exists(seq_i):
44 | logger.error("Aborting!")
45 | raise Exception("No sra sequence file %s"%sr)
46 |
47 | work_oases=os.path.join(workdir,"oases",sample)
48 | create_dirs([work_oases])
49 |
50 | step=0
51 | if start<=step:
52 | logger.info("--------------------------STEP %s--------------------------"%step)
53 | msg = "Erase Oases work directory for %s"%sample
54 | command="rm -rf %s/*" % (
55 | work_oases)
56 | command="bash -c \"%s\""%command
57 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
58 | retcode = cmd.run(msg=msg, timeout=timeout)
59 | step+=1
60 |
61 | oases_log = os.path.join(work_oases, "oases.log")
62 | oases_log_fd = open(oases_log, "w")
63 |
64 |
65 | seq_argument="-%s -%s %s "%(file_format,read_type,seq_argument)
66 |
67 | msg = "velveth for %s"%sample
68 | if start<=step:
69 | logger.info("--------------------------STEP %s--------------------------"%step)
70 | command="%s %s %d %s %s" % (
71 | velveth, work_oases, assmebly_hash, velveth_opts, seq_argument)
72 | command="bash -c \"%s\""%command
73 | cmd = TimedExternalCmd(command, logger, raise_exception=True, env_dict={"OMP_NUM_THREADS":str(nthreads)})
74 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout)
75 | else:
76 | logger.info("Skipping step %d: %s"%(step,msg))
77 | step+=1
78 |
79 |
80 | msg = "velvetg for %s"%sample
81 | if start<=step:
82 | logger.info("--------------------------STEP %s--------------------------"%step)
83 | command="%s %s %s -read_trkg yes " % (
84 | velvetg, work_oases, velvetg_opts)
85 | command="bash -c \"%s\""%command
86 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
87 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout)
88 | else:
89 | logger.info("Skipping step %d: %s"%(step,msg))
90 | step+=1
91 |
92 | msg = "oases for %s"%sample
93 | if start<=step:
94 | logger.info("--------------------------STEP %s--------------------------"%step)
95 | command="%s %s %s " % (
96 | oases, work_oases, oases_opts)
97 | command="bash -c \"%s\""%command
98 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
99 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout)
100 | else:
101 | logger.info("Skipping step %d: %s"%(step,msg))
102 | step+=1
103 |
104 | out_oases=os.path.join(outdir,"oases",sample)
105 | create_dirs([out_oases])
106 | msg="Copy predictions to output directory for %s."%sample
107 | if start<=step:
108 | logger.info("--------------------------STEP %s--------------------------"%step)
109 | if os.path.exists("%s/transcripts.fa"%work_oases):
110 | command = "cp %s/transcripts.fa %s/transcripts.fa"%(
111 | work_oases, out_oases)
112 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
113 | retcode = cmd.run(cmd_log_fd_out=oases_log_fd, cmd_log=oases_log, msg=msg, timeout=timeout)
114 | else:
115 | logger.info("Skipping step %d: %s"%(step,msg))
116 | step+=1
117 |
118 |
119 | transcripts = ""
120 | if os.path.exists("%s/transcripts.fa"%out_oases):
121 | logger.info("Oases was successfull!")
122 | logger.info("Output transcripts: %s/transcripts.fa"%out_oases)
123 | transcripts = "%s/transcripts.fa"%out_oases
124 | else:
125 | logger.info("Oases failed!")
126 | return transcripts
127 |
128 | def run_dnv_assemebly(assembler="Oases", assmebly_hash=DNV_HASH,
129 | seq_1="", seq_2="", seq_u="", seq_i="",
130 | file_format=DNV_FORMAT, read_type=DNV_READTYPE,
131 | oases=OASES, velvetg=VELVETG, velveth=VELVETH,
132 | oases_opts="", velvetg_opts="", velveth_opts="",
133 | start=0, sample= "", nthreads=1,
134 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
135 | transcripts=""
136 | if assembler.upper()=="OASES":
137 | try:
138 | transcripts=run_oases(assmebly_hash=assmebly_hash,
139 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u, seq_i=seq_i,
140 | file_format=file_format, read_type=read_type,
141 | oases=oases, velvetg=velvetg, velveth=velveth,
142 | oases_opts=oases_opts, velvetg_opts=velvetg_opts, velveth_opts=velveth_opts,
143 | start=start, sample= sample, nthreads=nthreads,
144 | workdir=workdir, outdir=outdir, timeout=timeout)
145 | except Exception as excp:
146 | logger.info("Oases failed!")
147 | logger.error(excp)
148 | if not ignore_exceptions:
149 | raise Exception(excp)
150 | return transcripts
--------------------------------------------------------------------------------
/src/run_editing.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 | import pysam
6 | import sys
7 | import csv
8 | import pybedtools
9 |
10 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
11 | logFormatter = logging.Formatter(FORMAT)
12 | logger = logging.getLogger(__name__)
13 | consoleHandler = logging.StreamHandler()
14 | consoleHandler.setFormatter(logFormatter)
15 | logger.addHandler(consoleHandler)
16 |
17 | def filter_multi_chr_alignments(in_file,out_file):
18 | curren_read=""
19 | chrms=set([])
20 | reads=[]
21 | infile=pysam.AlignmentFile(in_file, "rb")
22 | outfile=pysam.AlignmentFile(out_file, "wb",template=infile)
23 | for read in infile:
24 | if read.qname !=curren_read:
25 | if curren_read!="":
26 | if len(chrms)==1:
27 | for r in reads:
28 | outfile.write(r)
29 | curren_read=read.qname
30 | chrms=set([read.tid])
31 | reads=[read]
32 | else:
33 | chrms.add(read.tid)
34 | reads.append(read)
35 | if len(chrms)==1:
36 | for r in reads:
37 | outfile.write(r)
38 | outfile.close()
39 |
40 |
41 |
42 |
43 | def fix_SNV_no(feature):
44 | return pybedtools.Interval(feature.chrom, feature.start, feature.end, name="SNV",
45 | score=feature.score, strand=".",otherfields=[".","."])
46 |
47 | def merge_info_SNV(feature):
48 | pos=round(min(abs(int(feature[9])-feature.start),
49 | abs(int(feature[10])-feature.start))/float(int(feature[10])-int(feature[9])+1)*100)
50 | isin=1 if ( feature.start>= int(feature[9]) and feature.start<=int(feature[10])) else -1
51 | pos=pos*isin
52 | name="%s,%s"%(feature[3],feature[11])
53 | otherfields= [str(pos),feature[12]]
54 | return pybedtools.Interval(chrom=feature.chrom,start=feature.start,end=feature.end,name=name,
55 | score=feature.score,strand=feature[13],otherfields=otherfields)
56 |
57 | def find_SNV_strands(strand_pos_bed,genes_pos_bed,input_annotated_vcf,output_annotated_bed):
58 |
59 | final_fwd=pybedtools.BedTool(strand_pos_bed).filter(lambda x:x.strand=="+").sort()
60 | final_rev=pybedtools.BedTool(strand_pos_bed).filter(lambda x:x.strand=="-").sort()
61 |
62 | vcf_intervals=[]
63 | with open(input_annotated_vcf, 'rb') as csvfile:
64 | spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
65 | for x in spamreader:
66 | if x[0][0]=="#":
67 | continue
68 | if x[6]!="PASS":
69 | continue
70 | if len(x[3])!=1 or len(x[4])!=1:
71 | continue
72 |
73 | gt=x[9].split(":")[0]
74 | gt=gt.split("|") if "|" in gt else gt.split("/")
75 | if gt[0]==gt[1]:
76 | continue
77 |
78 | vcf_intervals.append(pybedtools.Interval(x[0], int(x[1])-1, int(x[1]), name="SNV",
79 | score=1 if "DB" in x[7] else 0, strand=".",otherfields=[".","."]))
80 | SNV=pybedtools.BedTool(vcf_intervals).sort().saveas()
81 |
82 |
83 |
84 |
85 | for w in [0,10,50,100,200,400,800,1000]:
86 | if w==0:
87 | SNV_no=SNV
88 | SNV_fwd=SNV_no.window(final_fwd,w=w).each(merge_info_SNV).sort()
89 | if len(SNV_fwd)>0:
90 | SNV_fwd=SNV_fwd.groupby(g=[1,2,3],c=[4,5,6,7,8],o="first,first,first,max,min")
91 | SNV_fwd1=SNV_no.window(final_fwd,w=w,v=True)
92 | SNV_fwd=SNV_fwd.cat(SNV_fwd1,postmerge=False).sort()
93 |
94 | SNV_rev=SNV_no.window(final_rev,w=w).each(merge_info_SNV).sort()
95 | if len(SNV_rev)>0:
96 | SNV_rev=SNV_rev.groupby(g=[1,2,3],c=[4,5,6,7,8],o="first,first,first,max,min")
97 | SNV_rev1=SNV_no.window(final_rev,w=w,v=True)
98 | SNV_rev=SNV_rev.cat(SNV_rev1,postmerge=False).sort()
99 | SNV_final=SNV_fwd.cat(SNV_rev,postmerge=False).sort()
100 | if len(SNV_final)>0:
101 | SNV_final=SNV_final.groupby(g=[1,2,3],c=[4,5,6,7,8],o="collapse,first,collapse,collapse,collapse")
102 |
103 | SNV_good_=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))==1).sort()
104 | SNV_no=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))==0).each(fix_SNV_no).sort()
105 | SNV_bad_=SNV_final.filter(lambda x:len(set(x[5].split(","))-set("."))>1).sort()
106 |
107 | if w==0:
108 | SNV_good=SNV_good_
109 | SNV_bad=SNV_bad_
110 | else:
111 | SNV_good=SNV_good.cat(SNV_good_,postmerge=False).sort()
112 | SNV_no=SNV_no.cat(SNV_bad_,postmerge=False).sort()
113 |
114 |
115 | SNV_annotated=[]
116 | cnt=0
117 | for i in SNV_good:
118 | name=list(set(i.name.split(","))-set(["SNV"]))[0]
119 | strand=list(set(i.strand.split(","))-set(["."]))
120 | strand=strand[0]
121 | SNV_annotated.append(pybedtools.Interval(chrom=i.chrom,start=i.start,end=i.end,name=name,
122 | score=i.score,strand=strand))
123 | for i in SNV_no:
124 | SNV_annotated.append(pybedtools.Interval(chrom=i.chrom,start=i.start,end=i.end,name="SNV%d"%cnt,
125 | score=i.score,strand="."))
126 | cnt+=1
127 | SNV_output_annotated_bed=pybedtools.BedTool(SNV_annotated).sort()
128 |
129 | Intes=SNV_output_annotated_bed.window(genes_pos_bed,v=True).each(lambda x:
130 | pybedtools.Interval(x[0],int(x[1]),int(x[2]),"Inte",x[4],"#")).sort()
131 | Genes=SNV_output_annotated_bed.window(genes_pos_bed,u=True)
132 | SNV_output_annotated_bed=Intes.cat(Genes,postmerge=False).sort().saveas(output_annotated_bed)
133 |
134 |
135 |
136 | def run_giremi(alignment="", variant="",
137 | strand_pos="", genes_pos="",
138 | ref_genome="", knownsites="",
139 | giremi_dir="", htslib_dir="",
140 | samtools=SAMTOOLS, gatk=GATK,
141 | java=JAVA, giremi_opts="", java_opts="",
142 | VariantAnnotator_opts="",
143 | start=0, sample= "", nthreads=1,
144 | workdir=None, outdir=None, timeout=TIMEOUT):
145 |
146 |
147 | logger.info("Running RNA editing detection (GIREMI) for %s"%sample)
148 | if not os.path.exists(alignment):
149 | logger.error("Aborting!")
150 | raise Exception("No alignment file %s"%alignment)
151 | if not os.path.exists(variant):
152 | logger.error("Aborting!")
153 | raise Exception("No variant VCF file %s"%variant)
154 | if not os.path.exists(strand_pos):
155 | logger.error("Aborting!")
156 | raise Exception("No strand position BED file %s"%strand_pos)
157 | if not os.path.exists(genes_pos):
158 | logger.error("Aborting!")
159 | raise Exception("No genes position BED file %s"%genes_pos)
160 | if not os.path.exists(ref_genome):
161 | logger.error("Aborting!")
162 | raise Exception("No reference genome FASTA file %s"%ref_genome)
163 | if not os.path.exists(knownsites):
164 | logger.error("Aborting!")
165 | raise Exception("No VCF knownsites file %s"%knownsites)
166 | if giremi_dir:
167 | if not os.path.exists(giremi_dir):
168 | logger.error("Aborting!")
169 | raise Exception("No GIREMI directory %s"%giremi_dir)
170 |
171 | work_giremi=os.path.join(workdir,"giremi",sample)
172 | create_dirs([work_giremi])
173 |
174 | tmp_dir = ""
175 | if "-Xms" not in java_opts:
176 | java_opts += " %s"%JAVA_XMS
177 | if "-Xmx" not in java_opts:
178 | java_opts += " %s"%JAVA_XMG
179 | if "-Djava.io.tmpdir" not in java_opts:
180 | java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_giremi)
181 | tmp_dir="%s/javatmp/"%(work_giremi)
182 |
183 |
184 | step=0
185 | if start<=step:
186 | logger.info("--------------------------STEP %s--------------------------"%step)
187 | msg = "Erase GIREMI work directory for %s"%sample
188 | command="rm -rf %s/*" % (
189 | work_giremi)
190 | command="bash -c \"%s\""%command
191 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
192 | retcode = cmd.run(msg=msg,timeout=timeout)
193 | step+=1
194 |
195 | giremi_log = os.path.join(work_giremi, "giremi.log")
196 | giremi_log_fd = open(giremi_log, "w")
197 |
198 | if tmp_dir:
199 | create_dirs([tmp_dir])
200 |
201 | msg = "Sort BAM by name for %s"%sample
202 | if start<=step:
203 | logger.info("--------------------------STEP %s--------------------------"%step)
204 | command="%s sort -n -@ %d -T %s/alignments.name_sorted -o %s/alignments.name_sorted.bam %s" % (
205 | samtools, nthreads, work_giremi, work_giremi, alignment)
206 | command="bash -c \"%s\""%command
207 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
208 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
209 | else:
210 | logger.info("Skipping step %d: %s"%(step,msg))
211 | step+=1
212 |
213 |
214 | msg = "Filter alignments mapped to multiple chromosoms for %s"%sample
215 | if start<=step:
216 | logger.info("--------------------------STEP %s--------------------------"%step)
217 | logger.info(msg)
218 | filter_multi_chr_alignments("%s/alignments.name_sorted.bam"%work_giremi,"%s/alignments.chr_unique.bam"%work_giremi)
219 | else:
220 | logger.info("Skipping step %d: %s"%(step,msg))
221 | step+=1
222 |
223 | msg = "Sort BAM by pos for %s"%sample
224 | if start<=step:
225 | logger.info("--------------------------STEP %s--------------------------"%step)
226 | command="%s sort -@ %d -T %s/alignments.pos_sorted -o %s/alignments.pos_sorted.bam %s/alignments.chr_unique.bam" % (
227 | samtools, nthreads, work_giremi, work_giremi, work_giremi)
228 | command="bash -c \"%s\""%command
229 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
230 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
231 | else:
232 | logger.info("Skipping step %d: %s"%(step,msg))
233 | step+=1
234 |
235 | msg = "GATK IndexFeatureFile for %s"%sample
236 | if start<=step:
237 | logger.info("--------------------------STEP %s--------------------------"%step)
238 | command="%s %s -jar %s IndexFeatureFile -F %s" % (
239 | java, java_opts, gatk, variant)
240 | command="bash -c \"%s\""%command
241 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
242 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
243 | else:
244 | logger.info("Skipping step %d: %s"%(step,msg))
245 | step+=1
246 |
247 |
248 | msg = "GATK VariantAnnotator for %s"%sample
249 | if start<=step:
250 | logger.info("--------------------------STEP %s--------------------------"%step)
251 | command="%s %s -jar %s VariantAnnotator -R %s -V %s -L %s -O %s/annotated.vcf --dbsnp %s %s" % (
252 | java, java_opts, gatk, ref_genome,variant,variant,work_giremi,knownsites,VariantAnnotator_opts)
253 | command="bash -c \"%s\""%command
254 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
255 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
256 | else:
257 | logger.info("Skipping step %d: %s"%(step,msg))
258 | step+=1
259 |
260 | msg="Find variant strands for %s"%sample
261 | if start<=step:
262 | logger.info("--------------------------STEP %s--------------------------"%step)
263 | logger.info(msg)
264 | find_SNV_strands(strand_pos, genes_pos, "%s/annotated.vcf"%work_giremi, "%s/SNV_annotated.bed"%work_giremi)
265 | else:
266 | logger.info("Skipping step %d: %s"%(step,msg))
267 | step+=1
268 |
269 | if htslib_dir:
270 | if "LD_LIBRARY_PATH" in os.environ:
271 | os.environ["LD_LIBRARY_PATH"] += ":%s/"%htslib_dir
272 | else:
273 | os.environ["LD_LIBRARY_PATH"] = htslib_dir
274 |
275 | if giremi_dir:
276 | os.environ["PATH"] += ":%s/"%giremi_dir
277 |
278 | msg = "Run GIREMI for %s"%sample
279 | if start<=step:
280 | logger.info("--------------------------STEP %s--------------------------"%step)
281 | command="cd %s && %s %s -f %s -l %s/SNV_annotated.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % (
282 | giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi))
283 | command="bash -c \"%s\""%command
284 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
285 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
286 | else:
287 | logger.info("Skipping step %d: %s"%(step,msg))
288 | step+=1
289 |
290 |
291 | if os.path.exists("%s/giremi_out.txt"%work_giremi) and not os.path.exists("%s/giremi_out.txt.res"%work_giremi):
292 |
293 | msg="Identify N variants for %s"%sample
294 | if start<=step:
295 | logger.info("--------------------------STEP %s--------------------------"%step)
296 | logger.info(msg)
297 | with open("%s/giremi_out.txt"%work_giremi) as csv_file_i:
298 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|')
299 | with open("%s/N.bed"%work_giremi, 'wb') as csvfile_o:
300 | spamwriter = csv.writer(csvfile_o, delimiter='\t',
301 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
302 | for row in spamreader:
303 | if (row[5]=="N" or row[8]=="N"):
304 | spamwriter.writerow([row[0],int(row[1])-1,row[1]])
305 | else:
306 | logger.info("Skipping step %d: %s"%(step,msg))
307 | step+=1
308 |
309 | cnt=len(pybedtools.BedTool("%s/N.bed"%work_giremi))
310 | if cnt>0:
311 | msg="Remove N variants for %s"%sample
312 | if start<=step:
313 | logger.info("--------------------------STEP %s--------------------------"%step)
314 | logger.info(msg)
315 | pybedtools.BedTool("%s/SNV_annotated.bed"%work_giremi).intersect(
316 | "%s/N.bed"%work_giremi,r=True, f=1, v=True).saveas("%s/SNV_annotated_filtered.bed"%work_giremi)
317 | else:
318 | logger.info("Skipping step %d: %s"%(step,msg))
319 | step+=1
320 |
321 | msg = "Rerun GIREMI for %s"%sample
322 | if start<=step:
323 | logger.info("--------------------------STEP %s--------------------------"%step)
324 | if os.path.exists("%s/SNV_annotated_filtered.bed"%work_giremi):
325 | command="cd %s && %s %s -f %s -l %s/SNV_annotated_filtered.bed -o %s/giremi_out.txt %s/alignments.pos_sorted.bam" % (
326 | giremi_dir,GIREMI, giremi_opts, os.path.abspath(ref_genome), os.path.abspath(work_giremi), os.path.abspath(work_giremi),os.path.abspath(work_giremi))
327 | command="bash -c \"%s\""%command
328 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
329 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
330 | else:
331 | logger.info("No file %s/SNV_annotated_filtered.bed"%work_giremi)
332 | else:
333 | logger.info("Skipping step %d: %s"%(step,msg))
334 | step+=1
335 | else:
336 | step+=2
337 | else:
338 | step+=3
339 |
340 | out_giremi=os.path.join(outdir,"giremi",sample)
341 | create_dirs([out_giremi])
342 | msg="Copy predictions to output directory for %s."%sample
343 | if start<=step:
344 | logger.info("--------------------------STEP %s--------------------------"%step)
345 | if os.path.exists("%s/giremi_out.txt.res"%work_giremi):
346 | command = "cp %s/giremi_out.txt.res %s/giremi_out.txt.res"%(
347 | work_giremi, out_giremi)
348 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
349 | retcode = cmd.run(cmd_log_fd_out=giremi_log_fd, cmd_log=giremi_log, msg=msg, timeout=timeout)
350 | else:
351 | logger.info("Skipping step %d: %s"%(step,msg))
352 | step+=1
353 |
354 |
355 | edits = ""
356 | if os.path.exists("%s/giremi_out.txt.res"%out_giremi):
357 | logger.info("GIREMI was successfull!")
358 | logger.info("Output edits: %s/giremi_out.txt.res"%out_giremi)
359 | edits = "%s/giremi_out.txt.res"%out_giremi
360 | else:
361 | logger.info("GIREMI failed!")
362 | return edits
363 |
364 | def run_editing(editing_caller="GIREMI", alignment="", variant="",
365 | strand_pos="", genes_pos="",
366 | ref_genome="", knownsites="",
367 | giremi_dir="", htslib_dir="",
368 | samtools=SAMTOOLS, gatk=GATK,
369 | java=JAVA, giremi_opts="", java_opts="",
370 | VariantAnnotator_opts="",
371 | start=0, sample= "", nthreads=1,
372 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
373 | edits=""
374 |
375 | if editing_caller.upper()=="GIREMI":
376 | try:
377 | edits=run_giremi(alignment=alignment, variant=variant,
378 | strand_pos=strand_pos, genes_pos=genes_pos,
379 | ref_genome=ref_genome, knownsites=knownsites,
380 | giremi_dir=giremi_dir, htslib_dir=htslib_dir,
381 | samtools=samtools, gatk=gatk,
382 | java=java, giremi_opts=giremi_opts, java_opts=java_opts,
383 | VariantAnnotator_opts=VariantAnnotator_opts,
384 | start=start, sample= sample, nthreads=nthreads,
385 | workdir=workdir, outdir=outdir, timeout=timeout)
386 | except Exception as excp:
387 | logger.info("GIREMI failed!")
388 | logger.error(excp)
389 | if not ignore_exceptions:
390 | raise Exception(excp)
391 |
392 | return edits
393 |
394 |
395 |
396 |
--------------------------------------------------------------------------------
/src/run_fusion.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_fusioncatcher(data_dir="", input="", start=0,
14 | fusioncatcher=FUSIONCATCHER, fusioncatcher_opts="",
15 | sample= "", nthreads=1,
16 | workdir=None, outdir=None, timeout=TIMEOUT):
17 |
18 |
19 | logger.info("Running RNA fusion detection (FusionCatcher) for %s"%sample)
20 | if not os.path.exists(data_dir):
21 | logger.error("Aborting!")
22 | raise Exception("No data directory %s"%data_dir)
23 |
24 |
25 | work_fusioncatcher=os.path.join(workdir,"fusioncatcher",sample)
26 | create_dirs([work_fusioncatcher])
27 | fusioncatcher_log = os.path.join(work_fusioncatcher, "fusioncatcher.log")
28 | fusioncatcher_log_fd = open(fusioncatcher_log, "w")
29 |
30 | if nthreads>1:
31 | if "-p " not in fusioncatcher_opts:
32 | fusioncatcher_opts += " -p %d"%nthreads
33 | msg = "Run FusionCatcher for %s"%sample
34 | command="%s -d %s -i %s --start %d -o %s" % (
35 | fusioncatcher, data_dir, input, start, work_fusioncatcher)
36 | command="bash -c \"%s\""%command
37 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
38 | retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log_fd, msg=msg, timeout=timeout)
39 |
40 | out_fusioncatcher=os.path.join(outdir,"fusioncatcher",sample)
41 | create_dirs([out_fusioncatcher])
42 | msg="Copy predictions to output directory for %s."%sample
43 | if os.path.exists("%s/final-list_candidate-fusion-genes.txt"%work_fusioncatcher):
44 | command = "cp %s/final-list_candidate-fusion-genes.txt %s/final-list_candidate-fusion-genes.txt"%(
45 | work_fusioncatcher, out_fusioncatcher)
46 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
47 | retcode = cmd.run(cmd_log_fd_out=fusioncatcher_log_fd, cmd_log=fusioncatcher_log, msg=msg, timeout=timeout)
48 |
49 | fusions = ""
50 | if os.path.exists("%s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher):
51 | logger.info("FusionCatcher was successfull!")
52 | logger.info("Output fusions: %s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher)
53 | fusions = "%s/final-list_candidate-fusion-genes.txt"%out_fusioncatcher
54 | else:
55 | logger.info("FusionCatcher failed!")
56 | return fusions
57 |
58 |
59 | def run_fusion(fusion_caller="FusionCatcher",
60 | data_dir="", input="", start=0,
61 | fusioncatcher=FUSIONCATCHER, fusioncatcher_opts="",
62 | sample= "", nthreads=1,
63 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
64 | fusions=""
65 | if fusion_caller.upper()=="FUSIONCATCHER":
66 | try:
67 | fusions=run_fusioncatcher(data_dir=data_dir, input=input, start=start,
68 | fusioncatcher=fusioncatcher, fusioncatcher_opts=fusioncatcher_opts,
69 | sample= sample, nthreads=nthreads,
70 | workdir=workdir, outdir=outdir, timeout=timeout)
71 | except Exception as excp:
72 | logger.info("FusionCatcher failed!")
73 | logger.error(excp)
74 | if not ignore_exceptions:
75 | raise Exception(excp)
76 | return fusions
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/src/run_lr_align.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_starlong(long="",
14 | genome_dir="", ref_gtf="",
15 | starlong=STARLONG, sam2psl=SAM2PSL,samtools=SAMTOOLS,
16 | starlong_opts="",
17 | start=0, sample= "", nthreads=1,
18 | workdir=None, outdir=None, timeout=TIMEOUT):
19 |
20 | logger.info("Running long read alignment (STARlong) for %s"%sample)
21 | if not os.path.exists(genome_dir+"SAindex"):
22 | logger.error("Aborting!")
23 | raise Exception("No SAindex directory in %s"%genome_dir)
24 |
25 | if long:
26 | if not os.path.exists(long):
27 | logger.error("Aborting!")
28 | raise Exception("No long read sequence file %s"%long)
29 |
30 | work_starlong=os.path.join(workdir,"starlong",sample)
31 | create_dirs([work_starlong])
32 |
33 | step=0
34 | if start<=step:
35 | logger.info("--------------------------STEP %s--------------------------"%step)
36 | msg = "Erase STARlong work directory for %s"%sample
37 | command="rm -rf %s/*" % (
38 | work_starlong)
39 | command="bash -c \"%s\""%command
40 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
41 | retcode = cmd.run(msg=msg,timeout=timeout)
42 | step+=1
43 |
44 | starlong_log = os.path.join(work_starlong, "starlong.log")
45 | starlong_log_fd = open(starlong_log, "w")
46 |
47 |
48 |
49 | if ref_gtf:
50 | if not os.path.exists(ref_gtf):
51 | logger.error("Aborting!")
52 | raise Exception("No reference GTF file %s"%ref_gtf)
53 |
54 | if "--outSAMattrRGline" not in starlong_opts:
55 | starlong_opts += " --outSAMattrRGline ID:STARlong SM:%s"%sample
56 | if "--runThreadN " not in starlong_opts:
57 | starlong_opts += " --runThreadN %d"%nthreads
58 | if ref_gtf:
59 | starlong_opts += " --sjdbGTFfile %s"%ref_gtf
60 | for k,v in STARLONG_DEFAULTS.iteritems():
61 | if k not in starlong_opts:
62 | starlong_opts += " --%s %s"%(k,v)
63 |
64 |
65 | msg = "STARlong for %s"%sample
66 | if start<=step:
67 | logger.info("--------------------------STEP %s--------------------------"%step)
68 | command="%s --runMode alignReads %s --genomeDir %s --readFilesIn %s --outFileNamePrefix %s/" % (
69 | starlong, starlong_opts, genome_dir, long, work_starlong )
70 | command="bash -c \"%s\""%command
71 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
72 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
73 | else:
74 | logger.info("Skipping step %d: %s"%(step,msg))
75 | step+=1
76 |
77 |
78 | msg = "converting SAM to PSL for %s"%sample
79 | if start<=step:
80 | logger.info("--------------------------STEP %s--------------------------"%step)
81 | command="%s -i %s/Aligned.out.sam -o %s/Aligned.out.psl" % (
82 | sam2psl, work_starlong, work_starlong)
83 | command="bash -c \"%s\""%command
84 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
85 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
86 | else:
87 | logger.info("Skipping step %d: %s"%(step,msg))
88 | step+=1
89 |
90 | msg = "converting SAM to BAM for %s"%sample
91 | if start<=step:
92 | logger.info("--------------------------STEP %s--------------------------"%step)
93 | command="%s view -Su %s/Aligned.out.sam -@ %d -o %s/Aligned.out.bam" % (
94 | samtools, work_starlong, nthreads, work_starlong)
95 | command="bash -c \"%s\""%command
96 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
97 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
98 | else:
99 | logger.info("Skipping step %d: %s"%(step,msg))
100 | step+=1
101 |
102 | #
103 | # msg = "Clean temp alignment files for %s"%sample
104 | # if start<=step:
105 | # logger.info("--------------------------STEP %s--------------------------"%step)
106 | # command="rm %s/Aligned.out.sam" % (work_starlong)
107 | # command="bash -c \"%s\""%command
108 | # cmd = TimedExternalCmd(command, logger, raise_exception=True)
109 | # retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
110 | # else:
111 | # logger.info("Skipping step %d: %s"%(step,msg))
112 | # step+=1
113 |
114 |
115 | out_starlong=os.path.join(outdir,"starlong",sample)
116 | create_dirs([out_starlong])
117 | msg="Copy predictions to output directory for %s."%sample
118 | if start<=step:
119 | logger.info("--------------------------STEP %s--------------------------"%step)
120 | if os.path.exists("%s/Aligned.out.psl"%work_starlong):
121 | command = "cp %s/Aligned.out.psl %s/Aligned.out.psl"%(
122 | work_starlong, out_starlong)
123 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
124 | retcode = cmd.run(cmd_log_fd_out=starlong_log_fd, cmd_log=starlong_log, msg=msg, timeout=timeout)
125 | else:
126 | logger.info("Skipping step %d: %s"%(step,msg))
127 | step+=1
128 |
129 |
130 | alignments_psl = ""
131 | if os.path.exists("%s/Aligned.out.psl"%out_starlong):
132 | logger.info("STARlong was successfull!")
133 | logger.info("Output alignment: %s/Aligned.out.psl"%out_starlong)
134 | alignments_psl = "%s/Aligned.out.psl"%out_starlong
135 | else:
136 | logger.info("STARlong failed!")
137 | return alignments_psl
138 |
139 | def run_lr_align(long_aligner="STARlong", long="",
140 | genome_dir="", ref_gtf="",
141 | starlong=STARLONG, sam2psl=SAM2PSL, samtools=SAMTOOLS,
142 | starlong_opts="",
143 | start=0, sample= "", nthreads=1,
144 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
145 | alignments_psl=""
146 | if long_aligner.upper()=="STARLONG":
147 | try:
148 | alignments_psl=run_starlong(genome_dir=genome_dir, ref_gtf=ref_gtf,
149 | long=long, starlong=starlong, sam2psl=sam2psl, samtools=samtools,
150 | starlong_opts=starlong_opts,
151 | start=start, sample= sample, nthreads=nthreads,
152 | workdir=workdir, outdir=outdir, timeout=timeout)
153 | except Exception as excp:
154 | logger.info("STARlong failed!")
155 | logger.error(excp)
156 | if not ignore_exceptions:
157 | raise Exception(excp)
158 | return alignments_psl
--------------------------------------------------------------------------------
/src/run_lr_correct.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_lordec(kmer=23,
14 | solid=3, long="", short="",
15 | lordec=LORDEC, lordec_opts="",
16 | start=0, sample= "", nthreads=1,
17 | workdir=None, outdir=None, timeout=TIMEOUT):
18 |
19 | logger.info("Running long read error correction (LoRDEC) for %s"%sample)
20 | if not os.path.exists(long):
21 | logger.error("Aborting!")
22 | raise Exception("No long read sequence file %s"%long)
23 |
24 | if not os.path.exists(short):
25 | logger.error("Aborting!")
26 | raise Exception("No short read sequence file %s"%short)
27 |
28 | work_lordec=os.path.join(workdir,"lordec",sample)
29 | create_dirs([work_lordec])
30 |
31 | step=0
32 | if start<=step:
33 | logger.info("--------------------------STEP %s--------------------------"%step)
34 | msg = "Erase LoRDEC work directory for %s"%sample
35 | command="rm -rf %s/*" % (
36 | work_lordec)
37 | command="bash -c \"%s\""%command
38 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
39 | retcode = cmd.run(msg=msg,timeout=timeout)
40 | step+=1
41 |
42 | lordec_log = os.path.join(work_lordec, "lordec.log")
43 | lordec_log_fd = open(lordec_log, "w")
44 | ksps = ""
45 |
46 | if "-T " not in lordec_opts:
47 | lordec_opts += " -T %d"%nthreads
48 |
49 | msg = "LoRDEC for %s"%sample
50 | if start<=step:
51 | logger.info("--------------------------STEP %s--------------------------"%step)
52 | command="%s %s -k %d -s %d -i %s -2 %s -O %s -o %s/long_corrected.fa" % (
53 | lordec, lordec_opts, kmer, solid, long, short, work_lordec, work_lordec)
54 | command="bash -c \"%s\""%command
55 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
56 | retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout)
57 | else:
58 | logger.info("Skipping step %d: %s"%(step,msg))
59 | step+=1
60 |
61 | out_lordec=os.path.join(outdir,"lordec",sample)
62 | create_dirs([out_lordec])
63 | msg="Copy predictions to output directory for %s."%sample
64 | if start<=step:
65 | logger.info("--------------------------STEP %s--------------------------"%step)
66 | if os.path.exists("%s/long_corrected.fa"%work_lordec):
67 | command = "cp %s/long_corrected.fa %s/long_corrected.fa"%(
68 | work_lordec, out_lordec)
69 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
70 | retcode = cmd.run(cmd_log_fd_out=lordec_log_fd, cmd_log=lordec_log, msg=msg, timeout=timeout)
71 | else:
72 | logger.info("Skipping step %d: %s"%(step,msg))
73 | step+=1
74 |
75 |
76 | corrected = ""
77 | if os.path.exists("%s/long_corrected.fa"%out_lordec):
78 | logger.info("LoRDEC was successfull!")
79 | logger.info("Output corrected reads: %s/long_corrected.fa"%out_lordec)
80 | corrected = "%s/long_corrected.fa"%out_lordec
81 | else:
82 | logger.info("LoRDEC failed!")
83 | return corrected
84 |
85 | def run_lr_correct(long_corrector="LoRDEC", kmer=23,
86 | solid=3, long="", short="",
87 | lordec=LORDEC, lordec_opts="",
88 | start=0, sample= "", nthreads=1,
89 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
90 | corrected=""
91 | if long_corrector.upper()=="LORDEC":
92 | try:
93 | corrected=run_lordec(kmer=kmer, solid=solid, long=long, short=short,
94 | lordec=lordec, lordec_opts=lordec_opts,
95 | start=start, sample= sample, nthreads=nthreads,
96 | workdir=workdir, outdir=outdir, timeout=timeout)
97 | except Exception as excp:
98 | logger.info("LoRDEC failed!")
99 | logger.error(excp)
100 | if not ignore_exceptions:
101 | raise Exception(excp)
102 | return corrected
--------------------------------------------------------------------------------
/src/run_lr_fusion.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 | import csv
6 | import re
7 |
8 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
9 | logFormatter = logging.Formatter(FORMAT)
10 | logger = logging.getLogger(__name__)
11 | consoleHandler = logging.StreamHandler()
12 | consoleHandler.setFormatter(logFormatter)
13 | logger.addHandler(consoleHandler)
14 |
15 | def sort_gpd(in_file,out_file,order_chrs=dict([("%s"%k,k) for k in range(1,23)]+[("MT",23),("X",24),("Y",25)]+[
16 | ("chr%s"%k,k) for k in range(1,23)]+[("chrM",23),("chrX",24),("chrY",25)])):
17 | with open(in_file) as csv_file:
18 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
19 | rows=[]
20 | for row in spamreader:
21 | rows.append(row)
22 | others_chrs=sorted(set(map(lambda x:x[2],rows))-set(order_chrs.keys()))
23 | if others_chrs:
24 | max_id=max(order_chrs.values())
25 | for i,c in enumerate(others_chrs):
26 | order_chrs[c]=max_id+i+1
27 | sorted_rows=sorted(rows,key=lambda x: (order_chrs[x[2]],int(x[4])))
28 | with open(out_file, 'wb') as csvfile:
29 | spamwriter = csv.writer(csvfile, delimiter='\t',
30 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
31 | spamwriter.writerows(sorted_rows)
32 |
33 |
34 |
35 | CIGAR_MATCH = 0
36 | CIGAR_INS = 1
37 | CIGAR_DEL = 2
38 | CIGAR_SOFTCLIP = 4
39 | CIGAR_EQUAL = 7
40 | CIGAR_DIFF = 8
41 | CIGAR_PATTERN = re.compile(r'([0-9]+)([MIDNSHPX=])')
42 | CIGAR_OP_DICT = {op: index for index, op in enumerate("MIDNSHP=X")}
43 | CIGAR_OP_DICT_rev = {index: op for index, op in enumerate("MIDNSHP=X")}
44 | CIGAR_REFERENCE_OPS = [CIGAR_MATCH, CIGAR_DEL, CIGAR_EQUAL, CIGAR_DIFF]
45 |
46 | def cigarstring_to_tuple(cigarstring):
47 | return tuple((CIGAR_OP_DICT[op], int(length)) for length, op in CIGAR_PATTERN.findall(cigarstring))
48 |
49 |
50 | def run_idpfusion(alignment="", short_junction="", long_alignment="",mode_number=0,
51 | short_fasta="", long_fasta="",
52 | ref_genome="", ref_all_gpd="", ref_gpd="", uniqueness_bedgraph="",
53 | genome_bowtie2_idx="", transcriptome_bowtie2_idx="",
54 | read_length=100,
55 | idpfusion_cfg="", idpfusion=IDPFUSION, samtools=SAMTOOLS,
56 | gmap=GMAP, gmap_idx="", star_dir=STAR_DIR, bowtie2_dir=BOWTIE2_DIR,
57 | start=0, sample= "", nthreads=1,
58 | workdir=None, outdir=None, timeout=TIMEOUT):
59 |
60 | logger.info("Running long read fusion Detection (IDP-fusion) for %s"%sample)
61 | if not os.path.exists(alignment):
62 | logger.error("Aborting!")
63 | raise Exception("No input short read alignment BAM/SAM file %s"%alignment)
64 | if not os.path.exists(short_junction):
65 | logger.error("Aborting!")
66 | raise Exception("No input short read junction BED file %s"%short_junction)
67 |
68 | if idpfusion_cfg:
69 | if not os.path.exists(idpfusion_cfg):
70 | logger.error("Aborting!")
71 | raise Exception("No input .cfg file %s"%idpfusion_cfg)
72 |
73 |
74 |
75 | if mode_number>0:
76 | start=4
77 |
78 | work_idpfusion="%s/idpfusion/%s/"%(workdir,sample)
79 | create_dirs([work_idpfusion])
80 |
81 | step=0
82 | if start<=step:
83 | logger.info("--------------------------STEP %s--------------------------"%step)
84 | msg = "Erase IDP-fusion work directory for %s"%sample
85 | command="rm -rf %s/*" % (
86 | work_idpfusion)
87 | command="bash -c \"%s\""%command
88 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
89 | retcode = cmd.run(msg=msg,timeout=timeout)
90 | step+=1
91 |
92 |
93 |
94 | idpfusion_log = os.path.join(work_idpfusion, "idpfusion.log")
95 | idpfusion_log_fd = open(idpfusion_log, "w")
96 |
97 | msg = "converting BAM to SAM for %s"%sample
98 | logger.info("--------------------------STEP %s--------------------------"%step)
99 | if start<=step:
100 | if alignment.endswith('.bam'):
101 | command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idpfusion,alignment)
102 | command="bash -c \"%s\""%command
103 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
104 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout)
105 | alignment = "%s/alignments.sam"%(work_idpfusion)
106 | else:
107 | logger.info("Skipping step %d: %s"%(step,msg))
108 | step+=1
109 |
110 |
111 | msg = "Fix soft-clipped reads in SAM for %s"%sample
112 | logger.info("--------------------------STEP %s--------------------------"%step)
113 | if start<=step:
114 | logger.info("Task: %s"%msg)
115 | corrected_alignment = "%s/alignments_corrected.sam"%(work_idpfusion)
116 | with open(alignment,"r") as csv_file_i:
117 | with open(corrected_alignment,"w") as csv_file_o:
118 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|')
119 | spamwriter = csv.writer(csv_file_o, delimiter='\t',
120 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
121 | for row in spamreader:
122 | if row[0][0]=="@":
123 | spamwriter.writerow(row)
124 | continue
125 | if row[5]=="*":
126 | continue
127 | if "S" in row[5]:
128 | cigartuple=cigarstring_to_tuple(row[5])
129 | if cigartuple[0][0]==4:
130 | row[9]=row[9][cigartuple[0][1]:]
131 | row[10]=row[10][cigartuple[0][1]:]
132 | cigartuple=cigartuple[1:]
133 | if cigartuple[-1][0]==4:
134 | row[9]=row[9][:-cigartuple[-1][1]]
135 | row[10]=row[10][:-cigartuple[-1][1]]
136 | cigartuple=cigartuple[:-1]
137 | row[5]="".join(["%d%s"%(x[1],CIGAR_OP_DICT_rev[x[0]]) for x in cigartuple])
138 | spamwriter.writerow(row)
139 | alignment=corrected_alignment
140 | else:
141 | logger.info("Skipping step %d: %s"%(step,msg))
142 | step+=1
143 |
144 |
145 | msg = "Fix junction bed for %s"%sample
146 | logger.info("--------------------------STEP %s--------------------------"%step)
147 | if start<=step:
148 | logger.info("Task: %s"%msg)
149 | corrected_junction = "%s/splicesites_corrected.bed"%(work_idpfusion)
150 | with open(short_junction,"r") as csv_file_i:
151 | with open(corrected_junction,"w") as csv_file_o:
152 | spamreader = csv.reader(csv_file_i, delimiter='\t', quotechar='|')
153 | spamwriter = csv.writer(csv_file_o, delimiter='\t',
154 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
155 | for row in spamreader:
156 | if len(row)<4:
157 | spamwriter.writerow(row)
158 | continue
159 | if "]" in row[3]:
160 | spamwriter.writerow(row)
161 | continue
162 | row[3]="(2)[2_2](2/0)"
163 | spamwriter.writerow(row)
164 | short_junction=corrected_junction
165 | else:
166 | logger.info("Skipping step %d: %s"%(step,msg))
167 | step+=1
168 |
169 |
170 | msg = "Preparing run.cfg for %s"%sample
171 | if start<=step:
172 | logger.info("--------------------------STEP %s--------------------------"%step)
173 | logger.info("Task: %s"%msg)
174 | if idpfusion_cfg:
175 | msg = "copy IDP-fusion .cfg file for %s"%sample
176 | command="cp %s %s/run.cfg" % (
177 | idpfusion_cfg, work_idpfusion)
178 | command="bash -c \"%s\""%command
179 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
180 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout)
181 | else:
182 | f=open("%s/run.cfg"%work_idpfusion, 'w')
183 | f.close()
184 |
185 | cgf_dict={}
186 | with open("%s/run.cfg"%work_idpfusion , 'r') as cfg_file:
187 | for line in cfg_file:
188 | line = line.strip()
189 | if line=='':
190 | continue
191 | if "=" in line and not line[0]=='#' :
192 | k,v=line.split("=")
193 | k=k.strip()
194 | v=v.strip()
195 | cgf_dict[k]=v
196 |
197 |
198 | with open("%s/run.cfg"%work_idpfusion , 'w') as cfg_file:
199 | for k,v in cgf_dict.iteritems():
200 | cfg_file.write("%s = %s \n"%(k,v))
201 | if "temp_foldername" not in cgf_dict:
202 | cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idpfusion)
203 | if "output_foldername" not in cgf_dict:
204 | cfg_file.write("output_foldername = %s/out/ \n"%work_idpfusion)
205 | if "Nthread" not in cgf_dict:
206 | cfg_file.write("Nthread = %d \n"%nthreads)
207 | if "LR_psl_pathfilename" not in cgf_dict:
208 | if long_alignment and os.path.exists(long_alignment):
209 | cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment)
210 | if "LR_pathfilename" not in cgf_dict:
211 | cfg_file.write("LR_pathfilename = %s \n"%long_fasta)
212 | if "SR_sam_pathfilename" not in cgf_dict:
213 | cfg_file.write("SR_sam_pathfilename = %s \n"%alignment)
214 | if "SR_jun_pathfilename" not in cgf_dict:
215 | cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction)
216 | if "SR_pathfilename" not in cgf_dict:
217 | cfg_file.write("SR_pathfilename = %s \n"%short_fasta)
218 | if "SR_aligner_choice" not in cgf_dict:
219 | cfg_file.write("SR_aligner_choice = STAR \n")
220 | if "star_path" not in cgf_dict:
221 | cfg_file.write("star_path = %s \n"%star_dir)
222 | if "gmap_executable_pathfilename" not in cgf_dict:
223 | cfg_file.write("gmap_executable_pathfilename = %s \n"%gmap)
224 | if "gmap_index_pathfoldername" not in cgf_dict:
225 | cfg_file.write("gmap_index_pathfoldername = %s \n"%gmap_idx)
226 | if "genome_bowtie2_index_pathfilename" not in cgf_dict:
227 | cfg_file.write("genome_bowtie2_index_pathfilename = %s \n"%genome_bowtie2_idx)
228 | if "transcriptome_bowtie2_index_pathfilename" not in cgf_dict:
229 | cfg_file.write("transcriptome_bowtie2_index_pathfilename = %s \n"%transcriptome_bowtie2_idx)
230 | if "allref_annotation_pathfilename" not in cgf_dict:
231 | cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd)
232 | if "ref_annotation_pathfilename" not in cgf_dict:
233 | cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd)
234 | if "genome_pathfilename" not in cgf_dict:
235 | cfg_file.write("genome_pathfilename = %s \n"%ref_genome)
236 | if "estimator_choice" not in cgf_dict:
237 | cfg_file.write("estimator_choice = MAP \n")
238 | if "FPR" not in cgf_dict:
239 | cfg_file.write("FPR = 0.1 \n")
240 | if "Njun_limit" not in cgf_dict:
241 | cfg_file.write("Njun_limit = 10 \n")
242 | if "Niso_limit" not in cgf_dict:
243 | cfg_file.write("Niso_limit = 20 \n")
244 | if "L_exon_limit" not in cgf_dict:
245 | cfg_file.write("L_exon_limit = 1700 \n")
246 | if "L_min_intron" not in cgf_dict:
247 | cfg_file.write("L_min_intron = 68 \n")
248 | if "Bfile_Npt" not in cgf_dict:
249 | cfg_file.write("Bfile_Npt = 50 \n")
250 | if "Bfile_Nbin" not in cgf_dict:
251 | cfg_file.write("Bfile_Nbin = 5 \n")
252 | if "min_LR_overlap_len" not in cgf_dict:
253 | cfg_file.write("min_LR_overlap_len = 100 \n")
254 | if "LR_fusion_point_err_margin" not in cgf_dict:
255 | cfg_file.write("LR_fusion_point_err_margin = 100 \n")
256 | if "min_LR_fusion_point_search_distance" not in cgf_dict:
257 | cfg_file.write("min_LR_fusion_point_search_distance = 20 \n")
258 | if "uniq_LR_alignment_margin_perc" not in cgf_dict:
259 | cfg_file.write("uniq_LR_alignment_margin_perc = 20 \n")
260 | if "Niso_fusion_limit" not in cgf_dict:
261 | cfg_file.write("Niso_fusion_limit = 1000 \n")
262 | if "psl_type" not in cgf_dict:
263 | cfg_file.write("psl_type = 0 \n")
264 | if "read_length" not in cgf_dict:
265 | cfg_file.write("read_length = %d \n"%read_length)
266 | if "min_junction_overlap_len" not in cgf_dict:
267 | cfg_file.write("min_junction_overlap_len = 10 \n")
268 | if "I_refjun_isoformconstruction" not in cgf_dict:
269 | cfg_file.write("I_refjun_isoformconstruction = 1 \n")
270 | if "I_ref5end_isoformconstruction" not in cgf_dict:
271 | cfg_file.write("I_ref5end_isoformconstruction = 1 \n")
272 | if "I_ref3end_isoformconstruction" not in cgf_dict:
273 | cfg_file.write("I_ref3end_isoformconstruction = 1 \n")
274 | if "fusion_mode" not in cgf_dict:
275 | cfg_file.write("fusion_mode = 1 \n")
276 | if "uniqueness_bedGraph_pathfilename" not in cgf_dict:
277 | cfg_file.write("uniqueness_bedGraph_pathfilename = %s \n"%uniqueness_bedgraph)
278 | if "exon_construction_junction_span" not in cgf_dict:
279 | cfg_file.write("exon_construction_junction_span = 1 \n")
280 | if "aligner_choice" not in cgf_dict:
281 | cfg_file.write("aligner_choice = gmap \n")
282 | if "aligner_choice" not in cgf_dict:
283 | cfg_file.write("aligner_choice = gmap \n")
284 | if "three_primer" not in cgf_dict:
285 | cfg_file.write("three_primer = \n")
286 | if "five_primer" not in cgf_dict:
287 | cfg_file.write("five_primer = \n")
288 | else:
289 | logger.info("Skipping step %d: %s"%(step,msg))
290 | step+=1
291 |
292 | if star_dir:
293 | os.environ["PATH"] += ":%s/"%star_dir
294 | if bowtie2_dir:
295 | os.environ["PATH"] += ":%s/"%bowtie2_dir
296 |
297 |
298 | msg = "IDP-fusion for %s"%sample
299 | if start<=step:
300 | logger.info("--------------------------STEP %s--------------------------"%step)
301 | command="%s %s/run.cfg %d" % (
302 | idpfusion, work_idpfusion, mode_number)
303 | command="bash -c \"%s\""%command
304 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
305 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout)
306 | else:
307 | logger.info("Skipping step %d: %s"%(step,msg))
308 | step+=1
309 |
310 | msg = "Convert transcript GPD file to GTF for %s"%sample
311 | if start<=step:
312 | logger.info("--------------------------STEP %s--------------------------"%step)
313 | if os.path.exists("%s/out/isoform.gpd"%work_idpfusion):
314 | sort_gpd("%s/out/isoform.gpd"%work_idpfusion,"%s/out/isoform_sorted.gpd"%work_idpfusion)
315 | command="gpd2gtf.py \
316 | %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idpfusion,work_idpfusion,work_idpfusion)
317 | command="bash -c \"%s\""%command
318 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
319 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout)
320 | else:
321 | logger.info("Skipping step %d: %s"%(step,msg))
322 | step+=1
323 |
324 | out_idpfusion=os.path.join(outdir,"idpfusion",sample)
325 | create_dirs([out_idpfusion])
326 | msg="Copy predictions to output directory for %s."%sample
327 | if start<=step:
328 | logger.info("--------------------------STEP %s--------------------------"%step)
329 | if os.path.exists("%s/out/fusion_report.tsv"%work_idpfusion):
330 | command = "cp %s/out/fusion_report.tsv %s/fusion_report.tsv"%(
331 | work_idpfusion, out_idpfusion)
332 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
333 | retcode = cmd.run(cmd_log_fd_out=idpfusion_log_fd, cmd_log=idpfusion_log, msg=msg, timeout=timeout)
334 | else:
335 | logger.info("Skipping step %d: %s"%(step,msg))
336 | step+=1
337 |
338 |
339 |
340 | fusions = ""
341 | if os.path.exists("%s/fusion_report.tsv"%out_idpfusion):
342 | logger.info("IDP-fusion was successfull!")
343 | logger.info("Output fusions: %s/fusion_report.tsv"%out_idpfusion)
344 | fusions = "%s/fusion_report.tsv"%out_idpfusion
345 | else:
346 | logger.info("IDP-fusion failed!")
347 | return fusions
348 |
349 | def run_lr_fusion(long_fusion_caller="IDP-fusion", alignment="",
350 | short_junction="", long_alignment="", mode_number=0,
351 | short_fasta="", long_fasta="",
352 | ref_genome="", ref_all_gpd="", ref_gpd="", uniqueness_bedgraph="",
353 | genome_bowtie2_idx="", transcriptome_bowtie2_idx="",
354 | read_length=100,
355 | idpfusion_cfg="", idpfusion=IDPFUSION, samtools=SAMTOOLS,
356 | gmap=GMAP, gmap_idx="", star_dir=STAR_DIR, bowtie2_dir=BOWTIE2_DIR,
357 | start=0, sample= "", nthreads=1,
358 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
359 | fusions = ""
360 | if long_fusion_caller.upper()=="IDP-FUSION":
361 | try:
362 | fusions=run_idpfusion(alignment=alignment,
363 | short_junction=short_junction, long_alignment=long_alignment,
364 | mode_number=mode_number,
365 | short_fasta=short_fasta, long_fasta=long_fasta,
366 | ref_genome=ref_genome, ref_all_gpd=ref_all_gpd,
367 | ref_gpd=ref_gpd, uniqueness_bedgraph=uniqueness_bedgraph,
368 | genome_bowtie2_idx=genome_bowtie2_idx, transcriptome_bowtie2_idx=transcriptome_bowtie2_idx,
369 | read_length=read_length,
370 | idpfusion_cfg=idpfusion_cfg, idpfusion=idpfusion, samtools=samtools,
371 | gmap=gmap, gmap_idx=gmap_idx, star_dir=star_dir,
372 | bowtie2_dir=bowtie2_dir,
373 | start=start, sample= sample, nthreads=nthreads,
374 | workdir=workdir, outdir=outdir, timeout=timeout)
375 | except Exception as excp:
376 | logger.info("IDP-fusion failed!")
377 | logger.error(excp)
378 | if not ignore_exceptions:
379 | raise Exception(excp)
380 | return fusions
--------------------------------------------------------------------------------
/src/run_lr_reconstruct.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 | import csv
6 |
7 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
8 | logFormatter = logging.Formatter(FORMAT)
9 | logger = logging.getLogger(__name__)
10 | consoleHandler = logging.StreamHandler()
11 | consoleHandler.setFormatter(logFormatter)
12 | logger.addHandler(consoleHandler)
13 |
14 | def sort_gpd(in_file,out_file,order_chrs=dict([("%s"%k,k) for k in range(1,23)]+[("MT",23),("X",24),("Y",25)]+[
15 | ("chr%s"%k,k) for k in range(1,23)]+[("chrM",23),("chrX",24),("chrY",25)])):
16 | with open(in_file) as csv_file:
17 | spamreader = csv.reader(csv_file, delimiter='\t', quotechar='|')
18 | rows=[]
19 | for row in spamreader:
20 | rows.append(row)
21 | others_chrs=sorted(set(map(lambda x:x[2],rows))-set(order_chrs.keys()))
22 | if others_chrs:
23 | max_id=max(order_chrs.values())
24 | for i,c in enumerate(others_chrs):
25 | order_chrs[c]=max_id+i+1
26 | sorted_rows=sorted(rows,key=lambda x: (order_chrs[x[2]],int(x[4])))
27 | with open(out_file, 'wb') as csvfile:
28 | spamwriter = csv.writer(csvfile, delimiter='\t',
29 | quotechar='|', quoting=csv.QUOTE_MINIMAL)
30 | spamwriter.writerows(sorted_rows)
31 |
32 |
33 |
34 | def run_idp(alignment="", short_junction="", long_alignment="",mode_number=0,
35 | ref_genome="", ref_all_gpd="", ref_gpd="",read_length=100,
36 | idp_cfg="", idp=IDP, samtools=SAMTOOLS,
37 | start=0, sample= "", nthreads=1,
38 | workdir=None, outdir=None, timeout=TIMEOUT):
39 |
40 | logger.info("Running long-read transcriptome reconstruction (IDP) for %s"%sample)
41 | if not os.path.exists(alignment):
42 | logger.error("Aborting!")
43 | raise Exception("No input short read alignment BAM/SAM file %s"%alignment)
44 | if not os.path.exists(short_junction):
45 | logger.error("Aborting!")
46 | raise Exception("No input short read junction BED file %s"%short_junction)
47 | if not os.path.exists(long_alignment):
48 | logger.error("Aborting!")
49 | raise Exception("No input long read alignment PSL file %s"%long_alignment)
50 |
51 | if idp_cfg:
52 | if not os.path.exists(idp_cfg):
53 | logger.error("Aborting!")
54 | raise Exception("No input .cfg file %s"%idp_cfg)
55 |
56 |
57 |
58 | if mode_number>0:
59 | start=4
60 |
61 | work_idp="%s/idp/%s/"%(workdir,sample)
62 | create_dirs([work_idp])
63 |
64 | step=0
65 | if start<=step:
66 | logger.info("--------------------------STEP %s--------------------------"%step)
67 | msg = "Erase IDP work directory for %s"%sample
68 | command="rm -rf %s/*" % (
69 | work_idp)
70 | command="bash -c \"%s\""%command
71 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
72 | retcode = cmd.run(msg=msg,timeout=timeout)
73 | step+=1
74 |
75 |
76 |
77 | idp_log = os.path.join(work_idp, "idp.log")
78 | idp_log_fd = open(idp_log, "w")
79 |
80 | msg = "converting BAM to SAM for %s"%sample
81 | logger.info("--------------------------STEP %s--------------------------"%step)
82 | if start<=step:
83 | if alignment.endswith('.bam'):
84 | command = "%s view -h -o %s/alignments.sam %s " % (samtools,work_idp,alignment)
85 | command="bash -c \"%s\""%command
86 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
87 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
88 | alignment = "%s/alignments.sam"%(work_idp)
89 | else:
90 | logger.info("Skipping step %d: %s"%(step,msg))
91 | step+=1
92 |
93 |
94 | msg = "Preparing run.cfg for %s"%sample
95 | if start<=step:
96 | logger.info("--------------------------STEP %s--------------------------"%step)
97 | if idp_cfg:
98 | msg = "copy IDP .cfg file for %s"%sample
99 | command="cp %s %s/run.cfg" % (
100 | idp_cfg, work_idp)
101 | command="bash -c \"%s\""%command
102 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
103 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
104 | else:
105 | f=open("%s/run.cfg"%work_idp, 'w')
106 | f.close()
107 |
108 | cgf_dict={}
109 | with open("%s/run.cfg"%work_idp , 'r') as cfg_file:
110 | for line in cfg_file:
111 | line = line.strip()
112 | if line=='':
113 | continue
114 | if "=" in line and not line[0]=='#' :
115 | k,v=line.split("=")
116 | k=k.strip()
117 | v=v.strip()
118 | cgf_dict[k]=v
119 |
120 | with open("%s/run.cfg"%work_idp , 'w') as cfg_file:
121 | for k,v in cgf_dict.iteritems():
122 | cfg_file.write("%s = %s \n"%(k,v))
123 | if "temp_foldername" not in cgf_dict:
124 | cfg_file.write("temp_foldername = %s/tmp/ \n"%work_idp)
125 | if "output_foldername" not in cgf_dict:
126 | cfg_file.write("output_foldername = %s/out/ \n"%work_idp)
127 | if "Nthread" not in cgf_dict:
128 | cfg_file.write("Nthread = %d \n"%nthreads)
129 | if "LR_psl_pathfilename" not in cgf_dict:
130 | cfg_file.write("LR_psl_pathfilename = %s \n"%long_alignment)
131 | if "SR_sam_pathfilename" not in cgf_dict:
132 | cfg_file.write("SR_sam_pathfilename = %s \n"%alignment)
133 | if "SR_jun_pathfilename" not in cgf_dict:
134 | cfg_file.write("SR_jun_pathfilename = %s \n"%short_junction)
135 | if "genome_pathfilename" not in cgf_dict:
136 | cfg_file.write("genome_pathfilename = %s \n"%ref_genome)
137 | if "allref_annotation_pathfilename" not in cgf_dict:
138 | cfg_file.write("allref_annotation_pathfilename = %s \n"%ref_all_gpd)
139 | if "ref_annotation_pathfilename" not in cgf_dict:
140 | cfg_file.write("ref_annotation_pathfilename = %s \n"%ref_gpd)
141 | if "estimator_choice" not in cgf_dict:
142 | cfg_file.write("estimator_choice = MLE \n")
143 | if "FPR" not in cgf_dict:
144 | cfg_file.write("FPR = 0.05 \n")
145 | if "Njun_limit" not in cgf_dict:
146 | cfg_file.write("Njun_limit = 10 \n")
147 | if "Niso_limit" not in cgf_dict:
148 | cfg_file.write("Niso_limit = 100 \n")
149 | if "aligner_choice" not in cgf_dict:
150 | cfg_file.write("aligner_choice = gmap \n")
151 | if "exon_construction_junction_span" not in cgf_dict:
152 | cfg_file.write("exon_construction_junction_span = 1 \n")
153 | if "read_length" not in cgf_dict:
154 | cfg_file.write("read_length = %d \n"%read_length)
155 | else:
156 | logger.info("Skipping step %d: %s"%(step,msg))
157 | step+=1
158 |
159 |
160 |
161 | msg = "IDP for %s"%sample
162 | if start<=step:
163 | logger.info("--------------------------STEP %s--------------------------"%step)
164 | command="%s %s/run.cfg %d" % (
165 | idp, work_idp, mode_number)
166 | command="bash -c \"%s\""%command
167 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
168 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
169 | else:
170 | logger.info("Skipping step %d: %s"%(step,msg))
171 | step+=1
172 |
173 | msg = "Convert transcript GPD file to GTF for %s"%sample
174 | if start<=step:
175 | logger.info("--------------------------STEP %s--------------------------"%step)
176 | if os.path.exists("%s/out/isoform.gpd"%work_idp):
177 | sort_gpd("%s/out/isoform.gpd"%work_idp,"%s/out/isoform_sorted.gpd"%work_idp)
178 | command="gpd2gtf.py \
179 | %s/out/isoform_sorted.gpd %s/out/isoform.exp %s/out/isoform.gtf IDP"%(work_idp,work_idp,work_idp)
180 | command="bash -c \"%s\""%command
181 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
182 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
183 | else:
184 | logger.info("Skipping step %d: %s"%(step,msg))
185 | step+=1
186 |
187 | out_idp=os.path.join(outdir,"idp",sample)
188 | create_dirs([out_idp])
189 | msg="Copy predictions to output directory for %s."%sample
190 | if start<=step:
191 | logger.info("--------------------------STEP %s--------------------------"%step)
192 | if os.path.exists("%s/out/isoform.gtf"%work_idp) and \
193 | os.path.exists("%s/out/isoform.exp"%work_idp):
194 | command = "cp %s/out/isoform.gtf %s/isoform.gtf"%(
195 | work_idp, out_idp)
196 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
197 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
198 |
199 | command = "cp %s/out/isoform.exp %s/isoform.exp"%(
200 | work_idp, out_idp)
201 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
202 | retcode = cmd.run(cmd_log_fd_out=idp_log_fd, cmd_log=idp_log, msg=msg, timeout=timeout)
203 | else:
204 | logger.info("Skipping step %d: %s"%(step,msg))
205 | step+=1
206 |
207 |
208 |
209 | transcripts = ""
210 | abundances = ""
211 | if os.path.exists("%s/isoform.gtf"%out_idp) and \
212 | os.path.exists("%s/isoform.exp"%out_idp):
213 | logger.info("IDP was successfull!")
214 | logger.info("Output isoforms: %s/isoform.gtf"%out_idp)
215 | logger.info("Output expressions: %s/isoform.exp"%out_idp)
216 | transcripts = "%s/isoform.gtf"%out_idp
217 | abundances = "%s/isoform.exp"%out_idp
218 | else:
219 | logger.info("IDP failed!")
220 | return transcripts,abundances
221 |
222 | def run_lr_reconstruct(long_reconstructor="IDP", alignment="",
223 | short_junction="", long_alignment="", mode_number=0,
224 | ref_genome="", ref_all_gpd="", ref_gpd="", read_length=100,
225 | idp_cfg="", idp=IDP, samtools=SAMTOOLS,
226 | start=0, sample= "", nthreads=1,
227 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
228 | transcripts = ""
229 | abundances = ""
230 | if long_reconstructor.upper()=="IDP":
231 | try:
232 | transcripts,abundances=run_idp(alignment=alignment,
233 | short_junction=short_junction, long_alignment=long_alignment,
234 | mode_number=mode_number,
235 | ref_genome=ref_genome, ref_all_gpd=ref_all_gpd, ref_gpd=ref_gpd,
236 | read_length=read_length,
237 | idp_cfg=idp_cfg, idp=idp, samtools=samtools,
238 | start=start, sample= sample, nthreads=nthreads,
239 | workdir=workdir, outdir=outdir, timeout=timeout)
240 | except Exception as excp:
241 | logger.info("IDP failed!")
242 | logger.error(excp)
243 | if not ignore_exceptions:
244 | raise Exception(excp)
245 | return transcripts,abundances
--------------------------------------------------------------------------------
/src/run_quantify.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_salmon_smem(quantifier_idx=None,
14 | seq_1="", seq_2="", seq_u="",
15 | salmon_k=SALMON_SMEM_k, libtype="",
16 | salmon_smem_opts="", salmon=SALMON,
17 | start=0, sample= "", nthreads=1, unzip=False,
18 | workdir=None, outdir=None, timeout=TIMEOUT):
19 |
20 | logger.info("Running quantification (Salmon-SMEM) for %s"%sample)
21 | if not os.path.exists(quantifier_idx):
22 | logger.error("Aborting!")
23 | raise Exception("No Salmon FMD index directory %s"%quantifier_idx)
24 |
25 | if seq_1 and seq_2:
26 | for s1 in seq_1.split(","):
27 | if not os.path.exists(s1):
28 | logger.error("Aborting!")
29 | raise Exception("No Mate 1 sequence file %s"%s1)
30 |
31 | for s2 in seq_2.split(","):
32 | if not os.path.exists(s2):
33 | logger.error("Aborting!")
34 | raise Exception("No Mate 2 sequence file %s"%s2)
35 |
36 | if unzip:
37 | seq_argument="-1 <(gunzip -c %s) -2 <(gunzip -c %s)"%(" ".join(seq_1.split(","))," ".join(seq_2.split(",")))
38 | else:
39 | if "," in seq_1:
40 | seq_1="<(cat %s)"%(" ".join(seq_1.split(",")))
41 | if "," in seq_2:
42 | seq_2="<(cat %s)"%(" ".join(seq_2.split(",")))
43 | seq_argument="-1 %s -2 %s"%(seq_1,seq_2)
44 | elif seq_u:
45 | if unzip:
46 | seq_argument="-r <(gunzip -c %s)"%(" ".join(seq_u.split(",")))
47 | elif "," in seq_u:
48 | seq_argument="-r <(cat %s)"%(" ".join(seq_u1.split(",")))
49 | else:
50 | seq_argument="-r %s"%(seq_u)
51 | for su in seq_u.split(","):
52 | if not os.path.exists(su):
53 | logger.error("Aborting!")
54 | raise Exception("No unpaired sequence file %s"%su)
55 |
56 |
57 | work_salmon_smem=os.path.join(workdir,"salmon_smem",sample)
58 | create_dirs([work_salmon_smem])
59 |
60 | step=0
61 | if start<=step:
62 | logger.info("--------------------------STEP %s--------------------------"%step)
63 | msg = "Erase Salmon-SMEM work directory for %s"%sample
64 | command="rm -rf %s/*" % (
65 | work_salmon_smem)
66 | command="bash -c \"%s\""%command
67 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
68 | retcode = cmd.run(msg=msg,timeout=timeout)
69 | step+=1
70 |
71 |
72 | salmon_smem_log = os.path.join(work_salmon_smem, "salmon_smem.log")
73 | salmon_smem_log_fd = open(salmon_smem_log, "w")
74 |
75 | if "-p " not in salmon_smem_opts:
76 | salmon_smem_opts += " -p %d"%nthreads
77 |
78 | salmon_smem_opts += " -k %d"%salmon_k
79 | salmon_smem_opts += " -l %s"%libtype
80 |
81 | msg = "Salmon-SMEM for %s"%sample
82 | if start<=step:
83 | logger.info("--------------------------STEP %s--------------------------"%step)
84 | command="%s quant -i %s %s %s -o %s" % (
85 | salmon, quantifier_idx, salmon_smem_opts, seq_argument,work_salmon_smem )
86 | command="bash -c \"%s\""%command
87 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
88 | retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout)
89 | else:
90 | logger.info("Skipping step %d: %s"%(step,msg))
91 | step+=1
92 |
93 |
94 | out_salmon_smem=os.path.join(outdir,"salmon_smem",sample)
95 | create_dirs([out_salmon_smem])
96 | msg="Copy predictions to output directory for %s."%sample
97 | if start<=step:
98 | logger.info("--------------------------STEP %s--------------------------"%step)
99 | if os.path.exists("%s/quant.sf"%work_salmon_smem):
100 | command = "cp %s/quant.sf %s/quant.sf"%(
101 | work_salmon_smem, out_salmon_smem)
102 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
103 | retcode = cmd.run(cmd_log_fd_out=salmon_smem_log_fd, cmd_log=salmon_smem_log, msg=msg, timeout=timeout)
104 | else:
105 | logger.info("Skipping step %d: %s"%(step,msg))
106 | step+=1
107 |
108 |
109 | quant = ""
110 | if os.path.exists("%s/quant.sf"%out_salmon_smem):
111 | logger.info("Salmon-SMEM was successfull!")
112 | logger.info("Output expressions: %s/quant.sf"%out_salmon_smem)
113 | quant = "%s/quant.sf"%out_salmon_smem
114 | else:
115 | logger.info("Salmon-SMEM failed!")
116 | return quant
117 |
118 | def run_quantify(quantifier="Salmon-SMEM", quantifier_idx=None,
119 | seq_1="", seq_2="", seq_u="",
120 | salmon_k=SALMON_SMEM_k, libtype="",
121 | salmon_smem_opts="", salmon=SALMON,
122 | start=0, sample= "", nthreads=1, unzip=False,
123 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
124 | quant=""
125 | if quantifier.upper()=="SALMON-SMEM":
126 | try:
127 | quant=run_salmon_smem(quantifier_idx=quantifier_idx,
128 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u,
129 | salmon_k=salmon_k, libtype=libtype,
130 | salmon_smem_opts=salmon_smem_opts, salmon=salmon,
131 | start=start, sample= sample, nthreads=nthreads, unzip=unzip,
132 | workdir=workdir, outdir=outdir, timeout=timeout)
133 | except Exception as excp:
134 | logger.info("Salmon-SMEM failed!")
135 | logger.error(excp)
136 | if not ignore_exceptions:
137 | raise Exception(excp)
138 | return quant
--------------------------------------------------------------------------------
/src/run_reconstruct.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_stringtie(alignment_bam="",ref_gtf="",
14 | stringtie_opts="", stringtie=STRINGTIE,
15 | start=0, sample= "", nthreads=1,
16 | workdir=None, outdir=None, timeout=TIMEOUT):
17 |
18 | logger.info("Running transcriptome reconstruction (StringTie) for %s"%sample)
19 | if not os.path.exists(alignment_bam):
20 | logger.error("Aborting!")
21 | raise Exception("No input alignment BAM file %s"%alignment_bam)
22 |
23 | work_stringtie="%s/stringtie/%s/"%(workdir,sample)
24 | create_dirs([work_stringtie])
25 | step=0
26 | if start<=step:
27 | logger.info("--------------------------STEP %s--------------------------"%step)
28 | msg = "Erase StringTie work directory for %s"%sample
29 | command="rm -rf %s/*" % (
30 | work_stringtie)
31 | command="bash -c \"%s\""%command
32 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
33 | retcode = cmd.run(msg=msg,timeout=timeout)
34 | step+=1
35 | stringtie_log = os.path.join(work_stringtie, "stringtie.log")
36 | stringtie_log_fd = open(stringtie_log, "w")
37 |
38 | if ref_gtf:
39 | if not os.path.exists(ref_gtf):
40 | logger.error("Aborting!")
41 | raise Exception("No reference GTF file %s"%ref_gtf)
42 |
43 | if ref_gtf:
44 | stringtie_opts += " -G %s"%ref_gtf
45 | if "-p " not in stringtie_opts:
46 | stringtie_opts += " -p %d"%nthreads
47 |
48 | msg = "StringTie for %s"%sample
49 | if start<=step:
50 | logger.info("--------------------------STEP %s--------------------------"%step)
51 | command="%s %s %s -o %s/transcripts.gtf -A %s/gene_abund.tab -v" % (
52 | stringtie, alignment_bam, stringtie_opts, work_stringtie, work_stringtie)
53 | command="bash -c \"%s\""%command
54 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
55 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout)
56 | else:
57 | logger.info("Skipping step %d: %s"%(step,msg))
58 | step+=1
59 |
60 | out_stringtie=os.path.join(outdir,"stringtie",sample)
61 | create_dirs([out_stringtie])
62 | msg="Copy predictions to output directory for %s."%sample
63 | if start<=step:
64 | logger.info("--------------------------STEP %s--------------------------"%step)
65 | if os.path.exists("%s/transcripts.gtf"%work_stringtie) and \
66 | os.path.exists("%s/gene_abund.tab"%work_stringtie):
67 | command = "cp %s/transcripts.gtf %s/transcripts.gtf"%(
68 | work_stringtie, out_stringtie)
69 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
70 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout)
71 |
72 | command = "cp %s/gene_abund.tab %s/gene_abund.tab"%(
73 | work_stringtie, out_stringtie)
74 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
75 | retcode = cmd.run(cmd_log_fd_out=stringtie_log_fd, cmd_log=stringtie_log, msg=msg, timeout=timeout)
76 | else:
77 | logger.info("Skipping step %d: %s"%(step,msg))
78 | step+=1
79 |
80 |
81 | transcripts = ""
82 | abundances = ""
83 | if os.path.exists("%s/transcripts.gtf"%out_stringtie) and \
84 | os.path.exists("%s/gene_abund.tab"%out_stringtie):
85 | logger.info("StringTie was successfull!")
86 | logger.info("Output isoforms: %s/transcripts.gtf"%out_stringtie)
87 | logger.info("Output expressions: %s/gene_abund.tab"%out_stringtie)
88 | transcripts = "%s/transcripts.gtf"%out_stringtie
89 | abundances = "%s/gene_abund.tab"%out_stringtie
90 | else:
91 | logger.info("StringTie failed!")
92 | return transcripts,abundances
93 |
94 | def run_reconstruct(reconstructor="StringTie", alignment_bam="",
95 | ref_gtf="",
96 | stringtie_opts="", stringtie=STRINGTIE,
97 | start=0, sample= "", nthreads=1,
98 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
99 | transcripts = ""
100 | abundances = ""
101 | if reconstructor.upper()=="STRINGTIE":
102 | try:
103 | transcripts,abundances=run_stringtie(alignment_bam=alignment_bam,
104 | ref_gtf=ref_gtf,
105 | stringtie_opts=stringtie_opts, stringtie=stringtie,
106 | start=start, sample= sample, nthreads=nthreads,
107 | workdir=workdir, outdir=outdir, timeout=timeout)
108 | except Exception as excp:
109 | logger.info("StringTie failed!")
110 | logger.error(excp)
111 | if not ignore_exceptions:
112 | raise Exception(excp)
113 | return transcripts,abundances
--------------------------------------------------------------------------------
/src/run_sr_align.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 |
14 | def run_hisat2(align_idx=None,
15 | seq_1="", seq_2="", seq_u="",
16 | seq_sra="", ref_gtf="",
17 | hisat2_opts="", hisat2=HISAT2, hisat2_sps=HISAT2_SPS,
18 | samtools=SAMTOOLS,
19 | start=0, sample= "", nthreads=1,
20 | workdir=None, outdir=None, timeout=TIMEOUT):
21 |
22 | logger.info("Running alignment (HISAT2) for %s"%sample)
23 | if not os.path.exists(align_idx+".1.ht2"):
24 | logger.error("Aborting!")
25 | raise Exception("No HISAT index file %s.1.ht2"%align_idx)
26 |
27 | if seq_1 and seq_2:
28 | for s1 in seq_1.split(","):
29 | if not os.path.exists(s1):
30 | logger.error("Aborting!")
31 | raise Exception("No Mate 1 sequence file %s"%s1)
32 | for s2 in seq_2.split(","):
33 | if not os.path.exists(s2):
34 | logger.error("Aborting!")
35 | raise Exception("No Mate 2 sequence file %s"%s2)
36 | seq_argument="-1 %s -2 %s"%(seq_1,seq_2)
37 | elif seq_u:
38 | seq_argument="-U %s"%(seq_u)
39 | for su in seq_u.split(","):
40 | if not os.path.exists(su):
41 | logger.error("Aborting!")
42 | raise Exception("No unpaired sequence file %s"%su)
43 |
44 | elif seq_sra:
45 | seq_argument="--sra-acc %s"%(seq_sra)
46 | for sr in seq_sra.split(","):
47 | if not os.path.exists(sr):
48 | logger.error("Aborting!")
49 | raise Exception("No sra sequence file %s"%sr)
50 |
51 |
52 | work_hisat2=os.path.join(workdir,"hisat2",sample)
53 | create_dirs([work_hisat2])
54 |
55 | step=0
56 | if start<=step:
57 | logger.info("--------------------------STEP %s--------------------------"%step)
58 | msg = "Erase HISAT2 work directory for %s"%sample
59 | command="rm -rf %s/*" % (
60 | work_hisat2)
61 | command="bash -c \"%s\""%command
62 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
63 | retcode = cmd.run(msg=msg,timeout=timeout)
64 | step+=1
65 |
66 | hisat2_log = os.path.join(work_hisat2, "hisat2.log")
67 | hisat2_log_fd = open(hisat2_log, "w")
68 |
69 | ksps = ""
70 | msg = "Prepare known-splicesites for %s"%sample
71 | if start<=step:
72 | logger.info("--------------------------STEP %s--------------------------"%step)
73 | if ref_gtf:
74 | if not os.path.exists(ref_gtf):
75 | logger.error("Aborting!")
76 | raise Exception("No reference GTF file %s"%ref_gtf)
77 | else:
78 | ksps = ref_gtf.strip() + "known-splicesite.txt"
79 | if os.path.exists(ksps):
80 | logger.info("Will use the precomputed %s as --known-splicesite-infile for HISAT2"%ksps)
81 | else:
82 | msg="compute --known-splicesite-infile for HISAT2"
83 | ksps = os.path.join(work_hisat2, "known-splicesite.txt")
84 | ksps_fd = open(ksps, "w")
85 |
86 | command="%s %s" % (hisat2_sps,ref_gtf)
87 | command="bash -c \"%s\""%command
88 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
89 | retcode = cmd.run(cmd_log_fd_out=ksps_fd, msg=msg, timeout=timeout)
90 | else:
91 | logger.info("Skipping step %d: %s"%(step,msg))
92 | step+=1
93 |
94 |
95 |
96 | if "--dta " not in hisat2_opts:
97 | hisat2_opts += " --dta"
98 | if "--rg-id " not in hisat2_opts:
99 | hisat2_opts += " --rg-id hisat2"
100 | if "--rg " not in hisat2_opts:
101 | hisat2_opts += " --rg SM:%s"%sample
102 | if "--threads " not in hisat2_opts:
103 | hisat2_opts += " --threads %d"%nthreads
104 | if ksps:
105 | hisat2_opts += " --known-splicesite-infile %s"%ksps
106 |
107 | msg = "HISAT2 for %s"%sample
108 | if start<=step:
109 | logger.info("--------------------------STEP %s--------------------------"%step)
110 | command="%s %s -x %s %s -S %s/alignments.sam --novel-splicesite-outfile %s/splicesites.tab" % (
111 | hisat2, hisat2_opts, align_idx, seq_argument,work_hisat2, work_hisat2 )
112 | command="bash -c \"%s\""%command
113 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
114 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
115 | else:
116 | logger.info("Skipping step %d: %s"%(step,msg))
117 | step+=1
118 |
119 | msg = "converting SAM to BAM for %s"%sample
120 | if start<=step:
121 | logger.info("--------------------------STEP %s--------------------------"%step)
122 | command="%s view -Su %s/alignments.sam -@ %d -o %s/alignments.bam" % (
123 | samtools, work_hisat2, nthreads, work_hisat2)
124 | command="bash -c \"%s\""%command
125 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
126 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
127 | else:
128 | logger.info("Skipping step %d: %s"%(step,msg))
129 | step+=1
130 |
131 | msg = "sorting BAM for %s"%sample
132 | if start<=step:
133 | logger.info("--------------------------STEP %s--------------------------"%step)
134 | command="%s sort -@ %d -T %s/alignments.sorted -o %s/alignments.sorted.bam %s/alignments.bam " % (
135 | samtools, nthreads, work_hisat2, work_hisat2, work_hisat2)
136 | command="bash -c \"%s\""%command
137 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
138 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
139 | else:
140 | logger.info("Skipping step %d: %s"%(step,msg))
141 | step+=1
142 |
143 |
144 |
145 | msg = "Converting junctions to BED for %s"%sample
146 | if start<=step:
147 | logger.info("--------------------------STEP %s--------------------------"%step)
148 | command="hisat2_jun2bed.py %s/splicesites.tab %s/splicesites.bed " % (
149 | work_hisat2, work_hisat2)
150 | command="bash -c \"%s\""%command
151 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
152 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
153 | else:
154 | logger.info("Skipping step %d: %s"%(step,msg))
155 | step+=1
156 |
157 | msg = "Clean temp alignment files for %s"%sample
158 | if start<=step:
159 | logger.info("--------------------------STEP %s--------------------------"%step)
160 | command="rm %s/alignments.sam %s/alignments.bam" % (work_hisat2, work_hisat2)
161 | command="bash -c \"%s\""%command
162 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
163 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
164 | else:
165 | logger.info("Skipping step %d: %s"%(step,msg))
166 | step+=1
167 |
168 |
169 | out_hisat2=os.path.join(outdir,"hisat2",sample)
170 | create_dirs([out_hisat2])
171 | msg="Copy predictions to output directory for %s."%sample
172 | if start<=step:
173 | logger.info("--------------------------STEP %s--------------------------"%step)
174 | if os.path.exists("%s/alignments.sorted.bam"%work_hisat2) and \
175 | os.path.exists("%s/splicesites.tab"%work_hisat2) and \
176 | os.path.exists("%s/splicesites.bed"%work_hisat2):
177 | command = "cp %s/alignments.sorted.bam %s/alignments.sorted.bam"%(
178 | work_hisat2, out_hisat2)
179 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
180 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
181 | command = "cp %s/splicesites.tab %s/splicesites.tab"%(
182 | work_hisat2, out_hisat2)
183 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
184 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
185 | command = "cp %s/splicesites.bed %s/splicesites.bed"%(
186 | work_hisat2, out_hisat2)
187 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
188 | retcode = cmd.run(cmd_log_fd_out=hisat2_log_fd, cmd_log=hisat2_log, msg=msg, timeout=timeout)
189 | else:
190 | logger.info("Skipping step %d: %s"%(step,msg))
191 | step+=1
192 |
193 |
194 |
195 | alignments_bam = ""
196 | junctions_tab = ""
197 | junctions_bed = ""
198 | if os.path.exists("%s/alignments.sorted.bam"%out_hisat2):
199 | logger.info("HISAT2 was successfull!")
200 | logger.info("Output alignment: %s/alignments.sorted.bam"%out_hisat2)
201 | logger.info("Output junction tab: %s/splicesites.tab"%out_hisat2)
202 | logger.info("Output junction bed: %s/splicesites.bed"%out_hisat2)
203 | alignments_bam = "%s/alignments.sorted.bam"%out_hisat2
204 | junctions_tab = "%s/splicesites.tab"%out_hisat2
205 | junctions_bed = "%s/splicesites.bed"%out_hisat2
206 | else:
207 | logger.info("HISAT2 failed!")
208 | return alignments_bam,junctions_tab,junctions_bed
209 |
210 | def run_sr_align(sr_aligner="HISAT2", align_idx=None,
211 | seq_1="", seq_2="", seq_u="",
212 | seq_sra="", ref_gtf="",
213 | hisat2_opts="", hisat2=HISAT2, hisat2_sps=HISAT2_SPS,
214 | samtools=SAMTOOLS,
215 | start=0, sample= "", nthreads=1,
216 | workdir=None, outdir=None, timeout=TIMEOUT,ignore_exceptions=False):
217 | alignments_bam = ""
218 | junctions_tab = ""
219 | junctions_bed = ""
220 | if sr_aligner.upper()=="HISAT2":
221 | try :
222 | alignments_bam, junctions_tab, junctions_bed=run_hisat2(align_idx=align_idx,
223 | seq_1=seq_1, seq_2=seq_2, seq_u=seq_u,
224 | seq_sra=seq_sra, ref_gtf=ref_gtf,
225 | hisat2_opts=hisat2_opts, hisat2=hisat2, hisat2_sps=hisat2_sps,
226 | samtools=samtools,
227 | start=start, sample= sample, nthreads=nthreads,
228 | workdir=workdir, outdir=outdir, timeout=timeout)
229 | except Exception as excp:
230 | logger.info("HISAT2 failed!")
231 | logger.error(excp)
232 | if not ignore_exceptions:
233 | raise Exception(excp)
234 |
235 | return alignments_bam, junctions_tab, junctions_bed
--------------------------------------------------------------------------------
/src/run_variant.py:
--------------------------------------------------------------------------------
1 | import os
2 | from external_cmd import TimedExternalCmd
3 | from defaults import *
4 | from utils import *
5 |
6 | FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
7 | logFormatter = logging.Formatter(FORMAT)
8 | logger = logging.getLogger(__name__)
9 | consoleHandler = logging.StreamHandler()
10 | consoleHandler.setFormatter(logFormatter)
11 | logger.addHandler(consoleHandler)
12 |
13 | def run_gatk(alignment="", ref_genome="", knownsites="",
14 | picard=PICARD, gatk=GATK,
15 | java=JAVA, java_opts="",
16 | CleanSam=False, no_BaseRecalibrator=False ,
17 | AddOrReplaceReadGroups_opts="", MarkDuplicates_opts="",
18 | SplitNCigarReads_opts="",
19 | BaseRecalibrator_opts="",
20 | ApplyBQSR_opts="", HaplotypeCaller_opts="",
21 | VariantFiltration_opts="",
22 | start=0, sample= "", nthreads=1,
23 | workdir=None, outdir=None, timeout=TIMEOUT):
24 |
25 | logger.info("Running variant calling (GATK) for %s"%sample)
26 | if not os.path.exists(alignment):
27 | logger.error("Aborting!")
28 | raise Exception("No alignment file %s"%alignment)
29 | if not os.path.exists(ref_genome):
30 | logger.error("Aborting!")
31 | raise Exception("No reference genome FASTA file %s"%ref_genome)
32 |
33 |
34 | work_gatk=os.path.join(workdir,"gatk",sample)
35 | create_dirs([work_gatk])
36 |
37 | step=0
38 | if start<=step:
39 | logger.info("--------------------------STEP %s--------------------------"%step)
40 | msg = "Erase GATK work directory for %s"%sample
41 | command="rm -rf %s/*" % (
42 | work_gatk)
43 | command="bash -c \"%s\""%command
44 | cmd = TimedExternalCmd(command, logger, raise_exception=False)
45 | retcode = cmd.run(msg=msg,timeout=timeout)
46 | step+=1
47 |
48 | gatk_log = os.path.join(work_gatk, "gatk.log")
49 | gatk_log_fd = open(gatk_log, "w")
50 |
51 |
52 | if "SO=" not in AddOrReplaceReadGroups_opts:
53 | AddOrReplaceReadGroups_opts += " SO=coordinate"
54 | if "RGLB=" not in AddOrReplaceReadGroups_opts:
55 | AddOrReplaceReadGroups_opts += " RGLB=lib1"
56 | if "RGPL=" not in AddOrReplaceReadGroups_opts:
57 | AddOrReplaceReadGroups_opts += " RGPL=illumina"
58 | if "RGPU=" not in AddOrReplaceReadGroups_opts:
59 | AddOrReplaceReadGroups_opts += " RGPU=unit1"
60 | if "RGSM=" not in AddOrReplaceReadGroups_opts:
61 | AddOrReplaceReadGroups_opts += " RGSM=%s"%sample
62 |
63 | if "CREATE_INDEX=" not in MarkDuplicates_opts:
64 | MarkDuplicates_opts += " CREATE_INDEX=true"
65 | if "VALIDATION_STRINGENCY=" not in MarkDuplicates_opts:
66 | MarkDuplicates_opts += " VALIDATION_STRINGENCY=SILENT"
67 |
68 | if knownsites:
69 | if not os.path.exists(knownsites):
70 | logger.error("Aborting!")
71 | raise Exception("No VCF knownsites file %s"%knownsites)
72 | if "--known-sites " not in BaseRecalibrator_opts:
73 | BaseRecalibrator_opts += " --known-sites %s"%knownsites
74 |
75 |
76 |
77 | if "--dont-use-soft-clipped-bases " not in HaplotypeCaller_opts:
78 | HaplotypeCaller_opts += " --dont-use-soft-clipped-bases"
79 | if "-stand-call-conf " not in HaplotypeCaller_opts:
80 | HaplotypeCaller_opts += " -stand-call-conf %f"%GATK_HC_STANDCALLCONF
81 |
82 | if "-window " not in VariantFiltration_opts:
83 | VariantFiltration_opts += " -window %d"%GATK_VF_WINDOW
84 | if "-cluster " not in VariantFiltration_opts:
85 | VariantFiltration_opts += " -cluster %d"%GATK_VF_CLUSTER
86 | if "--filter-name FS " not in VariantFiltration_opts:
87 | VariantFiltration_opts += " --filter-name FS -filter 'FS > %f'"%GATK_VF_FSMIN
88 | if "--filter-name QD " not in VariantFiltration_opts:
89 | VariantFiltration_opts += " --filter-name QD -filter 'QD < %f'"%GATK_VF_QDMAX
90 |
91 | if "-Xms" not in java_opts:
92 | java_opts += " %s"%JAVA_XMS
93 | if "-Xmx" not in java_opts:
94 | java_opts += " %s"%JAVA_XMG
95 | if "-Djava.io.tmpdir" not in java_opts:
96 | java_opts += " -Djava.io.tmpdir=%s/javatmp/"%(work_gatk)
97 | create_dirs(["%s/javatmp/"%(work_gatk)])
98 |
99 | msg = "picard CleanSam for %s"%sample
100 | if start<=step:
101 | logger.info("--------------------------STEP %s--------------------------"%step)
102 | if CleanSam:
103 | command="%s %s -cp %s picard.cmdline.PicardCommandLine CleanSam I=%s O=%s/alignments_clean.bam" % (
104 | java, java_opts, picard, alignment,work_gatk )
105 | command="bash -c \"%s\""%command
106 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
107 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
108 | alignment="%s/alignments_clean.bam"%work_gatk
109 | else:
110 | logger.info("Skipping step %d: %s"%(step,msg))
111 | step+=1
112 |
113 |
114 | msg = "picard AddOrReplaceReadGroups for %s"%sample
115 | if start<=step:
116 | logger.info("--------------------------STEP %s--------------------------"%step)
117 | command="%s %s -cp %s picard.cmdline.PicardCommandLine AddOrReplaceReadGroups I=%s O=%s/rg_added_sorted.bam %s" % (
118 | java, java_opts, picard, alignment,work_gatk,AddOrReplaceReadGroups_opts)
119 | command="bash -c \"%s\""%command
120 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
121 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
122 | else:
123 | logger.info("Skipping step %d: %s"%(step,msg))
124 | step+=1
125 |
126 |
127 | msg = "picard MarkDuplicates for %s"%sample
128 | if start<=step:
129 | logger.info("--------------------------STEP %s--------------------------"%step)
130 | command="%s %s -cp %s picard.cmdline.PicardCommandLine MarkDuplicates I=%s/rg_added_sorted.bam O=%s/dedupped.bam %s M=%s/output.metrics" % (
131 | java, java_opts, picard, work_gatk,work_gatk,MarkDuplicates_opts,work_gatk)
132 | command="bash -c \"%s\""%command
133 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
134 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
135 | else:
136 | logger.info("Skipping step %d: %s"%(step,msg))
137 | step+=1
138 |
139 |
140 | msg = "GATK SplitNCigarReads for %s"%sample
141 | if start<=step:
142 | logger.info("--------------------------STEP %s--------------------------"%step)
143 | command="%s %s -jar %s SplitNCigarReads -R %s -I %s/dedupped.bam -O %s/split.bam %s" % (
144 | java, java_opts, gatk, ref_genome,work_gatk,work_gatk,SplitNCigarReads_opts)
145 | command="bash -c \"%s\""%command
146 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
147 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
148 | else:
149 | logger.info("Skipping step %d: %s"%(step,msg))
150 | step+=1
151 |
152 | split_bam="%s/split.bam"%work_gatk
153 |
154 | if not no_BaseRecalibrator:
155 | msg = "GATK BaseRecalibrator for %s"%sample
156 | if start<=step:
157 | logger.info("--------------------------STEP %s--------------------------"%step)
158 | command="%s %s -jar %s BaseRecalibrator -R %s -I %s -O %s/recal_data.table %s" % (
159 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,BaseRecalibrator_opts)
160 | command="bash -c \"%s\""%command
161 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
162 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
163 | else:
164 | logger.info("Skipping step %d: %s"%(step,msg))
165 | step+=1
166 |
167 | msg = "GATK ApplyBQSR for %s"%sample
168 | if start<=step:
169 | logger.info("--------------------------STEP %s--------------------------"%step)
170 | command="%s %s -jar %s ApplyBQSR -R %s -I %s -bqsr %s/recal_data.table -O %s/bsqr.bam %s" % (
171 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,work_gatk,ApplyBQSR_opts)
172 | command="bash -c \"%s\""%command
173 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
174 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
175 | else:
176 | logger.info("Skipping step %d: %s"%(step,msg))
177 | step+=1
178 | split_bam="%s/bsqr.bam"%work_gatk
179 | else:
180 | msg = "GATK BaseRecalibrator for %s"%sample
181 | logger.info("Skipping step %d: %s"%(step,msg))
182 | step+=1
183 | msg = "GATK ApplyBQSR for %s"%sample
184 | logger.info("Skipping step %d: %s"%(step,msg))
185 | step+=1
186 |
187 | msg = "GATK HaplotypeCaller for %s"%sample
188 | if start<=step:
189 | logger.info("--------------------------STEP %s--------------------------"%step)
190 | command="%s %s -jar %s HaplotypeCaller -R %s -I %s -O %s/variants.vcf %s" % (
191 | java, java_opts, gatk, ref_genome,split_bam,work_gatk,HaplotypeCaller_opts)
192 | command="bash -c \"%s\""%command
193 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
194 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
195 | else:
196 | logger.info("Skipping step %d: %s"%(step,msg))
197 | step+=1
198 |
199 | msg = "GATK VariantFiltration for %s"%sample
200 | if start<=step:
201 | logger.info("--------------------------STEP %s--------------------------"%step)
202 | command="%s %s -jar %s VariantFiltration -R %s -V %s/variants.vcf -O %s/variants_filtered.vcf %s" % (
203 | java, java_opts, gatk, ref_genome,work_gatk,work_gatk,VariantFiltration_opts)
204 | command="bash -c \"%s\""%command
205 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
206 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
207 | else:
208 | logger.info("Skipping step %d: %s"%(step,msg))
209 | step+=1
210 |
211 |
212 | out_gatk=os.path.join(outdir,"gatk",sample)
213 | create_dirs([out_gatk])
214 | msg="Copy predictions to output directory for %s."%sample
215 | if start<=step:
216 | logger.info("--------------------------STEP %s--------------------------"%step)
217 | if os.path.exists("%s/variants_filtered.vcf"%work_gatk):
218 | command = "cp %s/variants_filtered.vcf %s/variants_filtered.vcf"%(
219 | work_gatk, out_gatk)
220 | cmd = TimedExternalCmd(command, logger, raise_exception=True)
221 | retcode = cmd.run(cmd_log_fd_out=gatk_log_fd, cmd_log=gatk_log, msg=msg, timeout=timeout)
222 | else:
223 | logger.info("Skipping step %d: %s"%(step,msg))
224 | step+=1
225 |
226 | variants = ""
227 | if os.path.exists("%s/variants_filtered.vcf"%out_gatk):
228 | logger.info("GATK was successfull!")
229 | logger.info("Output variants: %s/variants_filtered.vcf"%out_gatk)
230 | variants = "%s/variants_filtered.vcf"%out_gatk
231 | else:
232 | logger.info("GATK failed!")
233 | return variants
234 |
235 | def run_variant(variant_caller="GATK", alignment="",
236 | ref_genome="", knownsites="",
237 | picard=PICARD, gatk=GATK,
238 | java=JAVA, java_opts="",
239 | CleanSam=False, no_BaseRecalibrator=False,
240 | AddOrReplaceReadGroups_opts="", MarkDuplicates_opts="",
241 | SplitNCigarReads_opts="",
242 | BaseRecalibrator_opts="",
243 | ApplyBQSR_opts="", HaplotypeCaller_opts="",
244 | VariantFiltration_opts="",
245 | start=0, sample= "", nthreads=1,
246 | workdir=None, outdir=None, timeout=TIMEOUT, ignore_exceptions=False):
247 | variants=""
248 | if variant_caller.upper()=="GATK":
249 | try:
250 | variants=run_gatk(alignment=alignment,
251 | ref_genome=ref_genome, knownsites=knownsites,
252 | picard=picard, gatk=gatk,
253 | java=java, java_opts=java_opts,
254 | CleanSam=CleanSam,
255 | no_BaseRecalibrator=no_BaseRecalibrator,
256 | AddOrReplaceReadGroups_opts=AddOrReplaceReadGroups_opts,
257 | MarkDuplicates_opts=MarkDuplicates_opts,
258 | SplitNCigarReads_opts=SplitNCigarReads_opts,
259 | BaseRecalibrator_opts=BaseRecalibrator_opts,
260 | ApplyBQSR_opts=ApplyBQSR_opts, HaplotypeCaller_opts=HaplotypeCaller_opts,
261 | VariantFiltration_opts=VariantFiltration_opts,
262 | start=start, sample= sample, nthreads=nthreads,
263 | workdir=workdir, outdir=outdir, timeout=timeout)
264 | except Exception as excp:
265 | logger.info("GATK failed!")
266 | logger.error(excp)
267 | if not ignore_exceptions:
268 | raise Exception(excp)
269 | return variants
270 |
271 |
272 |
273 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import time
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 | def create_dirs(dirlist):
8 | for dirname in dirlist:
9 | if not os.path.isdir(dirname):
10 | logger.info("Creating directory %s" % (dirname))
11 | os.makedirs(dirname)
12 |
13 |
--------------------------------------------------------------------------------
/test/A1_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A1_1.fq.gz
--------------------------------------------------------------------------------
/test/A1_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A1_2.fq.gz
--------------------------------------------------------------------------------
/test/A2_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A2_1.fq.gz
--------------------------------------------------------------------------------
/test/A2_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/A2_2.fq.gz
--------------------------------------------------------------------------------
/test/B1_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B1_1.fq.gz
--------------------------------------------------------------------------------
/test/B1_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B1_2.fq.gz
--------------------------------------------------------------------------------
/test/B2_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B2_1.fq.gz
--------------------------------------------------------------------------------
/test/B2_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/B2_2.fq.gz
--------------------------------------------------------------------------------
/test/C_long.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_long.fa.gz
--------------------------------------------------------------------------------
/test/C_short.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short.fa.gz
--------------------------------------------------------------------------------
/test/C_short_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short_1.fq.gz
--------------------------------------------------------------------------------
/test/C_short_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/C_short_2.fq.gz
--------------------------------------------------------------------------------
/test/GRCh37_genes_pos.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh37_genes_pos.bed.gz
--------------------------------------------------------------------------------
/test/GRCh37_strand_pos.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh37_strand_pos.bed.gz
--------------------------------------------------------------------------------
/test/GRCh38.21.gpd.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38.21.gpd.gz
--------------------------------------------------------------------------------
/test/GRCh38_genes_pos.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38_genes_pos.bed.gz
--------------------------------------------------------------------------------
/test/GRCh38_strand_pos.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/GRCh38_strand_pos.bed.gz
--------------------------------------------------------------------------------
/test/hg19.known.21.gpd.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bioinform/rnacocktail/9a4ddee62dcfbcf3c1dfd6c3dfffd4b66e1f76e1/test/hg19.known.21.gpd.gz
--------------------------------------------------------------------------------
/test/test_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir example_small
4 | cd example_small
5 |
6 | echo "Download reference genome (chromosome 21) FASTA file"
7 | wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.21.fa.gz
8 |
9 | echo "Unzip reference genome (chromosome 21) FASTA file"
10 | gunzip Homo_sapiens.GRCh37.75.dna.chromosome.21.fa.gz
11 |
12 | echo "Download reference annotation GTF file"
13 | wget ftp://ftp.ensembl.org/pub/release-75//gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz
14 |
15 | echo "Unzip reference annotation GTF file"
16 | gunzip Homo_sapiens.GRCh37.75.gtf.gz
17 |
18 | echo "Restrict GTF to chromosome 21"
19 | less Homo_sapiens.GRCh37.75.gtf |awk '{if ($1==21) print}' > Homo_sapiens.GRCh37.75.chromosome.21.gtf
20 |
21 |
22 | echo "Download HISAT2 binaries"
23 | wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.0.5-Linux_x86_64.zip
24 |
25 | echo "Unzip HISAT2 binaries"
26 | unzip hisat2-2.0.5-Linux_x86_64.zip
27 |
28 | echo "Index genome with HISAT2"
29 | ./hisat2-2.0.5/hisat2-build Homo_sapiens.GRCh37.75.dna.chromosome.21.fa Homo_sapiens.GRCh37.75.dna.chromosome.21.HISAT2
30 |
31 | echo "Test alignment step using HISAT2"
32 | run_rnacocktail.py align --align_idx Homo_sapiens.GRCh37.75.dna.chromosome.21.HISAT2 --outdir out --workdir work --ref_gtf Homo_sapiens.GRCh37.75.chromosome.21.gtf --1 ../A1_1.fq.gz --2 ../A1_2.fq.gz --hisat2 hisat2-2.0.5/hisat2 --hisat2_sps hisat2-2.0.5/hisat2_extract_splice_sites.py --samtools samtools --sample A
33 |
34 | echo "Download StringTie binaries"
35 | wget http://ccb.jhu.edu/software/stringtie/dl/stringtie-1.3.3.Linux_x86_64.tar.gz
36 |
37 | echo "Untar StringTie binaries"
38 | tar -xzvf stringtie-1.3.3.Linux_x86_64.tar.gz
39 |
40 | echo "Test reconstruction step using StringTie"
41 | run_rnacocktail.py reconstruct --alignment_bam work/hisat2/A/alignments.sorted.bam --outdir out --workdir work --ref_gtf Homo_sapiens.GRCh37.75.chromosome.21.gtf --stringtie stringtie-1.3.3.Linux_x86_64/stringtie --sample A
42 |
--------------------------------------------------------------------------------