├── CGC
├── ORF1_list.txt
├── ORF2_list.txt
├── make_ORF1_and_intact_table.py
├── make_ORF1_and_intact_table_stranded.py
├── make_l1pa1to4table.py
├── make_l1pa1to4table_stranded.py
├── median_template_and_pairs.py
├── read_or_pair_overlap_bed_and_unmapped.py
├── report_l1_exp_counts.py
├── report_l1_exp_counts_unstranded.py
├── total_orf1_and_orf2.py
└── total_orf1_and_orf2_stranded.py
├── Dockerfile
├── L1EM.yml
├── L1EM
├── G_of_R.py
├── G_of_R_single_unstranded.py
├── G_of_R_unstranded.py
└── L1EM.py
├── LICENSE.txt
├── README.md
├── annotation
├── L1EM.400.bed
└── mm39.L1EM.bed
├── generate_L1EM_fasta_and_index.sh
├── generate_mm39_L1EM_fasta_and_index.sh
├── manual.md
├── parameters.sh
├── run_L1EM.sh
├── run_L1EM_fortcga.sh
├── run_L1EM_mm39.sh
├── run_L1EM_mm39_unstranded.sh
├── run_L1EM_unstranded.sh
├── run_L1EM_unstranded_fromdocker.sh
├── run_L1EM_withlessmemory.sh
└── utilities
├── L1EM_readpairs.py
├── filtered_and_normalized_active_l1md.py
├── filtered_and_normalized_active_l1md_unstranded.py
├── filtered_and_normalized_l1hs.py
├── filtered_and_normalized_l1hs_unstranded.py
├── median_template.py
├── read_or_pair_overlap_bed.py
├── report_l1_exp_counts.py
├── report_l1_exp_counts_clip.py
├── report_l1_exp_counts_unstranded.py
├── report_l1hs_transcription.py
└── report_l1hs_transcription_unstranded.py
/CGC/ORF1_list.txt:
--------------------------------------------------------------------------------
1 | L1HS.1.chrX:141421202-141427246
2 | L1HS.1.chr2:172315270-172321297
3 | L1HS.1.chr17:70458956-70464987
4 | L1HS.1.chr15:82882881-82888919
5 | L1HS.1.chr14:63116706-63122735
6 | L1HS.1.chr13:29641706-29647706
7 | L1HS.1.chr12:126299023-126305038
8 | L1PA3.1.chr12:13391606-13397632
9 | L1HS.1.chr11:95436216-95442246
10 | L1HS.1.chr10:98782941-98788971
11 | L1HS.1.chrX:11935296-11941314
12 | L1HS.1.chr7:111243515-111249546
13 | L1HS.1.chr7:96846650-96852680
14 | L1HS.1.chr7:66286853-66292884
15 | L1HS.1.chr7:49680245-49686300
16 | L1HS.1.chr6:24811657-24817706
17 | L1HS.1.chr5:109259387-109265418
18 | L1HS.1.chr5:104518587-104524616
19 | L1HS.1.chr4:136293494-136299546
20 | L1PA2.1.chr4:128213789-128219796
21 | L1HS.1.chr4:79966907-79972933
22 | L1HS.1.chr4:70328906-70334307
23 | L1HS.1.chr4:21159390-21165421
24 | L1HS.1.chr3:89460825-89466856
25 | L1PA2.1.chr3:81051389-81057413
26 | L1HS.1.chr1:237019467-237025494
27 | L1HS.1.chr1:180866811-180872843
28 | L1HS.1.chr1:84052389-84058406
29 | L1HS.1.chr1:104770247-104776278
30 | L1HS.1.chr6:86000000-86005073
31 | L1HS.1.chr22:28663283-28669315
32 | L1HS.1.chr11:78677772-78683802
33 | L1HS.1.chr10:19088601-19094618
34 | L1HS.1.chr9:90149604-90155634
35 | L1PA2.1.chr8:91558668-91564687
36 | L1HS.1.chr3:46783105-46789138
37 | L1HS.1.chr1:174590323-174596379
38 | L1HS.1.chr22:48985761-48991792
39 | L1HS.1.chr17:70544788-70550795
40 | L1HS.1.chrX:155516016-155522048
41 | L1HS.1.chrX:83059584-83065637
42 | L1HS.1.chr9:112798107-112804159
43 | L1HS.1.chr9:94113535-94119565
44 | L1HS.1.chr8:72875538-72881588
45 | L1HS.1.chr5:173402796-173408828
46 | L1HS.1.chr20:12801017-12807044
47 | L1HS.1.chr18:37819737-37825798
48 | L1HS.1.chr16:68583448-68589505
49 | L1HS.1.chr16:33952564-33958612
50 | L1HS.1.chrX:130517377-130523407
51 | L1HS.1.chrX:11707248-11713279
52 | L1HS.1.chr8:134070756-134076773
53 | L1HS.1.chr4:166569976-166576007
54 | L1HS.1.chr4:87347103-87353146
55 | L1HS.1.chr3:130628808-130634065
56 | L1HS.1.chr2:71411474-71417501
57 | L1HS.1.chr1:118852351-118858380
58 | L1HS.1.chr20:55859566-55865521
59 | L1HS.1.chr16:83637252-83643296
60 | L1HS.1.chr15:83450804-83456834
61 | L1HS.1.chr10:5245354-5251383
62 | L1HS.1.chr9:110791097-110797129
63 | L1HS.1.chr8:125582886-125588889
64 | L1HS.1.chr7:141920659-141926712
65 | L1HS.1.chr7:25041860-25047891
66 | L1HS.1.chr5:156061919-156067966
67 | L1HS.1.chr4:90675739-90681757
68 | L1HS.1.chr4:59078847-59084877
69 | L1HS.1.chr3:163236941-163242962
70 | L1HS.1.chr3:22050867-22053197
71 | L1HS.1.chr2:148188745-148194773
72 | L1HS.1.chr2:4733729-4739760
73 | L1HS.1.chr15:70729744-70735160
74 | L1HS.1.chr1:121532230-121538261
75 | L1HS.1.chr12:73283667-73289668
76 | L1HS.1.chr6:51874783-51880802
77 | L1HS.1.chr2:112503812-112509845
78 | L1HS.1.chr13:108510472-108516495
79 | L1HS.1.chr11:93136638-93142673
80 | L1HS.1.chr11:24327951-24334001
81 | L1HS.1.chr6:133020691-133026746
82 | L1HS.1.chr4:98592435-98598463
83 | L1HS.1.chr4:23614771-23620793
84 | L1HS.1.chr3:159095379-159101394
85 | L1HS.1.chr16:9584490-9590522
86 | L1HS.1.chr10:33510845-33516876
87 | L1HS.1.chrX:106469285-106475319
88 | L1HS.1.chr4:79704552-79710581
89 | L1HS.1.chr3:158019676-158025704
90 | L1PA2.1.chr3:63211708-63217714
91 | L1HS.1.chr2:166988454-166994509
92 | L1HS.1.chr13:31302314-31308370
93 | L1HS.1.chr12:74874868-74880901
94 | L1HS.1.chr7:30439242-30445274
95 | L1HS.1.chr6:72988654-72994686
96 | L1HS.1.chr4:166755895-166761908
97 | L1HS.1.chr4:79937715-79943746
98 | L1HS.1.chr2:102566355-102572385
99 | L1PA2.1.chr18:59403939-59409970
100 | L1HS.1.chr12:3500000-3505228
101 | L1HS.1.chr11:93420986-93427031
102 | L1HS.1.chr11:90400067-90406098
103 | L1HS.1.chr11:36551606-36557636
104 | L1HS.1.chr8:128453002-128459020
105 | L1HS.1.chr5:166966760-166972815
106 | L1HS.1.chr5:146609485-146615534
107 | L1HS.1.chr3:109199872-109205903
108 | L1PA3.1.chrX:64252345-64258375
109 | L1HS.1.chr6:2417774-2423803
110 | L1HS.1.chr5:102189483-102194435
111 | L1PA2.1.chr5:39787652-39793671
112 | L1HS.1.chr4:169515501-169521532
113 | L1HS.1.chr4:78105735-78111765
114 | L1PA2.1.chr4:55619153-55625181
115 | L1HS.1.chr3:136479056-136485103
116 | L1HS.1.chr3:116359999-116366026
117 | L1PA2.1.chr2:106130892-106136925
118 | L1PA2.1.chr1:71888203-71894235
119 | L1HS.1.chr15:87509891-87515920
120 | L1HS.1.chr10:109812437-109818457
121 | L1HS.1.chr10:105775520-105781551
122 | L1PA2.1.chrX:42888370-42894396
123 | L1HS.1.chr7:93787624-93793679
124 | L1HS.1.chr5:13416497-13422525
125 | L1HS.1.chr2:11000000-11002136
126 | L1PA2.1.chr10:78088450-78094479
127 | L1HS.1.chrX:73380991-73387013
128 | L1HS.1.chr7:97613656-97619688
129 | L1HS.1.chr5:79778884-79784938
130 | L1HS.1.chr4:61939927-61945962
131 | L1PA2.1.chr2:43660471-43666500
132 | L1PA2.1.chr1:93790652-93796681
133 | L1HS.1.chr1:68736693-68740136
134 | L1HS.1.chr14:30684809-30690837
135 | L1HS.1.chr12:54788573-54794627
136 | L1HS.1.chrX:26314417-26320446
137 | L1HS.1.chr6:112703745-112709778
138 | L1HS.1.chr6:70010347-70016552
139 | L1PA2.1.chr6:44870634-44876665
140 | L1HS.1.chr5:119684785-119690814
141 | L1HS.1.chr5:32824614-32827992
142 | L1HS.1.chr2:193212420-193218448
143 | L1HS.1.chr20:7116194-7122199
144 | L1PA2.1.chr11:116570827-116576273
145 | L1HS.1.chr11:82155865-82161891
146 | L1HS.1.chrY:5606144-5612199
147 | L1HS.1.chr4:78347980-78354013
148 | L1HS.1.chr4:15841546-15847572
149 | L1PA2.1.chr3:43064774-43070790
150 | L1HS.1.chr1:67078891-67084915
151 | L1HS.1.chr18:5684668-5687891
152 | L1HS.1.chr16:18821266-18827058
153 | L1HS.1.chr14:79308933-79314061
154 | L1HS.1.chr11:109177494-109183526
155 | L1HS.1.chr9:95697585-95703604
156 | L1HS.1.chr7:113776122-113782152
157 | L1HS.1.chr5:15906515-15912550
158 | L1HS.1.chr4:19077911-19083929
159 | L1HS.1.chr3:90169567-90175598
160 | L1HS.1.chr20:23426108-23432140
161 | L1HS.1.chr16:54042096-54048145
162 | L1HS.1.chrY:4948913-4954938
163 | L1HS.1.chrX:66180696-66186728
164 | L1HS.1.chrX:54118685-54124744
165 | L1HS.1.chr5:152886441-152892473
166 | L1HS.1.chr5:102131356-102137385
167 | L1HS.1.chr4:74717539-74723587
168 | L1HS.1.chr2:169248623-169254656
169 | L1HS.1.chr1:218009227-218015252
170 | L1PA2.1.chr1:176256085-176262110
171 | L1HS.1.chr1:34566055-34572105
172 | L1PA2.1.chr11:87047304-87053192
173 | L1HS.1.chrX:64013267-64019286
174 | L1HS.1.chr5:58384174-58390206
175 | L1HS.1.chr5:34147845-34154031
176 | L1HS.1.chr4:52538471-52544498
177 | L1HS.1.chr1:80939203-80945257
178 | L1HS.1.chr18:70746549-70752581
179 | L1PA2.1.chr15:71174139-71180152
180 | L1HS.1.chr7:110707004-110713024
181 | L1HS.1.chr6:117102131-117108163
182 | L1HS.1.chr4:91978211-91984413
183 | L1HS.1.chr1:197707714-197713746
184 | L1PA2.1.chr15:58125731-58131761
185 | L1HS.1.chrX:119435468-119441493
186 | L1HS.1.chr5:160709608-160715639
187 | L1HS.1.chr4:119948726-119954758
188 | L1HS.1.chr4:14755114-14761144
189 | L1HS.1.chr3:77763677-77769678
190 | L1HS.1.chr2:175481951-175487994
191 | L1HS.1.chr2:16593725-16599758
192 | L1HS.1.chr12:69773410-69779441
193 | L1PA2.1.chr9:120055235-120061264
194 | L1PA2.1.chr8:97295603-97301657
195 | L1PA2.1.chr8:58914690-58920717
196 | L1HS.1.chr7:63148831-63154859
197 | L1HS.1.chr7:61837998-61844054
198 | L1HS.1.chr4:111894801-111900831
199 | L1HS.1.chr3:103556537-103562569
200 | L1HS.1.chr3:79129777-79133955
201 | L1HS.1.chr3:26398017-26404045
202 | L1PA2.1.chr3:12028021-12033291
203 | L1HS.1.chr2:213567231-213573262
204 | L1HS.1.chr1:196219370-196225402
205 | L1HS.1.chr15:54926081-54932099
206 | L1HS.1.chr11:99602687-99608113
207 | L1HS.1.chr11:31315654-31321680
208 | L1HS.1.chr7:70197328-70203357
209 | L1PA2.1.chr3:137633714-137639732
210 | L1HS.1.chr2:196905587-196911636
211 | L1HS.1.chr2:86655238-86661268
212 | L1HS.1.chr1:187343764-187349794
213 | L1HS.1.chr1:71513698-71519742
214 | L1PA2.1.chr16:61801455-61807489
215 | L1PA2.1.chr11:14715908-14721938
216 | L1HS.1.chrX:83542396-83548420
217 | L1HS.1.chr1:193717837-193723892
218 | L1HS.1.chr1:113497220-113500000
219 | L1HS.1.chr1:86679080-86685111
220 | L1HS.1.chr16:16840517-16846556
221 | L1HS.1.chr5:133583288-133589299
222 | L1PA2.1.chr5:65164017-65170048
223 | L1HS.1.chr1:209913771-209919823
224 | L1PA2.1.chr12:112621197-112627228
225 | L1PA2.1.chr12:92313998-92320023
226 | L1HS.1.chr12:38799646-38805673
227 | L1PA2.1.chrY:17060920-17066963
228 | L1HS.1.chrX:96057824-96063842
229 | L1HS.1.chrX:50019456-50025505
230 | L1HS.1.chr5:152076868-152082891
231 | L1HS.1.chr3:108749400-108755425
232 | L1HS.1.chr18:47660373-47666427
233 | L1PA2.1.chr13:39000817-39006875
234 | L1HS.1.chr12:51562631-51568657
235 | L1PA2.1.chr8:93405812-93411825
236 | L1HS.1.chr5:166141191-166145692
237 | L1HS.1.chr5:153070982-153077008
238 | L1HS.1.chr5:81616090-81622140
239 | L1HS.1.chr4:93638307-93644337
240 | L1HS.1.chr2:153007766-153013796
241 | L1PA2.1.chrX:98687494-98693514
242 | L1HS.1.chr5:177772245-177778274
243 | L1PA2.1.chr3:158634523-158640540
244 | L1HS.1.chrX:23238516-23244575
245 | L1PA7.1.chr9:113437560-113443590
246 | L1HS.1.chr9:83049539-83055571
247 | L1PA2.1.chr8:40432212-40438240
248 | L1HS.1.chr6:156324980-156331010
249 | L1HS.1.chr4:93608283-93614338
250 | L1HS.1.chr4:57562316-57568347
251 | L1HS.1.chr18:50343959-50349987
252 | L1HS.1.chr18:535701-541755
253 | L1PA2.1.chr11:94232524-94238528
254 | L1HS.1.chrY:9941130-9947151
255 | L1PA2.1.chr6:99823597-99829594
256 | L1PA2.1.chr4:143100259-143106289
257 | L1HS.1.chr4:106571057-106577070
258 | L1PA2.1.chr21:35493766-35499791
259 | L1HS.1.chr14:70547290-70553322
260 | L1PA2.1.chr9:101102144-101108174
261 | L1HS.1.chr8:135875862-135881890
262 | L1PA2.1.chr8:68362478-68367911
263 | L1PA2.1.chr3:26384735-26390767
264 | L1HS.1.chr1:247687173-247693204
265 | L1PA2.1.chr1:174377791-174383815
266 | L1HS.1.chr12:44108220-44114234
267 | L1PA2.1.chr10:117832895-117838887
268 | L1HS.1.chr9:28111895-28117865
269 | L1PA2.1.chr3:53365276-53371325
270 | L1PA2.1.chr2:222149601-222155632
271 | L1PA2.1.chr2:165485934-165491963
272 | L1PA2.1.chr11:60532161-60538190
273 | L1HS.1.chrX:56695884-56701916
274 | L1HS.1.chr8:136438074-136444105
275 | L1PA2.1.chr5:152340020-152346052
276 | L1PA2.1.chr4:27375687-27381719
277 | L1PA2.1.chr1:242045561-242051585
278 | L1PA2.1.chr1:192500584-192506612
279 | L1PA2.1.chr1:78845456-78851474
280 | L1HS.1.chr17:9615985-9622015
281 | L1HS.1.chrX:142477849-142483853
282 | L1PA2.1.chrX:50060143-50066175
283 | L1PA3.1.chr8:61115375-61121394
284 | L1PA2.1.chr8:10932425-10938427
285 | L1HS.1.chr3:3963076-3969110
286 | L1PA2.1.chr2:158522617-158528649
287 | L1PA2.1.chr15:86528094-86534125
288 | L1PA2.1.chr6:104489393-104495424
289 | L1PA2.1.chr2:124139775-124145807
290 | L1PA2.1.chr2:72063975-72069997
291 | L1HS.1.chr18:62906292-62912314
292 | L1HS.1.chr11:125536609-125542640
293 | L1PA2.1.chr10:20751667-20757692
294 | L1PA2.1.chr9:19536200-19542230
295 | L1HS.1.chr8:91522091-91528121
296 | L1PA2.1.chr7:23035734-23039855
297 | L1PA2.1.chr18:8057452-8063463
298 | L1PA2.1.chr8:110952164-110957638
299 | L1PA2.1.chr4:75126805-75132838
300 | L1PA2.1.chr20:8595101-8601127
301 | L1PA2.1.chr4:119274113-119280127
302 | L1PA2.1.chr2:157368535-157374566
303 | L1PA2.1.chr20:42206269-42212317
304 | L1PA2.1.chr18:57719248-57725264
305 | L1PA2.1.chr18:24619042-24625072
306 | L1PA2.1.chr17:32887137-32893184
307 | L1PA2.1.chr14:26629268-26635299
308 | L1PA2.1.chrX:34249185-34253913
309 | L1PA2.1.chr8:98614445-98620471
310 | L1HS.1.chr7:141062014-141068042
311 | L1HS.1.chr5:111302238-111308262
312 | L1PA2.1.chr5:93213145-93219176
313 | L1PA2.1.chr1:40365613-40370869
314 | L1PA2.1.chr13:40734919-40740945
315 | L1PA2.1.chr13:82045349-82051380
316 | L1PA2.1.chr8:98260275-98266293
317 | L1HS.1.chr1:187597671-187603699
318 | L1HS.1.chr15:81995166-82000000
319 | L1HS.1.chr14:51794601-51800632
320 | L1HS.1.chr10:108310130-108316139
321 | L1HS.1.chr8:104739851-104745873
322 | L1PA2.1.chr6:69515143-69521169
323 | L1PA2.1.chr3:119001470-119007490
324 | L1PA2.1.chr18:39565592-39571600
325 | L1PA2.1.chr11:49775119-49781151
326 | L1PA2.1.chr4:100000524-100006553
327 | L1PA2.1.chr3:103220448-103226476
328 | L1PA2.1.chr2:219931818-219937838
329 | L1PA2.1.chr2:173699375-173705410
330 | L1PA2.1.chr15:44252034-44258049
331 | L1PA2.1.chr12:61941440-61947489
332 | L1PA2.1.chr10:80722509-80728544
333 | L1PA2.1.chrX:36465194-36471217
334 | L1PA2.1.chr9:100527228-100533251
335 | L1PA2.1.chr6:8770471-8776512
336 | L1PA2.1.chr4:160574032-160580085
337 | L1PA2.1.chr1:65558564-65564576
338 | L1PA2.1.chr14:58032539-58038561
339 | L1PA2.1.chr13:73640527-73646551
340 | L1PA2.1.chrY:13179085-13185115
341 | L1PA2.1.chrX:130958931-130964957
342 | L1PA2.1.chr7:136414180-136420210
343 | L1PA3.1.chr5:93261035-93267065
344 | L1PA2.1.chr2:128858984-128865016
345 | L1PA2.1.chr1:91211587-91216947
346 | L1PA2.1.chr16:48768571-48774603
347 | L1PA2.1.chrX:76005216-76007849
348 | L1PA2.1.chr8:35528045-35534071
349 | L1PA3.1.chr7:37612053-37618072
350 | L1PA3.1.chr2:122046204-122052249
351 | L1PA2.1.chr2:122012673-122018708
352 | L1PA2.1.chr18:69449559-69455072
353 | L1PA2.1.chr18:24710814-24716841
354 | L1PA2.1.chr12:64195587-64201638
355 | L1PA2.1.chr8:135259101-135265130
356 | L1PA2.1.chr7:141032606-141038609
357 | L1PA2.1.chr3:141757129-141763153
358 | L1PA2.1.chr3:111018716-111024745
359 | L1HS.1.chr2:230337069-230342513
360 | L1HS.1.chr14:45477110-45483169
361 | L1HS.1.chr12:55096256-55102283
362 | L1HS.1.chr3:18516080-18520244
363 | L1PA2.1.chr2:174404458-174410482
364 | L1HS.1.chr19:43864494-43867300
365 | L1PA2.1.chrX:117873996-117880023
366 | L1PA2.1.chrX:107364336-107370349
367 | L1PA2.1.chr9:1223881-1229900
368 | L1PA2.1.chr6:9966044-9972049
369 | L1PA2.1.chr14:30363537-30369568
370 | L1PA2.1.chr12:57646479-57652498
371 | L1PA2.1.chrX:111588704-111594727
372 | L1PA3.1.chr2:799542-805567
373 | L1PA2.1.chr17:55501716-55507741
374 | L1PA3.1.chr12:88846991-88853008
375 | L1PA2.1.chr12:23070636-23076652
376 | L1PA2.1.chrX:19142668-19148700
377 | L1HS.1.chr9:20655632-20658802
378 | L1PA2.1.chr7:16216428-16222457
379 | L1HS.1.chr3:82337499-82339442
380 | L1PA7.1.chr3:27815983-27821983
381 | L1PA2.1.chr2:196521488-196527500
382 | L1PA2.1.chr16:60522745-60528760
383 | L1PA2.1.chr11:75748616-75754649
384 | L1PA2.1.chrX:104026871-104032902
385 | L1PA2.1.chr9:22348829-22354859
386 | L1PA2.1.chr8:1607577-1613555
387 | L1PA2.1.chr6:113531117-113537147
388 | L1PA2.1.chr6:39454557-39460534
389 | L1PA2.1.chr2:133910480-133916504
390 | L1PA2.1.chr17:12449903-12455932
391 | L1PA2.1.chr8:119159398-119165410
392 | L1PA2.1.chr6:113226487-113232515
393 | L1PA2.1.chr3:65509292-65515316
394 | L1PA2.1.chr18:60029678-60035707
395 | L1PA3.1.chr12:47426904-47432928
396 | L1PA2.1.chr10:106844583-106850610
397 | L1PA2.1.chrX:74788952-74795000
398 | L1PA2.1.chrX:65614680-65620735
399 | L1PA2.1.chr7:15547309-15553333
400 | L1PA2.1.chr3:111556203-111562234
401 | L1PA2.1.chr12:55484008-55490018
402 | L1PA2.1.chr8:74942646-74948678
403 | L1PA3.1.chr6:9810750-9816777
404 | L1PA2.1.chr4:186110393-186116420
405 | L1PA2.1.chr2:211219320-211225344
406 | L1PA2.1.chr2:182025225-182031239
407 | L1PA2.1.chr2:62835478-62841498
408 | L1PA2.1.chr12:55344249-55350274
409 | L1PA2.1.chr11:55685638-55691665
410 | L1PA2.1.chr10:111583927-111589948
411 | L1PA2.1.chr8:25730343-25736354
412 | L1PA2.1.chr2:30904198-30910223
413 | L1PA3.1.chr1:49006012-49012044
414 | L1PA2.1.chr13:40356290-40362321
415 | L1PA2.1.chr11:100475720-100481744
416 | L1PA2.1.chr6:100196461-100202490
417 | L1PA2.1.chr20:53503851-53509874
418 | L1PA2.1.chrX:100550262-100555448
419 | L1PA2.1.chrX:79765605-79771625
420 | L1PA2.1.chrX:47783671-47789697
421 | L1PA3.1.chrX:14103920-14109975
422 | L1PA2.1.chr9:119621304-119627350
423 | L1PA2.1.chr7:34169569-34175587
424 | L1HS.1.chr5:123933969-123935867
425 | L1PA2.1.chr5:78866805-78872828
426 | L1PA2.1.chr21:17376038-17381930
427 | L1PA2.1.chr8:35171472-35177508
428 | L1PA2.1.chr2:195067521-195073543
429 | L1PA2.1.chr2:191761849-191767876
430 | L1PA2.1.chr1:58269013-58275032
431 | L1PA2.1.chr15:83383651-83389616
432 | L1PA2.1.chr8:15555981-15562028
433 | L1PA2.1.chr6:156361254-156367276
434 | L1PA2.1.chr4:11235217-11241254
435 | L1PA2.1.chr8:83447142-83453169
436 | L1PA3.1.chr3:70285056-70291083
437 | L1PA2.1.chr22:32294665-32300684
438 | L1PA2.1.chr18:28416544-28422561
439 | L1PA2.1.chrX:117727933-117733746
440 | L1PA2.1.chr7:122278915-122284945
441 | L1PA2.1.chr7:111445694-111451725
442 | L1PA3.1.chr1:159452953-159458976
443 | L1PA2.1.chr13:42424880-42430912
444 | L1PA2.1.chrY:7249559-7255517
445 | L1PA2.1.chr9:70199056-70205081
446 | L1PA2.1.chr8:120644161-120650187
447 | L1PA4.1.chr5:90816413-90822442
448 | L1PA3.1.chr4:167343516-167349561
449 | L1PA3.1.chr4:11143617-11149645
450 | L1PA2.1.chr2:83768302-83774332
451 | L1PA2.1.chr1:115147821-115153959
452 | L1PA3.1.chr1:84228930-84234940
453 | L1HS.1.chr7:12497211-12500000
454 | L1PA3.1.chr6:102752889-102758907
455 | L1PA2.1.chr17:32678257-32684272
456 | L1PA2.1.chr15:93675399-93681428
457 | L1PA2.1.chr12:85312419-85318459
458 | L1PA3.1.chr10:120234299-120240325
459 | L1PA2.1.chr8:84419545-84425573
460 | L1PA3.1.chr6:48737348-48743377
461 | L1PA2.1.chr4:141877346-141883376
462 | L1PA3.1.chr3:40902374-40908432
463 | L1PA2.1.chr12:102827177-102833208
464 | L1PA2.1.chr12:88708857-88714885
465 | L1PA2.1.chr10:84756878-84762878
466 | L1PA2.1.chrX:150417926-150421451
467 | L1PA2.1.chr18:68784834-68790853
468 | L1PA2.1.chr16:36071922-36077950
469 | L1PA4.1.chr8:75278367-75284402
470 | L1PA2.1.chr7:85249854-85255879
471 | L1PA2.1.chr2:209746180-209752208
472 | L1PA3.1.chr2:72015949-72021960
473 | L1PA2.1.chr1:163639993-163646040
474 | L1PA2.1.chr15:81870667-81876699
475 | L1PA2.1.chr14:40326207-40332209
476 | L1PA2.1.chr9:14663995-14670015
477 | L1PA2.1.chr21:33925606-33931606
478 | L1PA3.1.chr11:4170252-4176276
479 | L1PA2.1.chrX:86969631-86975651
480 | L1PA2.1.chr3:122041911-122047938
481 | L1PA2.1.chr3:34413126-34419172
482 | L1PA2.1.chr3:55046104-55052129
483 | L1PA2.1.chr1:223395534-223401557
484 | L1PA2.1.chr15:49778509-49784518
485 | L1HS.1.chrX:92254241-92256469
486 | L1PA3.1.chr8:59168655-59174679
487 | L1PA2.1.chr7:96552195-96558214
488 | L1PA3.1.chr7:22528296-22534331
489 | L1PA2.1.chr6:82389419-82395425
490 | L1PA2.1.chr5:64709034-64715065
491 | L1PA3.1.chr5:26476048-26482063
492 | L1PA2.1.chr19:55822401-55828429
493 | L1PA2.1.chr6:141814814-141820842
494 | L1PA3.1.chr3:97904737-97910773
495 | L1PA3.1.chr11:32941250-32947256
496 | L1PA2.1.chr8:101678069-101684093
497 | L1PA3.1.chr8:87223230-87229276
498 | L1PA2.1.chr4:64859153-64865171
499 | L1PA2.1.chr20:53472644-53478653
500 | L1PA2.1.chr10:117079038-117085063
501 | L1PA3.1.chrX:80405241-80411257
502 | L1PA2.1.chr9:28348134-28354162
503 | L1PA3.1.chr6:44754479-44760501
504 | L1PA2.1.chr5:30817969-30823996
505 | L1PA2.1.chr2:158351231-158357242
506 | L1PA2.1.chr1:177633927-177639946
507 | L1PA2.1.chr18:34552378-34558395
508 | L1PA2.1.chr15:20311030-20317051
509 | L1PA3.1.chr10:35477-41492
510 | L1PA2.1.chrX:113337822-113343853
511 | L1PA3.1.chr8:129324372-129329792
512 | L1PA2.1.chr6:86806663-86812682
513 | L1PA2.1.chr6:22174840-22180874
514 | L1PA2.1.chr5:139005423-139011486
515 | L1PA2.1.chr4:75401194-75407226
516 | L1PA2.1.chr18:34331115-34337159
517 | L1PA2.1.chr15:81668513-81674504
518 | L1PA2.1.chr11:26965118-26971134
519 | L1PA2.1.chr8:40647061-40653096
520 | L1PA2.1.chr7:91903682-91909706
521 | L1PA3.1.chr7:34906239-34912252
522 | L1PA3.1.chr6:133142073-133148104
523 | L1PA2.1.chr6:112813021-112819047
524 | L1PA2.1.chr6:91047151-91053161
525 | L1PA2.1.chr5:78452185-78458210
526 | L1PA3.1.chr4:53564637-53570664
527 | L1PA3.1.chr3:164702605-164708638
528 | L1PA2.1.chr3:58816278-58822304
529 | L1PA3.1.chr2:188123561-188129537
530 | L1PA3.1.chr18:35807-41823
531 | L1PA2.1.chr14:57112210-57118214
532 | L1PA2.1.chr8:126313241-126319040
533 | L1PA3.1.chr4:93589053-93595080
534 | L1PA2.1.chr2:153719447-153725466
535 | L1PA2.1.chrX:88207885-88213919
536 | L1PA3.1.chr5:133180711-133186159
537 | L1PA3.1.chr3:176827330-176833078
538 | L1PA2.1.chr17:61110229-61116238
539 | L1PA3.1.chr9:9931213-9937220
540 | L1PA2.1.chr7:113742900-113748900
541 | L1PA2.1.chr3:53921747-53927805
542 | L1PA2.1.chr5:123236611-123242616
543 | L1PA2.1.chr1:30438491-30444125
544 | L1PA2.1.chr8:137558584-137564612
545 | L1PA2.1.chr5:58552269-58558283
546 | L1PA2.1.chr3:195087672-195093677
547 | L1PA2.1.chr16:48015465-48021489
548 | L1PA3.1.chr14:88443481-88448908
549 | L1PA3.1.chrX:125608955-125614969
550 | L1PA3.1.chr6:49791502-49797511
551 | L1PA2.1.chr5:43717953-43723974
552 | L1PA2.1.chr16:21042672-21048703
553 | L1PA2.1.chr14:53266406-53271579
554 | L1HS.1.chr13:76612823-76618851
555 | L1PA2.1.chr10:30948036-30953410
556 | L1HS.1.chr7:10168424-10171419
557 | L1PA2.1.chr5:103046752-103052769
558 | L1PA2.1.chr11:42429263-42435286
559 | L1PA3.1.chr9:79177278-79183283
560 | L1PA2.1.chr20:21910853-21916881
561 | L1PA2.1.chr15:100417071-100423096
562 | L1PA2.1.chr9:97580993-97587031
563 | L1PA2.1.chr6:82558403-82564423
564 | L1PA2.1.chr4:167276135-167282161
565 | L1PA2.1.chr11:37240712-37246716
566 | L1PA3.1.chr10:19581231-19587252
567 | L1PA2.1.chr8:120348977-120354404
568 | L1PA3.1.chr4:175019740-175025751
569 | L1PA2.1.chr3:156278973-156284990
570 | L1PA3.1.chr10:31469210-31475223
571 | L1PA2.1.chrX:110816764-110822773
572 | L1PA2.1.chr7:84062409-84068436
573 | L1PA2.1.chr4:104687024-104692901
574 | L1PA2.1.chr2:22984153-22990168
575 | L1PA2.1.chr1:247888322-247894338
576 | L1HS.1.chr1:216485645-216487614
577 | L1PA2.1.chrX:152235997-152242028
578 | L1PA2.1.chrX:87116306-87122337
579 | L1PA3.1.chr20:40619322-40625356
580 | L1PA3.1.chr15:51031621-51037649
581 | L1PA3.1.chr16:86233295-86239310
582 | L1PA3.1.chr10:126676471-126682458
583 | L1PA3.1.chr9:87041882-87047900
584 | L1PA3.1.chr8:86586338-86592366
585 | L1PA2.1.chr8:74866519-74872545
586 | L1PA2.1.chr5:16335410-16341440
587 | L1PA2.1.chr1:185251798-185257805
588 | L1PA2.1.chr5:67736897-67742931
589 | L1PA2.1.chr5:42734714-42740780
590 | L1PA2.1.chr4:104786988-104793014
591 | L1PA3.1.chrX:126104124-126110137
592 | L1PA2.1.chr16:63388077-63394106
593 | L1PA3.1.chr14:43698653-43704668
594 | L1PA2.1.chr9:31294065-31300087
595 | L1PA2.1.chr8:72147447-72153464
596 | L1PA3.1.chr5:79095899-79101910
597 | L1PA3.1.chr3:168764449-168770163
598 | L1PA5.1.chr6:30247968-30253370
599 | L1PA3.1.chr5:152762660-152768698
600 | L1PA3.1.chr8:31851611-31857633
601 | L1PA2.1.chr7:93586902-93591767
602 | L1PA3.1.chr6:6185423-6191251
603 | L1PA2.1.chr14:40348058-40354070
604 | L1PA3.1.chr9:4605799-4611811
605 | L1PA3.1.chr2:117649184-117655215
606 | L1PA2.1.chr1:98588661-98592036
607 | L1PA2.1.chr4:14480751-14486827
608 | L1PA2.1.chr7:50898842-50904874
609 | L1PA3.1.chr3:50973137-50979172
610 | L1PA3.1.chr15:87463738-87469749
611 | L1PA2.1.chr4:75270579-75276607
612 | L1PA2.1.chr11:108260626-108266643
613 | L1PA2.1.chr18:7398117-7403452
614 | L1PA3.1.chr6:160679719-160685751
615 | L1PA3.1.chr5:117338253-117344286
616 | L1PA2.1.chr4:132631286-132637304
617 | L1PA4.1.chr11:88986995-88993027
618 | L1PA2.1.chr8:87624830-87630833
619 | L1PA3.1.chr4:138881278-138887439
620 | L1PA3.1.chr3:159392310-159398352
621 | L1PA2.1.chr2:208811621-208817648
622 | L1HS.1.chr10:37995443-38000000
623 | L1PA2.1.chr2:124407870-124413893
624 | L1PA2.1.chr14:43597900-43602088
625 | L1PA3.1.chrX:69676829-69682848
626 | L1PA3.1.chrX:29981730-29987885
627 | L1PA2.1.chr9:32729016-32735047
628 | L1PA3.1.chr3:189017229-189025537
629 | L1PA3.1.chr15:49259578-49265724
630 | L1PA2.1.chrX:63597195-63603223
631 | L1PA3.1.chr9:79494547-79500000
632 | L1PA2.1.chr6:86021488-86027515
633 | L1PA3.1.chr3:22737735-22743754
634 | L1PA2.1.chr15:56311143-56317177
635 | L1P1.1.chr12:21083799-21088613
636 | L1PA3.1.chr9:139648-145712
637 | L1PA3.1.chr3:135025209-135031249
638 | L1PA3.1.chr5:122509616-122515639
639 | L1PA3.1.chr4:184208227-184214253
640 | L1HS.1.chr16:65690011-65696020
641 | L1PA3.1.chr9:25070956-25076968
642 | L1PA3.1.chrX:148123449-148129475
643 | L1PA2.1.chr8:15576322-15582323
644 | L1PA2.1.chr2:40008598-40014619
645 | L1PA3.1.chr12:87969763-87975178
646 | L1PA3.1.chr6:116638696-116644847
647 | L1PA2.1.chr1:68923486-68926929
648 | L1PA3.1.chr14:43262062-43268079
649 | L1PA3.1.chr11:13793800-13799246
650 | L1HS.1.chr17:69000000-69005148
651 | L1PA2.1.chr14:26673315-26679374
652 | L1PA3.1.chrX:91752800-91758828
653 | L1PA3.1.chr7:8726340-8732368
654 | L1PA3.1.chr10:98541327-98544428
655 | L1PA2.1.chr8:91869518-91875534
656 | L1PA2.1.chrY:22339612-22345645
657 | L1PA2.1.chrX:6933211-6938641
658 | L1PA3.1.chr9:28989909-28996028
659 | L1PA3.1.chr6:71826715-71832734
660 | L1PA3.1.chrX:130567667-130573661
661 | L1PA2.1.chr9:86606053-86612054
662 | L1PA2.1.chr6:102722105-102725773
663 | L1HS.1.chr3:135002247-135005976
664 | L1PA3.1.chrX:13132317-13138466
665 | L1PA3.1.chr8:120190619-120194194
666 | L1PA3.1.chr1:75677750-75683758
667 | L1PA3.1.chr19:42714368-42720306
668 | L1PA2.1.chr15:49354233-49360252
669 | L1PA3.1.chr12:87098406-87104423
670 | L1PA3.1.chr5:79902294-79908448
671 | L1HS.1.chr4:110690112-110693684
672 | L1PA3.1.chr11:4000000-4005742
673 | L1PA2.1.chr8:131770949-131776926
674 | L1PA3.1.chr7:45377515-45383528
675 | L1PA2.1.chr5:28798790-28804814
676 | L1PA3.1.chr4:117921983-117927986
677 | L1PA2.1.chr8:66949103-66955119
678 | L1PA3.1.chr6:54961429-54967437
679 | L1PA3.1.chr5:148610279-148615691
680 | L1PA3.1.chr7:96579968-96585972
681 | L1P1.1.chr6:68912761-68914683
682 | L1PA2.1.chr3:85289930-85295940
683 | L1PA3.1.chr1:94506993-94512223
684 | L1PA3.1.chr10:35858668-35864695
685 | L1PA3.1.chr6:75899741-75905745
686 | L1PA3.1.chr10:105735717-105741873
687 | L1PA3.1.chr9:65709351-65715424
688 | L1PA3.1.chr2:166702725-166708745
689 | L1PA3.1.chr10:9519878-9526029
690 | L1PA3.1.chr4:147434630-147440182
691 | L1PA3.1.chr4:11557447-11563586
692 | L1PA3.1.chr3:133551337-133557486
693 | L1PA4.1.chr3:29993917-29999918
694 | L1PA4.1.chr4:122939633-122945796
695 | L1PA3.1.chr2:113471481-113477545
696 | L1PA3.1.chr8:107889880-107895907
697 | L1P1.1.chr16:59363201-59365889
698 | L1PA2.1.chr13:95733699-95740837
699 | L1PA3.1.chr7:36552965-36559126
700 | L1PA3.1.chr7:87238750-87244161
701 | L1PA2.1.chr5:85657593-85663641
702 | L1HS.1.chr7:145561496-145564595
703 | L1PA3.1.chr14:93413032-93419170
704 | L1PA3.1.chr4:102697355-102702634
705 | L1PA3.1.chr5:75769521-75774874
706 | L1PA3.1.chr2:137309506-137315534
707 | L1PA3.1.chrX:51512321-51518344
708 | L1PA3.1.chrX:32730898-32736929
709 | L1PA3.1.chr6:115009066-115014475
710 | L1PA3.1.chr13:102522020-102528176
711 | L1HS.1.chr9:63913382-63916426
712 | L1PA3.1.chr4:127676608-127682756
713 | L1PA2.1.chr6:141916863-141922880
714 | L1PA3.1.chr3:67147643-67153797
715 | L1PA3.1.chr2:227064885-227070875
716 | L1PA3.1.chr2:154147919-154154081
717 | L1PA2.1.chr4:156284540-156290599
718 | L1PA3.1.chr4:119061901-119068062
719 | L1PA3.1.chr16:47182757-47188783
720 | L1PA3.1.chr4:187050521-187056650
721 | L1PA3.1.chr4:90152884-90159024
722 | L1PA3.1.chr11:113656606-113662757
723 | L1PA3.1.chr10:130316193-130319757
724 | L1PA2.1.chr10:37896970-37902988
725 | L1PA3.1.chr4:167055311-167058094
726 | L1PA3.1.chr9:92900099-92906066
727 | L1PA3.1.chr6:1428614-1434750
728 | L1PA3.1.chr3:809376-815406
729 | L1PA3.1.chr2:240544494-240550522
730 | L1PA3.1.chr20:31332715-31338865
731 | L1PA3.1.chr5:98342992-98349146
732 | L1PA3.1.chrX:56075957-56081346
733 | L1PA3.1.chr2:96378981-96385131
734 | L1PA3.1.chr8:138042576-138048582
735 | L1PA3.1.chrX:42202163-42208183
736 | L1PA3.1.chr8:118548537-118554688
737 | L1PA2.1.chr7:113533338-113538585
738 | L1PA3.1.chr6:94757768-94763915
739 | L1PA3.1.chr3:180448471-180452905
740 | L1HS.1.chr1:103922065-103925398
741 | L1PA2.1.chr7:39007527-39013552
742 | L1PA3.1.chr6:4831172-4836230
743 | L1PA3.1.chr2:172187264-172193414
744 | L1PA3.1.chr1:113633560-113639380
745 | L1PA3.1.chrX:75561270-75567303
746 | L1HS.1.chr6:121162716-121168725
747 | L1P1.1.chr2:31496361-31500000
748 | L1PA3.1.chr13:104340349-104346508
749 | L1PA3.1.chr5:44578944-44584955
750 | L1PA3.1.chr5:15291930-15298100
751 | L1PA3.1.chr4:127860152-127865687
752 | L1PA3.1.chr15:38834949-38841092
753 | L1PA3.1.chrX:116159201-116165340
754 | L1PA3.1.chr8:74594235-74600252
755 | L1PA3.1.chrX:16230649-16236856
756 | L1PA3.1.chr16:65356743-65362738
757 | L1PA3.1.chr7:90634134-90640272
758 | L1PA3.1.chr12:66578267-66584409
759 | L1PA4.1.chr6:122537169-122543352
760 | L1PA3.1.chr6:107854715-107860748
761 | L1PA3.1.chr3:23349243-23355385
762 | L1PA3.1.chrX:108277015-108283183
763 | L1PA3.1.chr5:126903810-126909817
764 | L1PA3.1.chr5:91762133-91768161
765 | L1PA3.1.chr3:63733312-63739364
766 | L1PA2.1.chr16:80086062-80091657
767 | L1PA3.1.chr3:171625080-171632749
768 | L1PA4.1.chr6:70782891-70788469
769 | L1PA4.1.chr2:149501242-149507406
770 | L1PA3.1.chr14:47171654-47177809
771 | L1PA3.1.chr12:105776064-105782085
772 | L1PA3.1.chr15:97372487-97377657
773 | L1HS.1.chrX:21330646-21331772
774 | L1PA3.1.chr8:126341585-126347597
775 | L1PA2.1.chr11:60435417-60441441
776 | L1PA2.1.chr8:48793905-48799930
777 | L1PA2.1.chr3:108257680-108263696
778 | L1PA4.1.chr7:111500001-111505980
779 | L1PA2.1.chr21:40022874-40028842
780 | L1PA2.1.chr8:95552232-95558265
781 | L1PA3.1.chrX:51644758-51650770
782 | L1PA2.1.chr20:39368228-39373484
783 | L1PA3.1.chrX:85146837-85153009
784 | L1PA3.1.chr7:23321058-23327074
785 | L1PA3.1.chr5:99339615-99345764
786 | L1HS.1.chr2:131815264-131816385
787 | L1PA2.1.chr9:74424274-74430304
788 | L1PA3.1.chr7:32708015-32714180
789 | L1PA3.1.chr6:23804047-23810075
790 | L1PA4.1.chr4:63948074-63953638
791 | L1PA2.1.chrY:20367992-20374018
792 | L1PA3.1.chr5:120176046-120182173
793 | L1PA3.1.chr10:45009173-45015196
794 | L1PA3.1.chr4:163207006-163212968
795 | L1PA2.1.chr2:236383666-236389689
796 | L1PA2.1.chr6:71138558-71146364
797 | L1PA3.1.chr3:94846027-94852063
798 | L1PA2.1.chrX:114720516-114726531
799 | L1PA3.1.chr9:12333994-12340033
800 | L1PA2.1.chr5:19161336-19167360
801 | L1PA2.1.chr21:40033161-40039180
802 | L1PA2.1.chr4:106935103-106941147
803 | L1PA3.1.chr12:55081365-55087407
804 | L1PA2.1.chr12:58109257-58115306
805 | L1PA3.1.chr14:39521539-39527458
806 | L1PA2.1.chr8:4854406-4860419
807 | L1PA3.1.chr13:60709752-60715784
808 | L1PA2.1.chr7:32682678-32688715
809 | L1PA3.1.chrX:36170297-36176324
810 | L1PA3.1.chr3:18328056-18331287
811 | L1PA3.1.chr12:59751112-59757266
812 | L1PA3.1.chr3:26239082-26245212
813 | L1PA2.1.chr1:186637331-186643356
814 | L1PA3.1.chr3:61211021-61217154
815 |
--------------------------------------------------------------------------------
/CGC/ORF2_list.txt:
--------------------------------------------------------------------------------
1 | L1HS.1.chr20:7116194-7122199
2 | L1HS.1.chr5:152886441-152892473
3 | L1HS.1.chr15:70729744-70735160
4 | L1HS.1.chr8:125582886-125588889
5 | L1HS.1.chr4:136293494-136299546
6 | L1HS.1.chrX:141421202-141427246
7 | L1HS.1.chr15:54926081-54932099
8 | L1HS.1.chr4:74717539-74723587
9 | L1HS.1.chr8:128453002-128459020
10 | L1HS.1.chr2:4733729-4739760
11 | L1HS.1.chr16:16840517-16846556
12 | L1HS.1.chr9:95697585-95703604
13 | L1HS.1.chr7:30439242-30445274
14 | L1HS.1.chr4:138547723-138552054
15 | L1HS.1.chr11:78677772-78683802
16 | L1HS.1.chr8:134070756-134076773
17 | L1HS.1.chr5:109259387-109265418
18 | L1HS.1.chr4:21159390-21165421
19 | L1HS.1.chr6:2417774-2423803
20 | L1HS.1.chrX:11935296-11941314
21 | L1HS.1.chrX:11707248-11713279
22 | L1HS.1.chr16:18821266-18827058
23 | L1HS.1.chr13:29641706-29647706
24 | L1HS.1.chr8:72875538-72881588
25 | L1HS.1.chr12:126299023-126305038
26 | L1HS.1.chr5:104518587-104524616
27 | L1HS.1.chr3:130628808-130634065
28 | L1HS.1.chr10:105377346-105383377
29 | L1HS.1.chr6:129000000-129004416
30 | L1HS.1.chr4:79937715-79943746
31 | L1HS.1.chr22:28663283-28669315
32 | L1HS.1.chr2:16593725-16599758
33 | L1HS.1.chr18:70746549-70752581
34 | L1HS.1.chr16:33952564-33958612
35 | L1HS.1.chr10:109812437-109818457
36 | L1HS.1.chr10:6369617-6375667
37 | L1HS.1.chr6:156034135-156040165
38 | L1HS.1.chr1:84052389-84058406
39 | L1HS.1.chr18:75846851-75852883
40 | L1HS.1.chr11:93420986-93427031
41 | L1HS.1.chr1:71513698-71519742
42 | L1HS.1.chrX:147653734-147659767
43 | L1HS.1.chr1:247687173-247693204
44 | L1HS.1.chr7:113776122-113782152
45 | L1HS.1.chr4:78347980-78354013
46 | L1HS.1.chr11:93136638-93142673
47 | L1HS.1.chr5:177772245-177778274
48 | L1HS.1.chr4:90675739-90681757
49 | L1HS.1.chr2:196905587-196911636
50 | L1HS.1.chr16:83637252-83643296
51 | L1HS.1.chr16:9584490-9590522
52 | L1HS.1.chr7:141920659-141926712
53 | L1HS.1.chr3:109199872-109205903
54 | L1HS.1.chr1:174590323-174596379
55 | L1HS.1.chr11:95436216-95442246
56 | L1HS.1.chr11:24327951-24334001
57 | L1HS.1.chr9:90149604-90155634
58 | L1HS.1.chr6:19764892-19770918
59 | L1HS.1.chr7:110707004-110713024
60 | L1HS.1.chr6:83333952-83339981
61 | L1HS.1.chr2:86655238-86661268
62 | L1HS.1.chr7:49680245-49686300
63 | L1HS.1.chr6:133020691-133026746
64 | L1HS.1.chr1:86679080-86685111
65 | L1HS.1.chr10:85355506-85361538
66 | L1HS.1.chr8:27113618-27119645
67 | L1HS.1.chr3:103556537-103562569
68 | L1HS.1.chr6:24811657-24817706
69 | L1PA2.1.chr5:132513964-132519996
70 | L1HS.1.chr5:79778884-79784938
71 | L1HS.1.chr3:120573021-120579186
72 | L1HS.1.chr2:175481951-175487994
73 | L1HS.1.chr1:239623498-239629523
74 | L1HS.1.chr14:70547290-70553322
75 | L1HS.1.chrX:54118685-54124744
76 | L1HS.1.chr13:92685561-92691592
77 | L1HS.1.chr1:237019467-237025494
78 | L1HS.1.chr1:80939203-80945257
79 | L1HS.1.chr5:58384174-58390206
80 | L1HS.1.chr5:173402796-173408828
81 | L1HS.1.chr4:16944926-16949113
82 | L1HS.1.chr4:93638307-93644337
83 | L1HS.1.chr3:77763677-77769678
84 | L1HS.1.chr17:9615985-9622015
85 | L1HS.1.chr6:121162716-121168725
86 | L1HS.1.chr22:48985761-48991792
87 | L1HS.1.chrX:23238516-23244575
88 | L1HS.1.chr2:166988454-166994509
89 | L1HS.1.chrX:81841153-81847184
90 | L1PA2.1.chr11:60532161-60538190
91 | L1HS.1.chr4:111894801-111900831
92 | L1HS.1.chr1:180866811-180872843
93 | L1HS.1.chr17:66596579-66602595
94 | L1HS.1.chr6:117102131-117108163
95 | L1PA2.1.chr5:39787652-39793671
96 | L1HS.1.chr4:59078847-59084877
97 | L1HS.1.chr9:28111895-28117865
98 | L1HS.1.chr7:111963193-111969223
99 | L1HS.1.chr5:146609485-146615534
100 | L1HS.1.chr3:159095379-159101394
101 | L1HS.1.chr2:180833661-180839689
102 | L1HS.1.chr7:111243515-111249546
103 | L1HS.1.chr15:87509891-87515920
104 | L1HS.1.chr11:85324758-85330821
105 | L1HS.1.chr10:98782941-98788971
106 | L1HS.1.chr1:187597671-187603699
107 | L1HS.1.chr14:63116706-63122735
108 | L1HS.1.chr1:187343764-187349794
109 | L1HS.1.chr18:13975860-13981891
110 | L1PA2.1.chr1:71888203-71894235
111 | L1HS.1.chr20:11632779-11638837
112 | L1HS.1.chrX:96057824-96063842
113 | L1HS.1.chr4:122652658-122656850
114 | L1HS.1.chr1:195925003-195929320
115 | L1HS.1.chr1:85927067-85933100
116 | L1HS.1.chr18:50343959-50349987
117 | L1HS.1.chr6:72988654-72994686
118 | L1HS.1.chr11:109177494-109183526
119 | L1HS.1.chr8:88685705-88691760
120 | L1HS.1.chr5:111302238-111308262
121 | L1HS.1.chr2:102566355-102572385
122 | L1HS.1.chr5:86510690-86516743
123 | L1HS.1.chr3:132946006-132952034
124 | L1HS.1.chr1:118852351-118858380
125 | L1HS.1.chr10:76586841-76591752
126 | L1HS.1.chrX:151330320-151336351
127 | L1HS.1.chr10:5245354-5251383
128 | L1PA2.1.chr6:115960032-115966060
129 | L1PA2.1.chr12:92313998-92320023
130 | L1HS.1.chrX:155516016-155522048
131 | L1HS.1.chr4:169515501-169521532
132 | L1HS.1.chr7:93787624-93793679
133 | L1HS.1.chr10:19088601-19094618
134 | L1HS.1.chrX:76322775-76328806
135 | L1PA2.1.chrX:28206791-28212789
136 | L1HS.1.chr5:102131356-102137385
137 | L1PA2.1.chr12:90536603-90542635
138 | L1HS.1.chr7:46820756-46825657
139 | L1PA2.1.chr19:37837502-37843533
140 | L1PA2.1.chr10:15915731-15921753
141 | L1HS.1.chr20:12801017-12807044
142 | L1HS.1.chr11:49793154-49797728
143 | L1HS.1.chr18:37819737-37825798
144 | L1HS.1.chrY:5606144-5612199
145 | L1HS.1.chr3:4916534-4922591
146 | L1PA2.1.chr18:59403939-59409970
147 | L1PA2.1.chr15:71174139-71180152
148 | L1HS.1.chrX:142477849-142483853
149 | L1HS.1.chr10:33510845-33516876
150 | L1HS.1.chr11:90400067-90406098
151 | L1HS.1.chr7:63148831-63154859
152 | L1PA2.1.chr5:83316287-83320401
153 | L1HS.1.chr1:209913771-209919823
154 | L1HS.1.chr11:36551606-36557636
155 | L1PA2.1.chr3:187412123-187418152
156 | L1HS.1.chr3:136479056-136485103
157 | L1PA2.1.chr3:81051389-81057413
158 | L1PA2.1.chr18:7966442-7972474
159 | L1PA2.1.chr8:91558668-91564687
160 | L1HS.1.chr3:89460825-89466856
161 | L1PA2.1.chr6:44870634-44876665
162 | L1PA2.1.chr5:45658440-45664470
163 | L1HS.1.chr3:54394322-54400323
164 | L1PA2.1.chr6:72570139-72576167
165 | L1HS.1.chr18:72966526-72972556
166 | L1HS.1.chr3:3963076-3969110
167 | L1PA2.1.chr2:128858984-128865016
168 | L1PA2.1.chr3:177388770-177394751
169 | L1PA2.1.chr10:11731436-11737465
170 | L1PA2.1.chr10:39466259-39470575
171 | L1PA2.1.chr9:19536200-19542230
172 | L1PA2.1.chr6:104489393-104495424
173 | L1HS.1.chrX:83059584-83065637
174 | L1HS.1.chr7:70197328-70203357
175 | L1PA2.1.chr2:173699375-173705410
176 | L1HS.1.chrX:64013267-64019286
177 | L1PA2.1.chrX:103891506-103897537
178 | L1PA2.1.chr4:164553492-164559523
179 | L1PA2.1.chr8:63797384-63803439
180 | L1HS.1.chr12:54788573-54794627
181 | L1PA2.1.chr10:106844583-106850610
182 | L1PA2.1.chr15:51173565-51179009
183 | L1PA2.1.chr8:75444000-75448442
184 | L1PA2.1.chr6:104452399-104457460
185 | L1PA3.1.chr3:137454947-137460983
186 | L1HS.1.chr5:122240435-122244924
187 | L1PA2.1.chr4:102204930-102210958
188 | L1HS.1.chr7:7465092-7471120
189 | L1PA2.1.chr3:155119416-155125444
190 | L1PA2.1.chr16:21042672-21048703
191 | L1PA3.1.chr3:187424407-187428816
192 | L1HS.1.chr16:35608475-35614501
193 | L1PA2.1.chr5:139005423-139011486
194 | L1PA2.1.chr15:93675399-93681428
195 | L1PA2.1.chr2:165485934-165491963
196 | L1PA2.1.chr18:24619042-24625072
197 | L1PA3.1.chr6:48363090-48369117
198 | L1PA2.1.chr3:65509292-65515316
199 | L1PA3.1.chr19:29225779-29231807
200 | L1PA2.1.chr8:120348977-120354404
201 | L1PA2.1.chr12:77173381-77179424
202 | L1PA2.1.chr13:100698082-100704117
203 | L1PA2.1.chr12:64195587-64201638
204 | L1PA2.1.chr2:174269465-174274464
205 | L1PA2.1.chr8:72479440-72485463
206 | L1PA2.1.chr4:14009454-14015486
207 | L1PA2.1.chr13:40356290-40362321
208 | L1PA2.1.chr6:156361254-156367276
209 | L1PA2.1.chr1:174377791-174383815
210 | L1PA2.1.chr4:145369388-145375369
211 | L1HS.1.chr1:104770247-104776278
212 | L1PA2.1.chr13:42424880-42430912
213 | L1PA2.1.chr14:101266199-101272227
214 | L1PA2.1.chr4:158084240-158090272
215 | L1PA2.1.chr5:21107412-21113430
216 | L1PA2.1.chr3:141757129-141763153
217 | L1PA2.1.chr1:49875162-49881175
218 | L1PA2.1.chr18:22529636-22535670
219 | L1PA2.1.chr1:25506585-25512707
220 | L1PA3.1.chr6:107854715-107860748
221 | L1HS.1.chr13:31302314-31308370
222 | L1PA2.1.chr14:26629268-26635299
223 | L1PA2.1.chrX:127116697-127122729
224 | L1PA2.1.chr5:57471563-57475609
225 | L1PA2.1.chr8:131770949-131776926
226 | L1PA2.1.chr1:178314791-178320818
227 | L1PA2.1.chr16:63388077-63394106
228 | L1HS.1.chr4:79704552-79710581
229 | L1PA2.1.chr3:178859948-178865979
230 | L1PA2.1.chr18:40187639-40193657
231 | L1PA3.1.chr18:27279551-27285578
232 | L1PA2.1.chr2:158351231-158357242
233 | L1PA3.1.chr4:120741413-120747472
234 | L1PA2.1.chr12:57646479-57652498
235 | L1PA2.1.chr7:29579936-29585963
236 | L1PA2.1.chr8:72147447-72153464
237 | L1HS.1.chr11:90966271-90972302
238 | L1PA2.1.chrX:47783671-47789697
239 | L1PA2.1.chrX:18105518-18110908
240 | L1PA2.1.chr4:4953897-4959919
241 | L1PA2.1.chr11:107361810-107367839
242 | L1PA2.1.chr1:75383477-75388208
243 | L1PA2.1.chr12:70013065-70019067
244 | L1PA2.1.chr2:76775758-76781758
245 | L1PA3.1.chr6:105716122-105722275
246 | L1PA2.1.chr18:41631742-41637769
247 | L1PA3.1.chr2:4157808-4163833
248 | L1PA3.1.chr2:57587069-57592429
249 | L1PA2.1.chrX:36465194-36471217
250 | L1PA2.1.chr2:192268435-192274450
251 | L1PA3.1.chrX:68736250-68742398
252 | L1PA3.1.chrX:137672117-137678261
253 | L1PA3.1.chr2:48739839-48745890
254 | L1PA2.1.chr2:195067521-195073543
255 | L1PA3.1.chr7:37612053-37618072
256 | L1PA3.1.chr5:3439025-3445063
257 | L1PA2.1.chr3:116203398-116209426
258 | L1PA3.1.chrX:125510365-125515187
259 | L1PA3.1.chr4:97633479-97639503
260 | L1PA2.1.chr20:53472644-53478653
261 | L1PA2.1.chr7:16216428-16222457
262 | L1PA2.1.chr1:177633927-177639946
263 | L1HS.1.chrX:56695884-56701916
264 | L1PA2.1.chr18:35205779-35211809
265 | L1PA2.1.chr15:56311143-56317177
266 | L1PA2.1.chr20:24900605-24906618
267 | L1PA3.1.chr8:2413733-2419762
268 | L1PA3.1.chr8:2301964-2307993
269 | L1PA2.1.chr3:158634523-158640540
270 | L1HS.1.chr1:67078891-67084915
271 | L1PA2.1.chr2:124593139-124599168
272 | L1PA2.1.chr7:43059096-43065116
273 | L1PA3.1.chr5:35568225-35574246
274 | L1PA2.1.chr17:3176530-3182557
275 | L1PA3.1.chr4:65052166-65058198
276 | L1PA3.1.chr2:228229041-228234515
277 | L1PA2.1.chr20:18601523-18606939
278 | L1PA3.1.chr9:133310021-133316045
279 | L1PA2.1.chr2:151698868-151704889
280 | L1PA2.1.chr6:141566105-141572136
281 | L1HS.1.chr1:56365452-56369282
282 | L1HS.1.chr14:30684809-30690837
283 | L1PA2.1.chr16:61801455-61807489
284 | L1PA2.1.chr22:16021017-16027044
285 | L1PA3.1.chr2:188123561-188129537
286 | L1HS.1.chr4:15841546-15847572
287 | L1PA3.1.chr11:89744187-89750239
288 | L1HS.1.chr4:107206672-107210557
289 | L1PA2.1.chr8:58914690-58920717
290 | L1HS.1.chr1:237075264-237081293
291 | L1PA3.1.chr3:135025209-135031249
292 | L1PA2.1.chr5:75642235-75648286
293 | L1PA2.1.chr19:55822401-55828429
294 | L1PA2.1.chr6:103709031-103715056
295 | L1PA2.1.chr10:7137522-7142956
296 | L1PA2.1.chr12:106471865-106477891
297 | L1HS.1.chr20:55859566-55865521
298 | L1PA2.1.chr9:14663995-14670015
299 | L1HS.1.chr5:152076868-152082891
300 | L1PA2.1.chr14:55988182-55993244
301 | L1PA2.1.chr10:18030651-18036675
302 | L1PA2.1.chr2:204991072-204997106
303 | L1PA2.1.chr1:174233266-174239293
304 | L1PA2.1.chr13:82045349-82051380
305 | L1PA2.1.chr15:81797930-81803963
306 | L1PA3.1.chr7:14313260-14319290
307 | L1HS.1.chr18:62906292-62912314
308 | L1PA2.1.chr6:162989737-162995762
309 | L1PA2.1.chr9:1223881-1229900
310 | L1PA2.1.chrX:5480456-5486466
311 | L1PA2.1.chrX:98424325-98430357
312 | L1HS.1.chr2:193212420-193218448
313 | L1PA3.1.chr13:105383251-105388345
314 | L1PA2.1.chr12:80244169-80250184
315 | L1PA2.1.chr1:91211587-91216947
316 | L1PA2.1.chr4:64859153-64865171
317 | L1PA2.1.chr9:21536697-21541948
318 | L1PA3.1.chrX:64252345-64258375
319 | L1PA3.1.chr11:127868667-127872497
320 | L1PA2.1.chr1:82250044-82256069
321 | L1PA2.1.chr3:111556203-111562234
322 | L1PA3.1.chr4:53564637-53570664
323 | L1PA3.1.chr6:136000726-136006368
324 | L1HS.1.chrX:127362223-127368248
325 | L1PA2.1.chr4:65288237-65294261
326 | L1PA3.1.chr10:126881867-126887893
327 | L1PA3.1.chr6:133142073-133148104
328 | L1PA3.1.chr15:97372487-97377657
329 | L1PA3.1.chr11:79552653-79558680
330 | L1PA3.1.chr10:60692960-60698897
331 | L1PA2.1.chr5:51250746-51256770
332 | L1PA2.1.chr11:40585472-40591501
333 | L1PA3.1.chr4:157174892-157180916
334 | L1PA4.1.chr16:72580798-72586932
335 | L1PA2.1.chr7:86340208-86346233
336 | L1HS.1.chr4:135178140-135183747
337 | L1PA3.1.chr8:118548537-118554688
338 | L1PA3.1.chrX:124582570-124588702
339 | L1PA2.1.chr2:137393160-137399190
340 | L1PA4.1.chr10:20291416-20297572
341 |
--------------------------------------------------------------------------------
/CGC/make_ORF1_and_intact_table.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | allowed_runthrough_fraction = float(sys.argv[5])
13 |
14 | output_orf1_name = sys.argv[6]
15 | output_intact_name = sys.argv[7]
16 |
17 | orf1_intact = set()
18 | for line in open(orf1_list):
19 | orf1_intact.add(line.strip())
20 | orf2_intact = set()
21 | for line in open(orf2_list):
22 | orf2_intact.add(line.strip())
23 |
24 | exp_probs = dict()
25 | seqs = set([])
26 |
27 | for line in open(exp_prob_pkls_list):
28 | names_file, X_file = line.strip().split('\t')
29 | name = names_file.split('/')[-1][:-16]
30 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
31 | seqs = seqs | set(exp_probs[name].keys())
32 |
33 | l1pa_pairs = dict()
34 | mapped_pairs = dict()
35 |
36 | for line in open(bam_info_list):
37 | name = line.strip().split('/')[-1][:-4]
38 | baminfo = open(line.strip()).readlines()
39 | mapped_pairs[name] = int(baminfo[1])
40 | l1pa_pairs[name] = int(baminfo[2])
41 |
42 | output_orf1 = open(output_orf1_name,'w')
43 | output_intact = open(output_intact_name,'w')
44 |
45 | print_string = "locus"
46 | for name in exp_probs:
47 | print_string += "\t"+name
48 |
49 | output_orf1.write (print_string+'\n')
50 | output_intact.write (print_string+'\n')
51 |
52 | completed = set()
53 |
54 | for name in seqs:
55 | seq_name = '_'.join(name.split('_')[:-1])
56 | if seq_name in completed:
57 | continue
58 | else:
59 | completed.add(seq_name)
60 | print_string = seq_name.split('(')[0]
61 | only_name = seq_name+'_only'
62 | runon_name = seq_name+'_3prunon'
63 | runthrough_name = seq_name+'_runthrough'
64 | for name in exp_probs:
65 | FPM = 0.0
66 | runthrough_FPM = 0.0
67 | if only_name in exp_probs[name]:
68 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
69 | if runon_name in exp_probs[name]:
70 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
71 | if runthrough_name in exp_probs[name]:
72 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
73 | if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction:
74 | print_string += '\t'+str(FPM)
75 | else:
76 | print_string += '\t0.0'
77 | if seq_name.split('(')[0][:-2] in orf1_intact:
78 | output_orf1.write(print_string+'\n')
79 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
80 | output_intact.write(print_string+'\n')
81 |
82 | output_orf1.close()
83 | output_intact.close()
84 |
--------------------------------------------------------------------------------
/CGC/make_ORF1_and_intact_table_stranded.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | allowed_runthrough_fraction = float(sys.argv[5])
13 |
14 | output_orf1_name = sys.argv[6]
15 | output_intact_name = sys.argv[7]
16 |
17 | orf1_intact = set()
18 | for line in open(orf1_list):
19 | orf1_intact.add(line.strip())
20 | orf2_intact = set()
21 | for line in open(orf2_list):
22 | orf2_intact.add(line.strip())
23 |
24 | exp_probs = dict()
25 | seqs = set([])
26 |
27 | for line in open(exp_prob_pkls_list):
28 | names_file, X_file = line.strip().split('\t')
29 | name = names_file.split('/')[-1][:-16]
30 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
31 | seqs = seqs | set(exp_probs[name].keys())
32 |
33 | l1pa_pairs = dict()
34 | mapped_pairs = dict()
35 |
36 | for line in open(bam_info_list):
37 | name = line.strip().split('/')[-1][:-4]
38 | baminfo = open(line.strip()).readlines()
39 | mapped_pairs[name] = int(baminfo[1])
40 | l1pa_pairs[name] = int(baminfo[2])
41 |
42 | output_orf1 = open(output_orf1_name,'w')
43 | output_intact = open(output_intact_name,'w')
44 |
45 | print_string = "locus"
46 | for name in exp_probs:
47 | print_string += "\t"+name
48 |
49 | output_orf1.write (print_string+'\n')
50 | output_intact.write (print_string+'\n')
51 |
52 | completed = set()
53 |
54 | for name in seqs:
55 | seq_name = '_'.join(name.split('_')[:-1])
56 | if seq_name in completed:
57 | continue
58 | else:
59 | completed.add(seq_name)
60 | print_string = seq_name.split('(')[0]
61 | only_name = seq_name+'_only'
62 | runon_name = seq_name+'_3prunon'
63 | senserunthrough_name = seq_name+'_senserunthrough'
64 | antisenserunthrough_name = seq_name+'_antisenserunthrough'
65 | for name in exp_probs:
66 | FPM = 0.0
67 | runthrough_FPM = 0.0
68 | if only_name in exp_probs[name]:
69 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
70 | if runon_name in exp_probs[name]:
71 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
72 | if senserunthrough_name in exp_probs[name]:
73 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
74 | if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction:
75 | print_string += '\t'+str(FPM)
76 | else:
77 | print_string += '\t0.0'
78 | if seq_name.split('(')[0][:-2] in orf1_intact:
79 | output_orf1.write(print_string+'\n')
80 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
81 | output_intact.write(print_string+'\n')
82 |
83 | output_orf1.close()
84 | output_intact.close()
85 |
--------------------------------------------------------------------------------
/CGC/make_l1pa1to4table.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 |
11 | exp_probs = dict()
12 | seqs = set([])
13 |
14 | for line in open(exp_prob_pkls_list):
15 | names_file, X_file = line.strip().split('\t')
16 | name = names_file.split('/')[-1][:-16]
17 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
18 | seqs = seqs | set(exp_probs[name].keys())
19 |
20 | l1pa_pairs = dict()
21 | mapped_pairs = dict()
22 |
23 | for line in open(bam_info_list):
24 | name = line.strip().split('/')[-1][:-4]
25 | baminfo = open(line.strip()).readlines()
26 | mapped_pairs[name] = int(baminfo[1])
27 | l1pa_pairs[name] = int(baminfo[2])
28 |
29 | print_string = "locus"
30 | for name in exp_probs:
31 | print_string += "\t"+name+'-active'+"\t"+name+'-passive'
32 |
33 | print(print_string)
34 |
35 | completed = set()
36 |
37 | for name in seqs:
38 | if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']:
39 | continue
40 | seq_name = '_'.join(name.split('_')[:-1])
41 | if seq_name in completed:
42 | continue
43 | else:
44 | completed.add(seq_name)
45 | print_string = seq_name.split('(')[0]
46 | only_name = seq_name+'_only'
47 | runon_name = seq_name+'_3prunon'
48 | runthrough_name = seq_name+'_runthrough'
49 | for name in exp_probs:
50 | FPM = 0.0
51 | runthrough_FPM = 0.0
52 | if only_name in exp_probs[name]:
53 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
54 | if runon_name in exp_probs[name]:
55 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
56 | if runthrough_name in exp_probs[name]:
57 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
58 | print_string += '\t'+str(FPM)+'\t'+str(runthrough_FPM)
59 | print(print_string)
60 |
--------------------------------------------------------------------------------
/CGC/make_l1pa1to4table_stranded.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 | allowed_rt_fraction = float(sys.argv[3])
11 |
12 | exp_probs = dict()
13 | seqs = set([])
14 |
15 | for line in open(exp_prob_pkls_list):
16 | names_file, X_file = line.strip().split('\t')
17 | name = names_file.split('/')[-1][:-16]
18 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
19 | seqs = seqs | set(exp_probs[name].keys())
20 |
21 | l1pa_pairs = dict()
22 | mapped_pairs = dict()
23 |
24 | for line in open(bam_info_list):
25 | name = line.strip().split('/')[-1][:-4]
26 | baminfo = open(line.strip()).readlines()
27 | mapped_pairs[name] = int(baminfo[1])
28 | l1pa_pairs[name] = int(baminfo[2])
29 |
30 | print_string = "locus"
31 | for name in exp_probs:
32 | print_string += "\t"+name
33 |
34 | print(print_string)
35 |
36 | completed = set()
37 |
38 | for name in seqs:
39 | if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']:
40 | continue
41 | seq_name = '_'.join(name.split('_')[:-1])
42 | if seq_name in completed:
43 | continue
44 | else:
45 | completed.add(seq_name)
46 | print_string = seq_name.split('(')[0]
47 | only_name = seq_name+'_only'
48 | runon_name = seq_name+'_3prunon'
49 | runthrough_name = seq_name+'_senserunthrough'
50 | for name in exp_probs:
51 | FPM = 0.0
52 | runthrough_FPM = 0.0
53 | if only_name in exp_probs[name]:
54 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
55 | if runon_name in exp_probs[name]:
56 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
57 | if runthrough_name in exp_probs[name]:
58 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
59 | if runthrough_FPM < allowed_rt_fraction*FPM:
60 | print_string += '\t'+str(FPM)
61 | else:
62 | print_string += '\t0.0'
63 | print(print_string)
64 |
--------------------------------------------------------------------------------
/CGC/median_template_and_pairs.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pysam
3 | import random
4 | import numpy
5 |
6 | """
7 | Estimate median template length of a bam file.
8 |
9 | Part of the L1-EM package.
10 |
11 | Copyright (C) 2019 Wilson McKerrow
12 |
13 | This program is free software: you can redistribute it and/or modify
14 | it under the terms of the GNU General Public License as published by
15 | the Free Software Foundation, either version 3 of the License, or
16 | (at your option) any later version.
17 |
18 | This program is distributed in the hope that it will be useful,
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 | GNU General Public License for more details.
22 |
23 | You should have received a copy of the GNU General Public License
24 | along with this program. If not, see .
25 |
26 | """
27 |
28 | bamfile = sys.argv[1]
29 | fraction = float(sys.argv[2])
30 |
31 | tlens = list()
32 | n_proper_reads = 0
33 |
34 | for read in pysam.AlignmentFile(bamfile):
35 | if read.is_proper_pair:
36 | n_proper_reads += 1
37 | if random.random() < fraction:
38 | tlens.append(read.template_length)
39 |
40 | print(numpy.median(numpy.abs(tlens)))
41 | print(n_proper_reads/2)
42 |
--------------------------------------------------------------------------------
/CGC/read_or_pair_overlap_bed_and_unmapped.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import sys
3 |
4 | """
5 | Extract reads or pairs of reads that overlap a bed file.
6 |
7 | Part of the L1-EM package.
8 |
9 | Copyright (C) 2019 Wilson McKerrow
10 |
11 | This program is free software: you can redistribute it and/or modify
12 | it under the terms of the GNU General Public License as published by
13 | the Free Software Foundation, either version 3 of the License, or
14 | (at your option) any later version.
15 |
16 | This program is distributed in the hope that it will be useful,
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 | GNU General Public License for more details.
20 |
21 | You should have received a copy of the GNU General Public License
22 | along with this program. If not, see .
23 |
24 | """
25 |
26 | def main():
27 | bedfile = sys.argv[1]
28 | bamfile = sys.argv[2]
29 | outbamfile = sys.argv[3]
30 | outunmappedbamfile = sys.argv[4]
31 | if len(sys.argv) > 5:
32 | flanking = int(sys.argv[5])
33 | else:
34 | flanking = 400
35 | if len(sys.argv) > 6:
36 | maxNM = int(sys.argv[6])
37 | else:
38 | maxNM = 4
39 |
40 | inbam = pysam.AlignmentFile(bamfile,'rb')
41 | outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam)
42 | outunmappedbam = pysam.AlignmentFile(outunmappedbamfile,'wb',template=inbam)
43 |
44 | read_ids = set()
45 | for line in open(bedfile):
46 | chrom,start,stop = line.strip().split('\t')[:3]
47 | start = int(start)+flanking
48 | stop = int(stop)-flanking
49 | if chrom in inbam.references:
50 | for read in inbam.fetch(chrom,start,stop):
51 | if not read.is_unmapped:
52 | if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM):
53 | read_ids.add(read.query_name)
54 | # if chrom[3:] in inbam.references:
55 | # for read in inbam.fetch(chrom[3:],start,stop):
56 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
57 | # read_ids.add(read.query_name)
58 | # if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references:
59 | # for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop):
60 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
61 | # read_ids.add(read.query_name)
62 |
63 | inbam.close()
64 | inbam = pysam.AlignmentFile(bamfile,'rb')
65 |
66 | for read in inbam:
67 | if read.query_name in read_ids:
68 | if not read.is_secondary and not read.is_supplementary:
69 | outbam.write(read)
70 | elif read.is_unmapped or read.mate_is_unmapped:
71 | if not read.is_secondary and not read.is_supplementary:
72 | outunmappedbam.write(read)
73 |
74 | inbam.close()
75 | outbam.close()
76 |
77 | if __name__ == '__main__':
78 | main()
79 |
--------------------------------------------------------------------------------
/CGC/report_l1_exp_counts.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2019 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 |
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 |
33 | total = float(sys.argv[4])
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'exon' not in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6)
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6)
56 | runthroughS_name = seq_name+'_senserunthrough'
57 | if runthroughS_name not in X_est:
58 | X_est[runthroughS_name]=0.0
59 | print_string += '\t'+str(total*X_est[runthroughS_name]/proper_pairs_in_original_bam*10**6)
60 | runthroughA_name = seq_name+'_antisenserunthrough'
61 | if runthroughA_name not in X_est:
62 | X_est[runthroughA_name]=0.0
63 | print_string += '\t'+str(total*X_est[runthroughA_name]/proper_pairs_in_original_bam*10**6)
64 | antisense_name = seq_name+'_antisense'
65 | if antisense_name not in X_est:
66 | X_est[antisense_name]=0.0
67 | print_string += '\t'+str(total*X_est[antisense_name]/proper_pairs_in_original_bam*10**6)
68 | print(print_string)
69 |
--------------------------------------------------------------------------------
/CGC/report_l1_exp_counts_unstranded.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2019 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 |
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 |
33 | total = float(sys.argv[4])
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'exon' not in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6)
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6)
56 | runthrough_name = seq_name+'_runthrough'
57 | if runthrough_name not in X_est:
58 | X_est[runthrough_name]=0.0
59 | print_string += '\t'+str(total*X_est[runthrough_name]/proper_pairs_in_original_bam*10**6)
60 | print(print_string)
61 |
--------------------------------------------------------------------------------
/CGC/total_orf1_and_orf2.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | min_FPM = float(sys.argv[5])
13 | allowed_runthrough_fraction = float(sys.argv[6])
14 |
15 | l1pa_pairs = dict()
16 | mapped_pairs = dict()
17 |
18 | orf1_intact = set()
19 | for line in open(orf1_list):
20 | orf1_intact.add(line.strip())
21 | orf2_intact = set()
22 | for line in open(orf2_list):
23 | orf2_intact.add(line.strip())
24 |
25 | for line in open(bam_info_list):
26 | name = line.strip().split('/')[-1][:-4]
27 | baminfo = open(line.strip()).readlines()
28 | mapped_pairs[name] = int(baminfo[1])
29 | l1pa_pairs[name] = int(baminfo[2])
30 |
31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM')
32 |
33 | for line in open(exp_prob_pkls_list):
34 | names_file, X_file = line.strip().split('\t')
35 | sample_name = names_file.split('/')[-1][:-16]
36 | exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
37 | orf1 = 0.0
38 | orf2 = 0.0
39 | both = 0.0
40 | L1HS_exp = 0.0
41 | L1HS_all = 0.0
42 | for transcript in exp_prob:
43 | if 'L1HS' in transcript:
44 | L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
45 | if 'only' not in transcript:
46 | continue
47 | seq_name = '_'.join(transcript.split('_')[:-1])
48 | only_name = seq_name+'_only'
49 | runon_name = seq_name+'_3prunon'
50 | runthrough_name = seq_name+'_runthrough'
51 | FPM = 0.0
52 | FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
53 | if runon_name in exp_prob:
54 | FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
55 | if runthrough_name in exp_prob:
56 | runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
57 | else:
58 | runthrough_FPM = 0.0
59 | FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction
60 | if seq_name.split('(')[0][:-2] in orf1_intact:
61 | orf1 += FPM
62 | if seq_name.split('(')[0][:-2] in orf2_intact:
63 | orf2 += FPM
64 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
65 | both += FPM
66 | if 'L1HS' in seq_name:
67 | L1HS_exp += FPM
68 | print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all))
69 |
--------------------------------------------------------------------------------
/CGC/total_orf1_and_orf2_stranded.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
3 | try:
4 | import cPickle as pickle
5 | except ImportError:
6 | import pickle
7 |
8 | exp_prob_pkls_list = sys.argv[1]
9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | min_FPM = float(sys.argv[5])
13 | allowed_runthrough_fraction = float(sys.argv[6])
14 |
15 | l1pa_pairs = dict()
16 | mapped_pairs = dict()
17 |
18 | orf1_intact = set()
19 | for line in open(orf1_list):
20 | orf1_intact.add(line.strip())
21 | orf2_intact = set()
22 | for line in open(orf2_list):
23 | orf2_intact.add(line.strip())
24 |
25 | for line in open(bam_info_list):
26 | name = line.strip().split('/')[-1][:-4]
27 | baminfo = open(line.strip()).readlines()
28 | mapped_pairs[name] = int(baminfo[1])
29 | l1pa_pairs[name] = int(baminfo[2])
30 |
31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM')
32 |
33 | for line in open(exp_prob_pkls_list):
34 | names_file, X_file = line.strip().split('\t')
35 | sample_name = names_file.split('/')[-1][:-16]
36 | exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
37 | orf1 = 0.0
38 | orf2 = 0.0
39 | both = 0.0
40 | L1HS_exp = 0.0
41 | L1HS_all = 0.0
42 | for transcript in exp_prob:
43 | if 'L1HS' in transcript:
44 | L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
45 | if 'only' not in transcript:
46 | continue
47 | seq_name = '_'.join(transcript.split('_')[:-1])
48 | only_name = seq_name+'_only'
49 | runon_name = seq_name+'_3prunon'
50 | runthrough_name = seq_name+'_senserunthrough'
51 | FPM = 0.0
52 | FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
53 | if runon_name in exp_prob:
54 | FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
55 | if runthrough_name in exp_prob:
56 | runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
57 | else:
58 | runthrough_FPM = 0.0
59 | FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction
60 | if seq_name.split('(')[0][:-2] in orf1_intact:
61 | orf1 += FPM
62 | if seq_name.split('(')[0][:-2] in orf2_intact:
63 | orf2 += FPM
64 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
65 | both += FPM
66 | if 'L1HS' in seq_name:
67 | L1HS_exp += FPM
68 | print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all))
69 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3:4.5.12
2 |
3 | WORKDIR /
4 |
5 | RUN conda install -y --override-channels -c bioconda -c conda-forge -c defaults python=2.7.15 bwa=0.7.17 samtools=1.9 numpy=1.14.3 scipy=1.1.0 pysam=0.15.0 bedtools=2.27.1
6 | RUN git clone https://github.com/FenyoLab/L1EM/
7 |
8 |
--------------------------------------------------------------------------------
/L1EM.yml:
--------------------------------------------------------------------------------
1 | name: L1EM
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - python=2.7.15
8 | - bwa=0.7.17
9 | - samtools=1.9
10 | - numpy=1.14.3
11 | - scipy=1.1.0
12 | - pysam=0.15.0
13 | - bedtools=2.27.1
14 |
15 |
--------------------------------------------------------------------------------
/L1EM/G_of_R.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import sys
3 | import numpy
4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
5 | try:
6 | import cPickle as pickle
7 | except ImportError:
8 | import pickle
9 | from scipy import sparse
10 | import datetime
11 | import argparse
12 |
13 | """
14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
16 | and the entries are the likelihood of that read arising from that transcript.
17 | The matrix is pickled and saved. The column names are writted to a text file.
18 |
19 | Part of the L1-EM package.
20 |
21 | Copyright (C) 2019 Wilson McKerrow
22 |
23 | This program is free software: you can redistribute it and/or modify
24 | it under the terms of the GNU General Public License as published by
25 | the Free Software Foundation, either version 3 of the License, or
26 | (at your option) any later version.
27 |
28 | This program is distributed in the hope that it will be useful,
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 | GNU General Public License for more details.
32 |
33 | You should have received a copy of the GNU General Public License
34 | along with this program. If not, see .
35 |
36 | """
37 |
38 | """
39 | This class stores relevant information about a read's potential alignment as a dictionary
40 | with references names as keys and as list of potential alignments to that reference name
41 | as values.
42 | """
43 | class read_alignments(object):
44 | def __init__(self, alignment,rnames,P):
45 | self.alignments = dict()
46 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
47 | # Add a new alignment, passing a pysam aligned_segnment object.
48 | def add(self, alignment,rnames,P):
49 | if rnames[alignment.rname] not in self.alignments:
50 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
51 | else:
52 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
53 | # Add a new alignment, passing the output of parseXA.
54 | def addXA(self,refname,start,is_reverse,P):
55 | if refname not in self.alignments:
56 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
57 | else:
58 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
59 |
60 | # Stores position, strand and likelihood for an alignment.
61 | class alignment_at_name(object):
62 | def __init__(self,start,is_reverse,P):
63 | self.start = start
64 | self.is_reverse = is_reverse
65 | self.P = P
66 |
67 | # Read command line arguments
68 | def GetArgs():
69 |
70 | def ParseArgs(parser):
71 | class Parser(argparse.ArgumentParser):
72 | def error(self, message):
73 | sys.stderr.write('error: %s\n' % message)
74 | self.print_help()
75 | sys.exit(2)
76 |
77 | parser.add_argument('-b', '--bamfile',
78 | type=str,
79 | required=True,
80 | help='Bam to generate alignments from. Required.')
81 | parser.add_argument('-e', '--error_prob',
82 | required=False,
83 | default=0.01,
84 | type=float,
85 | help='Probability of an alignment mismatch. [0.01]')
86 | parser.add_argument('-m', '--max_start2start_len',
87 | required=False,
88 | default=500,
89 | type=int,
90 | help='Maximium distance between read starts to be considered concordant. [500]')
91 | parser.add_argument('-r', '--reads_per_pickle',
92 | required=False,
93 | default=12500,
94 | type=int,
95 | help='Split output into chunks of this many reads. [12500]')
96 | parser.add_argument('-p', '--prefix',
97 | required=False,
98 | default='G_of_R',
99 | type=str,
100 | help='Prefix for output file(s) [G_of_R]')
101 | parser.add_argument('-n', '--NMdiff',
102 | required=False,
103 | default=2,
104 | type=int,
105 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
106 | parser.add_argument('-i', '--insert_mean',
107 | required=True,
108 | type=float,
109 | help='Median template length. Required.')
110 | parser.add_argument('--flanking',
111 | required=False,
112 | default=400,
113 | type=int,
114 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
115 | parser.add_argument('--as_start',
116 | required=False,
117 | default=500,
118 | type=int,
119 | help='Position of the antisense TSS in L1. [500]')
120 | parser.add_argument('-w', '--wiggle',
121 | required=False,
122 | default=20,
123 | type=int,
124 | help='Extend L1 annotation this many bases in both directions. [20]')
125 | parser.add_argument('--min_len',
126 | required=False,
127 | default=500,
128 | type=int,
129 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
130 | parser.add_argument('--min_exon_len',
131 | required=False,
132 | default=100,
133 | type=int,
134 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
135 | return parser.parse_args()
136 |
137 | parser = argparse.ArgumentParser()
138 | args = ParseArgs(parser)
139 |
140 | return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len
141 |
142 | """
143 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
144 | row matrix with the likelihoods of all properly paired alignments.
145 | """
146 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len):
147 | this_G_of_R = numpy.zeros(5*nreps)
148 | for refname in alignments1.alignments:
149 | if refname not in alignments2.alignments:
150 | continue
151 | for aln1 in alignments1.alignments[refname]:
152 | for aln2 in alignments2.alignments[refname]:
153 | if aln1.is_reverse == aln2.is_reverse:
154 | continue
155 | if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len:
156 | has_5pUTR = refname.split('.')[1]=='1'
157 | if refname.split('.')[1]=='2':
158 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len))
159 | continue
160 | is_sense = not aln2.is_reverse
161 | within_5p = min(aln1.start,aln2.start) > flanking -wiggle
162 | within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle
163 | overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking
164 | if not overlap_element:
165 | continue
166 | if is_sense:
167 | this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
168 | if not is_sense:
169 | this_G_of_R[nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
170 | if within_5p and within_3p and is_sense and has_5pUTR:
171 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len)
172 | if within_5p and is_sense and has_5pUTR:
173 | this_G_of_R[3*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
174 | if has_5pUTR and rlens[rnames_index[refname]] > flanking+as_start and max(aln1.start,aln2.start)+read_length < flanking+as_start and (not is_sense) and rlens[rnames_index[refname]] > flanking+as_start:
175 | this_G_of_R[4*nreps+rnames_index[refname]] += aln1.P*aln2.P/(as_start+insert_mean+wiggle)
176 | return sparse.csr_matrix(this_G_of_R)
177 |
178 | # Parse secondary alignments in the XA tag from bwa aln.
179 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
180 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
181 | refname = aln[0]
182 | #if not reversed:
183 | # is_reverse = aln[1][0] == '-'
184 | #else:
185 | # is_reverse = aln[1][0] == '+'
186 | is_reverse = aln[1][0] == '-'
187 | start = int(aln[1][1:])
188 | cigarstring = aln[2]
189 | NM = int(aln[3])
190 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
191 | P = error_prob**NM
192 | alignments.addXA(refname,start,is_reverse,P)
193 | return alignments
194 |
195 | def main():
196 | bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs()
197 |
198 | pickle_num = 0
199 |
200 | bam = pysam.Samfile(bamfile, "rb")
201 | rnames = bam.references
202 | rlens = bam.lengths
203 | nreps = len(rnames)
204 | rnames_index = dict()
205 | for i in range(nreps):
206 | rnames_index[rnames[i]] = i
207 |
208 | # Write transcript (column) names
209 | TEnamefile = open(prefix+'_TE_list.txt','w')
210 | for i in range(nreps):
211 | TEnamefile.write(rnames[i]+'_senserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
212 | for i in range(nreps):
213 | TEnamefile.write(rnames[i]+'_antisenserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
214 | for i in range(nreps):
215 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
216 | for i in range(nreps):
217 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
218 | for i in range(nreps):
219 | TEnamefile.write(rnames[i]+'_antisense'+'\t'+str(flanking+as_start)+'\n')
220 | TEnamefile.close()
221 |
222 | read_id = None
223 |
224 | G_of_R = None
225 | G_of_R_list_file = open(prefix+'_list.txt','w')
226 | G_of_R_row = 0
227 |
228 | starttime = datetime.datetime.now()
229 |
230 | # Read through the name sorted bam file
231 | for alignment in bam:
232 | read_length = alignment.query_length
233 | # Throw out alignments that are unmapped, clipped or low quality
234 | if alignment.is_unmapped:
235 | continue
236 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
237 | continue
238 | if numpy.mean(alignment.query_qualities) < 30:
239 | continue
240 |
241 | if not read_id:
242 | read_id = alignment.qname
243 | new_read_id1 = True
244 | new_read_id2 = True
245 |
246 | # Once we have read all entries for a given query name, create a row for that fragment
247 | if read_id != alignment.qname:
248 | if not (new_read_id1 or new_read_id2):
249 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
250 | if this_G_of_R.nnz > 0:
251 | if G_of_R_row > 0:
252 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
253 | else:
254 | G_of_R = this_G_of_R
255 | G_of_R_row += 1
256 | # If necessary, break up matrix into multiple pickle files.
257 | if G_of_R_row >= reads_per_pickle:
258 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
259 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
260 | pickle_num += 1
261 | G_of_R_row = 0
262 | G_of_R = None
263 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
264 | starttime = datetime.datetime.now()
265 |
266 | read_id = alignment.qname
267 | new_read_id1 = True
268 | new_read_id2 = True
269 |
270 | # Parse primary alignment
271 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
272 | NMtag = dict(alignment.tags)['XM']
273 | for pair in alignment.cigartuples:
274 | NMtag += (pair[0]>0)*pair[1]
275 | P = error_prob**NMtag
276 |
277 | if alignment.is_read1:
278 | if new_read_id1:
279 | alignments1 = read_alignments(alignment,rnames,P)
280 | new_read_id1 = False
281 | else:
282 | alignments1.add(alignment,rnames,P)
283 | else:
284 | if new_read_id2:
285 | alignments2 = read_alignments(alignment,rnames,P)
286 | new_read_id2 = False
287 | else:
288 | alignments2.add(alignment,rnames,P)
289 |
290 | # Parse secondary alignments
291 | if 'XA' in dict(alignment.tags):
292 | if alignment.is_read1:
293 | alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
294 | else:
295 | alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
296 |
297 | # Make row for last read
298 | if read_id and not (new_read_id1 or new_read_id2):
299 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
300 | if this_G_of_R.nnz > 0:
301 | if G_of_R_row > 0:
302 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
303 | else:
304 | G_of_R = this_G_of_R
305 |
306 | # Write matrix to disk.
307 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
308 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
309 | print(G_of_R_row+reads_per_pickle*pickle_num)
310 |
311 | if __name__ == '__main__':
312 | main()
313 |
--------------------------------------------------------------------------------
/L1EM/G_of_R_single_unstranded.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import sys
3 | import numpy
4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
5 | try:
6 | import cPickle as pickle
7 | except ImportError:
8 | import pickle
9 | from scipy import sparse
10 | import datetime
11 | import argparse
12 |
13 | """
14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
16 | and the entries are the likelihood of that read arising from that transcript.
17 | The matrix is pickled and saved. The column names are writted to a text file.
18 |
19 | Copyright (C) 2019 Wilson McKerrow
20 |
21 | This program is free software: you can redistribute it and/or modify
22 | it under the terms of the GNU General Public License as published by
23 | the Free Software Foundation, either version 3 of the License, or
24 | (at your option) any later version.
25 |
26 | This program is distributed in the hope that it will be useful,
27 | but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | GNU General Public License for more details.
30 |
31 | You should have received a copy of the GNU General Public License
32 | along with this program. If not, see .
33 |
34 | """
35 |
36 | """
37 | This class stores relevant information about a read's potential alignment as a dictionary
38 | with references names as keys and as list of potential alignments to that reference name
39 | as values.
40 | """
41 | class read_alignments(object):
42 | def __init__(self, alignment,rnames,P):
43 | self.alignments = dict()
44 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
45 | # Add a new alignment, passing a pysam aligned_segnment object.
46 | def add(self, alignment,rnames,P):
47 | if rnames[alignment.rname] not in self.alignments:
48 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
49 | else:
50 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
51 | # Add a new alignment, passing the output of parseXA.
52 | def addXA(self,refname,start,is_reverse,P):
53 | if refname not in self.alignments:
54 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
55 | else:
56 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
57 |
58 | # Stores position, strand and likelihood for an alignment.
59 | class alignment_at_name(object):
60 | def __init__(self,start,is_reverse,P):
61 | self.start = start
62 | self.is_reverse = is_reverse
63 | self.P = P
64 |
65 | # Read command line arguments
66 | def GetArgs():
67 |
68 | def ParseArgs(parser):
69 | class Parser(argparse.ArgumentParser):
70 | def error(self, message):
71 | sys.stderr.write('error: %s\n' % message)
72 | self.print_help()
73 | sys.exit(2)
74 |
75 | parser.add_argument('-b', '--bamfile',
76 | type=str,
77 | required=True,
78 | help='Bam to generate alignments from. Required.')
79 | parser.add_argument('-e', '--error_prob',
80 | required=False,
81 | default=0.01,
82 | type=float,
83 | help='Probability of an alignment mismatch. [0.01]')
84 | parser.add_argument('-r', '--reads_per_pickle',
85 | required=False,
86 | default=12500,
87 | type=int,
88 | help='Split output into chunks of this many reads. [12500]')
89 | parser.add_argument('-p', '--prefix',
90 | required=False,
91 | default='G_of_R',
92 | type=str,
93 | help='Prefix for output file(s) [G_of_R]')
94 | parser.add_argument('-n', '--NMdiff',
95 | required=False,
96 | default=2,
97 | type=int,
98 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
99 | parser.add_argument('--flanking',
100 | required=False,
101 | default=400,
102 | type=int,
103 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
104 | parser.add_argument('-w', '--wiggle',
105 | required=False,
106 | default=20,
107 | type=int,
108 | help='Extend L1 annotation this many bases in both directions. [20]')
109 | parser.add_argument('--min_len',
110 | required=False,
111 | default=500,
112 | type=int,
113 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
114 | parser.add_argument('--min_exon_len',
115 | required=False,
116 | default=100,
117 | type=int,
118 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
119 | return parser.parse_args()
120 |
121 | parser = argparse.ArgumentParser()
122 | args = ParseArgs(parser)
123 |
124 | return args.bamfile, args.error_prob, args.reads_per_pickle, args.prefix, args.NMdiff, args.flanking, args.wiggle, args.min_len, args.min_exon_len
125 |
126 | """
127 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
128 | row matrix with the likelihoods of all properly paired alignments.
129 | """
130 | def make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len):
131 | this_G_of_R = numpy.zeros(3*nreps)
132 | for refname in alignments.alignments:
133 | for aln in alignments.alignments[refname]:
134 | has_5pUTR = refname.split('.')[1]=='1'
135 | within_5p = aln.start > flanking -wiggle
136 | within_3p = aln.start+read_length < rlens[rnames_index[refname]]-flanking +wiggle
137 | overlap_element = aln.start+read_length > flanking and aln.start < rlens[rnames_index[refname]]-flanking
138 | if not overlap_element:
139 | continue
140 | this_G_of_R[rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+read_length+2*wiggle)
141 | if within_5p and within_3p and has_5pUTR:
142 | this_G_of_R[1*nreps+rnames_index[refname]] += aln.P/max(rlens[rnames_index[refname]]-2*flanking-read_length+2*wiggle,min_len)
143 | if within_5p and has_5pUTR:
144 | this_G_of_R[2*nreps+rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
145 | return sparse.csr_matrix(this_G_of_R)
146 |
147 | # Parse secondary alignments in the XA tag from bwa aln.
148 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
149 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
150 | refname = aln[0]
151 | #if not reversed:
152 | # is_reverse = aln[1][0] == '-'
153 | #else:
154 | # is_reverse = aln[1][0] == '+'
155 | is_reverse = aln[1][0] == '-'
156 | start = int(aln[1][1:])
157 | cigarstring = aln[2]
158 | NM = int(aln[3])
159 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
160 | P = error_prob**NM
161 | alignments.addXA(refname,start,is_reverse,P)
162 | return alignments
163 |
164 | def main():
165 | bamfile, error_prob, reads_per_pickle, prefix, NMdiff, flanking, wiggle, min_len, min_exon_len = GetArgs()
166 |
167 | pickle_num = 0
168 |
169 | bam = pysam.Samfile(bamfile, "rb")
170 | rnames = bam.references
171 | rlens = bam.lengths
172 | nreps = len(rnames)
173 | rnames_index = dict()
174 | for i in range(nreps):
175 | rnames_index[rnames[i]] = i
176 |
177 | # Write transcript (column) names
178 | TEnamefile = open(prefix+'_TE_list.txt','w')
179 | for i in range(nreps):
180 | TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
181 | for i in range(nreps):
182 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
183 | for i in range(nreps):
184 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
185 | TEnamefile.close()
186 |
187 | read_id = None
188 |
189 | G_of_R = None
190 | G_of_R_list_file = open(prefix+'_list.txt','w')
191 | G_of_R_row = 0
192 |
193 | starttime = datetime.datetime.now()
194 |
195 | # Read through the name sorted bam file
196 | for alignment in bam:
197 | read_length = alignment.query_length
198 | # Throw out alignments that are unmapped, clipped or low quality
199 | if alignment.is_unmapped:
200 | continue
201 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
202 | continue
203 | if numpy.mean(alignment.query_qualities) < 30:
204 | continue
205 |
206 | if not read_id:
207 | read_id = alignment.qname
208 | new_read_id = True
209 |
210 | # Once we have read all entries for a given query name, create a row for that fragment
211 | if read_id != alignment.qname:
212 | if not new_read_id:
213 | this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len)
214 | # Don't add row if its empty
215 | if this_G_of_R.nnz > 0:
216 | if G_of_R_row > 0:
217 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
218 | else:
219 | G_of_R = this_G_of_R
220 | G_of_R_row += 1
221 | # If necessary, break up matrix into multiple pickle files.
222 | if G_of_R_row >= reads_per_pickle:
223 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
224 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
225 | pickle_num += 1
226 | G_of_R_row = 0
227 | G_of_R = None
228 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
229 | starttime = datetime.datetime.now()
230 |
231 | read_id = alignment.qname
232 | new_read_id = True
233 |
234 | # Parse primary alignment
235 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
236 | NMtag = dict(alignment.tags)['NM']
237 | P = error_prob**NMtag
238 |
239 | if new_read_id:
240 | alignments = read_alignments(alignment,rnames,P)
241 | new_read_id = False
242 | else:
243 | alignments.add(alignment,rnames,P)
244 |
245 | # Parse secondary alignments
246 | if 'XA' in dict(alignment.tags):
247 | alignments = parseXA(alignments,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
248 |
249 | # Make row for last read
250 | if not new_read_id:
251 | this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len)
252 | if this_G_of_R.nnz > 0:
253 | if G_of_R_row > 0:
254 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
255 | else:
256 | G_of_R = this_G_of_R
257 |
258 | # Write matrix to disk.
259 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
260 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
261 | print(G_of_R_row+reads_per_pickle*pickle_num)
262 |
263 | if __name__ == '__main__':
264 | main()
265 |
--------------------------------------------------------------------------------
/L1EM/G_of_R_unstranded.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import sys
3 | import numpy
4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
5 | try:
6 | import cPickle as pickle
7 | except ImportError:
8 | import pickle
9 | from scipy import sparse
10 | import datetime
11 | import argparse
12 |
13 | """
14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
16 | and the entries are the likelihood of that read arising from that transcript.
17 | The matrix is pickled and saved. The column names are writted to a text file.
18 |
19 | Copyright (C) 2019 Wilson McKerrow
20 |
21 | This program is free software: you can redistribute it and/or modify
22 | it under the terms of the GNU General Public License as published by
23 | the Free Software Foundation, either version 3 of the License, or
24 | (at your option) any later version.
25 |
26 | This program is distributed in the hope that it will be useful,
27 | but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | GNU General Public License for more details.
30 |
31 | You should have received a copy of the GNU General Public License
32 | along with this program. If not, see .
33 |
34 | """
35 |
36 | """
37 | This class stores relevant information about a read's potential alignment as a dictionary
38 | with references names as keys and as list of potential alignments to that reference name
39 | as values.
40 | """
41 | class read_alignments(object):
42 | def __init__(self, alignment,rnames,P):
43 | self.alignments = dict()
44 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
45 | # Add a new alignment, passing a pysam aligned_segnment object.
46 | def add(self, alignment,rnames,P):
47 | if rnames[alignment.rname] not in self.alignments:
48 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
49 | else:
50 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
51 | # Add a new alignment, passing the output of parseXA.
52 | def addXA(self,refname,start,is_reverse,P):
53 | if refname not in self.alignments:
54 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
55 | else:
56 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
57 |
58 | # Stores position, strand and likelihood for an alignment.
59 | class alignment_at_name(object):
60 | def __init__(self,start,is_reverse,P):
61 | self.start = start
62 | self.is_reverse = is_reverse
63 | self.P = P
64 |
65 | # Read command line arguments
66 | def GetArgs():
67 |
68 | def ParseArgs(parser):
69 | class Parser(argparse.ArgumentParser):
70 | def error(self, message):
71 | sys.stderr.write('error: %s\n' % message)
72 | self.print_help()
73 | sys.exit(2)
74 |
75 | parser.add_argument('-b', '--bamfile',
76 | type=str,
77 | required=True,
78 | help='Bam to generate alignments from. Required.')
79 | parser.add_argument('-e', '--error_prob',
80 | required=False,
81 | default=0.01,
82 | type=float,
83 | help='Probability of an alignment mismatch. [0.01]')
84 | parser.add_argument('-m', '--max_start2start_len',
85 | required=False,
86 | default=500,
87 | type=int,
88 | help='Maximium distance between read starts to be considered concordant. [500]')
89 | parser.add_argument('-r', '--reads_per_pickle',
90 | required=False,
91 | default=12500,
92 | type=int,
93 | help='Split output into chunks of this many reads. [12500]')
94 | parser.add_argument('-p', '--prefix',
95 | required=False,
96 | default='G_of_R',
97 | type=str,
98 | help='Prefix for output file(s) [G_of_R]')
99 | parser.add_argument('-n', '--NMdiff',
100 | required=False,
101 | default=2,
102 | type=int,
103 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
104 | parser.add_argument('-i', '--insert_mean',
105 | required=True,
106 | type=float,
107 | help='Median template length. Required.')
108 | parser.add_argument('--flanking',
109 | required=False,
110 | default=400,
111 | type=int,
112 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
113 | parser.add_argument('--as_start',
114 | required=False,
115 | default=500,
116 | type=int,
117 | help='Position of the antisense TSS in L1. [500]')
118 | parser.add_argument('-w', '--wiggle',
119 | required=False,
120 | default=20,
121 | type=int,
122 | help='Extend L1 annotation this many bases in both directions. [20]')
123 | parser.add_argument('--min_len',
124 | required=False,
125 | default=500,
126 | type=int,
127 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
128 | parser.add_argument('--min_exon_len',
129 | required=False,
130 | default=100,
131 | type=int,
132 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
133 | return parser.parse_args()
134 |
135 | parser = argparse.ArgumentParser()
136 | args = ParseArgs(parser)
137 |
138 | return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len
139 |
140 | """
141 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
142 | row matrix with the likelihoods of all properly paired alignments.
143 | """
144 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len):
145 | this_G_of_R = numpy.zeros(3*nreps)
146 | for refname in alignments1.alignments:
147 | if refname not in alignments2.alignments:
148 | continue
149 | for aln1 in alignments1.alignments[refname]:
150 | for aln2 in alignments2.alignments[refname]:
151 | if aln1.is_reverse == aln2.is_reverse:
152 | continue
153 | if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len:
154 | has_5pUTR = refname.split('.')[1]=='1'
155 | if refname.split('.')[1]=='2':
156 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len))
157 | continue
158 | within_5p = min(aln1.start,aln2.start) > flanking -wiggle
159 | within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle
160 | overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking
161 | if not overlap_element:
162 | continue
163 | this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
164 | if within_5p and within_3p and has_5pUTR:
165 | this_G_of_R[1*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len)
166 | if within_5p and has_5pUTR:
167 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
168 | return sparse.csr_matrix(this_G_of_R)
169 |
170 | # Parse secondary alignments in the XA tag from bwa aln.
171 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
172 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
173 | refname = aln[0]
174 | #if not reversed:
175 | # is_reverse = aln[1][0] == '-'
176 | #else:
177 | # is_reverse = aln[1][0] == '+'
178 | is_reverse = aln[1][0] == '-'
179 | start = int(aln[1][1:])
180 | cigarstring = aln[2]
181 | NM = int(aln[3])
182 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
183 | P = error_prob**NM
184 | alignments.addXA(refname,start,is_reverse,P)
185 | return alignments
186 |
187 | def main():
188 | bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs()
189 |
190 | pickle_num = 0
191 |
192 | bam = pysam.Samfile(bamfile, "rb")
193 | rnames = bam.references
194 | rlens = bam.lengths
195 | nreps = len(rnames)
196 | rnames_index = dict()
197 | for i in range(nreps):
198 | rnames_index[rnames[i]] = i
199 |
200 | # Write transcript (column) names
201 | TEnamefile = open(prefix+'_TE_list.txt','w')
202 | for i in range(nreps):
203 | TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
204 | for i in range(nreps):
205 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
206 | for i in range(nreps):
207 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
208 | TEnamefile.close()
209 |
210 | read_id = None
211 |
212 | G_of_R = None
213 | G_of_R_list_file = open(prefix+'_list.txt','w')
214 | G_of_R_row = 0
215 |
216 | starttime = datetime.datetime.now()
217 |
218 | # Read through the name sorted bam file
219 | for alignment in bam:
220 | read_length = alignment.query_length
221 | # Throw out alignments that are unmapped, clipped or low quality
222 | if alignment.is_unmapped:
223 | continue
224 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
225 | continue
226 | if numpy.mean(alignment.query_qualities) < 30:
227 | continue
228 |
229 | if not read_id:
230 | read_id = alignment.qname
231 | new_read_id1 = True
232 | new_read_id2 = True
233 |
234 | # Once we have read all entries for a given query name, create a row for that fragment
235 | if read_id != alignment.qname:
236 | if not (new_read_id1 or new_read_id2):
237 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
238 | if this_G_of_R.nnz > 0:
239 | if G_of_R_row > 0:
240 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
241 | else:
242 | G_of_R = this_G_of_R
243 | G_of_R_row += 1
244 | # If necessary, break up matrix into multiple pickle files.
245 | if G_of_R_row >= reads_per_pickle:
246 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
247 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
248 | pickle_num += 1
249 | G_of_R_row = 0
250 | G_of_R = None
251 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
252 | starttime = datetime.datetime.now()
253 |
254 | read_id = alignment.qname
255 | new_read_id1 = True
256 | new_read_id2 = True
257 |
258 | # Parse primary alignment
259 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
260 | NMtag = dict(alignment.tags)['XM']
261 | for pair in alignment.cigartuples:
262 | NMtag += (pair[0]>0)*pair[1]
263 | P = error_prob**NMtag
264 |
265 | if alignment.is_read1:
266 | if new_read_id1:
267 | alignments1 = read_alignments(alignment,rnames,P)
268 | new_read_id1 = False
269 | else:
270 | alignments1.add(alignment,rnames,P)
271 | else:
272 | if new_read_id2:
273 | alignments2 = read_alignments(alignment,rnames,P)
274 | new_read_id2 = False
275 | else:
276 | alignments2.add(alignment,rnames,P)
277 |
278 | # Parse secondary alignments
279 | if 'XA' in dict(alignment.tags):
280 | if alignment.is_read1:
281 | alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
282 | else:
283 | alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
284 |
285 | # Make row for last read
286 | if read_id!=None and not (new_read_id1 or new_read_id2):
287 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
288 | if this_G_of_R.nnz > 0:
289 | if G_of_R_row > 0:
290 | G_of_R = sparse.vstack([G_of_R,this_G_of_R])
291 | else:
292 | G_of_R = this_G_of_R
293 |
294 | # Write matrix to disk.
295 | if G_of_R_row+reads_per_pickle*pickle_num >0:
296 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
297 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
298 | print(G_of_R_row+reads_per_pickle*pickle_num)
299 |
300 | if __name__ == '__main__':
301 | main()
302 |
--------------------------------------------------------------------------------
/L1EM/L1EM.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import numpy
7 | import sys
8 | import datetime
9 | from scipy import sparse
10 | from multiprocessing import Pool
11 | import argparse
12 |
13 | """
14 | This code takes as input the output of G_of_R.py and runs the EM algorithm to estimate
15 | transcript abundances.
16 |
17 | Part of the L1-EM package.
18 |
19 | Copyright (C) 2019 Wilson McKerrow
20 |
21 | This program is free software: you can redistribute it and/or modify
22 | it under the terms of the GNU General Public License as published by
23 | the Free Software Foundation, either version 3 of the License, or
24 | (at your option) any later version.
25 |
26 | This program is distributed in the hope that it will be useful,
27 | but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | GNU General Public License for more details.
30 |
31 | You should have received a copy of the GNU General Public License
32 | along with this program. If not, see .
33 | """
34 |
35 | # Main calculation for the E step
36 | def calculate_expcounts(G_of_R_pkl,X):
37 | G_of_R_file = open(G_of_R_pkl,'rb')
38 | G_of_R = pickle.load(G_of_R_file)
39 | G_of_R_file.close()
40 | if G_of_R == None:
41 | return 0.0,0.0
42 | L_of_R_mat = G_of_R.multiply(X)
43 | L_of_R = numpy.array(L_of_R_mat.sum(1))
44 | L_of_R_mat = L_of_R_mat[L_of_R[:,0]>=10**-200,:]
45 | L_of_R = L_of_R[L_of_R>=10**-200]
46 | L_of_R_inv = sparse.csr_matrix(1.0/L_of_R).transpose()
47 | exp_counts = L_of_R_mat.multiply(L_of_R_inv).sum(0)
48 | loglik = numpy.sum(numpy.log(L_of_R))
49 | if numpy.isfinite(loglik):
50 | return exp_counts,loglik
51 | else:
52 | return numpy.zeros(G_of_R.shape[1]),0.0
53 |
54 | # Divide send each thread a chunk of the G_of_R pkl files.
55 | def calculate_expcounts_chunk(input):
56 | G_of_R_pkl_list,X_len = input
57 | exp_counts = numpy.zeros(X_len.shape,dtype=numpy.float64)
58 | loglik = 0.0
59 | for G_of_R_pkl in G_of_R_pkl_list:
60 | this_exp_counts,this_loglik = calculate_expcounts(G_of_R_pkl,X_len)
61 | exp_counts += this_exp_counts
62 | loglik += this_loglik
63 | return exp_counts,loglik
64 |
65 | # Parse commandline arguments
66 | def GetArgs():
67 |
68 | def ParseArgs(parser):
69 | class Parser(argparse.ArgumentParser):
70 | def error(self, message):
71 | sys.stderr.write('error: %s\n' % message)
72 | self.print_help()
73 | sys.exit(2)
74 |
75 | parser.add_argument('-g', '--G_of_R_list',
76 | type=str,
77 | required=True,
78 | help='Text file listing paths to chunks of the G(R) matrix.')
79 | parser.add_argument('-l', '--TE_list',
80 | required=True,
81 | type=str,
82 | help='Text file listing the names of all transcripts. Output of G_of_R.py.')
83 | parser.add_argument('-s', '--stop_thresh',
84 | required=False,
85 | default=10**-7,
86 | type=float,
87 | help='Continue EM iterations until no transcription expression fraction (X_i) changes by more than this value. [1e-7]')
88 | parser.add_argument('-r', '--report_every',
89 | required=False,
90 | default=100,
91 | type=int,
92 | help='Write X every 100 steps. [100]')
93 | parser.add_argument('-m', '--max_nEMsteps',
94 | required=False,
95 | default=10000,
96 | type=int,
97 | help='Terminate if threshold has not been reached after this many EM steps [10000]')
98 | parser.add_argument('-t', '--nThreads',
99 | required=False,
100 | default=16,
101 | type=int,
102 | help='Divide E step into this many threads. [16]')
103 | parser.add_argument('-p', '--prefix',
104 | required=False,
105 | type=str,
106 | default='',
107 | help='If specified, this prefix will be used for output files.')
108 | return parser.parse_args()
109 |
110 | parser = argparse.ArgumentParser()
111 | args = ParseArgs(parser)
112 |
113 | return args.G_of_R_list, args.TE_list, args.stop_thresh, args.report_every, args.max_nEMsteps, args.nThreads, args.prefix
114 |
115 |
116 | def main():
117 | G_of_R_list, TE_list, stop_thresh, report_every, max_nEMsteps, nThreads, prefix = GetArgs()
118 |
119 | # All the transcripts names in the same order as the G_of_R matrix columns
120 | TE_names = list()
121 | for name in open(TE_list):
122 | TE_names.append(name.strip().split('\t')[0])
123 |
124 | # Intial guess
125 | X = sparse.csr_matrix(numpy.ones((1,len(TE_names)),dtype=numpy.float64)/len(TE_names))
126 |
127 | # Split up the pickle files into a set for each thread.
128 | G_of_R_pkl_fulllist = list()
129 | for G_of_R_pkl in open(G_of_R_list):
130 | G_of_R_pkl_fulllist.append(G_of_R_pkl.strip())
131 | G_of_R_pkl_lists = list()
132 | listsize = len(G_of_R_pkl_fulllist)//nThreads
133 | nlistsp1 = len(G_of_R_pkl_fulllist)%nThreads
134 | k = 0
135 | for i in range(nlistsp1):
136 | G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize+1])
137 | k+=listsize+1
138 | for i in range(nlistsp1,nThreads):
139 | G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize])
140 | k+=listsize
141 |
142 | masterPool = Pool(processes = nThreads)
143 |
144 | # Run the EM steps
145 | for step in range(max_nEMsteps):
146 | starttime = datetime.datetime.now()
147 | exp_counts = numpy.zeros((1,len(TE_names)),dtype=numpy.float64)
148 | loglik = 0.0
149 |
150 | outputs = masterPool.map(calculate_expcounts_chunk,zip(G_of_R_pkl_lists,[X]*nThreads))
151 | for output in outputs:
152 | this_exp_counts,this_loglik = output
153 | exp_counts += this_exp_counts
154 | loglik += this_loglik
155 |
156 | last_X = X.copy()
157 | X = sparse.csr_matrix(exp_counts/numpy.sum(exp_counts))
158 | print(str(step)+" "+str(numpy.max(numpy.abs(X.toarray()-last_X.toarray())))+" "+str(loglik)+" "+str(datetime.datetime.now()-starttime))
159 |
160 | if (step+1) % report_every == 0:
161 | pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
162 | pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
163 |
164 | if numpy.max(numpy.abs(X.toarray()-last_X.toarray())) < stop_thresh:
165 | break
166 |
167 | # Output the final results
168 | pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
169 | pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
170 |
171 | if __name__ == '__main__':
172 | main()
173 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | TERMS AND CONDITIONS
2 |
3 | 0. Definitions.
4 |
5 | “This License” refers to version 3 of the GNU General Public License.
6 |
7 | “Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.
8 |
9 | “The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.
10 |
11 | To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work.
12 |
13 | A “covered work” means either the unmodified Program or a work based on the Program.
14 |
15 | To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.
16 |
17 | To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.
18 |
19 | An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.
20 |
21 | 1. Source Code.
22 |
23 | The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.
24 |
25 | A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.
26 |
27 | The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.
28 |
29 | The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work.
30 |
31 | The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.
32 |
33 | The Corresponding Source for a work in source code form is that same work.
34 |
35 | 2. Basic Permissions.
36 |
37 | All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.
38 |
39 | You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.
40 |
41 | Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
42 |
43 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
44 |
45 | No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.
46 |
47 | When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures.
48 |
49 | 4. Conveying Verbatim Copies.
50 |
51 | You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.
52 |
53 | You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.
54 |
55 | 5. Conveying Modified Source Versions.
56 |
57 | You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:
58 |
59 | a) The work must carry prominent notices stating that you modified it, and giving a relevant date.
60 | b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”.
61 | c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.
62 | d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.
63 | A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.
64 |
65 | 6. Conveying Non-Source Forms.
66 |
67 | You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:
68 |
69 | a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.
70 | b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.
71 | c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.
72 | d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.
73 | e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.
74 | A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.
75 |
76 | A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.
77 |
78 | “Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.
79 |
80 | If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).
81 |
82 | The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.
83 |
84 | Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.
85 |
86 | 7. Additional Terms.
87 |
88 | “Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.
89 |
90 | When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.
91 |
92 | Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:
93 |
94 | a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or
95 | b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or
96 | c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or
97 | d) Limiting the use for publicity purposes of names of licensors or authors of the material; or
98 | e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or
99 | f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.
100 | All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.
101 |
102 | If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.
103 |
104 | Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.
105 |
106 | 8. Termination.
107 |
108 | You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).
109 |
110 | However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.
111 |
112 | Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.
113 |
114 | Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.
115 |
116 | 9. Acceptance Not Required for Having Copies.
117 |
118 | You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.
119 |
120 | 10. Automatic Licensing of Downstream Recipients.
121 |
122 | Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.
123 |
124 | An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.
125 |
126 | You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.
127 |
128 | 11. Patents.
129 |
130 | A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”.
131 |
132 | A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.
133 |
134 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.
135 |
136 | In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.
137 |
138 | If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.
139 |
140 | If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.
141 |
142 | A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.
143 |
144 | Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.
145 |
146 | 12. No Surrender of Others' Freedom.
147 |
148 | If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.
149 |
150 | 13. Use with the GNU Affero General Public License.
151 |
152 | Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.
153 |
154 | 14. Revised Versions of this License.
155 |
156 | The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
157 |
158 | Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.
159 |
160 | If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program.
161 |
162 | Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.
163 |
164 | 15. Disclaimer of Warranty.
165 |
166 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
167 |
168 | 16. Limitation of Liability.
169 |
170 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
171 |
172 | 17. Interpretation of Sections 15 and 16.
173 |
174 | If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 | ### conda way
3 | You will need
4 | 1. git (https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)
5 | 2. anaconda (https://docs.anaconda.com/anaconda/install/)
6 |
7 | Download from github
8 | ```
9 | git clone https://github.com/FenyoLab/L1EM
10 | ```
11 | Create conda environment
12 | ```
13 | cd L1EM
14 | conda env create -f L1EM.yml
15 | ```
16 |
17 | Before running L1EM, activate the environment:
18 | ```
19 | source activate L1EM
20 | ```
21 |
22 | When finished, deactivate the environment:
23 | ```
24 | source deactivate L1EM
25 | ```
26 |
27 | ### old way
28 | Alternatively you can install the following dependencies yourself:
29 | * python version 2.7+ (version 2.7 tested)
30 | * bwa (version 0.7.17 tested)
31 | * samtools (version 1.9 tested)
32 | * numpy (version 1.14.3 tested)
33 | * scipy (version 1.1.0 tested)
34 | * pysam (version 0.15.0 tested)
35 | * bedtools (version 2.27.1 tested)
36 |
37 | No compiling of L1EM is necessary. Python scripts will be called from inside the L1EM
38 | directory.
39 |
40 | If necessary, you can specify the path for bwa and samtools in the run\_L1EM.sh script.
41 | You must use samtools >=1.0. Early version of pysam will not work. I highly recommend
42 | that you use bwa 0.7.17. Earlier versions may differ in how they write the XA tag. This
43 | will lead to inaccurate results without throwing an error.
44 |
45 | ## Quick guide
46 | ### First time: build L1EM reference
47 | You will need the hg38 reference genome in fasta format, with bwa index.
48 | Downloaded from UCSC genome browser:
49 | ```
50 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
51 | zcat hg38.fa.gz > hg38.fa
52 | bwa index hg38.fa
53 | ```
54 | Note: this will take some time.
55 |
56 | Then you can build the L1EM reference using the provided shell script:
57 | ```
58 | bash generate_L1EM_fasta_and_index.sh /fullpathto/hg38.fa
59 | ```
60 | This should be done inside the L1EM directory
61 |
62 | ### Executing the L1-EM pipeline
63 | You will need a bam file with strand specific paired end read alignments to hg38. You can
64 | use any aligner, but make sure that all reads from the original fastq files are present
65 | trimming should be okay, but is tested. Filtering reads will potentially break the pipeline.
66 |
67 | First move to an empty directory and then execute the shell script:
68 | ```
69 | bash -e /fullpathto/run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
70 | ```
71 | L1EM will write files with specific names, so do NOT run two instances of L1EM in the same
72 | directory.
73 |
74 | At the end of the run\_L1EM.sh script are a commented set of commands to delete all the
75 | intermediate files. If you wish to automatically delete intermediate files, you can delete
76 | these comments.
77 |
78 | ### Output
79 | At completion, three tab delimited tables will be written.
80 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs
81 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts
82 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned)
83 |
84 | The rows of all files are L1 loci.
85 |
86 | For full\_counts.txt each of the five transcript types:
87 | only, runon, passive (sense), passive (antisense), antisense
88 | are reported.
89 |
90 | For l1hs\_transcript\_counts.txt and filter\_L1HS\_FPM.txt only proper transcription from L1HS elements start at the
91 | 5' UTR is reported.
92 |
93 | The results are also written as pickle files to facilitate further analysis in python. To
94 | generate a python dictionary with keys being the transcript names and values being the
95 | relative expression:
96 | ```
97 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl'))))
98 | ```
99 |
100 | ## Additional details
101 | * Our Bioinformatics paper introducing L1EM: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz724/5581349
102 | * More details can be found in manual.md
103 |
104 | ## Mouse Version
105 | Scripts and annotation to measure the expression of LINE-1 loci in mm39 has been added. The mouse version uses all the same methodology as the human version, but has not been as rigorously tested.
106 | 1. Download and index the mm39 reference genome (UCSC genome browser version)
107 | ```
108 | wget http://hgdownload.cse.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
109 | zcat mm39.fa.gz > mm39.fa
110 | bwa index mm39.fa
111 | ```
112 | 2. Build the mm39 L1EM reference.
113 | ```
114 | bash generate_mm39_L1EM_fasta_and_index.sh /fullpathto/mm39.fa
115 | ```
116 | 3. Run L1EM.
117 | ```
118 | bash /fullpathto/run_L1EM_mm39.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/mm39.fa
119 | ```
120 | All L1Md loci are quantified in full\_counts.txt. Normalized expression of 5' UTR intact young (L1Md\_Tf I/II/II, L1Md\_Gf I/II, L1Md\_A I/II/III) LINE-1 loci supported by at least 100 reads can be found in filter\_active\_L1Md\_FPM.txt.
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/generate_L1EM_fasta_and_index.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | # If you need to specify package directories
4 | bedtools=$(which bedtools)
5 | bwa=$(which bwa)
6 |
7 | # Command line
8 | hg38=$1
9 |
10 | $bedtools getfasta -s -name -fi $hg38 -bed annotation/L1EM.400.bed > annotation/L1EM.400.fa
11 | $bwa index annotation/L1EM.400.fa
--------------------------------------------------------------------------------
/generate_mm39_L1EM_fasta_and_index.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | # If you need to specify package directories
4 | bedtools=$(which bedtools)
5 | bwa=$(which bwa)
6 |
7 | # Command line
8 | mm39=$1
9 |
10 | $bedtools getfasta -s -name -fi $mm39 -bed annotation/mm39.L1EM.bed > annotation/mm39.L1EM.400.fa
11 | $bwa index annotation/mm39.L1EM.400.fa
12 |
--------------------------------------------------------------------------------
/manual.md:
--------------------------------------------------------------------------------
1 | ## Pipeline Parameters
2 |
3 | The key parameters for L1EM are listed at the beginning of the run\_L1EM.sh file. Default parameters should work well in most cases, but advanced users may wish to tinker.
4 | 1. threads. Dictates the number of threads that L1EM will spawn. More threads will improve parallel performance, but memory usage scales linearly with number of threads.
5 | 2. realignNM. The number of mismatches to allow when trying to realign reads that do not align as proper pairs in the bam file provided. Default is 3, but you might want to increase for longer reads.
6 | 3. L1EM_NM. As above, but for the generation of candidate alignments to the L1EM reference. Including more candidate alignments will slow the computation, but too few candidate alignments could yield less accurate results.
7 | 4. NMdiff. Only consider alignments with at most this many more mismatches than the primary alignment. Because read likelihood diminishes exponentially with additional mismatches, increasing this parameter is unlikely to affect results but will slow the EM steps.
8 | 5. bwa\_i. By default bwa will create a large number of alignments with indels near the edge of the read. This parameter will prevent this behavior. You may wish to decrease this parameter for shorter reads.
9 | 6. error\_prob. Probability of an error. Error probability is chosen to be constant because computing the read likelihood from base quality scores is slow.
10 | 7. max\_start2start\_len=500. Maximum allowed fragment/template length. Increase if you are using data with very large fragments.
11 | 8. reads\_per\_pickle. The G(R) matrix is split into a number of pickle files, so the entire matrix doesn't need to sit in memory. Decreasing this parameter will free up memory at the G(R) construction and EM steps.
12 | 9. EM\_threshold. Run EM steps until no entry in X changes by more than this value. The paremeter is chosen to be small by default to ensure convergence. Increasing the parameter modestly will improve run time.
13 | 10. template\_fraction. When computing median template length, subsample read to this fraction. You only need about 10,000 proper pairs to get a good estimate.
14 |
15 | ## Generating new annotations
16 | If you wish to run L1-EM for another retrotransposon or for another model organism, you will need to generate a new annotation.
17 | 1. Create a bedfile with the following naming scheme:
18 | family.category.region.strand
19 | Where family is the name of the repeat family,
20 | category is 1 is the element has a promoter and 0 otherwise
21 | region is the genome region (chrom:start-stop) of the element
22 | strand is +/- depending which strand the element falls on
23 | The bedfile must have the six required fields: chrom, start, stop, name, score, strand
24 | The start and stop coordinates should include 400 positions of flanking sequence on either end.
25 | Exons overlapping the annotation can also be included.
26 | 2. Create a fasta file from your bed file and index it with bwa:
27 | ```
28 | bedtools getfasta -s -name -fi refernece.fa -bed annotation.bed > annotation.fa
29 | bwa index annotation.fa
30 | ```
31 | 3. Update lines 27 and 28 to point toward your new annotation.
32 |
33 | ## Pipeline steps
34 | ### STEP 1: realign
35 | In this step reads that are not properly paired are extracted and realigned with bwa. Many aligners do not bother with highly redundant reads, so this step is included to ensure that LINE-1 aligning reads are identified.
36 |
37 | ### STEP 2: extract
38 | In this step, L1HS/L1PA reads are extracted. Any read pair for which either end overlaps an entry in the L1EM.400.bed annotation is considered.
39 |
40 | ### STEP 3: candidate alignments
41 | The extracted reads are aligned to L1EM.400.fa, all secondary alignments with up to L1EM_NM mismatches are found. The candidate alignments fastqs are split for parallelization. It is vitally important that all candidate alignments are identified. Missing some of these alignments will drastically hurt accuracy. For this reads bwa aln is used. Do not use bwa mem or STAR as these aligner do not provide a complete enumeration of secondary alignments for highly repetitive elements (like L1HS).
42 |
43 | ### STEP 4: G(R) matrix construction
44 |
45 | The bam files of candidate alignments are read by the script G\_of\_R.py. The likelihood of each candidate alignment is calculated and added to the G(R) matrix.
46 |
47 | The following options are additional parameters that can be accessed at this step:
48 | 1. -f/--flanking specifies the amount of flanking sequence in the annotation. If you created you own annotation with more or less that 400 bases of flanking sequence specify that here.
49 | 2. --as\_start. If you wish to change to TSS for antisense transcription do that here.
50 | 3. -w/--wiggle. Some proper LINE-1 transcripts start slightly before the annotation start of the 5'UTR. This parameter extends the annotated element this many bases in either direct (default is 20).
51 | 4. --min\_len. Puts a floor on transcript effective length to prevent cases where transcription of very short elements are over predicted. Default is 500.
52 | 5. --min\_exon\_len. Corresponding minimun effective length for exon annotations. Default is 100.
53 |
54 | ### STEP 5: Expectation maximization
55 | In this step, the expectation maximization algorithm is used to compute a maximum likelihood estimate of relative expression, using the G(R) matrix output in the previous as input.
56 | The following options are additional parameters that can be accessed at this step:
57 | 1. -r/--report\_every. Write the estimate every n steps.
58 | 2. -m/--max\_nEMsteps. By default EM stops if converge has not been achieved after 10000 steps. Change that value here.
59 |
60 | ### STEP 6: Writing results
61 | At completion, three tab delimited tables will be written.
62 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs
63 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts
64 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned)
65 |
66 | ### STEP 7: Clean up
67 | All the intermediate files are delete at this step. Comment out these lines if you want to keep them.
68 |
69 | The rows of both files are L1 loci.
70 |
71 | For full\_counts.txt each of the five transcript types:
72 | only, runon, passive (sense), passive (antisense), antisense
73 | are reported.
74 |
75 | For l1hs\_transcript_counts.txt only proper transcription from L1HS elements start at the
76 | 5' UTR is reported.
77 |
78 | The results are also written as pickle files to facilitate further analysis in python. To
79 | generate a python dictionary with keys being the transcript names and values being the
80 | relative expression:
81 | ```
82 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl'))))
83 | ```
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/parameters.sh:
--------------------------------------------------------------------------------
1 | # Parameters
2 | export threads=16 #How many threads to use for samtools, bwa and L1EM
3 | export realignNM=3 #Number of mismatches allowed in bwa realignment
4 | export L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
5 | export NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
6 | export bwa_i=20 #bwa i parameter prevents indels near the edges of a read
7 | export error_prob=0.01 #Probability of a read error at a given position
8 | export max_start2start_len=500 #Max allowed template/fragment length
9 | export reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
10 | export EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
11 | export template_fraction=1 #Fraction of reads to consider when calculated median template length.
12 |
--------------------------------------------------------------------------------
/run_L1EM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
20 |
21 | # Parameters
22 | threads=16 #How many threads to use for samtools, bwa and L1EM
23 | realignNM=3 #Number of mismatches allowed in bwa realignment
24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
27 | error_prob=0.01 #Probability of a read error at a given position
28 | max_start2start_len=500 #Max allowed template/fragment length
29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
32 |
33 | # If you need to specify paths to required packages
34 | bwa=$(which bwa) # version 0.7.17 tested
35 | samtools=$(which samtools) # version 1.9 tested
36 | python=$(which python) # use version 2.7
37 |
38 | # Command line arguments
39 | bamfile=$1
40 | L1EM_directory=$2
41 | hg38=$3
42 |
43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
48 |
49 | # Try to realign unaligned reads using bwa aln.
50 | echo 'STEP 1: realign'
51 | mkdir idL1reads
52 | cd idL1reads
53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam
57 | samtools index realigned.bam
58 |
59 | # Extract L1HS/L1PA* aligning reads.
60 | echo 'STEP 2: extract'
61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
65 | cat temp.fq1 >> L1.fq1
66 | cat temp.fq2 >> L1.fq2
67 | # rm temp*
68 |
69 | # Split the L1 fastq files for parallel execution
70 | cd ..
71 | mkdir split_fqs
72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
75 | cd split_fqs
76 |
77 | # Generate candidate alignments
78 | echo 'STEP 3: candidate alignments'
79 | for name in *.fq1.*
80 | do reads1=$name
81 | reads2=$(echo $name|sed 's/fq1/fq2/g')
82 | ref=$L1EM_fa
83 | base=$(echo $name|sed 's/.fq1//g')
84 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
85 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
86 | done
87 | for name in *.fq1.*
88 | do reads1=$name
89 | reads2=$(echo $name|sed 's/fq1/fq2/g')
90 | ref=$L1EM_fa
91 | base=$(echo $name|sed 's/.fq1//g')
92 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
93 | done
94 | wait
95 |
96 | # Make G_of_R matrix
97 | echo 'STEP 4: G(R) matrix construction'
98 | mkdir ../G_of_R
99 | cd ../G_of_R
100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
101 | medianinsert=$(head -1 ../baminfo.txt)
102 | for bam in ../split_fqs/*.bam
103 | do $python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
104 | done
105 | wait
106 |
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 |
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 |
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 |
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 |
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 |
--------------------------------------------------------------------------------
/run_L1EM_fortcga.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/L1HS.fa
4 |
5 | # Parameters
6 | threads=16 #How many threads to use for samtools, bwa and L1EM
7 | realignNM=3 #Number of mismatches allowed in bwa realignment
8 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
9 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
10 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
11 | error_prob=0.01 #Probability of a read error at a given position
12 | max_start2start_len=500 #Max allowed template/fragment length
13 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
14 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
15 | template_fraction=0.0001 #Fraction of reads to consider when calculated median template length.
16 |
17 | # If you need to specify paths to required packages
18 | bwa=$(which bwa) # version 0.7.17 tested
19 | samtools=$(which samtools) # version 1.9 tested
20 | python=$(which python) # use version 2.7
21 |
22 | # Command line arguments
23 | bamfile=$1
24 | L1EM_directory=$2
25 | L1HS=$3
26 |
27 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
28 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
29 | L1EM_code_dir=$L1EM_directory'/L1EM/'
30 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
31 |
32 | # Try to realign unaligned reads using bwa aln.
33 | echo 'STEP 1: realign'
34 | mkdir idL1reads
35 | cd idL1reads
36 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
37 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq1 > 1.sai
38 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq2 > 2.sai
39 | $bwa sampe $L1HS 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -f 2 -@ $threads - | $samtools sort -@ $threads - > realigned.bam
40 |
41 | # Extract L1HS/L1PA* aligning reads.
42 | echo 'STEP 2: extract'
43 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
44 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
45 | $samtools fastq realigned.bam -1 temp.fq1 -2 temp.fq2
46 | cat temp.fq1 >> L1.fq1
47 | cat temp.fq2 >> L1.fq2
48 | rm temp*
49 |
50 | # Split the L1 fastq files for parallel execution
51 | cd ..
52 | mkdir split_fqs
53 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
54 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
55 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
56 | cd split_fqs
57 |
58 | # Generate candidate alignments
59 | echo 'STEP 3: candidate alignments'
60 | for name in *.fq1.*
61 | do reads1=$name
62 | reads2=$(echo $name|sed 's/fq1/fq2/g')
63 | ref=$L1EM_fa
64 | base=$(echo $name|sed 's/.fq1//g')
65 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
66 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
67 | done
68 | for name in *.fq1.*
69 | do reads1=$name
70 | reads2=$(echo $name|sed 's/fq1/fq2/g')
71 | ref=$L1EM_fa
72 | base=$(echo $name|sed 's/.fq1//g')
73 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
74 | done
75 | wait
76 |
77 | # Make G_of_R matrix
78 | echo 'STEP 4: G(R) matrix construction'
79 | mkdir ../G_of_R
80 | cd ../G_of_R
81 | medianinsert=$($python ${L1EM_utilities_dir}median_template.py $bamfile $template_fraction)
82 | for bam in ../split_fqs/*.bam
83 | do $python ${L1EM_code_dir}G_of_R_unstranded.py -b $bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
84 | done
85 | wait
86 |
87 | # RUN EM
88 | echo 'STEP 5: Expectation maximization'
89 | mkdir ../L1EM/
90 | cd ../L1EM/
91 | ls ../G_of_R/*pk2 > G_of_R_list.txt
92 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
93 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
94 |
95 | #Write results as text file
96 | echo 'STEP 6: Writing results'
97 |
98 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
99 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
100 |
101 | #Clean up
102 | echo 'STEP 7: Clean up'
103 | cp *final.pkl ../
104 | cd ..
105 |
106 | #rm idL1reads/*
107 | #rmdir idL1reads
108 | #rm split_fqs/*
109 | #rmdir split_fqs
110 | #rm G_of_R/*
111 | #rmdir G_of_R
112 | #rm L1EM/*
113 | #rmdir L1EM
114 |
115 |
--------------------------------------------------------------------------------
/run_L1EM_mm39.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
20 |
21 | # Parameters
22 | threads=16 #How many threads to use for samtools, bwa and L1EM
23 | realignNM=2 #Number of mismatches allowed in bwa realignment
24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
27 | error_prob=0.01 #Probability of a read error at a given position
28 | max_start2start_len=500 #Max allowed template/fragment length
29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
32 |
33 | # If you need to specify paths to required packages
34 | bwa=$(which bwa) # version 0.7.17 tested
35 | samtools=$(which samtools) # version 1.9 tested
36 | python=$(which python) # use version 2.7
37 |
38 | # Command line arguments
39 | bamfile=$1
40 | L1EM_directory=$2
41 | hg38=$3
42 |
43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed'
44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa'
45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
48 |
49 | # Try to realign unaligned reads using bwa aln.
50 | echo 'STEP 1: realign'
51 | mkdir idL1reads
52 | cd idL1reads
53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam
57 | samtools index realigned.bam
58 |
59 | # Extract L1HS/L1PA* aligning reads.
60 | echo 'STEP 2: extract'
61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
65 | cat temp.fq1 >> L1.fq1
66 | cat temp.fq2 >> L1.fq2
67 | rm temp*
68 |
69 | # Split the L1 fastq files for parallel execution
70 | cd ..
71 | mkdir split_fqs
72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
75 | cd split_fqs
76 |
77 | # Generate candidate alignments
78 | echo 'STEP 3: candidate alignments'
79 | for name in *.fq1.*
80 | do reads1=$name
81 | reads2=$(echo $name|sed 's/fq1/fq2/g')
82 | ref=$L1EM_fa
83 | base=$(echo $name|sed 's/.fq1//g')
84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
90 | done
91 |
92 | # Make G_of_R matrix
93 | echo 'STEP 4: G(R) matrix construction'
94 | mkdir ../G_of_R
95 | cd ../G_of_R
96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
97 | medianinsert=$(head -1 ../baminfo.txt)
98 | ls ../split_fqs/*.bam > list_of_bams.txt
99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | do for bam in $(cat $bamlist)
102 | do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | done
104 | wait
105 | done
106 |
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 |
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 |
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt
121 |
122 | #Clean up
123 | echo 'STEP 7: Clean up'
124 | cp *final.pkl ../
125 | cd ..
126 |
127 | # rm idL1reads/*
128 | # rmdir idL1reads
129 | # rm split_fqs/*
130 | # rmdir split_fqs
131 | # rm G_of_R/*
132 | # rmdir G_of_R
133 | # rm L1EM/*
134 | # rmdir L1EM
135 |
--------------------------------------------------------------------------------
/run_L1EM_mm39_unstranded.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
20 |
21 | # Parameters
22 | threads=16 #How many threads to use for samtools, bwa and L1EM
23 | realignNM=2 #Number of mismatches allowed in bwa realignment
24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
27 | error_prob=0.01 #Probability of a read error at a given position
28 | max_start2start_len=500 #Max allowed template/fragment length
29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
32 |
33 | # If you need to specify paths to required packages
34 | bwa=$(which bwa) # version 0.7.17 tested
35 | samtools=$(which samtools) # version 1.9 tested
36 | python=$(which python) # use version 2.7
37 |
38 | # Command line arguments
39 | bamfile=$1
40 | L1EM_directory=$2
41 | hg38=$3
42 |
43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed'
44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa'
45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
48 |
49 | # Try to realign unaligned reads using bwa aln.
50 | echo 'STEP 1: realign'
51 | mkdir idL1reads
52 | cd idL1reads
53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam
57 | samtools index realigned.bam
58 |
59 | # Extract L1HS/L1PA* aligning reads.
60 | echo 'STEP 2: extract'
61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
65 | cat temp.fq1 >> L1.fq1
66 | cat temp.fq2 >> L1.fq2
67 | rm temp*
68 |
69 | # Split the L1 fastq files for parallel execution
70 | cd ..
71 | mkdir split_fqs
72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
75 | cd split_fqs
76 |
77 | # Generate candidate alignments
78 | echo 'STEP 3: candidate alignments'
79 | for name in *.fq1.*
80 | do reads1=$name
81 | reads2=$(echo $name|sed 's/fq1/fq2/g')
82 | ref=$L1EM_fa
83 | base=$(echo $name|sed 's/.fq1//g')
84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
90 | done
91 |
92 | # Make G_of_R matrix
93 | echo 'STEP 4: G(R) matrix construction'
94 | mkdir ../G_of_R
95 | cd ../G_of_R
96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
97 | medianinsert=$(head -1 ../baminfo.txt)
98 | ls ../split_fqs/*.bam > list_of_bams.txt
99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | do for bam in $(cat $bamlist)
102 | do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | done
104 | wait
105 | done
106 |
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 |
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 |
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt
121 |
122 | #Clean up
123 | echo 'STEP 7: Clean up'
124 | cp *final.pkl ../
125 | cd ..
126 |
127 | # rm idL1reads/*
128 | # rmdir idL1reads
129 | # rm split_fqs/*
130 | # rmdir split_fqs
131 | # rm G_of_R/*
132 | # rmdir G_of_R
133 | # rm L1EM/*
134 | # rmdir L1EM
135 |
--------------------------------------------------------------------------------
/run_L1EM_unstranded.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
20 |
21 | # Parameters
22 | threads=16 #How many threads to use for samtools, bwa and L1EM
23 | realignNM=3 #Number of mismatches allowed in bwa realignment
24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
27 | error_prob=0.01 #Probability of a read error at a given position
28 | max_start2start_len=500 #Max allowed template/fragment length
29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
32 |
33 | # If you need to specify paths to required packages
34 | bwa=$(which bwa) # version 0.7.17 tested
35 | samtools=$(which samtools) # version 1.9 tested
36 | python=$(which python) # use version 2.7
37 |
38 | # Command line arguments
39 | bamfile=$1
40 | L1EM_directory=$2
41 | hg38=$3
42 |
43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
48 |
49 | # Try to realign unaligned reads using bwa aln.
50 | echo 'STEP 1: realign'
51 | mkdir idL1reads
52 | cd idL1reads
53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam
57 | samtools index realigned.bam
58 |
59 | # Extract L1HS/L1PA* aligning reads.
60 | echo 'STEP 2: extract'
61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
65 | cat temp.fq1 >> L1.fq1
66 | cat temp.fq2 >> L1.fq2
67 | # rm temp*
68 |
69 | # Split the L1 fastq files for parallel execution
70 | cd ..
71 | mkdir split_fqs
72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
75 | cd split_fqs
76 |
77 | # Generate candidate alignments
78 | echo 'STEP 3: candidate alignments'
79 | for name in *.fq1.*
80 | do reads1=$name
81 | reads2=$(echo $name|sed 's/fq1/fq2/g')
82 | ref=$L1EM_fa
83 | base=$(echo $name|sed 's/.fq1//g')
84 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
85 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
86 | done
87 | for name in *.fq1.*
88 | do reads1=$name
89 | reads2=$(echo $name|sed 's/fq1/fq2/g')
90 | ref=$L1EM_fa
91 | base=$(echo $name|sed 's/.fq1//g')
92 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
93 | done
94 | wait
95 |
96 | # Make G_of_R matrix
97 | echo 'STEP 4: G(R) matrix construction'
98 | mkdir ../G_of_R
99 | cd ../G_of_R
100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
101 | medianinsert=$(head -1 ../baminfo.txt)
102 | for bam in ../split_fqs/*.bam
103 | do $python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
104 | done
105 | wait
106 |
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 |
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 |
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 |
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 |
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 |
--------------------------------------------------------------------------------
/run_L1EM_unstranded_fromdocker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh parameters.sh /fullpathto/alignments.bam /fullpathto/hg38.fa
20 |
21 | # Command line arguments
22 | bash $1
23 | bamfile=$2
24 | hg38=$3
25 |
26 | # Locations within L1EM directory
27 | L1EM_bed=/annotation/L1EM.400.bed
28 | L1EM_fa=/annotation/L1EM.400.fa
29 | L1EM_code_dir=/L1EM/L1EM/
30 | L1EM_utilities_dir=/L1EM/utilities/
31 | L1EM_CGC_dir=/L1EM/CGC/
32 |
33 | # Try to realign unaligned reads using bwa aln.
34 | echo 'STEP 1: realign'
35 | mkdir idL1reads
36 | cd idL1reads
37 | samtools view -@ $threads -b -F 2 $bamfile | samtools sort -@ $threads -n - | samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
38 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq1 > 1.sai
39 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq2 > 2.sai
40 | bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | samtools view -b -@ $threads - | samtools sort -@ $threads - > realigned.bam
41 | samtools index realigned.bam
42 |
43 | # Extract L1HS/L1PA* aligning reads.
44 | echo 'STEP 2: extract'
45 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
46 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 L1.fq1 -2 L1.fq2
47 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
48 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 temp.fq1 -2 temp.fq2
49 | cat temp.fq1 >> L1.fq1
50 | cat temp.fq2 >> L1.fq2
51 | # rm temp*
52 |
53 | # Split the L1 fastq files for parallel execution
54 | cd ..
55 | mkdir split_fqs
56 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
57 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
58 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
59 | cd split_fqs
60 |
61 | # Generate candidate alignments
62 | echo 'STEP 3: candidate alignments'
63 | for name in *.fq1.*
64 | do reads1=$name
65 | reads2=$(echo $name|sed 's/fq1/fq2/g')
66 | ref=$L1EM_fa
67 | base=$(echo $name|sed 's/.fq1//g')
68 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
69 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
70 | done
71 | for name in *.fq1.*
72 | do reads1=$name
73 | reads2=$(echo $name|sed 's/fq1/fq2/g')
74 | ref=$L1EM_fa
75 | base=$(echo $name|sed 's/.fq1//g')
76 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | samtools view -bS - | samtools sort -n - > $base.aln.bam &
77 | done
78 | wait
79 |
80 | # Make G_of_R matrix
81 | echo 'STEP 4: G(R) matrix construction'
82 | mkdir ../G_of_R
83 | cd ../G_of_R
84 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
85 | medianinsert=$(head -1 ../baminfo.txt)
86 | for bam in ../split_fqs/*.bam
87 | do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
88 | done
89 | wait
90 |
91 | # RUN EM
92 | echo 'STEP 5: Expectation maximization'
93 | mkdir ../L1EM/
94 | cd ../L1EM/
95 | ls ../G_of_R/*pk2 > G_of_R_list.txt
96 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
97 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
98 |
99 | #Write results as text file
100 | echo 'STEP 6: Writing results'
101 |
102 | python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
103 | python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
104 | python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt
105 | python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
106 |
107 | #Clean up
108 | echo 'STEP 7: Clean up'
109 | cp *final.pkl ../
110 | cd ..
111 |
112 | # rm idL1reads/*
113 | # rmdir idL1reads
114 | # rm split_fqs/*
115 | # rmdir split_fqs
116 | # rm G_of_R/*
117 | # rmdir G_of_R
118 | # rm L1EM/*
119 | # rmdir L1EM
120 |
--------------------------------------------------------------------------------
/run_L1EM_withlessmemory.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Script to execute L1-EM pipeline
4 | # Copyright (C) 2019 Wilson McKerrow
5 |
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 |
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 |
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
20 |
21 | # Parameters
22 | threads=16 #How many threads to use for samtools, bwa and L1EM
23 | realignNM=3 #Number of mismatches allowed in bwa realignment
24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
27 | error_prob=0.01 #Probability of a read error at a given position
28 | max_start2start_len=500 #Max allowed template/fragment length
29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
32 |
33 | # If you need to specify paths to required packages
34 | bwa=$(which bwa) # version 0.7.17 tested
35 | samtools=$(which samtools) # version 1.9 tested
36 | python=$(which python) # use version 2.7
37 |
38 | # Command line arguments
39 | bamfile=$1
40 | L1EM_directory=$2
41 | hg38=$3
42 |
43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
48 |
49 | # Try to realign unaligned reads using bwa aln.
50 | echo 'STEP 1: realign'
51 | mkdir idL1reads
52 | cd idL1reads
53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam
57 | samtools index realigned.bam
58 |
59 | # Extract L1HS/L1PA* aligning reads.
60 | echo 'STEP 2: extract'
61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
65 | cat temp.fq1 >> L1.fq1
66 | cat temp.fq2 >> L1.fq2
67 | # rm temp*
68 |
69 | # Split the L1 fastq files for parallel execution
70 | cd ..
71 | mkdir split_fqs
72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
75 | cd split_fqs
76 |
77 | # Generate candidate alignments
78 | echo 'STEP 3: candidate alignments'
79 | for name in *.fq1.*
80 | do reads1=$name
81 | reads2=$(echo $name|sed 's/fq1/fq2/g')
82 | ref=$L1EM_fa
83 | base=$(echo $name|sed 's/.fq1//g')
84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai
86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
90 | done
91 |
92 | # Make G_of_R matrix
93 | echo 'STEP 4: G(R) matrix construction'
94 | mkdir ../G_of_R
95 | cd ../G_of_R
96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
97 | medianinsert=$(head -1 ../baminfo.txt)
98 | ls ../split_fqs/*.bam > list_of_bams.txt
99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | do for bam in $(cat $bamlist)
102 | do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | done
104 | wait
105 | done
106 |
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 |
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 |
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 |
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 |
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 |
--------------------------------------------------------------------------------
/utilities/L1EM_readpairs.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 |
7 | """
8 | Report the total numbr of read pairs passed to L1EM
9 |
10 | Copyright (C) 2019 Wilson McKerrow
11 |
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | """
26 |
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | G_of_R = pickle.load(open(line.strip(),'rb'))
30 | if G_of_R != None:
31 | total += G_of_R.shape[0]
32 |
33 | print(total)
34 |
--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_active_l1md.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from mm39 version of L1EM.
10 |
11 | Copyright (C) 2021 Wilson McKerrow
12 |
13 | This program is free software: you can redistribute it and/or modify
14 | it under the terms of the GNU General Public License as published by
15 | the Free Software Foundation, either version 3 of the License, or
16 | (at your option) any later version.
17 |
18 | This program is distributed in the hope that it will be useful,
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 | GNU General Public License for more details.
22 |
23 | You should have received a copy of the GNU General Public License
24 | along with this program. If not, see .
25 |
26 | """
27 |
28 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
29 |
30 | proper_pairs_in_original_bam = float(sys.argv[3])
31 |
32 | total = float(sys.argv[4])
33 |
34 | written_seqs = set([])
35 |
36 | print("family.category.locus.strand\tonly\t3prunon")
37 |
38 | names = list(X_est.keys())
39 |
40 | for name in names:
41 | if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name:
42 | seq_name = '_'.join(name.split('_')[:-1])
43 | if seq_name in written_seqs:
44 | continue
45 | written_seqs.add(seq_name)
46 | print_string = seq_name.split('(')[0]
47 | only_name = seq_name+'_only'
48 | if only_name not in X_est:
49 | X_est[only_name]=0.0
50 | only_pairs = total*X_est[only_name]
51 | runon_name = seq_name+'_3prunon'
52 | if runon_name not in X_est:
53 | X_est[runon_name]=0.0
54 | runon_pairs = total*X_est[runon_name]
55 | runthroughS_name = seq_name+'_senserunthrough'
56 | if runthroughS_name not in X_est:
57 | X_est[runthroughS_name]=0.0
58 | runthrough_pairs = total*X_est[runthroughS_name]
59 | runthroughA_name = seq_name+'_antisenserunthrough'
60 | if runthroughA_name not in X_est:
61 | X_est[runthroughA_name]=0.0
62 | runthrough_pairs += total*X_est[runthroughA_name]
63 | if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100):
64 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
65 |
--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_active_l1md_unstranded.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2021 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 |
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 |
33 | total = float(sys.argv[4])
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | only_pairs = total*X_est[only_name]
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | runon_pairs = total*X_est[runon_name]
56 | runthrough_name = seq_name+'_runthrough'
57 | if runthrough_name not in X_est:
58 | X_est[runthrough_name]=0.0
59 | runthrough_pairs = total*X_est[runthrough_name]
60 | if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100):
61 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
62 |
--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_l1hs.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2019 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 |
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 |
33 | total = float(sys.argv[4])
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'L1HS' in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | only_pairs = total*X_est[only_name]
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | runon_pairs = total*X_est[runon_name]
56 | runthroughS_name = seq_name+'_senserunthrough'
57 | if runthroughS_name not in X_est:
58 | X_est[runthroughS_name]=0.0
59 | runthrough_pairs = total*X_est[runthroughS_name]
60 | runthroughA_name = seq_name+'_antisenserunthrough'
61 | if runthroughA_name not in X_est:
62 | X_est[runthroughA_name]=0.0
63 | runthrough_pairs += total*X_est[runthroughA_name]
64 | if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100):
65 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
66 |
--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_l1hs_unstranded.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2019 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 |
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 |
33 | total = float(sys.argv[4])
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'L1HS' in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | only_pairs = total*X_est[only_name]
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | runon_pairs = total*X_est[runon_name]
56 | runthrough_name = seq_name+'_runthrough'
57 | if runthrough_name not in X_est:
58 | X_est[runthrough_name]=0.0
59 | runthrough_pairs = total*X_est[runthrough_name]
60 | if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100):
61 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
62 |
--------------------------------------------------------------------------------
/utilities/median_template.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pysam
3 | import random
4 | import numpy
5 |
6 | """
7 | Estimate median template length of a bam file.
8 |
9 | Part of the L1-EM package.
10 |
11 | Copyright (C) 2019 Wilson McKerrow
12 |
13 | This program is free software: you can redistribute it and/or modify
14 | it under the terms of the GNU General Public License as published by
15 | the Free Software Foundation, either version 3 of the License, or
16 | (at your option) any later version.
17 |
18 | This program is distributed in the hope that it will be useful,
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 | GNU General Public License for more details.
22 |
23 | You should have received a copy of the GNU General Public License
24 | along with this program. If not, see .
25 |
26 | """
27 |
28 | bamfile = sys.argv[1]
29 | fraction = float(sys.argv[2])
30 |
31 | tlens = list()
32 |
33 | for read in pysam.AlignmentFile(bamfile):
34 | if not read.is_unmapped and random.random() < fraction:
35 | tlens.append(read.template_length)
36 |
37 | print(numpy.median(numpy.abs(tlens)))
38 |
--------------------------------------------------------------------------------
/utilities/read_or_pair_overlap_bed.py:
--------------------------------------------------------------------------------
1 | import pysam
2 | import sys
3 |
4 | """
5 | Extract reads or pairs of reads that overlap a bed file.
6 |
7 | Part of the L1-EM package.
8 |
9 | Copyright (C) 2019 Wilson McKerrow
10 |
11 | This program is free software: you can redistribute it and/or modify
12 | it under the terms of the GNU General Public License as published by
13 | the Free Software Foundation, either version 3 of the License, or
14 | (at your option) any later version.
15 |
16 | This program is distributed in the hope that it will be useful,
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 | GNU General Public License for more details.
20 |
21 | You should have received a copy of the GNU General Public License
22 | along with this program. If not, see .
23 |
24 | """
25 |
26 | def main():
27 | bedfile = sys.argv[1]
28 | bamfile = sys.argv[2]
29 | outbamfile = sys.argv[3]
30 | if len(sys.argv) > 4:
31 | flanking = int(sys.argv[4])
32 | else:
33 | flanking = 400
34 | if len(sys.argv) > 5:
35 | maxNM = int(sys.argv[5])
36 | else:
37 | maxNM = 4
38 |
39 | inbam = pysam.AlignmentFile(bamfile,'rb')
40 | outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam)
41 |
42 | read_ids = set()
43 | for line in open(bedfile):
44 | chrom,start,stop = line.strip().split('\t')[:3]
45 | start = int(start)+flanking
46 | stop = int(stop)-flanking
47 | if chrom in inbam.references:
48 | for read in inbam.fetch(chrom,start,stop):
49 | if not read.is_unmapped:
50 | if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM):
51 | read_ids.add(read.query_name)
52 | # if chrom[3:] in inbam.references:
53 | # for read in inbam.fetch(chrom[3:],start,stop):
54 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
55 | # read_ids.add(read.query_name)
56 | # if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references:
57 | # for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop):
58 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
59 | # read_ids.add(read.query_name)
60 |
61 | inbam.close()
62 | inbam = pysam.AlignmentFile(bamfile,'rb')
63 |
64 | for read in inbam:
65 | if read.query_name in read_ids:
66 | if not read.is_secondary and not read.is_supplementary:
67 | outbam.write(read)
68 |
69 | inbam.close()
70 | outbam.close()
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 |
7 | """
8 | Extract the estimate of proper transcription of L1HS elements.
9 |
10 | Copyright (C) 2019 Wilson McKerrow
11 |
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | """
26 |
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | G_of_R = pickle.load(open(line.strip(),'rb'))
30 | if G_of_R != None:
31 | total += pickle.load(open(line.strip(),'rb')).shape[0]
32 |
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'exon' not in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | only_name = seq_name+'_only'
49 | if only_name not in X_est:
50 | X_est[only_name]=0.0
51 | print_string += '\t'+str(total*X_est[only_name])
52 | runon_name = seq_name+'_3prunon'
53 | if runon_name not in X_est:
54 | X_est[runon_name]=0.0
55 | print_string += '\t'+str(total*X_est[runon_name])
56 | senserunthrough_name = seq_name+'_senserunthrough'
57 | if senserunthrough_name not in X_est:
58 | X_est[senserunthrough_name]=0.0
59 | print_string += '\t'+str(total*X_est[senserunthrough_name])
60 | antisenserunthrough_name = seq_name+'_antisenserunthrough'
61 | if antisenserunthrough_name not in X_est:
62 | X_est[antisenserunthrough_name]=0.0
63 | print_string += '\t'+str(total*X_est[antisenserunthrough_name])
64 | antisense_name = seq_name+'_antisense'
65 | if antisense_name not in X_est:
66 | X_est[antisense_name]=0.0
67 | print_string += '\t'+str(total*X_est[antisense_name])
68 | print(print_string)
69 |
--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts_clip.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 |
7 | """
8 | Extract the estimate of proper transcription of L1HS elements.
9 |
10 | Copyright (C) 2019 Wilson McKerrow
11 |
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | """
26 |
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | G_of_R = pickle.load(open(line.strip(),'rb'))
30 | if G_of_R != None:
31 | total += pickle.load(open(line.strip(),'rb')).shape[0]
32 |
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\sesne\tantisense")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'exon' not in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 | sense_name = seq_name+'_sense'
49 | if sense_name not in X_est:
50 | X_est[sense_name]=0.0
51 | print_string += '\t'+str(total*X_est[sense_name])
52 | antisense_name = seq_name+'_antisense'
53 | if antisense_name not in X_est:
54 | X_est[antisense_name]=0.0
55 | print_string += '\t'+str(total*X_est[antisense_name])
56 | print(print_string)
57 |
--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts_unstranded.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 | import sys
7 |
8 | """
9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 |
12 | Copyright (C) 2019 Wilson McKerrow
13 |
14 | This program is free software: you can redistribute it and/or modify
15 | it under the terms of the GNU General Public License as published by
16 | the Free Software Foundation, either version 3 of the License, or
17 | (at your option) any later version.
18 |
19 | This program is distributed in the hope that it will be useful,
20 | but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | GNU General Public License for more details.
23 |
24 | You should have received a copy of the GNU General Public License
25 | along with this program. If not, see .
26 |
27 | """
28 |
29 | total = 0
30 | for line in open('G_of_R_list.txt'):
31 | G_of_R = pickle.load(open(line.strip(),'rb'))
32 | if G_of_R != None:
33 | total += pickle.load(open(line.strip(),'rb')).shape[0]
34 |
35 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
36 |
37 | written_seqs = set([])
38 |
39 | print("family.category.locus.strand\tonly\t3prunon\tpassive")
40 |
41 | names = list(X_est.keys())
42 |
43 | for name in names:
44 | if 'exon' not in name:
45 | seq_name = '_'.join(name.split('_')[:-1])
46 | if seq_name in written_seqs:
47 | continue
48 | written_seqs.add(seq_name)
49 | print_string = seq_name.split('(')[0]
50 | only_name = seq_name+'_only'
51 | if only_name not in X_est:
52 | X_est[only_name]=0.0
53 | print_string += '\t'+str(total*X_est[only_name])
54 | runon_name = seq_name+'_3prunon'
55 | if runon_name not in X_est:
56 | X_est[runon_name]=0.0
57 | print_string += '\t'+str(total*X_est[runon_name])
58 | runthrough_name = seq_name+'_runthrough'
59 | if runthrough_name not in X_est:
60 | X_est[runthrough_name]=0.0
61 | print_string += '\t'+str(total*X_est[runthrough_name])
62 | print(print_string)
63 |
--------------------------------------------------------------------------------
/utilities/report_l1hs_transcription.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 |
7 | """
8 | Extract the estimate of proper transcription of L1HS elements.
9 |
10 | Copyright (C) 2019 Wilson McKerrow
11 |
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | """
26 |
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | G_of_R = pickle.load(open(line.strip(),'rb'))
30 | if G_of_R != None:
31 | total += G_of_R.shape[0]
32 |
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'L1HS' in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 |
49 | total_proper = 0.0
50 | total_passive = 0.0
51 |
52 | only_name = seq_name+'_only'
53 | if only_name not in X_est:
54 | X_est[only_name]=0.0
55 | print_string += '\t'+str(total*X_est[only_name])
56 | total_proper += total*X_est[only_name]
57 | runon_name = seq_name+'_3prunon'
58 | if runon_name not in X_est:
59 | X_est[runon_name]=0.0
60 | print_string += '\t'+str(total*X_est[runon_name])
61 | total_proper += total*X_est[runon_name]
62 | senserunthrough_name = seq_name+'_senserunthrough'
63 | if senserunthrough_name not in X_est:
64 | X_est[senserunthrough_name]=0.0
65 | total_passive += total*X_est[senserunthrough_name]
66 | antisenserunthrough_name = seq_name+'_antisenserunthrough'
67 | if antisenserunthrough_name not in X_est:
68 | X_est[antisenserunthrough_name]=0.0
69 | total_passive += total*X_est[senserunthrough_name]
70 | if total_proper > 3*total_passive:
71 | print(print_string)
72 |
--------------------------------------------------------------------------------
/utilities/report_l1hs_transcription_unstranded.py:
--------------------------------------------------------------------------------
1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
2 | try:
3 | import cPickle as pickle
4 | except ImportError:
5 | import pickle
6 |
7 | """
8 | Extract the estimate of proper transcription of L1HS elements.
9 |
10 | Copyright (C) 2019 Wilson McKerrow
11 |
12 | This program is free software: you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation, either version 3 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | """
26 |
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | G_of_R = pickle.load(open(line.strip(),'rb'))
30 | if G_of_R != None:
31 | total += G_of_R.shape[0]
32 |
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 |
35 | written_seqs = set([])
36 |
37 | print("family.category.locus.strand\tonly\t3prunon")
38 |
39 | names = list(X_est.keys())
40 |
41 | for name in names:
42 | if 'L1HS' in name:
43 | seq_name = '_'.join(name.split('_')[:-1])
44 | if seq_name in written_seqs:
45 | continue
46 | written_seqs.add(seq_name)
47 | print_string = seq_name.split('(')[0]
48 |
49 | total_proper = 0.0
50 | total_passive = 0.0
51 |
52 | only_name = seq_name+'_only'
53 | if only_name not in X_est:
54 | X_est[only_name]=0.0
55 | print_string += '\t'+str(total*X_est[only_name])
56 | total_proper += total*X_est[only_name]
57 | runon_name = seq_name+'_3prunon'
58 | if runon_name not in X_est:
59 | X_est[runon_name]=0.0
60 | print_string += '\t'+str(total*X_est[runon_name])
61 | total_proper += total*X_est[runon_name]
62 | senserunthrough_name = seq_name+'_runthrough'
63 | if senserunthrough_name not in X_est:
64 | X_est[senserunthrough_name]=0.0
65 | total_passive += total*X_est[senserunthrough_name]
66 | if total_proper > 3*total_passive:
67 | print(print_string)
68 |
--------------------------------------------------------------------------------